diff --git a/.cargo/config.toml b/.cargo/config.toml index 5e452974ad..c71d491303 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -3,6 +3,16 @@ # by the RUSTDOCFLAGS env var in CI. rustdocflags = ["-Arustdoc::private_intra_doc_links"] +# Enable frame pointers. This may have a minor performance overhead, but makes it easier and more +# efficient to obtain stack traces (and thus CPU/heap profiles). It may also avoid seg faults that +# we've seen with libunwind-based profiling. See also: +# +# * +# * +# +# NB: the RUSTFLAGS envvar will replace this. Make sure to update e.g. Dockerfile as well. +rustflags = ["-Cforce-frame-pointers=yes"] + [alias] build_testing = ["build", "--features", "testing"] neon = ["run", "--bin", "neon_local"] diff --git a/.dockerignore b/.dockerignore index 9e2d2e7108..9fafc2e4ba 100644 --- a/.dockerignore +++ b/.dockerignore @@ -14,6 +14,7 @@ !compute/ !compute_tools/ !control_plane/ +!docker-compose/ext-src !libs/ !pageserver/ !pgxn/ @@ -24,3 +25,4 @@ !storage_controller/ !vendor/postgres-*/ !workspace_hack/ +!build_tools/patches diff --git a/.github/ISSUE_TEMPLATE/bug-template.md b/.github/ISSUE_TEMPLATE/bug-template.md index d33eec3cde..234d9b5a37 100644 --- a/.github/ISSUE_TEMPLATE/bug-template.md +++ b/.github/ISSUE_TEMPLATE/bug-template.md @@ -3,6 +3,7 @@ name: Bug Template about: Used for describing bugs title: '' labels: t/bug +type: Bug assignees: '' --- diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md index c442f50fde..868fd084f1 100644 --- a/.github/ISSUE_TEMPLATE/epic-template.md +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -4,6 +4,7 @@ about: A set of related tasks contributing towards specific outcome, comprising more than 1 week of work. title: 'Epic: ' labels: t/Epic +type: Epic assignees: '' --- diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 27c8fb3c23..1e6c2d0aa2 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -4,6 +4,7 @@ self-hosted-runner: - large - large-arm64 - small + - small-metal - small-arm64 - us-east-2 config-variables: @@ -23,3 +24,11 @@ config-variables: - BENCHMARK_INGEST_TARGET_PROJECTID - PGREGRESS_PG16_PROJECT_ID - PGREGRESS_PG17_PROJECT_ID + - SLACK_ON_CALL_QA_STAGING_STREAM + - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN + - SLACK_ON_CALL_STORAGE_STAGING_STREAM + - SLACK_CICD_CHANNEL_ID + - SLACK_STORAGE_CHANNEL_ID + - NEON_DEV_AWS_ACCOUNT_ID + - NEON_PROD_AWS_ACCOUNT_ID + - AWS_ECR_REGION diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index d6219c31b4..b85ca7874d 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -7,10 +7,9 @@ inputs: type: boolean required: false default: false - aws_oicd_role_arn: - description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role' - required: false - default: '' + aws-oicd-role-arn: + description: 'OIDC role arn to interract with S3' + required: true outputs: base-url: @@ -39,9 +38,11 @@ runs: # - name: Set variables shell: bash -euxo pipefail {0} + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BUCKET: neon-github-public-dev run: | - PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) - if [ "${PR_NUMBER}" != "null" ]; then + if [ -n "${PR_NUMBER}" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \ [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then @@ -60,8 +61,6 @@ runs: echo "LOCK_FILE=${LOCK_FILE}" >> $GITHUB_ENV echo "WORKDIR=${WORKDIR}" >> $GITHUB_ENV echo "BUCKET=${BUCKET}" >> $GITHUB_ENV - env: - BUCKET: neon-github-public-dev # TODO: We can replace with a special docker image with Java and Allure pre-installed - uses: actions/setup-java@v4 @@ -81,15 +80,14 @@ runs: rm -f ${ALLURE_ZIP} fi env: - ALLURE_VERSION: 2.27.0 - ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777 + ALLURE_VERSION: 2.32.2 + ALLURE_ZIP_SHA256: 3f28885e2118f6317c92f667eaddcc6491400af1fb9773c1f3797a5fa5174953 - - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test - if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }} - uses: aws-actions/configure-aws-credentials@v4 + - uses: aws-actions/configure-aws-credentials@v4 + if: ${{ !cancelled() }} with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml index 3c83656c89..687bfd49af 100644 --- a/.github/actions/allure-report-store/action.yml +++ b/.github/actions/allure-report-store/action.yml @@ -8,10 +8,9 @@ inputs: unique-key: description: 'string to distinguish different results in the same run' required: true - aws_oicd_role_arn: - description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role' - required: false - default: '' + aws-oicd-role-arn: + description: 'OIDC role arn to interract with S3' + required: true runs: using: "composite" @@ -19,9 +18,11 @@ runs: steps: - name: Set variables shell: bash -euxo pipefail {0} + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + REPORT_DIR: ${{ inputs.report-dir }} run: | - PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) - if [ "${PR_NUMBER}" != "null" ]; then + if [ -n "${PR_NUMBER}" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \ [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then @@ -33,15 +34,12 @@ runs: echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV echo "REPORT_DIR=${REPORT_DIR}" >> $GITHUB_ENV - env: - REPORT_DIR: ${{ inputs.report-dir }} - - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test - if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }} - uses: aws-actions/configure-aws-credentials@v4 + - uses: aws-actions/configure-aws-credentials@v4 + if: ${{ !cancelled() }} with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report - name: Upload test results diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index 01c216b1ac..14b2ef8eac 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -15,10 +15,19 @@ inputs: prefix: description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false + aws-oicd-role-arn: + description: 'OIDC role arn to interract with S3' + required: true runs: using: "composite" steps: + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ inputs.aws-oicd-role-arn }} + role-duration-seconds: 3600 + - name: Download artifact id: download-artifact shell: bash -euxo pipefail {0} diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index f4a194639f..a393aa6106 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -17,6 +17,38 @@ inputs: compute_units: description: '[Min, Max] compute units' default: '[1, 1]' + # settings below only needed if you want the project to be sharded from the beginning + shard_split_project: + description: 'by default new projects are not shard-split initiailly, but only when shard-split threshold is reached, specify true to explicitly shard-split initially' + required: false + default: 'false' + disable_sharding: + description: 'by default new projects use storage controller default policy to shard-split when shard-split threshold is reached, specify true to explicitly disable sharding' + required: false + default: 'false' + admin_api_key: + description: 'Admin API Key needed for shard-splitting. Must be specified if shard_split_project is true' + required: false + shard_count: + description: 'Number of shards to split the project into, only applies if shard_split_project is true' + required: false + default: '8' + stripe_size: + description: 'Stripe size, optional, in 8kiB pages. e.g. set 2048 for 16MB stripes. Default is 128 MiB, only applies if shard_split_project is true' + required: false + default: '32768' + psql_path: + description: 'Path to psql binary - it is caller responsibility to provision the psql binary' + required: false + default: '/tmp/neon/pg_install/v16/bin/psql' + libpq_lib_path: + description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library' + required: false + default: '/tmp/neon/pg_install/v16/lib' + project_settings: + description: 'A JSON object with project settings' + required: false + default: '{}' outputs: dsn: @@ -48,7 +80,7 @@ runs: \"provisioner\": \"k8s-neonvm\", \"autoscaling_limit_min_cu\": ${MIN_CU}, \"autoscaling_limit_max_cu\": ${MAX_CU}, - \"settings\": { } + \"settings\": ${PROJECT_SETTINGS} } }") @@ -63,6 +95,38 @@ runs: echo "project_id=${project_id}" >> $GITHUB_OUTPUT echo "Project ${project_id} has been created" + + if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then + # determine tenant ID + TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"` + + echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))" + + echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" + echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}" + + # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set) + curl -X PUT \ + "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \ + -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ + -d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}" + fi + if [ "${DISABLE_SHARDING}" = "true" ]; then + # determine tenant ID + TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"` + + echo "Explicitly disabling shard-splitting for project ${project_id} with tenant_id ${TENANT_ID}" + + echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" + echo "with body {\"scheduling\": \"Essential\"}" + + # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set) + curl -X PUT \ + "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" \ + -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ + -d "{\"scheduling\": \"Essential\"}" + fi + env: API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} @@ -70,3 +134,11 @@ runs: POSTGRES_VERSION: ${{ inputs.postgres_version }} MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }} MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }} + SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }} + DISABLE_SHARDING: ${{ inputs.disable_sharding }} + ADMIN_API_KEY: ${{ inputs.admin_api_key }} + SHARD_COUNT: ${{ inputs.shard_count }} + STRIPE_SIZE: ${{ inputs.stripe_size }} + PSQL: ${{ inputs.psql_path }} + LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }} + PROJECT_SETTINGS: ${{ inputs.project_settings }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 1159627302..122fe48b68 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -48,10 +48,9 @@ inputs: description: 'benchmark durations JSON' required: false default: '{}' - aws_oicd_role_arn: - description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role' - required: false - default: '' + aws-oicd-role-arn: + description: 'OIDC role arn to interract with S3' + required: true runs: using: "composite" @@ -62,6 +61,7 @@ runs: with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - name: Download Neon binaries for the previous release if: inputs.build_type != 'remote' @@ -70,6 +70,7 @@ runs: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon-previous prefix: latest + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - name: Download compatibility snapshot if: inputs.build_type != 'remote' @@ -81,6 +82,7 @@ runs: # The lack of compatibility snapshot (for example, for the new Postgres version) # shouldn't fail the whole job. Only relevant test should fail. skip-if-does-not-exist: true + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - name: Checkout if: inputs.needs_postgres_source == 'true' @@ -119,6 +121,8 @@ runs: export DEFAULT_PG_VERSION=${PG_VERSION#v} export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-} + export ASAN_OPTIONS=detect_leaks=0:detect_stack_use_after_return=0:abort_on_error=1:strict_string_checks=1:check_initialization_order=1:strict_init_order=1 + export UBSAN_OPTIONS=abort_on_error=1:print_stacktrace=1 if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 @@ -218,17 +222,19 @@ runs: # The lack of compatibility snapshot shouldn't fail the job # (for example if we didn't run the test for non build-and-test workflow) skip-if-does-not-exist: true + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test - if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }} - uses: aws-actions/configure-aws-credentials@v4 + - uses: aws-actions/configure-aws-credentials@v4 + if: ${{ !cancelled() }} with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report + - name: Upload test results if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-store with: report-dir: /tmp/test_output/allure/results - unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }} + unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}-${{ runner.arch }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml index 6fbe19a96e..1bbea5400f 100644 --- a/.github/actions/save-coverage-data/action.yml +++ b/.github/actions/save-coverage-data/action.yml @@ -14,9 +14,11 @@ runs: name: coverage-data-artifact path: /tmp/coverage skip-if-does-not-exist: true # skip if there's no previous coverage to download + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - name: Upload coverage data uses: ./.github/actions/upload with: name: coverage-data-artifact path: /tmp/coverage + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index 8a4cfe2eff..ac5579ccea 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -14,6 +14,10 @@ inputs: prefix: description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false + aws-oicd-role-arn: + description: "the OIDC role arn for aws auth" + required: false + default: "" runs: using: "composite" @@ -53,6 +57,13 @@ runs: echo 'SKIPPED=false' >> $GITHUB_OUTPUT + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ inputs.aws-oicd-role-arn }} + role-duration-seconds: 3600 + - name: Upload artifact if: ${{ steps.prepare-artifact.outputs.SKIPPED == 'false' }} shell: bash -euxo pipefail {0} diff --git a/.github/file-filters.yaml b/.github/file-filters.yaml new file mode 100644 index 0000000000..02ee383d5e --- /dev/null +++ b/.github/file-filters.yaml @@ -0,0 +1,13 @@ +rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock'] +rust_dependencies: ['**/Cargo.lock'] + +v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**'] +v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**'] +v16: ['vendor/postgres-v16/**', 'Makefile', 'pgxn/**'] +v17: ['vendor/postgres-v17/**', 'Makefile', 'pgxn/**'] + +rebuild_neon_extra: + - .github/workflows/neon_extra_builds.yml + +rebuild_macos: + - .github/workflows/build-macos.yml diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml index 5cdc16f248..71aef1430e 100644 --- a/.github/workflows/_benchmarking_preparation.yml +++ b/.github/workflows/_benchmarking_preparation.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] + platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon, neon_pg17 ] database: [ clickbench, tpch, userexample ] env: @@ -41,6 +41,9 @@ jobs: neon) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; + neon_pg17) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }} + ;; aws-rds-postgres) CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; @@ -70,6 +73,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # we create a table that has one row for each database that we want to restore with the status whether the restore is done - name: Create benchmark_restore_status table if it does not exist diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 42c32a23e3..30fde127b0 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -23,6 +23,11 @@ on: description: 'a json object of postgres versions and lfc states to run regression tests on' required: true type: string + sanitizers: + description: 'enabled or disabled' + required: false + default: 'disabled' + type: string defaults: run: @@ -31,12 +36,13 @@ defaults: env: RUST_BACKTRACE: 1 COPT: '-Werror' - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} jobs: build-neon: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + permissions: + id-token: write # aws-actions/configure-aws-credentials + contents: read container: image: ${{ inputs.build-tools-image }} credentials: @@ -86,6 +92,7 @@ jobs: - name: Set env variables env: ARCH: ${{ inputs.arch }} + SANITIZERS: ${{ inputs.sanitizers }} run: | CARGO_FEATURES="--features testing" if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then @@ -98,8 +105,14 @@ jobs: cov_prefix="" CARGO_FLAGS="--locked --release" fi + if [[ $SANITIZERS == 'enabled' ]]; then + make_vars="WITH_SANITIZERS=yes" + else + make_vars="" + fi { echo "cov_prefix=${cov_prefix}" + echo "make_vars=${make_vars}" echo "CARGO_FEATURES=${CARGO_FEATURES}" echo "CARGO_FLAGS=${CARGO_FLAGS}" echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" @@ -135,37 +148,39 @@ jobs: - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: mold -run make postgres-v14 -j$(nproc) + run: mold -run make ${make_vars} postgres-v14 -j$(nproc) - name: Build postgres v15 if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: mold -run make postgres-v15 -j$(nproc) + run: mold -run make ${make_vars} postgres-v15 -j$(nproc) - name: Build postgres v16 if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: mold -run make postgres-v16 -j$(nproc) + run: mold -run make ${make_vars} postgres-v16 -j$(nproc) - name: Build postgres v17 if: steps.cache_pg_17.outputs.cache-hit != 'true' - run: mold -run make postgres-v17 -j$(nproc) + run: mold -run make ${make_vars} postgres-v17 -j$(nproc) - name: Build neon extensions - run: mold -run make neon-pg-ext -j$(nproc) + run: mold -run make ${make_vars} neon-pg-ext -j$(nproc) - name: Build walproposer-lib - run: mold -run make walproposer-lib -j$(nproc) + run: mold -run make ${make_vars} walproposer-lib -j$(nproc) - name: Run cargo build + env: + WITH_TESTS: ${{ inputs.sanitizers != 'enabled' && '--tests' || '' }} run: | - PQ_LIB_DIR=$(pwd)/pg_install/v16/lib - export PQ_LIB_DIR - ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests + export ASAN_OPTIONS=detect_leaks=0 + ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS} # Do install *before* running rust tests because they might recompile the # binaries with different features/flags. - name: Install rust binaries env: ARCH: ${{ inputs.arch }} + SANITIZERS: ${{ inputs.sanitizers }} run: | # Install target binaries mkdir -p /tmp/neon/bin/ @@ -180,7 +195,7 @@ jobs: done # Install test executables and write list of all binaries (for code coverage) - if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then + if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' && $SANITIZERS != 'enabled' ]]; then # Keep bloated coverage data files away from the rest of the artifact mkdir -p /tmp/coverage/ @@ -205,12 +220,18 @@ jobs: done fi + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Run rust tests + if: ${{ inputs.sanitizers != 'enabled' }} env: NEXTEST_RETRIES: 3 run: | - PQ_LIB_DIR=$(pwd)/pg_install/v16/lib - export PQ_LIB_DIR LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib export LD_LIBRARY_PATH @@ -221,8 +242,13 @@ jobs: ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' # run pageserver tests with different settings - for io_engine in std-fs tokio-epoll-uring ; do - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + for get_vectored_concurrent_io in sequential sidecar-task; do + for io_engine in std-fs tokio-epoll-uring ; do + NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \ + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \ + ${cov_prefix} \ + cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + done done # Run separate tests for real S3 @@ -256,6 +282,28 @@ jobs: with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact path: /tmp/neon + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + + - name: Check diesel schema + if: inputs.build-type == 'release' && inputs.arch == 'x64' + env: + DATABASE_URL: postgresql://localhost:1235/storage_controller + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + run: | + export ASAN_OPTIONS=detect_leaks=0 + /tmp/neon/bin/neon_local init + /tmp/neon/bin/neon_local storage_controller start + + diesel print-schema > storage_controller/src/schema.rs + + if [ -n "$(git diff storage_controller/src/schema.rs)" ]; then + echo >&2 "Uncommitted changes in diesel schema" + + git diff . + exit 1 + fi + + /tmp/neon/bin/neon_local storage_controller stop # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data @@ -265,6 +313,10 @@ jobs: regress-tests: # Don't run regression tests on debug arm64 builds if: inputs.build-type != 'debug' || inputs.arch != 'arm64' + permissions: + id-token: write # aws-actions/configure-aws-credentials + contents: read + statuses: write needs: [ build-neon ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} container: @@ -283,9 +335,9 @@ jobs: submodules: true - name: Pytest regression tests - continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }} + continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }} uses: ./.github/actions/run-python-test-set - timeout-minutes: 60 + timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 75 || 180 }} with: build_type: ${{ inputs.build-type }} test_selection: regress @@ -295,12 +347,19 @@ jobs: real_s3_region: eu-central-1 rerun_failed: true pg_version: ${{ matrix.pg_version }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds. + # Attempt to stop tests gracefully to generate test reports + # until they are forcibly stopped by the stricter `timeout-minutes` limit. + extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }} env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} + SANITIZERS: ${{ inputs.sanitizers }} # Temporary disable this step until we figure out why it's so flaky # Ref https://github.com/neondatabase/neon/issues/4540 diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml new file mode 100644 index 0000000000..c4c76914aa --- /dev/null +++ b/.github/workflows/_check-codestyle-rust.yml @@ -0,0 +1,89 @@ +name: Check Codestyle Rust + +on: + workflow_call: + inputs: + build-tools-image: + description: "build-tools image" + required: true + type: string + archs: + description: "Json array of architectures to run on" + type: string + + +defaults: + run: + shell: bash -euxo pipefail {0} + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +jobs: + check-codestyle-rust: + strategy: + matrix: + arch: ${{ fromJson(inputs.archs) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + + container: + image: ${{ inputs.build-tools-image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + + - name: Cache cargo deps + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + !~/.cargo/registry/src + ~/.cargo/git + target + key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust + + # Some of our rust modules use FFI and need those to be checked + - name: Get postgres headers + run: make postgres-headers -j$(nproc) + + # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. + # This will catch compiler & clippy warnings in all feature combinations. + # TODO: use cargo hack for build and test as well, but, that's quite expensive. + # NB: keep clippy args in sync with ./run_clippy.sh + # + # The only difference between "clippy --debug" and "clippy --release" is that in --release mode, + # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second + # time just for that, so skip "clippy --release". + - run: | + CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" + if [ "$CLIPPY_COMMON_ARGS" = "" ]; then + echo "No clippy args found in .neon_clippy_args" + exit 1 + fi + echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV + - name: Run cargo clippy (debug) + run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS + + - name: Check documentation generation + run: cargo doc --workspace --no-deps --document-private-items + env: + RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" + + # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run + - name: Check formatting + if: ${{ !cancelled() }} + run: cargo fmt --all -- --check + + # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check rust dependencies + if: ${{ !cancelled() }} + run: | + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack diff --git a/.github/workflows/_push-to-acr.yml b/.github/workflows/_push-to-acr.yml deleted file mode 100644 index c304172ff7..0000000000 --- a/.github/workflows/_push-to-acr.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: Push images to ACR -on: - workflow_call: - inputs: - client_id: - description: Client ID of Azure managed identity or Entra app - required: true - type: string - image_tag: - description: Tag for the container image - required: true - type: string - images: - description: Images to push - required: true - type: string - registry_name: - description: Name of the container registry - required: true - type: string - subscription_id: - description: Azure subscription ID - required: true - type: string - tenant_id: - description: Azure tenant ID - required: true - type: string - -jobs: - push-to-acr: - runs-on: ubuntu-22.04 - permissions: - contents: read # This is required for actions/checkout - id-token: write # This is required for Azure Login to work. - - steps: - - name: Azure login - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 - with: - client-id: ${{ inputs.client_id }} - subscription-id: ${{ inputs.subscription_id }} - tenant-id: ${{ inputs.tenant_id }} - - - name: Login to ACR - run: | - az acr login --name=${{ inputs.registry_name }} - - - name: Copy docker images to ACR ${{ inputs.registry_name }} - run: | - images='${{ inputs.images }}' - for image in ${images}; do - docker buildx imagetools create \ - -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \ - neondatabase/${image}:${{ inputs.image_tag }} - done diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml new file mode 100644 index 0000000000..403d078988 --- /dev/null +++ b/.github/workflows/_push-to-container-registry.yml @@ -0,0 +1,104 @@ +name: Push images to Container Registry +on: + workflow_call: + inputs: + # Example: {"docker.io/neondatabase/neon:13196061314":["${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]} + image-map: + description: JSON map of images, mapping from a source image to an array of target images that should be pushed. + required: true + type: string + aws-region: + description: AWS region to log in to. Required when pushing to ECR. + required: false + type: string + aws-account-id: + description: AWS account ID to log in to for pushing to ECR. Required when pushing to ECR. + required: false + type: string + aws-role-to-assume: + description: AWS role to assume to for pushing to ECR. Required when pushing to ECR. + required: false + type: string + azure-client-id: + description: Client ID of Azure managed identity or Entra app. Required when pushing to ACR. + required: false + type: string + azure-subscription-id: + description: Azure subscription ID. Required when pushing to ACR. + required: false + type: string + azure-tenant-id: + description: Azure tenant ID. Required when pushing to ACR. + required: false + type: string + acr-registry-name: + description: ACR registry name. Required when pushing to ACR. + required: false + type: string + +permissions: {} + +defaults: + run: + shell: bash -euo pipefail {0} + +jobs: + push-to-container-registry: + runs-on: ubuntu-22.04 + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: scripts/push_with_image_map.py + sparse-checkout-cone-mode: false + + - name: Print image-map + run: echo '${{ inputs.image-map }}' | jq + + - name: Configure AWS credentials + if: contains(inputs.image-map, 'amazonaws.com/') + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: "${{ inputs.aws-region }}" + role-to-assume: "arn:aws:iam::${{ inputs.aws-account-id }}:role/${{ inputs.aws-role-to-assume }}" + role-duration-seconds: 3600 + + - name: Login to ECR + if: contains(inputs.image-map, 'amazonaws.com/') + uses: aws-actions/amazon-ecr-login@v2 + with: + registries: "${{ inputs.aws-account-id }}" + + - name: Configure Azure credentials + if: contains(inputs.image-map, 'azurecr.io/') + uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 + with: + client-id: ${{ inputs.azure-client-id }} + subscription-id: ${{ inputs.azure-subscription-id }} + tenant-id: ${{ inputs.azure-tenant-id }} + + - name: Login to ACR + if: contains(inputs.image-map, 'azurecr.io/') + run: | + az acr login --name=${{ inputs.acr-registry-name }} + + - name: Login to GHCR + if: contains(inputs.image-map, 'ghcr.io/') + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Copy docker images to target registries + run: python scripts/push_with_image_map.py + env: + IMAGE_MAP: ${{ inputs.image-map }} diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 85cfe7446e..0e53830040 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -33,7 +33,7 @@ jobs: # SC2086 - Double quote to prevent globbing and word splitting. - https://www.shellcheck.net/wiki/SC2086 SHELLCHECK_OPTS: --exclude=SC2046,SC2086 with: - fail_on_error: true + fail_level: error filter_mode: nofilter level: error diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index 0a0898d30c..f4e1e2e96c 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -67,9 +67,9 @@ jobs: - uses: actions/checkout@v4 with: - ref: main + ref: ${{ github.event.pull_request.head.sha }} token: ${{ secrets.CI_ACCESS_TOKEN }} - + - name: Look for existing PR id: get-pr env: @@ -77,7 +77,7 @@ jobs: run: | ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')" echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT} - + - name: Get changed labels id: get-labels if: steps.get-pr.outputs.ALREADY_CREATED != '' @@ -94,8 +94,6 @@ jobs: echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT} echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT} - - run: gh pr checkout "${PR_NUMBER}" - - run: git checkout -b "${BRANCH}" - run: git push --force origin "${BRANCH}" @@ -103,7 +101,7 @@ jobs: - name: Create a Pull Request for CI run (if required) if: steps.get-pr.outputs.ALREADY_CREATED == '' - env: + env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | cat << EOF > body.md @@ -140,7 +138,7 @@ jobs: - run: git push --force origin "${BRANCH}" if: steps.get-pr.outputs.ALREADY_CREATED != '' - + cleanup: # Close PRs and delete branchs if the original PR is closed. diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 7621d72f64..b36ac46f35 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -63,11 +63,15 @@ jobs: fail-fast: false matrix: include: - - DEFAULT_PG_VERSION: 16 + - PG_VERSION: 16 PLATFORM: "neon-staging" region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} RUNNER: [ self-hosted, us-east-2, x64 ] - - DEFAULT_PG_VERSION: 16 + - PG_VERSION: 17 + PLATFORM: "neon-staging" + region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + RUNNER: [ self-hosted, us-east-2, x64 ] + - PG_VERSION: 16 PLATFORM: "azure-staging" region_id: 'azure-eastus2' RUNNER: [ self-hosted, eastus2, x64 ] @@ -75,7 +79,7 @@ jobs: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }} + PG_VERSION: ${{ matrix.PG_VERSION }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -105,13 +109,14 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project uses: ./.github/actions/neon-project-create with: region_id: ${{ matrix.region_id }} - postgres_version: ${{ env.DEFAULT_PG_VERSION }} + postgres_version: ${{ env.PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Run benchmark @@ -121,8 +126,8 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} - pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests @@ -152,7 +157,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -204,6 +209,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Run Logical Replication benchmarks uses: ./.github/actions/run-python-test-set @@ -214,7 +220,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -231,7 +237,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -243,7 +249,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} @@ -306,11 +312,16 @@ jobs: "image": [ "'"$image_default"'" ], "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then @@ -326,12 +337,15 @@ jobs: matrix='{ "platform": [ "neonvm-captest-reuse" + ], + "pg_version" : [ + 16,17 ] }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" }, - { "platform": "rds-aurora" }]') + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" }, + { "pg_version": 16, "platform": "rds-aurora" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -343,14 +357,14 @@ jobs: "platform": [ "neonvm-captest-reuse" ], - "scale": [ - "10" + "pg_version" : [ + 16,17 ] }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" }, - { "platform": "rds-aurora", "scale": "10" }]') + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" }, + { "pg_version": 16, "platform": "rds-aurora" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -375,7 +389,7 @@ jobs: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }} POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: ${{ matrix.pg_version }} + PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -405,14 +419,15 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project - if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) + if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-new-many-tables", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: region_id: ${{ matrix.region_id }} - postgres_version: ${{ env.DEFAULT_PG_VERSION }} + postgres_version: ${{ env.PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }} @@ -426,7 +441,7 @@ jobs: neonvm-captest-sharding-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} ;; - neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) + neonvm-captest-new | neonvm-captest-new-many-tables | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -443,6 +458,26 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB + # without (neonvm-captest-new) + # and with (neonvm-captest-new-many-tables) many relations in the database + - name: Create many relations before the run + if: contains(fromJson('["neonvm-captest-new-many-tables"]'), matrix.platform) + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_perf_many_relations + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + TEST_NUM_RELATIONS: 10000 + - name: Benchmark init uses: ./.github/actions/run-python-test-set with: @@ -451,8 +486,8 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init - pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -466,8 +501,8 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update - pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -481,8 +516,8 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only - pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -500,7 +535,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -525,14 +560,19 @@ jobs: include: - PLATFORM: "neonvm-captest-pgvector" RUNNER: [ self-hosted, us-east-2, x64 ] + postgres_version: 16 + - PLATFORM: "neonvm-captest-pgvector-pg17" + RUNNER: [ self-hosted, us-east-2, x64 ] + postgres_version: 17 - PLATFORM: "azure-captest-pgvector" RUNNER: [ self-hosted, eastus2, x64 ] + postgres_version: 16 env: TEST_PG_BENCH_DURATIONS_MATRIX: "15m" TEST_PG_BENCH_SCALES_MATRIX: "1" POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 16 + PG_VERSION: ${{ matrix.postgres_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote @@ -550,32 +590,20 @@ jobs: steps: - uses: actions/checkout@v4 - # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16 - # instead of using Neon artifacts containing pgbench - - name: Install postgresql-16 where pytest expects it - run: | - # Just to make it easier to test things locally on macOS (with arm64) - arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g') + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours - cd /home/nonroot - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb" - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb" - wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb" - dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg - dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg - dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg - - mkdir -p /tmp/neon/pg_install/v16/bin - ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench - ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql - ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu /tmp/neon/pg_install/v16/lib - - LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}" - export LD_LIBRARY_PATH - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV} - - /tmp/neon/pg_install/v16/bin/pgbench --version - /tmp/neon/pg_install/v16/bin/psql --version + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr @@ -584,6 +612,9 @@ jobs: neonvm-captest-pgvector) CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} ;; + neonvm-captest-pgvector-pg17) + CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_PG17 }} + ;; azure-captest-pgvector) CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }} ;; @@ -595,13 +626,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 18000 # 5 hours - - name: Benchmark pgvector hnsw indexing uses: ./.github/actions/run-python-test-set with: @@ -610,8 +634,8 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing - pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -625,8 +649,8 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 - pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -637,7 +661,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -672,7 +696,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 16 + PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }} TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }} @@ -708,13 +732,25 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in neonvm-captest-reuse) - CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }} + case "${PG_VERSION}" in + 16) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }} + ;; + 17) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_CONNSTR_PG17 }} + ;; + *) + echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}" + exit 1 + ;; + esac ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }} @@ -738,8 +774,8 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 43200 -k test_clickbench - pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -753,7 +789,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -774,7 +810,7 @@ jobs: # We might change it after https://github.com/neondatabase/neon/issues/2900. # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) - if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} permissions: contents: write statuses: write @@ -787,12 +823,11 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 16 + PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} - TEST_OLAP_SCALE: ${{ matrix.scale }} runs-on: [ self-hosted, us-east-2, x64 ] container: @@ -818,18 +853,30 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Get Connstring Secret Name run: | case "${PLATFORM}" in neonvm-captest-reuse) - ENV_PLATFORM=CAPTEST_TPCH + case "${PG_VERSION}" in + 16) + CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_S10_CONNSTR" + ;; + 17) + CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_CONNSTR_PG17" + ;; + *) + echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}" + exit 1 + ;; + esac ;; rds-aurora) - ENV_PLATFORM=RDS_AURORA_TPCH + CONNSTR_SECRET_NAME="BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR" ;; rds-postgres) - ENV_PLATFORM=RDS_POSTGRES_TPCH + CONNSTR_SECRET_NAME="BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR" ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" @@ -837,7 +884,6 @@ jobs: ;; esac - CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR" echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV - name: Set up Connection String @@ -855,20 +901,20 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_tpch - pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - TEST_OLAP_SCALE: ${{ matrix.scale }} + TEST_OLAP_SCALE: 10 - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -883,7 +929,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} user-examples-compare: - if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} + # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} permissions: contents: write statuses: write @@ -896,7 +942,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 16 + PG_VERSION: ${{ matrix.pg_version }} TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -926,13 +972,25 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr run: | case "${PLATFORM}" in neonvm-captest-reuse) - CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }} + case "${PG_VERSION}" in + 16) + CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }} + ;; + 17) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_USER_EXAMPLE_CONNSTR_PG17 }} + ;; + *) + echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}" + exit 1 + ;; + esac ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }} @@ -956,8 +1014,8 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_user_examples - pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -968,7 +1026,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml new file mode 100644 index 0000000000..347a511e98 --- /dev/null +++ b/.github/workflows/build-macos.yml @@ -0,0 +1,241 @@ +name: Check neon with MacOS builds + +on: + workflow_call: + inputs: + pg_versions: + description: "Array of the pg versions to build for, for example: ['v14', 'v17']" + type: string + default: '[]' + required: false + rebuild_rust_code: + description: "Rebuild Rust code" + type: boolean + default: false + required: false + rebuild_everything: + description: "If true, rebuild for all versions" + type: boolean + default: false + required: false + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + +# TODO: move `check-*` and `files-changed` jobs to the "Caller" Workflow +# We should care about that as Github has limitations: +# - You can connect up to four levels of workflows +# - You can call a maximum of 20 unique reusable workflows from a single workflow file. +# https://docs.github.com/en/actions/sharing-automations/reusing-workflows#limitations +jobs: + build-pgxn: + if: | + (inputs.pg_versions != '[]' || inputs.rebuild_everything) && ( + contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' + ) + timeout-minutes: 30 + runs-on: macos-15 + strategy: + matrix: + postgres-version: ${{ inputs.rebuild_everything && fromJson('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }} + env: + # Use release build only, to have less debug info around + # Hence keeping target/ (and general cache size) smaller + BUILD_TYPE: release + steps: + - name: Checkout main repo + uses: actions/checkout@v4 + + - name: Set pg ${{ matrix.postgres-version }} for caching + id: pg_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-${{ matrix.postgres-version }}) | tee -a "${GITHUB_OUTPUT}" + + - name: Cache postgres ${{ matrix.postgres-version }} build + id: cache_pg + uses: actions/cache@v4 + with: + path: pg_install/${{ matrix.postgres-version }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Checkout submodule vendor/postgres-${{ matrix.postgres-version }} + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + git submodule init vendor/postgres-${{ matrix.postgres-version }} + git submodule update --depth 1 --recursive + + - name: Install build dependencies + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + brew install flex bison openssl protobuf icu4c + + - name: Set extra env for macOS + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV + echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + + - name: Build Postgres ${{ matrix.postgres-version }} + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + make postgres-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu) + + - name: Build Neon Pg Ext ${{ matrix.postgres-version }} + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu) + + - name: Get postgres headers ${{ matrix.postgres-version }} + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu) + + build-walproposer-lib: + if: | + (inputs.pg_versions != '[]' || inputs.rebuild_everything) && ( + contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' + ) + timeout-minutes: 30 + runs-on: macos-15 + needs: [build-pgxn] + env: + # Use release build only, to have less debug info around + # Hence keeping target/ (and general cache size) smaller + BUILD_TYPE: release + steps: + - name: Checkout main repo + uses: actions/checkout@v4 + + - name: Set pg v17 for caching + id: pg_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}" + + - name: Cache postgres v17 build + id: cache_pg + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Cache walproposer-lib + id: cache_walproposer_lib + uses: actions/cache@v4 + with: + path: pg_install/build/walproposer-lib + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Checkout submodule vendor/postgres-v17 + if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' + run: | + git submodule init vendor/postgres-v17 + git submodule update --depth 1 --recursive + + - name: Install build dependencies + if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' + run: | + brew install flex bison openssl protobuf icu4c + + - name: Set extra env for macOS + if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' + run: | + echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV + echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + + - name: Build walproposer-lib (only for v17) + if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' + run: + make walproposer-lib -j$(sysctl -n hw.ncpu) + + cargo-build: + if: | + (inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything) && ( + contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' + ) + timeout-minutes: 30 + runs-on: macos-15 + needs: [build-pgxn, build-walproposer-lib] + env: + # Use release build only, to have less debug info around + # Hence keeping target/ (and general cache size) smaller + BUILD_TYPE: release + steps: + - name: Checkout main repo + uses: actions/checkout@v4 + with: + submodules: true + + - name: Set pg v14 for caching + id: pg_rev_v14 + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) | tee -a "${GITHUB_OUTPUT}" + - name: Set pg v15 for caching + id: pg_rev_v15 + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) | tee -a "${GITHUB_OUTPUT}" + - name: Set pg v16 for caching + id: pg_rev_v16 + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) | tee -a "${GITHUB_OUTPUT}" + - name: Set pg v17 for caching + id: pg_rev_v17 + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}" + + - name: Cache postgres v14 build + id: cache_pg + uses: actions/cache@v4 + with: + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v14-${{ steps.pg_rev_v14.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v15 build + id: cache_pg_v15 + uses: actions/cache@v4 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v15-${{ steps.pg_rev_v15.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v16 build + id: cache_pg_v16 + uses: actions/cache@v4 + with: + path: pg_install/v16 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v16-${{ steps.pg_rev_v16.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v17 build + id: cache_pg_v17 + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Cache cargo deps (only for v17) + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + !~/.cargo/registry/src + ~/.cargo/git + target + key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust + + - name: Cache walproposer-lib + id: cache_walproposer_lib + uses: actions/cache@v4 + with: + path: pg_install/build/walproposer-lib + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Install build dependencies + run: | + brew install flex bison openssl protobuf icu4c + + - name: Set extra env for macOS + run: | + echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV + echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + + - name: Run cargo build (only for v17) + run: cargo build --all --release -j$(sysctl -n hw.ncpu) + + - name: Check that no warnings are produced (only for v17) + run: ./run_clippy.sh diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index cb966f292e..1b706b3f16 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -21,8 +21,6 @@ concurrency: env: RUST_BACKTRACE: 1 COPT: '-Werror' - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} @@ -47,10 +45,30 @@ jobs: run cancel-previous-in-concurrency-group.yml \ --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" + files-changed: + needs: [ check-permissions ] + runs-on: [ self-hosted, small ] + timeout-minutes: 3 + outputs: + check-rust-dependencies: ${{ steps.files-changed.outputs.rust_dependencies }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: true + + - name: Check for file changes + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 + id: files-changed + with: + token: ${{ secrets.GITHUB_TOKEN }} + filters: .github/file-filters.yaml + tag: needs: [ check-permissions ] runs-on: [ self-hosted, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} @@ -166,77 +184,19 @@ jobs: check-codestyle-rust: needs: [ check-permissions, build-build-tools-image ] - strategy: - matrix: - arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + uses: ./.github/workflows/_check-codestyle-rust.yml + with: + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + archs: '["x64", "arm64"]' + secrets: inherit - container: - image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - options: --init - - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: true - - - name: Cache cargo deps - uses: actions/cache@v4 - with: - path: | - ~/.cargo/registry - !~/.cargo/registry/src - ~/.cargo/git - target - key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust - - # Some of our rust modules use FFI and need those to be checked - - name: Get postgres headers - run: make postgres-headers -j$(nproc) - - # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations. - # This will catch compiler & clippy warnings in all feature combinations. - # TODO: use cargo hack for build and test as well, but, that's quite expensive. - # NB: keep clippy args in sync with ./run_clippy.sh - # - # The only difference between "clippy --debug" and "clippy --release" is that in --release mode, - # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second - # time just for that, so skip "clippy --release". - - run: | - CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")" - if [ "$CLIPPY_COMMON_ARGS" = "" ]; then - echo "No clippy args found in .neon_clippy_args" - exit 1 - fi - echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV - - name: Run cargo clippy (debug) - run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS - - - name: Check documentation generation - run: cargo doc --workspace --no-deps --document-private-items - env: - RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" - - # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - - name: Check formatting - if: ${{ !cancelled() }} - run: cargo fmt --all -- --check - - # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - - name: Check rust dependencies - if: ${{ !cancelled() }} - run: | - cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date - cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack - - # https://github.com/EmbarkStudios/cargo-deny - - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() }} - run: cargo deny check --hide-inclusion-graph + check-dependencies-rust: + needs: [ files-changed, build-build-tools-image ] + if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' }} + uses: ./.github/workflows/cargo-deny.yml + with: + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + secrets: inherit build-and-test-locally: needs: [ tag, build-build-tools-image ] @@ -255,15 +215,15 @@ jobs: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} - # Run tests on all Postgres versions in release builds and only on the latest version in debug builds - # run without LFC on v17 release only + # Run tests on all Postgres versions in release builds and only on the latest version in debug builds. + # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. test-cfg: | - ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"}, - {"pg_version":"v15", "lfc_state": "without-lfc"}, - {"pg_version":"v16", "lfc_state": "without-lfc"}, - {"pg_version":"v17", "lfc_state": "without-lfc"}, - {"pg_version":"v17", "lfc_state": "with-lfc"}]' - || '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }} + ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "with-lfc"}, + {"pg_version":"v15", "lfc_state": "with-lfc"}, + {"pg_version":"v16", "lfc_state": "with-lfc"}, + {"pg_version":"v17", "lfc_state": "with-lfc"}, + {"pg_version":"v17", "lfc_state": "without-lfc"}]' + || '[{"pg_version":"v17", "lfc_state": "without-lfc" }]' }} secrets: inherit # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking @@ -303,9 +263,15 @@ jobs: echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT benchmarks: - if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') - needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] - runs-on: [ self-hosted, small ] + # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `deploy` in PRs + if: github.ref_name == 'main' || (contains(github.event.pull_request.labels.*.name, 'run-benchmarks') && !failure() && !cancelled()) + needs: [ check-permissions, build-build-tools-image, get-benchmarks-durations, deploy ] + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write + runs-on: [ self-hosted, small-metal ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: @@ -333,6 +299,7 @@ jobs: extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} pg_version: v16 + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -342,24 +309,31 @@ jobs: # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones - report-benchmarks-failures: + report-benchmarks-results-to-slack: needs: [ benchmarks, create-test-report ] - if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure' + if: github.ref_name == 'main' && !cancelled() && contains(fromJSON('["success", "failure"]'), needs.benchmarks.result) runs-on: ubuntu-22.04 steps: - - uses: slackapi/slack-github-action@v1 + - uses: slackapi/slack-github-action@v2 with: - channel-id: C060CNA47S9 # on-call-staging-storage-stream - slack-message: | - Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}> - <${{ needs.create-test-report.outputs.report-url }}|Allure report> - env: - SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: "${{ vars.SLACK_ON_CALL_STORAGE_STAGING_STREAM }}" + text: | + Benchmarks on main: *${{ needs.benchmarks.result }}* + - <${{ needs.create-test-report.outputs.report-url }}|Allure report> + - <${{ github.event.head_commit.url }}|${{ github.sha }}> create-test-report: needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write outputs: report-url: ${{ steps.create-allure-report.outputs.report-url }} @@ -380,6 +354,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} @@ -411,6 +386,10 @@ jobs: coverage-report: if: ${{ !startsWith(github.ref_name, 'release') }} needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -437,12 +416,14 @@ jobs: with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact path: /tmp/neon + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Get coverage artifact uses: ./.github/actions/download with: name: coverage-data-artifact path: /tmp/coverage + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Merge coverage data run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge @@ -517,7 +498,7 @@ jobs: trigger-e2e-tests: if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }} - needs: [ check-permissions, promote-images, tag ] + needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, tag ] uses: ./.github/workflows/trigger-e2e-tests.yml secrets: inherit @@ -573,6 +554,10 @@ jobs: neon-image: needs: [ neon-image-arch, tag ] runs-on: ubuntu-22.04 + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: read steps: - uses: docker/login-action@v3 @@ -587,19 +572,12 @@ jobs: neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \ neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64 - - uses: docker/login-action@v3 - with: - registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com - username: ${{ secrets.AWS_ACCESS_KEY_DEV }} - password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - - name: Push multi-arch image to ECR - run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }} - compute-node-image-arch: needs: [ check-permissions, build-build-tools-image, tag ] + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: read strategy: fail-fast: false matrix: @@ -640,12 +618,6 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/login-action@v3 - with: - registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com - username: ${{ secrets.AWS_ACCESS_KEY_DEV }} - password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - uses: docker/login-action@v3 with: registry: cache.neon.build @@ -686,37 +658,17 @@ jobs: push: true pull: true file: compute/compute-node.Dockerfile - target: neon-pg-ext-test + target: extension-tests cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} tags: | neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} - - name: Build compute-tools image - # compute-tools are Postgres independent, so build it only once - # We pick 16, because that builds on debian 11 with older glibc (and is - # thus compatible with newer glibc), rather than 17 on Debian 12, as - # that isn't guaranteed to be compatible with Debian 11 - if: matrix.version.pg == 'v16' - uses: docker/build-push-action@v6 - with: - target: compute-tools-image - context: . - build-args: | - GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} - DEBIAN_VERSION=${{ matrix.version.debian }} - provenance: false - push: true - pull: true - file: compute/compute-node.Dockerfile - cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} - tags: | - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - compute-node-image: needs: [ compute-node-image-arch, tag ] + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: read runs-on: ubuntu-22.04 strategy: @@ -753,31 +705,6 @@ jobs: neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - - name: Create multi-arch compute-tools image - if: matrix.version.pg == 'v16' - run: | - docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - - - uses: docker/login-action@v3 - with: - registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com - username: ${{ secrets.AWS_ACCESS_KEY_DEV }} - password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR - run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - - - name: Push multi-arch compute-tools image to ECR - if: matrix.version.pg == 'v16' - run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} - vm-compute-node-image: needs: [ check-permissions, tag, compute-node-image ] runs-on: [ self-hosted, large ] @@ -795,7 +722,7 @@ jobs: - pg: v17 debian: bookworm env: - VM_BUILDER_VERSION: v0.35.0 + VM_BUILDER_VERSION: v0.37.1 steps: - uses: actions/checkout@v4 @@ -848,6 +775,17 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - name: Get the last compute release tag + id: get-last-compute-release-tag + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${{ github.repository }}/releases") + echo tag=${tag} >> ${GITHUB_OUTPUT} + # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. # Regular pageserver version string looks like @@ -879,112 +817,143 @@ jobs: TEST_VERSION_ONLY: ${{ matrix.pg_version }} run: ./docker-compose/docker_compose_test.sh + - name: Print logs and clean up docker-compose test + if: always() + run: | + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down + + - name: Test extension upgrade + timeout-minutes: 20 + if: ${{ needs.tag.outputs.build-tag == github.run_id }} + env: + NEWTAG: ${{ needs.tag.outputs.build-tag }} + OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + run: ./docker-compose/test_extensions_upgrade.sh + - name: Print logs and clean up if: always() run: | - docker compose -f ./docker-compose/docker-compose.yml logs || 0 - docker compose -f ./docker-compose/docker-compose.yml down + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down - promote-images: - needs: [ check-permissions, tag, test-images, vm-compute-node-image ] + generate-image-maps: + needs: [ tag ] runs-on: ubuntu-22.04 - - permissions: - id-token: write # for `aws-actions/configure-aws-credentials` - - env: - VERSIONS: v14 v15 v16 v17 - + outputs: + neon-dev: ${{ steps.generate.outputs.neon-dev }} + neon-prod: ${{ steps.generate.outputs.neon-prod }} + compute-dev: ${{ steps.generate.outputs.compute-dev }} + compute-prod: ${{ steps.generate.outputs.compute-prod }} steps: - - uses: docker/login-action@v3 + - uses: actions/checkout@v4 with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + sparse-checkout: scripts/generate_image_maps.py + sparse-checkout-cone-mode: false - - name: Login to dev ECR - uses: docker/login-action@v3 - with: - registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com - username: ${{ secrets.AWS_ACCESS_KEY_DEV }} - password: ${{ secrets.AWS_SECRET_KEY_DEV }} + - name: Generate Image Maps + id: generate + run: python scripts/generate_image_maps.py + env: + BUILD_TAG: "${{ needs.tag.outputs.build-tag }}" + BRANCH: "${{ github.ref_name }}" + DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" + PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" + DEV_AWS: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + PROD_AWS: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + AWS_REGION: "${{ vars.AWS_ECR_REGION }}" - - name: Copy vm-compute-node images to ECR - run: | - for version in ${VERSIONS}; do - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} - done - - - name: Add latest tag to images - if: github.ref_name == 'main' - run: | - for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do - docker buildx imagetools create -t $repo/neon:latest \ - $repo/neon:${{ needs.tag.outputs.build-tag }} - - docker buildx imagetools create -t $repo/compute-tools:latest \ - $repo/compute-tools:${{ needs.tag.outputs.build-tag }} - - for version in ${VERSIONS}; do - docker buildx imagetools create -t $repo/compute-node-${version}:latest \ - $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }} - - docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \ - $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} - done - done - docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \ - neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} - - - name: Configure AWS-prod credentials - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-region: eu-central-1 - mask-aws-account-id: true - role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }} - - - name: Login to prod ECR - uses: docker/login-action@v3 - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - with: - registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com - - - name: Copy all images to prod ECR - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - run: | - for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do - docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ - 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} - done - - push-to-acr-dev: - if: github.ref_name == 'main' - needs: [ tag, promote-images ] - uses: ./.github/workflows/_push-to-acr.yml + push-neon-image-dev: + needs: [ generate-image-maps, neon-image ] + uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: - client_id: ${{ vars.AZURE_DEV_CLIENT_ID }} - image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 - registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} - subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} - tenant_id: ${{ vars.AZURE_TENANT_ID }} + image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}' + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" + azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} + secrets: inherit - push-to-acr-prod: + push-compute-image-dev: + needs: [ generate-image-maps, vm-compute-node-image ] + uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR + with: + image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}' + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" + azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} + secrets: inherit + + push-neon-image-prod: if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - needs: [ tag, promote-images ] - uses: ./.github/workflows/_push-to-acr.yml + needs: [ generate-image-maps, neon-image, test-images ] + uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR with: - client_id: ${{ vars.AZURE_PROD_CLIENT_ID }} - image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 - registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} - subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} - tenant_id: ${{ vars.AZURE_TENANT_ID }} + image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}' + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" + azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} + secrets: inherit + + push-compute-image-prod: + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' + needs: [ generate-image-maps, vm-compute-node-image, test-images ] + uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR + with: + image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}' + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-id: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" + azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} + secrets: inherit + + # This is a bit of a special case so we're not using a generated image map. + add-latest-tag-to-neon-extensions-test-image: + if: github.ref_name == 'main' + needs: [ tag, compute-node-image ] + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: | + { + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"], + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.tag.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"] + } + secrets: inherit trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] runs-on: ubuntu-22.04 + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write steps: - name: Set PR's status to pending and request a remote CI test run: | @@ -1057,15 +1026,114 @@ jobs: exit 1 deploy: - needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ] + needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled() - + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write runs-on: [ self-hosted, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/ansible:latest steps: - uses: actions/checkout@v4 + - name: Create git tag and GitHub release + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' + uses: actions/github-script@v7 + with: + retries: 5 + script: | + const tag = "${{ needs.tag.outputs.build-tag }}"; + const branch = "${{ github.ref_name }}"; + + try { + const existingRef = await github.rest.git.getRef({ + owner: context.repo.owner, + repo: context.repo.repo, + ref: `tags/${tag}`, + }); + + if (existingRef.data.object.sha !== context.sha) { + throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`); + } + + console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`); + } catch (error) { + if (error.status !== 404) { + throw error; + } + + console.log(`Tag ${tag} does not exist. Creating it...`); + await github.rest.git.createRef({ + owner: context.repo.owner, + repo: context.repo.repo, + ref: `refs/tags/${tag}`, + sha: context.sha, + }); + console.log(`Tag ${tag} created successfully.`); + } + + try { + const existingRelease = await github.rest.repos.getReleaseByTag({ + owner: context.repo.owner, + repo: context.repo.repo, + tag: tag, + }); + + console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`); + } catch (error) { + if (error.status !== 404) { + throw error; + } + + console.log(`Release for tag ${tag} does not exist. Creating it...`); + + // Find the PR number using the commit SHA + const pullRequests = await github.rest.pulls.list({ + owner: context.repo.owner, + repo: context.repo.repo, + state: 'closed', + base: branch, + }); + + const pr = pullRequests.data.find(pr => pr.merge_commit_sha === context.sha); + const prNumber = pr ? pr.number : null; + + // Find the previous release on the branch + const releases = await github.rest.repos.listReleases({ + owner: context.repo.owner, + repo: context.repo.repo, + per_page: 100, + }); + + const branchReleases = releases.data + .filter((release) => { + const regex = new RegExp(`^${branch}-\\d+$`); + return regex.test(release.tag_name) && !release.draft && !release.prerelease; + }) + .sort((a, b) => new Date(b.created_at) - new Date(a.created_at)); + + const previousTag = branchReleases.length > 0 ? branchReleases[0].tag_name : null; + + const releaseNotes = [ + prNumber + ? `Release PR https://github.com/${context.repo.owner}/${context.repo.repo}/pull/${prNumber}.` + : 'Release PR not found.', + previousTag + ? `Diff with the previous release https://github.com/${context.repo.owner}/${context.repo.repo}/compare/${previousTag}...${tag}.` + : `No previous release found on branch ${branch}.`, + ].join('\n\n'); + + await github.rest.repos.createRelease({ + owner: context.repo.owner, + repo: context.repo.repo, + tag_name: tag, + body: releaseNotes, + }); + console.log(`Release for tag ${tag} created successfully.`); + } + - name: Trigger deploy workflow env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} @@ -1115,38 +1183,29 @@ jobs: exit 1 fi - - name: Create git tag - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' - uses: actions/github-script@v7 + notify-storage-release-deploy-failure: + needs: [ deploy ] + # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs. + if: github.ref_name == 'release' && needs.deploy.result != 'success' && always() + runs-on: ubuntu-22.04 + steps: + - name: Post release-deploy failure to team-storage slack channel + uses: slackapi/slack-github-action@v2 with: - # Retry script for 5XX server errors: https://github.com/actions/github-script#retries - retries: 5 - script: | - await github.rest.git.createRef({ - owner: context.repo.owner, - repo: context.repo.repo, - ref: "refs/tags/${{ needs.tag.outputs.build-tag }}", - sha: context.sha, - }) - - # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok - - name: Create GitHub release - if: github.ref_name == 'release' - uses: actions/github-script@v7 - with: - # Retry script for 5XX server errors: https://github.com/actions/github-script#retries - retries: 5 - script: | - await github.rest.repos.createRelease({ - owner: context.repo.owner, - repo: context.repo.repo, - tag_name: "${{ needs.tag.outputs.build-tag }}", - generate_release_notes: true, - }) + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }} + text: | + 🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: needs: [ deploy ] + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: read # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` if: github.ref_name == 'release' && !failure() && !cancelled() @@ -1183,6 +1242,12 @@ jobs: echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT} echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT} + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 3600 + - name: Promote compatibility snapshot and Neon artifact env: BUCKET: neon-github-public-dev @@ -1230,7 +1295,7 @@ jobs: done pin-build-tools-image: - needs: [ build-build-tools-image, promote-images, build-and-test-locally ] + needs: [ build-build-tools-image, test-images, build-and-test-locally ] if: github.ref_name == 'main' uses: ./.github/workflows/pin-build-tools-image.yml with: @@ -1253,7 +1318,10 @@ jobs: - build-and-test-locally - check-codestyle-python - check-codestyle-rust - - promote-images + - check-dependencies-rust + - files-changed + - push-compute-image-dev + - push-neon-image-dev - test-images - trigger-custom-extensions-build-and-wait runs-on: ubuntu-22.04 @@ -1265,4 +1333,12 @@ jobs: if: | contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') - || contains(needs.*.result, 'skipped') + || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true') + || needs.build-and-test-locally.result == 'skipped' + || needs.check-codestyle-python.result == 'skipped' + || needs.check-codestyle-rust.result == 'skipped' + || needs.files-changed.result == 'skipped' + || needs.push-compute-image-dev.result == 'skipped' + || needs.push-neon-image-dev.result == 'skipped' + || needs.test-images.result == 'skipped' + || needs.trigger-custom-extensions-build-and-wait.result == 'skipped' diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml new file mode 100644 index 0000000000..e40b02b5d2 --- /dev/null +++ b/.github/workflows/build_and_test_with_sanitizers.yml @@ -0,0 +1,134 @@ +name: Build and Test with Sanitizers + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 1 * * *' # run once a day, timezone is utc + workflow_dispatch: + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + cancel-in-progress: true + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + +jobs: + tag: + runs-on: [ self-hosted, small ] + container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned + outputs: + build-tag: ${{steps.build-tag.outputs.tag}} + + steps: + # Need `fetch-depth: 0` to count the number of commits in the branch + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get build tag + run: | + echo run:$GITHUB_RUN_ID + echo ref:$GITHUB_REF_NAME + echo rev:$(git rev-list --count HEAD) + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then + echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'" + echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT + fi + shell: bash + id: build-tag + + build-build-tools-image: + uses: ./.github/workflows/build-build-tools-image.yml + secrets: inherit + + build-and-test-locally: + needs: [ tag, build-build-tools-image ] + strategy: + fail-fast: false + matrix: + arch: [ x64, arm64 ] + build-type: [ release ] + uses: ./.github/workflows/_build-and-test-locally.yml + with: + arch: ${{ matrix.arch }} + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + build-tag: ${{ needs.tag.outputs.build-tag }} + build-type: ${{ matrix.build-type }} + test-cfg: '[{"pg_version":"v17"}]' + sanitizers: enabled + secrets: inherit + + + create-test-report: + needs: [ build-and-test-locally, build-build-tools-image ] + if: ${{ !cancelled() }} + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write + outputs: + report-url: ${{ steps.create-allure-report.outputs.report-url }} + + runs-on: [ self-hosted, small ] + container: + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - uses: actions/checkout@v4 + + - name: Create Allure report + if: ${{ !cancelled() }} + id: create-allure-report + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + + - uses: actions/github-script@v7 + if: ${{ !cancelled() }} + with: + # Retry script for 5XX server errors: https://github.com/actions/github-script#retries + retries: 5 + script: | + const report = { + reportUrl: "${{ steps.create-allure-report.outputs.report-url }}", + reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", + } + + const coverage = {} + + const script = require("./scripts/comment-test-report.js") + await script({ + github, + context, + fetch, + report, + coverage, + }) diff --git a/.github/workflows/cargo-deny.yml b/.github/workflows/cargo-deny.yml new file mode 100644 index 0000000000..433b377c32 --- /dev/null +++ b/.github/workflows/cargo-deny.yml @@ -0,0 +1,57 @@ +name: cargo deny checks + +on: + workflow_call: + inputs: + build-tools-image: + required: false + type: string + schedule: + - cron: '0 0 * * *' + +jobs: + cargo-deny: + strategy: + matrix: + ref: >- + ${{ + fromJSON( + github.event_name == 'schedule' + && '["main","release","release-proxy","release-compute"]' + || format('["{0}"]', github.sha) + ) + }} + + runs-on: [self-hosted, small] + + container: + image: ${{ inputs.build-tools-image || 'neondatabase/build-tools:pinned' }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ matrix.ref }} + + - name: Check rust licenses/bans/advisories/sources + env: + CARGO_DENY_TARGET: >- + ${{ github.event_name == 'schedule' && 'advisories' || 'all' }} + run: cargo deny check --hide-inclusion-graph $CARGO_DENY_TARGET + + - name: Post to a Slack channel + if: ${{ github.event_name == 'schedule' && failure() }} + uses: slackapi/slack-github-action@v2 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload: | + channel: ${{ vars.SLACK_CICD_CHANNEL_ID }} + text: | + Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + Pinging @oncall-devprod. diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 57194090cf..09d6acd325 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -19,14 +19,17 @@ concurrency: group: ${{ github.workflow }} cancel-in-progress: true +permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + jobs: regress: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} strategy: fail-fast: false matrix: @@ -78,6 +81,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create a new branch id: create-branch @@ -93,10 +97,12 @@ jobs: test_selection: cloud_regress pg_version: ${{matrix.pg-version}} extra_params: -m remote_cluster + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}} - name: Delete branch + if: always() uses: ./.github/actions/neon-branch-delete with: api_key: ${{ secrets.NEON_STAGING_API_KEY }} @@ -107,12 +113,14 @@ jobs: id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate + with: + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # on-call-staging-stream + channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }} slack-message: | Periodic pg_regress on staging: ${{ job.status }} <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml new file mode 100644 index 0000000000..71c5158ef6 --- /dev/null +++ b/.github/workflows/force-test-extensions-upgrade.yml @@ -0,0 +1,76 @@ +name: Force Test Upgrading of Extension +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '45 2 * * *' # run once a day, timezone is utc + workflow_dispatch: # adds ability to run this manually + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow + group: ${{ github.workflow }} + cancel-in-progress: true + +permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: read + +jobs: + regress: + strategy: + fail-fast: false + matrix: + pg-version: [16, 17] + + runs-on: small + + steps: + - uses: actions/checkout@v4 + with: + submodules: false + + - name: Get the last compute release tag + id: get-last-compute-release-tag + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${GITHUB_REPOSITORY}/releases") + echo tag=${tag} >> ${GITHUB_OUTPUT} + + - name: Test extension upgrade + timeout-minutes: 20 + env: + NEWTAG: latest + OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }} + PG_VERSION: ${{ matrix.pg-version }} + FORCE_ALL_UPGRADE_TESTS: true + run: ./docker-compose/test_extensions_upgrade.sh + + - name: Print logs and clean up + if: always() + run: | + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true + docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down + + - name: Post to the Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }} + slack-message: | + Test upgrading of extensions: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index a5810e91a4..c20c5890f9 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -13,7 +13,7 @@ on: # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 9 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually - + defaults: run: shell: bash -euxo pipefail {0} @@ -28,7 +28,33 @@ jobs: strategy: fail-fast: false # allow other variants to continue even if one fails matrix: - target_project: [new_empty_project, large_existing_project] + include: + - target_project: new_empty_project_stripe_size_2048 + stripe_size: 2048 # 16 MiB + postgres_version: 16 + disable_sharding: false + - target_project: new_empty_project_stripe_size_32768 + stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold + # while here it is sharded from the beginning with a shard size of 256 MiB + disable_sharding: false + postgres_version: 16 + - target_project: new_empty_project + stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + disable_sharding: false + postgres_version: 16 + - target_project: new_empty_project + stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + disable_sharding: false + postgres_version: 17 + - target_project: large_existing_project + stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project + disable_sharding: false + postgres_version: 16 + - target_project: new_empty_project_unsharded + stripe_size: null # run with neon defaults which will shard split only when reaching the threshold + disable_sharding: true + postgres_version: 16 + max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results permissions: contents: write statuses: write @@ -56,7 +82,7 @@ jobs: with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role + role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role - name: Download Neon artifact uses: ./.github/actions/download @@ -64,19 +90,25 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project - if: ${{ matrix.target_project == 'new_empty_project' }} + if: ${{ startsWith(matrix.target_project, 'new_empty_project') }} id: create-neon-project-ingest-target uses: ./.github/actions/neon-project-create with: region_id: aws-us-east-2 - postgres_version: 16 + postgres_version: ${{ matrix.postgres_version }} compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck api_key: ${{ secrets.NEON_STAGING_API_KEY }} + shard_split_project: ${{ matrix.stripe_size != null && 'true' || 'false' }} + admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} + shard_count: 8 + stripe_size: ${{ matrix.stripe_size }} + disable_sharding: ${{ matrix.disable_sharding }} - name: Initialize Neon project - if: ${{ matrix.target_project == 'new_empty_project' }} + if: ${{ startsWith(matrix.target_project, 'new_empty_project') }} env: BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }} NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }} @@ -94,7 +126,7 @@ jobs: project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - - name: Initialize Neon project + - name: Initialize Neon project if: ${{ matrix.target_project == 'large_existing_project' }} env: BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }} @@ -122,16 +154,16 @@ jobs: ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;" echo "BENCHMARK_INGEST_TARGET_CONNSTR=${BENCHMARK_INGEST_TARGET_CONNSTR}" >> $GITHUB_ENV - - name: Invoke pgcopydb + - name: Invoke pgcopydb uses: ./.github/actions/run-python-test-set with: build_type: remote test_selection: performance/test_perf_ingest_using_pgcopydb.py run_in_parallel: false extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb - pg_version: v16 + pg_version: v${{ matrix.postgres_version }} save_perf_report: true - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }} TARGET_PROJECT_TYPE: ${{ matrix.target_project }} @@ -143,9 +175,9 @@ jobs: run: | export LD_LIBRARY_PATH=${PG_16_LIB_PATH} ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+" - + - name: Delete Neon Project - if: ${{ always() && matrix.target_project == 'new_empty_project' }} + if: ${{ always() && startsWith(matrix.target_project, 'new_empty_project') }} uses: ./.github/actions/neon-project-delete with: project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 092831adb9..f077e04d1c 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -31,19 +31,15 @@ jobs: uses: ./.github/workflows/build-build-tools-image.yml secrets: inherit - check-macos-build: - needs: [ check-permissions ] - if: | - contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || - contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || - github.ref_name == 'main' - timeout-minutes: 90 - runs-on: macos-15 - - env: - # Use release build only, to have less debug info around - # Hence keeping target/ (and general cache size) smaller - BUILD_TYPE: release + files-changed: + name: Detect what files changed + runs-on: ubuntu-22.04 + timeout-minutes: 3 + outputs: + v17: ${{ steps.files_changed.outputs.v17 }} + postgres_changes: ${{ steps.postgres_changes.outputs.changes }} + rebuild_rust_code: ${{ steps.files_changed.outputs.rust_code }} + rebuild_everything: ${{ steps.files_changed.outputs.rebuild_neon_extra || steps.files_changed.outputs.rebuild_macos }} steps: - name: Checkout @@ -51,102 +47,45 @@ jobs: with: submodules: true - - name: Install macOS postgres dependencies - run: brew install flex bison openssl protobuf icu4c - - - name: Set pg 14 revision for caching - id: pg_v14_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - - - name: Set pg 15 revision for caching - id: pg_v15_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - - - name: Set pg 16 revision for caching - id: pg_v16_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT - - - name: Set pg 17 revision for caching - id: pg_v17_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT - - - name: Cache postgres v14 build - id: cache_pg_14 - uses: actions/cache@v4 + - name: Check for Postgres changes + uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3 + id: files_changed with: - path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + token: ${{ github.token }} + filters: .github/file-filters.yaml + base: ${{ github.event_name != 'pull_request' && (github.event.merge_group.base_ref || github.ref_name) || '' }} + ref: ${{ github.event_name != 'pull_request' && (github.event.merge_group.head_ref || github.ref) || '' }} - - name: Cache postgres v15 build - id: cache_pg_15 - uses: actions/cache@v4 - with: - path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v16 build - id: cache_pg_16 - uses: actions/cache@v4 - with: - path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v17 build - id: cache_pg_17 - uses: actions/cache@v4 - with: - path: pg_install/v17 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Set extra env for macOS + - name: Filter out only v-string for build matrix + id: postgres_changes run: | - echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV - echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c) + echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}" - - name: Cache cargo deps - uses: actions/cache@v4 - with: - path: | - ~/.cargo/registry - !~/.cargo/registry/src - ~/.cargo/git - target - key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust - - - name: Build postgres v14 - if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: make postgres-v14 -j$(sysctl -n hw.ncpu) - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: make postgres-v15 -j$(sysctl -n hw.ncpu) - - - name: Build postgres v16 - if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: make postgres-v16 -j$(sysctl -n hw.ncpu) - - - name: Build postgres v17 - if: steps.cache_pg_17.outputs.cache-hit != 'true' - run: make postgres-v17 -j$(sysctl -n hw.ncpu) - - - name: Build neon extensions - run: make neon-pg-ext -j$(sysctl -n hw.ncpu) - - - name: Build walproposer-lib - run: make walproposer-lib -j$(sysctl -n hw.ncpu) - - - name: Run cargo build - run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release - - - name: Check that no warnings are produced - run: ./run_clippy.sh - - gather-rust-build-stats: - needs: [ check-permissions, build-build-tools-image ] + check-macos-build: + needs: [ check-permissions, files-changed ] if: | - contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' + uses: ./.github/workflows/build-macos.yml + with: + pg_versions: ${{ needs.files-changed.outputs.postgres_changes }} + rebuild_rust_code: ${{ needs.files-changed.outputs.rebuild_rust_code }} + rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }} + + gather-rust-build-stats: + needs: [ check-permissions, build-build-tools-image, files-changed ] + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + if: | + (needs.files-changed.outputs.v17 == 'true' || needs.files-changed.outputs.rebuild_everything == 'true') && ( + contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || + contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || + github.ref_name == 'main' + ) runs-on: [ self-hosted, large ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -175,15 +114,20 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc) + run: cargo build --all --release --timings -j$(nproc) + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 3600 - name: Upload the build stats id: upload-stats env: BUCKET: neon-github-public-dev SHA: ${{ github.event.pull_request.head.sha || github.sha }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} run: | REPORT_URL=https://${BUCKET}.s3.amazonaws.com/build-stats/${SHA}/${GITHUB_RUN_ID}/cargo-timing.html aws s3 cp --only-show-errors ./target/cargo-timings/cargo-timing.html "s3://${BUCKET}/build-stats/${SHA}/${GITHUB_RUN_ID}/" diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 6b98bc873f..af877029e4 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -27,6 +27,11 @@ concurrency: jobs: trigger_bench_on_ec2_machine_in_eu_central_1: + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write runs-on: [ self-hosted, small ] container: image: neondatabase/build-tools:pinned-bookworm @@ -38,8 +43,6 @@ jobs: env: API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }} RUN_ID: ${{ github.run_id }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }} AWS_DEFAULT_REGION : "eu-central-1" AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74" steps: @@ -50,6 +53,13 @@ jobs: - name: Show my own (github runner) external IP address - usefull for IP allowlisting run: curl https://ifconfig.me + - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine) + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }} + role-duration-seconds: 3600 + - name: Start EC2 instance and wait for the instance to boot up run: | aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID @@ -124,11 +134,10 @@ jobs: cat "test_log_${GITHUB_RUN_ID}" - name: Create Allure report - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate + with: + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -148,6 +157,14 @@ jobs: -H "Authorization: Bearer $API_KEY" \ -d '' + - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine) + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }} + role-duration-seconds: 3600 + - name: Stop EC2 instance and wait for the instance to be stopped if: always() && steps.poll_step.outputs.too_many_runs != 'true' run: | diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index 4f5495cbe2..abc90c7fe1 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -12,8 +12,8 @@ on: pull_request: paths: - '.github/workflows/pg-clients.yml' - - 'test_runner/pg_clients/**' - - 'test_runner/logical_repl/**' + - 'test_runner/pg_clients/**/*.py' + - 'test_runner/logical_repl/**/*.py' - 'poetry.lock' workflow_dispatch: @@ -25,11 +25,13 @@ defaults: run: shell: bash -euxo pipefail {0} +permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write # require for posting a status update + env: DEFAULT_PG_VERSION: 16 PLATFORM: neon-captest-new - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} AWS_DEFAULT_REGION: eu-central-1 jobs: @@ -94,6 +96,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project @@ -101,6 +104,8 @@ jobs: with: api_key: ${{ secrets.NEON_STAGING_API_KEY }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} + project_settings: >- + {"enable_logical_replication": true} - name: Run tests uses: ./.github/actions/run-python-test-set @@ -110,6 +115,7 @@ jobs: run_in_parallel: false extra_params: -m remote_cluster pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} @@ -126,6 +132,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} @@ -159,6 +166,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project @@ -175,6 +183,7 @@ jobs: run_in_parallel: false extra_params: -m remote_cluster pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} @@ -191,6 +200,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index 5b43d97de6..d2588ba0bf 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -33,10 +33,6 @@ concurrency: # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} -env: - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: pinned - jobs: check-manifests: runs-on: ubuntu-22.04 @@ -46,11 +42,14 @@ jobs: steps: - name: Check if we really need to pin the image id: check-manifests + env: + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: pinned run: | - docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json - docker manifest inspect neondatabase/build-tools:${TO_TAG} > ${TO_TAG}.json + docker manifest inspect "docker.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json" + docker manifest inspect "docker.io/neondatabase/build-tools:${TO_TAG}" > "${TO_TAG}.json" - if diff ${FROM_TAG}.json ${TO_TAG}.json; then + if diff "${FROM_TAG}.json" "${TO_TAG}.json"; then skip=true else skip=false @@ -64,51 +63,36 @@ jobs: # use format(..) to catch both inputs.force = true AND inputs.force = 'true' if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true' - runs-on: ubuntu-22.04 - permissions: - id-token: write # for `azure/login` + id-token: write # Required for aws/azure login + packages: write # required for pushing to GHCR - steps: - - uses: docker/login-action@v3 - with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - - uses: docker/login-action@v3 - with: - registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com - username: ${{ secrets.AWS_ACCESS_KEY_DEV }} - password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - - name: Azure login - uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 - with: - client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} - tenant-id: ${{ secrets.AZURE_TENANT_ID }} - subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} - - - name: Login to ACR - run: | - az acr login --name=neoneastus2 - - - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR - env: - DEFAULT_DEBIAN_VERSION: bookworm - run: | - for debian_version in bullseye bookworm; do - tags=() - - tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}") - tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}") - tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}") - - if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then - tags+=("-t" "neondatabase/build-tools:${TO_TAG}") - tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}") - tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}") - fi - - docker buildx imagetools create "${tags[@]}" \ - neondatabase/build-tools:${FROM_TAG}-${debian_version} - done + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: | + { + "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [ + "docker.io/neondatabase/build-tools:pinned-bullseye", + "ghcr.io/neondatabase/build-tools:pinned-bullseye", + "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye", + "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye" + ], + "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [ + "docker.io/neondatabase/build-tools:pinned-bookworm", + "docker.io/neondatabase/build-tools:pinned", + "ghcr.io/neondatabase/build-tools:pinned-bookworm", + "ghcr.io/neondatabase/build-tools:pinned", + "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm", + "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned", + "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm", + "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned" + ] + } + aws-region: ${{ vars.AWS_ECR_REGION }} + aws-account-id: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}" + aws-role-to-assume: "gha-oidc-neon-admin" + azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }} + azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} + azure-tenant-id: ${{ vars.AZURE_TENANT_ID }} + acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} + secrets: inherit diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index d2f9d8a666..c47b3fe0de 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -1,6 +1,12 @@ name: Pre-merge checks on: + pull_request: + paths: + - .github/workflows/_check-codestyle-python.yml + - .github/workflows/_check-codestyle-rust.yml + - .github/workflows/build-build-tools-image.yml + - .github/workflows/pre-merge-checks.yml merge_group: branches: - main @@ -17,8 +23,10 @@ jobs: runs-on: ubuntu-22.04 outputs: python-changed: ${{ steps.python-src.outputs.any_changed }} + rust-changed: ${{ steps.rust-src.outputs.any_changed }} steps: - uses: actions/checkout@v4 + - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 id: python-src with: @@ -30,14 +38,31 @@ jobs: poetry.lock pyproject.toml + - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 + id: rust-src + with: + files: | + .github/workflows/_check-codestyle-rust.yml + .github/workflows/build-build-tools-image.yml + .github/workflows/pre-merge-checks.yml + **/**.rs + **/Cargo.toml + Cargo.toml + Cargo.lock + - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES env: PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }} + RUST_CHANGED_FILES: ${{ steps.rust-src.outputs.all_changed_files }} run: | echo "${PYTHON_CHANGED_FILES}" + echo "${RUST_CHANGED_FILES}" build-build-tools-image: - if: needs.get-changed-files.outputs.python-changed == 'true' + if: | + false + || needs.get-changed-files.outputs.python-changed == 'true' + || needs.get-changed-files.outputs.rust-changed == 'true' needs: [ get-changed-files ] uses: ./.github/workflows/build-build-tools-image.yml with: @@ -55,17 +80,30 @@ jobs: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64 secrets: inherit + check-codestyle-rust: + if: needs.get-changed-files.outputs.rust-changed == 'true' + needs: [ get-changed-files, build-build-tools-image ] + uses: ./.github/workflows/_check-codestyle-rust.yml + with: + # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64 + archs: '["x64"]' + secrets: inherit + # To get items from the merge queue merged into main we need to satisfy "Status checks that are required". # Currently we require 2 jobs (checks with exact name): # - conclusion # - neon-cloud-e2e conclusion: - if: always() + # Do not run job on Pull Requests as it interferes with the `conclusion` job from the `build_and_test` workflow + if: always() && github.event_name == 'merge_group' permissions: statuses: write # for `github.repos.createCommitStatus(...)` + contents: write needs: - get-changed-files - check-codestyle-python + - check-codestyle-rust runs-on: ubuntu-22.04 steps: - name: Create fake `neon-cloud-e2e` check @@ -90,6 +128,8 @@ jobs: - name: Fail the job if any of the dependencies do not succeed or skipped run: exit 1 if: | - (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true') + false + || (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true') + || (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true') || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') diff --git a/.github/workflows/regenerate-pg-setting.yml b/.github/workflows/regenerate-pg-setting.yml new file mode 100644 index 0000000000..1e9d2ec5e2 --- /dev/null +++ b/.github/workflows/regenerate-pg-setting.yml @@ -0,0 +1,41 @@ +name: Regenerate Postgres Settings + +on: + pull_request: + types: + - opened + - synchronize + - reopened + paths: + - pgxn/neon/**.c + - vendor/postgres-v* + - vendor/revisions.json + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref }} + cancel-in-progress: true + +permissions: + pull-requests: write + +jobs: + regenerate-pg-settings: + runs-on: ubuntu-22.04 + + steps: + - name: Add comment + uses: thollander/actions-comment-pull-request@v3 + with: + comment-tag: ${{ github.job }} + pr-number: ${{ github.event.number }} + message: | + If this PR added a GUC in the Postgres fork or `neon` extension, + please regenerate the Postgres settings in the `cloud` repo: + + ``` + make NEON_WORKDIR=path/to/neon/checkout \ + -C goapp/internal/shareddomain/postgres generate + ``` + + If you're an external contributor, a Neon employee will assist in + making sure this step is done. diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f0273b977f..919846ce44 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -3,8 +3,9 @@ name: Create Release Branch on: schedule: # It should be kept in sync with if-condition in jobs - - cron: '0 6 * * MON' # Storage release - cron: '0 6 * * THU' # Proxy release + - cron: '0 6 * * FRI' # Storage release + - cron: '0 7 * * FRI' # Compute release workflow_dispatch: inputs: create-storage-release-branch: @@ -29,7 +30,7 @@ defaults: jobs: create-storage-release-branch: - if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }} + if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }} permissions: contents: write @@ -55,7 +56,7 @@ jobs: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} create-compute-release-branch: - if: inputs.create-compute-release-branch + if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }} permissions: contents: write diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 70c2e8549f..be6a7a7901 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -15,7 +15,14 @@ env: E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name }} + cancel-previous-e2e-tests: + needs: [ check-permissions ] if: github.event_name == 'pull_request' runs-on: ubuntu-22.04 @@ -29,6 +36,7 @@ jobs: --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" tag: + needs: [ check-permissions ] runs-on: ubuntu-22.04 outputs: build-tag: ${{ steps.build-tag.outputs.tag }} @@ -68,7 +76,7 @@ jobs: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} TAG: ${{ needs.tag.outputs.build-tag }} steps: - - name: Wait for `promote-images` job to finish + - name: Wait for `push-{neon,compute}-image-dev` job to finish # It's important to have a timeout here, the script in the step can run infinitely timeout-minutes: 60 run: | @@ -79,20 +87,20 @@ jobs: # For PRs we use the run id as the tag BUILD_AND_TEST_RUN_ID=${TAG} while true; do - conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion') - case "$conclusion" in - success) - break - ;; - failure | cancelled | skipped) - echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..." - exit 1 - ;; - *) - echo "The 'promote-images' hasn't succeed yet. Waiting..." - sleep 60 - ;; - esac + gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '[.jobs[] | select((.name | startswith("push-neon-image-dev")) or (.name | startswith("push-compute-image-dev"))) | {"name": .name, "conclusion": .conclusion, "url": .url}]' > jobs.json + if [ $(jq '[.[] | select(.conclusion == "success")] | length' jobs.json) -eq 2 ]; then + break + fi + jq -c '.[]' jobs.json | while read -r job; do + case $(echo $job | jq .conclusion) in + failure | cancelled | skipped) + echo "The '$(echo $job | jq .name)' job didn't succeed: '$(echo $job | jq .conclusion)'. See log in '$(echo $job | jq .url)' Exiting..." + exit 1 + ;; + esac + done + echo "The 'push-{neon,compute}-image-dev' jobs haven't succeeded yet. Waiting..." + sleep 60 done - name: Set e2e-platforms diff --git a/Cargo.lock b/Cargo.lock index e2d5e03613..47552174d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,9 +10,9 @@ checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" [[package]] name = "addr2line" -version = "0.21.0" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" dependencies = [ "gimli", ] @@ -23,6 +23,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "ahash" version = "0.8.11" @@ -173,7 +179,7 @@ dependencies = [ "nom", "num-traits", "rusticata-macros", - "thiserror", + "thiserror 1.0.69", "time", ] @@ -200,6 +206,16 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "assert-json-diff" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "async-channel" version = "1.9.0" @@ -712,14 +728,14 @@ dependencies = [ [[package]] name = "axum" -version = "0.7.5" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf" +checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8" dependencies = [ - "async-trait", "axum-core", - "base64 0.21.1", + "base64 0.22.1", "bytes", + "form_urlencoded", "futures-util", "http 1.1.0", "http-body 1.0.0", @@ -727,7 +743,7 @@ dependencies = [ "hyper 1.4.1", "hyper-util", "itoa", - "matchit 0.7.0", + "matchit", "memchr", "mime", "percent-encoding", @@ -740,8 +756,8 @@ dependencies = [ "sha1", "sync_wrapper 1.0.1", "tokio", - "tokio-tungstenite", - "tower", + "tokio-tungstenite 0.26.1", + "tower 0.5.2", "tower-layer", "tower-service", "tracing", @@ -749,11 +765,10 @@ dependencies = [ [[package]] name = "axum-core" -version = "0.4.5" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +checksum = "df1362f362fd16024ae199c1970ce98f9661bf5ef94b9808fee734bc3698b733" dependencies = [ - "async-trait", "bytes", "futures-util", "http 1.1.0", @@ -771,7 +786,7 @@ dependencies = [ [[package]] name = "azure_core" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "async-trait", "base64 0.22.1", @@ -800,7 +815,7 @@ dependencies = [ [[package]] name = "azure_identity" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "async-lock", "async-trait", @@ -819,7 +834,7 @@ dependencies = [ [[package]] name = "azure_storage" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "RustyXML", "async-lock", @@ -837,7 +852,7 @@ dependencies = [ [[package]] name = "azure_storage_blobs" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "RustyXML", "azure_core", @@ -857,7 +872,7 @@ dependencies = [ [[package]] name = "azure_svc_blobstorage" version = "0.21.0" -source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d" +source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a" dependencies = [ "azure_core", "bytes", @@ -871,17 +886,17 @@ dependencies = [ [[package]] name = "backtrace" -version = "0.3.69" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" dependencies = [ "addr2line", - "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.8.0", "object", "rustc-demangle", + "windows-targets 0.52.6", ] [[package]] @@ -936,6 +951,18 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "bb8" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8" +dependencies = [ + "async-trait", + "futures-util", + "parking_lot 0.12.1", + "tokio", +] + [[package]] name = "bcder" version = "0.7.4" @@ -961,7 +988,7 @@ version = "0.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "cexpr", "clang-sys", "itertools 0.12.1", @@ -989,9 +1016,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.1" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" [[package]] name = "block-buffer" @@ -1112,7 +1139,7 @@ dependencies = [ "log", "nix 0.25.1", "regex", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -1127,7 +1154,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -1208,6 +1235,20 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" +[[package]] +name = "clashmap" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93bd59c81e2bd87a775ae2de75f070f7e2bfe97363a6ad652f46824564c23e4d" +dependencies = [ + "crossbeam-utils", + "hashbrown 0.15.2", + "lock_api", + "parking_lot_core 0.9.8", + "polonius-the-crab", + "replace_with", +] + [[package]] name = "colorchoice" version = "1.0.0" @@ -1246,6 +1287,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", + "jsonwebtoken", "regex", "remote_storage", "serde", @@ -1261,6 +1303,8 @@ dependencies = [ "aws-config", "aws-sdk-kms", "aws-sdk-s3", + "aws-smithy-types", + "axum", "base64 0.13.1", "bytes", "camino", @@ -1268,9 +1312,10 @@ dependencies = [ "chrono", "clap", "compute_api", + "fail", "flate2", "futures", - "hyper 0.14.30", + "http 1.1.0", "metrics", "nix 0.27.1", "notify", @@ -1280,7 +1325,6 @@ dependencies = [ "opentelemetry_sdk", "postgres", "postgres_initdb", - "prometheus", "regex", "remote_storage", "reqwest", @@ -1291,18 +1335,21 @@ dependencies = [ "serde_with", "signal-hook", "tar", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-postgres", "tokio-stream", "tokio-util", + "tower 0.5.2", + "tower-http", "tracing", - "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "url", "utils", + "uuid", "vm_monitor", + "walkdir", "workspace_hack", "zstd", ] @@ -1381,6 +1428,7 @@ dependencies = [ "comfy-table", "compute_api", "futures", + "http-utils", "humantime", "humantime-serde", "hyper 0.14.30", @@ -1397,7 +1445,7 @@ dependencies = [ "serde", "serde_json", "storage_broker", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-postgres", "tokio-util", @@ -1498,6 +1546,17 @@ dependencies = [ "itertools 0.10.5", ] +[[package]] +name = "cron" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5877d3fbf742507b66bc2a1945106bd30dd8504019d596901ddd012a4dd01740" +dependencies = [ + "chrono", + "once_cell", + "winnow", +] + [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -1539,7 +1598,7 @@ version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "crossterm_winapi", "libc", "parking_lot 0.12.1", @@ -1595,6 +1654,32 @@ dependencies = [ "typenum", ] +[[package]] +name = "curve25519-dalek" +version = "4.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" +dependencies = [ + "cfg-if", + "cpufeatures", + "curve25519-dalek-derive", + "digest", + "fiat-crypto", + "rustc_version", + "subtle", +] + +[[package]] +name = "curve25519-dalek-derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "darling" version = "0.20.1" @@ -1643,6 +1728,20 @@ dependencies = [ "parking_lot_core 0.9.8", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core 0.9.8", +] + [[package]] name = "data-encoding" version = "2.4.0" @@ -1726,20 +1825,33 @@ checksum = "ab03c107fafeb3ee9f5925686dbb7a73bc76e3932abb0d2b365cb64b169cf04c" [[package]] name = "diesel" -version = "2.2.3" +version = "2.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71" +checksum = "ccf1bedf64cdb9643204a36dd15b19a6ce8e7aa7f7b105868e9f1fad5ffa7d12" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "byteorder", "chrono", "diesel_derives", "itoa", - "pq-sys", - "r2d2", "serde_json", ] +[[package]] +name = "diesel-async" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb" +dependencies = [ + "async-trait", + "bb8", + "diesel", + "futures-util", + "scoped-futures", + "tokio", + "tokio-postgres", +] + [[package]] name = "diesel_derives" version = "2.2.1" @@ -1773,6 +1885,12 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + [[package]] name = "digest" version = "0.10.7" @@ -1851,6 +1969,28 @@ dependencies = [ "spki 0.7.3", ] +[[package]] +name = "ed25519" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" +dependencies = [ + "signature 2.2.0", +] + +[[package]] +name = "ed25519-dalek" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871" +dependencies = [ + "curve25519-dalek", + "ed25519", + "rand_core 0.6.4", + "sha2", + "subtle", +] + [[package]] name = "either" version = "1.8.1" @@ -1942,6 +2082,15 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "env_filter" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" +dependencies = [ + "log", +] + [[package]] name = "env_logger" version = "0.10.2" @@ -1955,6 +2104,16 @@ dependencies = [ "termcolor", ] +[[package]] +name = "env_logger" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" +dependencies = [ + "env_filter", + "log", +] + [[package]] name = "equator" version = "0.2.2" @@ -2070,6 +2229,12 @@ dependencies = [ "subtle", ] +[[package]] +name = "fiat-crypto" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" + [[package]] name = "filetime" version = "0.2.22" @@ -2107,7 +2272,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" dependencies = [ "crc32fast", - "miniz_oxide", + "miniz_oxide 0.7.1", ] [[package]] @@ -2141,7 +2306,7 @@ dependencies = [ "pin-project", "rand 0.8.5", "sha1", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", ] @@ -2307,10 +2472,20 @@ dependencies = [ ] [[package]] -name = "gimli" -version = "0.28.1" +name = "gettid" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +checksum = "397256552fed4a9e577850498071831ec8f18ea83368aecc114cab469dcb43e5" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "git-version" @@ -2434,6 +2609,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + [[package]] name = "hashlink" version = "0.9.1" @@ -2484,6 +2665,15 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" +[[package]] +name = "higher-kinded-types" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "561985554c8b8d4808605c90a5f1979cc6c31a5d20b78465cd59501233c6678e" +dependencies = [ + "never-say-never", +] + [[package]] name = "hmac" version = "0.12.1" @@ -2580,6 +2770,38 @@ dependencies = [ "url", ] +[[package]] +name = "http-utils" +version = "0.1.0" +dependencies = [ + "anyhow", + "backtrace", + "bytes", + "fail", + "flate2", + "hyper 0.14.30", + "inferno 0.12.0", + "itertools 0.10.5", + "jemalloc_pprof", + "metrics", + "once_cell", + "pprof", + "regex", + "routerify", + "serde", + "serde_json", + "serde_path_to_error", + "thiserror 1.0.69", + "tokio", + "tokio-stream", + "tokio-util", + "tracing", + "url", + "utils", + "uuid", + "workspace_hack", +] + [[package]] name = "httparse" version = "1.8.0" @@ -2713,7 +2935,7 @@ dependencies = [ "pin-project-lite", "socket2", "tokio", - "tower", + "tower 0.4.13", "tower-service", "tracing", ] @@ -2939,12 +3161,34 @@ dependencies = [ ] [[package]] -name = "inotify" -version = "0.9.6" +name = "inferno" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" +checksum = "75a5d75fee4d36809e6b021e4b96b686e763d365ffdb03af2bd00786353f84fe" dependencies = [ - "bitflags 1.3.2", + "ahash", + "clap", + "crossbeam-channel", + "crossbeam-utils", + "dashmap 6.1.0", + "env_logger 0.11.2", + "indexmap 2.0.1", + "itoa", + "log", + "num-format", + "once_cell", + "quick-xml 0.37.1", + "rgb", + "str_stack", +] + +[[package]] +name = "inotify" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3" +dependencies = [ + "bitflags 2.8.0", "inotify-sys", "libc", ] @@ -3104,6 +3348,17 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "json-structural-diff" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e878e36a8a44c158505c2c818abdc1350413ad83dcb774a0459f6a7ef2b65cbf" +dependencies = [ + "difflib", + "regex", + "serde_json", +] + [[package]] name = "jsonwebtoken" version = "9.2.0" @@ -3121,9 +3376,9 @@ dependencies = [ [[package]] name = "kqueue" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c8fc60ba15bf51257aa9807a48a61013db043fcf3a78cb0d916e8e396dcad98" +checksum = "7447f1ca1b7b563588a205fe93dea8df60fd981423a768bc1c0ded35ed147d0c" dependencies = [ "kqueue-sys", "libc", @@ -3131,9 +3386,9 @@ dependencies = [ [[package]] name = "kqueue-sys" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587" +checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b" dependencies = [ "bitflags 1.3.2", "libc", @@ -3145,7 +3400,7 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2" dependencies = [ - "dashmap", + "dashmap 5.5.0", "hashbrown 0.13.2", ] @@ -3160,9 +3415,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.167" +version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libloading" @@ -3247,15 +3502,9 @@ dependencies = [ [[package]] name = "matchit" -version = "0.7.0" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40" - -[[package]] -name = "matchit" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" [[package]] name = "md-5" @@ -3405,15 +3654,24 @@ dependencies = [ ] [[package]] -name = "mio" -version = "0.8.11" +name = "miniz_oxide" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + +[[package]] +name = "mio" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", "log", "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -3422,6 +3680,12 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +[[package]] +name = "never-say-never" +version = "6.6.666" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf5a574dadd7941adeaa71823ecba5e28331b8313fb2e1c6a5c7e5981ea53ad6" + [[package]] name = "nix" version = "0.25.1" @@ -3453,7 +3717,7 @@ version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "cfg-if", "libc", "memoffset 0.9.0", @@ -3471,12 +3735,11 @@ dependencies = [ [[package]] name = "notify" -version = "6.1.1" +version = "8.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d" +checksum = "2fee8403b3d66ac7b26aee6e40a897d85dc5ce26f44da36b8b73e987cc52e943" dependencies = [ - "bitflags 2.4.1", - "crossbeam-channel", + "bitflags 2.8.0", "filetime", "fsevent-sys", "inotify", @@ -3484,10 +3747,17 @@ dependencies = [ "libc", "log", "mio", + "notify-types", "walkdir", - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] +[[package]] +name = "notify-types" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d" + [[package]] name = "ntapi" version = "0.4.1" @@ -3632,15 +3902,15 @@ dependencies = [ "serde_json", "serde_path_to_error", "sha2", - "thiserror", + "thiserror 1.0.69", "url", ] [[package]] name = "object" -version = "0.32.2" +version = "0.36.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" dependencies = [ "memchr", ] @@ -3674,23 +3944,23 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "opentelemetry" -version = "0.26.0" +version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17" +checksum = "ab70038c28ed37b97d8ed414b6429d343a8bbf44c9f79ec854f3a643029ba6d7" dependencies = [ "futures-core", "futures-sink", "js-sys", - "once_cell", "pin-project-lite", - "thiserror", + "thiserror 1.0.69", + "tracing", ] [[package]] name = "opentelemetry-http" -version = "0.26.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99" +checksum = "10a8a7f5f6ba7c1b286c2fbca0454eaba116f63bbe69ed250b642d36fbb04d80" dependencies = [ "async-trait", "bytes", @@ -3701,9 +3971,9 @@ dependencies = [ [[package]] name = "opentelemetry-otlp" -version = "0.26.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd" +checksum = "91cf61a1868dacc576bf2b2a1c3e9ab150af7272909e80085c3173384fe11f76" dependencies = [ "async-trait", "futures-core", @@ -3714,14 +3984,14 @@ dependencies = [ "opentelemetry_sdk", "prost", "reqwest", - "thiserror", + "thiserror 1.0.69", ] [[package]] name = "opentelemetry-proto" -version = "0.26.1" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34" +checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6" dependencies = [ "opentelemetry", "opentelemetry_sdk", @@ -3731,29 +4001,29 @@ dependencies = [ [[package]] name = "opentelemetry-semantic-conventions" -version = "0.26.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09" +checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52" [[package]] name = "opentelemetry_sdk" -version = "0.26.0" +version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3" +checksum = "231e9d6ceef9b0b2546ddf52335785ce41252bc7474ee8ba05bfad277be13ab8" dependencies = [ "async-trait", "futures-channel", "futures-executor", "futures-util", "glob", - "once_cell", "opentelemetry", "percent-encoding", "rand 0.8.5", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", + "tracing", ] [[package]] @@ -3853,16 +4123,18 @@ name = "pagectl" version = "0.1.0" dependencies = [ "anyhow", + "bincode", "camino", "clap", "humantime", + "itertools 0.10.5", "pageserver", "pageserver_api", "postgres_ffi", "remote_storage", "serde_json", "svg_fmt", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", "utils", @@ -3877,6 +4149,7 @@ dependencies = [ "arc-swap", "async-compression", "async-stream", + "bincode", "bit_field", "byteorder", "bytes", @@ -3894,6 +4167,7 @@ dependencies = [ "futures", "hex", "hex-literal", + "http-utils", "humantime", "humantime-serde", "hyper 0.14.30", @@ -3909,13 +4183,13 @@ dependencies = [ "pageserver_client", "pageserver_compaction", "pin-project-lite", - "postgres", "postgres-protocol", "postgres-types", "postgres_backend", "postgres_connection", "postgres_ffi", "postgres_initdb", + "pprof", "pq_proto", "procfs", "rand 0.8.5", @@ -3936,7 +4210,7 @@ dependencies = [ "strum_macros", "sysinfo", "tenant_size_model", - "thiserror", + "thiserror 1.0.69", "tikv-jemallocator", "tokio", "tokio-epoll-uring", @@ -3982,7 +4256,7 @@ dependencies = [ "storage_broker", "strum", "strum_macros", - "thiserror", + "thiserror 1.0.69", "utils", ] @@ -3993,11 +4267,11 @@ dependencies = [ "anyhow", "bytes", "futures", + "http-utils", "pageserver_api", - "postgres", "reqwest", "serde", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-postgres", "tokio-stream", @@ -4029,6 +4303,16 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "papaya" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c" +dependencies = [ + "equivalent", + "seize", +] + [[package]] name = "parking" version = "2.1.1" @@ -4295,10 +4579,20 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "polonius-the-crab" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e97ca2c89572ae41bbec1c99498251f87dd5a94e500c5ec19c382dd593dd5ce9" +dependencies = [ + "higher-kinded-types", + "never-say-never", +] + [[package]] name = "postgres" -version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" +version = "0.19.7" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" dependencies = [ "bytes", "fallible-iterator", @@ -4310,10 +4604,10 @@ dependencies = [ [[package]] name = "postgres-protocol" -version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" +version = "0.6.6" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" dependencies = [ - "base64 0.20.0", + "base64 0.22.1", "byteorder", "bytes", "fallible-iterator", @@ -4344,10 +4638,11 @@ dependencies = [ [[package]] name = "postgres-types" -version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" +version = "0.2.6" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" dependencies = [ "bytes", + "chrono", "fallible-iterator", "postgres-protocol", ] @@ -4372,7 +4667,7 @@ dependencies = [ "rustls 0.23.18", "rustls-pemfile 2.1.1", "serde", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-postgres", "tokio-postgres-rustls", @@ -4388,7 +4683,6 @@ dependencies = [ "anyhow", "itertools 0.10.5", "once_cell", - "postgres", "tokio-postgres", "url", ] @@ -4401,14 +4695,16 @@ dependencies = [ "bindgen", "bytes", "crc32c", - "env_logger", + "criterion", + "env_logger 0.10.2", "log", "memoffset 0.9.0", "once_cell", "postgres", + "pprof", "regex", "serde", - "thiserror", + "thiserror 1.0.69", "tracing", "utils", ] @@ -4419,7 +4715,7 @@ version = "0.1.0" dependencies = [ "anyhow", "camino", - "thiserror", + "thiserror 1.0.69", "tokio", "workspace_hack", ] @@ -4441,7 +4737,7 @@ dependencies = [ "cfg-if", "criterion", "findshlibs", - "inferno", + "inferno 0.11.21", "libc", "log", "nix 0.26.4", @@ -4452,7 +4748,7 @@ dependencies = [ "smallvec", "symbolic-demangle", "tempfile", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -4474,15 +4770,6 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" -[[package]] -name = "pq-sys" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd" -dependencies = [ - "vcpkg", -] - [[package]] name = "pq_proto" version = "0.1.0" @@ -4493,7 +4780,7 @@ dependencies = [ "postgres-protocol", "rand 0.8.5", "serde", - "thiserror", + "thiserror 1.0.69", "tokio", ] @@ -4531,7 +4818,7 @@ version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "chrono", "flate2", "hex", @@ -4546,7 +4833,7 @@ version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "chrono", "hex", ] @@ -4564,7 +4851,7 @@ dependencies = [ "memchr", "parking_lot 0.12.1", "procfs", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -4652,6 +4939,7 @@ dependencies = [ "ahash", "anyhow", "arc-swap", + "assert-json-diff", "async-compression", "async-trait", "atomic-take", @@ -4665,15 +4953,17 @@ dependencies = [ "camino-tempfile", "chrono", "clap", + "clashmap", "compute_api", "consumption_metrics", - "dashmap", "ecdsa 0.16.9", - "env_logger", + "ed25519-dalek", + "env_logger 0.10.2", "fallible-iterator", "flate2", "framed-websockets", "futures", + "gettid", "hashbrown 0.14.5", "hashlink", "hex", @@ -4681,6 +4971,7 @@ dependencies = [ "hostname", "http 1.1.0", "http-body-util", + "http-utils", "humantime", "humantime-serde", "hyper 0.14.30", @@ -4696,7 +4987,9 @@ dependencies = [ "measured", "metrics", "once_cell", + "opentelemetry", "p256 0.13.2", + "papaya", "parking_lot 0.12.1", "parquet", "parquet_derive", @@ -4705,7 +4998,6 @@ dependencies = [ "postgres-protocol2", "postgres_backend", "pq_proto", - "prometheus", "rand 0.8.5", "rand_distr", "rcgen", @@ -4730,19 +5022,20 @@ dependencies = [ "smallvec", "smol_str", "socket2", - "strum", "strum_macros", "subtle", - "thiserror", + "thiserror 1.0.69", "tikv-jemalloc-ctl", "tikv-jemallocator", "tokio", "tokio-postgres", "tokio-postgres2", "tokio-rustls 0.26.0", - "tokio-tungstenite", + "tokio-tungstenite 0.21.0", "tokio-util", "tracing", + "tracing-log", + "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "try-lock", @@ -4776,6 +5069,15 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.37.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f22f29bdff3987b4d8632ef95fd6424ec7e4e0a57e2f4fc63e489e75357f6a03" +dependencies = [ + "memchr", +] + [[package]] name = "quote" version = "1.0.37" @@ -4785,17 +5087,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "r2d2" -version = "0.8.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93" -dependencies = [ - "log", - "parking_lot 0.12.1", - "scheduled-thread-pool", -] - [[package]] name = "rand" version = "0.7.3" @@ -5062,6 +5353,7 @@ dependencies = [ "once_cell", "pin-project-lite", "rand 0.8.5", + "reqwest", "scopeguard", "serde", "serde_json", @@ -5075,6 +5367,12 @@ dependencies = [ "utils", ] +[[package]] +name = "replace_with" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a8614ee435691de62bcffcf4a66d91b3594bf1428a5722e79103249a095690" + [[package]] name = "reqwest" version = "0.12.4" @@ -5131,7 +5429,7 @@ dependencies = [ "http 1.1.0", "reqwest", "serde", - "thiserror", + "thiserror 1.0.69", "tower-service", ] @@ -5151,7 +5449,7 @@ dependencies = [ "reqwest", "reqwest-middleware", "retry-policies", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", "wasm-timer", @@ -5159,15 +5457,15 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2" +checksum = "73e6153390585f6961341b50e5a1931d6be6dee4292283635903c26ef9d980d2" dependencies = [ "anyhow", "async-trait", "getrandom 0.2.11", "http 1.1.0", - "matchit 0.8.2", + "matchit", "opentelemetry", "reqwest", "reqwest-middleware", @@ -5320,9 +5618,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" @@ -5354,7 +5652,7 @@ version = "0.38.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.8.0", "errno", "libc", "linux-raw-sys 0.4.14", @@ -5513,10 +5811,12 @@ dependencies = [ "crc32c", "criterion", "desim", + "env_logger 0.10.2", "fail", "futures", "hex", "http 1.1.0", + "http-utils", "humantime", "hyper 0.14.30", "itertools 0.10.5", @@ -5524,7 +5824,6 @@ dependencies = [ "once_cell", "pageserver_api", "parking_lot 0.12.1", - "postgres", "postgres-protocol", "postgres_backend", "postgres_ffi", @@ -5535,15 +5834,17 @@ dependencies = [ "remote_storage", "reqwest", "safekeeper_api", + "safekeeper_client", "scopeguard", "sd-notify", "serde", "serde_json", "sha2", + "smallvec", "storage_broker", "strum", "strum_macros", - "thiserror", + "thiserror 1.0.69", "tikv-jemallocator", "tokio", "tokio-io-timeout", @@ -5564,11 +5865,30 @@ dependencies = [ name = "safekeeper_api" version = "0.1.0" dependencies = [ + "anyhow", "const_format", + "pageserver_api", + "postgres_ffi", + "pq_proto", "serde", + "serde_json", + "tokio", "utils", ] +[[package]] +name = "safekeeper_client" +version = "0.1.0" +dependencies = [ + "http-utils", + "reqwest", + "safekeeper_api", + "serde", + "thiserror 1.0.69", + "utils", + "workspace_hack", +] + [[package]] name = "same-file" version = "1.0.6" @@ -5588,12 +5908,12 @@ dependencies = [ ] [[package]] -name = "scheduled-thread-pool" -version = "0.2.7" +name = "scoped-futures" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19" +checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091" dependencies = [ - "parking_lot 0.12.1", + "pin-project-lite", ] [[package]] @@ -5670,6 +5990,16 @@ dependencies = [ "libc", ] +[[package]] +name = "seize" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "semver" version = "1.0.17" @@ -5773,7 +6103,7 @@ dependencies = [ "rand 0.8.5", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "time", "url", "uuid", @@ -5845,7 +6175,7 @@ checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" dependencies = [ "percent-encoding", "serde", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -6007,7 +6337,7 @@ checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085" dependencies = [ "num-bigint", "num-traits", - "thiserror", + "thiserror 1.0.69", "time", ] @@ -6127,14 +6457,18 @@ dependencies = [ "chrono", "clap", "control_plane", + "cron", "diesel", + "diesel-async", "diesel_migrations", "fail", "futures", "hex", + "http-utils", "humantime", "hyper 0.14.30", "itertools 0.10.5", + "json-structural-diff", "lasso", "measured", "metrics", @@ -6142,17 +6476,25 @@ dependencies = [ "pageserver_api", "pageserver_client", "postgres_connection", - "r2d2", "rand 0.8.5", + "regex", "reqwest", "routerify", + "rustls 0.23.18", + "rustls-native-certs 0.8.0", + "safekeeper_api", + "safekeeper_client", + "scoped-futures", "scopeguard", "serde", "serde_json", "strum", "strum_macros", - "thiserror", + "thiserror 1.0.69", + "tikv-jemallocator", "tokio", + "tokio-postgres", + "tokio-postgres-rustls", "tokio-util", "tracing", "utils", @@ -6395,7 +6737,7 @@ dependencies = [ "fastrand 2.2.0", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -6443,7 +6785,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" +dependencies = [ + "thiserror-impl 2.0.11", ] [[package]] @@ -6457,6 +6808,17 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "thiserror-impl" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "thread_local" version = "1.1.7" @@ -6587,33 +6949,32 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.38.1" +version = "1.43.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df" +checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" dependencies = [ "backtrace", "bytes", "libc", "mio", - "num_cpus", "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "tokio-epoll-uring" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497" dependencies = [ "futures", "nix 0.26.4", "once_cell", "scopeguard", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", "tracing", @@ -6632,9 +6993,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.3.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", @@ -6643,8 +7004,8 @@ dependencies = [ [[package]] name = "tokio-postgres" -version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" +version = "0.7.10" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" dependencies = [ "async-trait", "byteorder", @@ -6659,9 +7020,11 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", + "rand 0.8.5", "socket2", "tokio", "tokio-util", + "whoami", ] [[package]] @@ -6682,18 +7045,16 @@ dependencies = [ name = "tokio-postgres2" version = "0.1.0" dependencies = [ - "async-trait", - "byteorder", "bytes", "fallible-iterator", "futures-util", "log", "parking_lot 0.12.1", - "percent-encoding", "phf", "pin-project-lite", "postgres-protocol2", "postgres-types2", + "serde", "tokio", "tokio-util", ] @@ -6765,7 +7126,19 @@ dependencies = [ "futures-util", "log", "tokio", - "tungstenite", + "tungstenite 0.21.0", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4bf6fecd69fcdede0ec680aaf474cdab988f9de6bc73d3758f0160e3b7025a" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite 0.26.1", ] [[package]] @@ -6825,12 +7198,9 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" dependencies = [ - "async-stream", "async-trait", - "axum", "base64 0.22.1", "bytes", - "h2 0.4.4", "http 1.1.0", "http-body 1.0.0", "http-body-util", @@ -6842,11 +7212,10 @@ dependencies = [ "prost", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", - "socket2", "tokio", "tokio-rustls 0.26.0", "tokio-stream", - "tower", + "tower 0.4.13", "tower-layer", "tower-service", "tracing", @@ -6887,16 +7256,49 @@ dependencies = [ ] [[package]] -name = "tower-layer" -version = "0.3.2" +name = "tower" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper 1.0.1", + "tokio", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697" +dependencies = [ + "bitflags 2.8.0", + "bytes", + "http 1.1.0", + "http-body 1.0.0", + "pin-project-lite", + "tower-layer", + "tower-service", + "tracing", + "uuid", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" @@ -6965,9 +7367,9 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.27.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b" +checksum = "97a971f6058498b5c0f1affa23e7ea202057a7301dbff68e968b2d578bcbd053" dependencies = [ "js-sys", "once_cell", @@ -7046,11 +7448,29 @@ dependencies = [ "log", "rand 0.8.5", "sha1", - "thiserror", + "thiserror 1.0.69", "url", "utf-8", ] +[[package]] +name = "tungstenite" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413083a99c579593656008130e29255e54dcaae495be556cc26888f211648c24" +dependencies = [ + "byteorder", + "bytes", + "data-encoding", + "http 1.1.0", + "httparse", + "log", + "rand 0.8.5", + "sha1", + "thiserror 2.0.11", + "utf-8", +] + [[package]] name = "twox-hash" version = "1.6.3" @@ -7143,7 +7563,7 @@ dependencies = [ [[package]] name = "uring-common" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#781989bb540a1408b0b93daa1e9d1fa452195497" dependencies = [ "bytes", "io-uring", @@ -7200,6 +7620,7 @@ dependencies = [ "anyhow", "arc-swap", "async-compression", + "backtrace", "bincode", "byteorder", "bytes", @@ -7215,8 +7636,6 @@ dependencies = [ "hex", "hex-literal", "humantime", - "hyper 0.14.30", - "jemalloc_pprof", "jsonwebtoken", "metrics", "nix 0.27.1", @@ -7227,28 +7646,23 @@ dependencies = [ "pq_proto", "rand 0.8.5", "regex", - "routerify", "scopeguard", "sentry", "serde", "serde_assert", "serde_json", - "serde_path_to_error", "serde_with", "signal-hook", "strum", "strum_macros", - "thiserror", + "thiserror 1.0.69", "tokio", - "tokio-stream", "tokio-tar", "tokio-util", "toml_edit", "tracing", "tracing-error", "tracing-subscriber", - "url", - "uuid", "walkdir", ] @@ -7268,12 +7682,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" -[[package]] -name = "vcpkg" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" - [[package]] name = "version_check" version = "0.9.4" @@ -7318,7 +7726,7 @@ dependencies = [ "anyhow", "camino-tempfile", "clap", - "env_logger", + "env_logger 0.10.2", "log", "postgres", "postgres_ffi", @@ -7333,13 +7741,21 @@ dependencies = [ "anyhow", "async-compression", "bytes", + "camino", + "camino-tempfile", + "criterion", + "futures", "pageserver_api", "postgres_ffi", + "pprof", "prost", + "remote_storage", "serde", - "thiserror", + "serde_json", + "thiserror 1.0.69", + "tikv-jemallocator", "tokio", - "tonic", + "tokio-util", "tonic-build", "tracing", "utils", @@ -7348,9 +7764,9 @@ dependencies = [ [[package]] name = "walkdir" -version = "2.3.3" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" dependencies = [ "same-file", "winapi-util", @@ -7572,7 +7988,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" dependencies = [ "windows-core", - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -7581,7 +7997,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -7599,7 +8015,16 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", ] [[package]] @@ -7619,17 +8044,18 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -7640,9 +8066,9 @@ checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" @@ -7652,9 +8078,9 @@ checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_aarch64_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" @@ -7664,9 +8090,15 @@ checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_gnu" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" @@ -7676,9 +8108,9 @@ checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_i686_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" @@ -7688,9 +8120,9 @@ checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnu" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" @@ -7700,9 +8132,9 @@ checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" @@ -7712,15 +8144,15 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] name = "windows_x86_64_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.13" +version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1" +checksum = "1e90edd2ac1aa278a5c4599b1d89cf03074b610800f866d4026dc199d7929a28" dependencies = [ "memchr", ] @@ -7741,8 +8173,6 @@ version = "0.1.0" dependencies = [ "ahash", "anyhow", - "axum", - "axum-core", "base64 0.13.1", "base64 0.21.1", "base64ct", @@ -7822,10 +8252,12 @@ dependencies = [ "tokio-util", "toml_edit", "tonic", - "tower", + "tower 0.4.13", "tracing", "tracing-core", + "tracing-log", "url", + "uuid", "zerocopy", "zeroize", "zstd", @@ -7860,7 +8292,7 @@ dependencies = [ "ring", "signature 2.2.0", "spki 0.7.3", - "thiserror", + "thiserror 1.0.69", "zeroize", ] @@ -7877,7 +8309,7 @@ dependencies = [ "nom", "oid-registry", "rusticata-macros", - "thiserror", + "thiserror 1.0.69", "time", ] diff --git a/Cargo.toml b/Cargo.toml index 0654c25a3d..e6ca3c982c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,12 +11,14 @@ members = [ "pageserver/pagebench", "proxy", "safekeeper", + "safekeeper/client", "storage_broker", "storage_controller", "storage_controller/client", "storage_scrubber", "workspace_hack", "libs/compute_api", + "libs/http-utils", "libs/pageserver_api", "libs/postgres_ffi", "libs/safekeeper_api", @@ -51,7 +53,9 @@ anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } atomic-take = "1.1.0" +backtrace = "0.3.74" flate2 = "1.0.26" +assert-json-diff = "2" async-stream = "0.3" async-trait = "0.1" aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] } @@ -63,7 +67,7 @@ aws-smithy-types = "1.2" aws-credential-types = "1.2.0" aws-sigv4 = { version = "1.2", features = ["sign-http"] } aws-types = "1.3" -axum = { version = "0.7.5", features = ["ws"] } +axum = { version = "0.8.1", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" bindgen = "0.70" @@ -73,12 +77,13 @@ byteorder = "1.4" bytes = "1.9" camino = "1.1.6" cfg-if = "1.0.0" +cron = "0.15" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive", "env"] } +clashmap = { version = "1.0", features = ["raw-api"] } comfy-table = "7.1" const_format = "0.2" crc32c = "0.6" -dashmap = { version = "5.5.0", features = ["raw-api"] } diatomic-waker = { version = "0.2.3" } either = "1.8" enum-map = "2.4.2" @@ -108,6 +113,7 @@ hyper-util = "0.1" tokio-tungstenite = "0.21.0" indexmap = "2" indoc = "2" +inferno = "0.12.0" ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" @@ -120,20 +126,20 @@ measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.9" nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] } -notify = "6.0.0" +notify = "8.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" -opentelemetry = "0.26" -opentelemetry_sdk = "0.26" -opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } -opentelemetry-semantic-conventions = "0.26" +opentelemetry = "0.27" +opentelemetry_sdk = "0.27" +opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.27" parking_lot = "0.12" parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" -pprof = { version = "0.14", features = ["criterion", "flamegraph", "protobuf", "protobuf-codec"] } +pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "protobuf", "protobuf-codec"] } procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.13" @@ -141,7 +147,7 @@ rand = "0.8" redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] } +reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] } reqwest-middleware = "0.4" reqwest-retry = "0.7" routerify = "3" @@ -174,7 +180,7 @@ test-context = "0.3" thiserror = "1.0" tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] } -tokio = { version = "1.17", features = ["macros"] } +tokio = { version = "1.41", features = ["macros"] } tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" tokio-postgres-rustls = "0.12.0" @@ -184,11 +190,15 @@ tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.8" toml_edit = "0.22" -tonic = {version = "0.12.3", features = ["tls", "tls-roots"]} -tower-service = "0.3.2" +tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]} +tower = { version = "0.5.2", default-features = false } +tower-http = { version = "0.6.2", features = ["request-id", "trace"] } +tower-service = "0.3.3" tracing = "0.1" tracing-error = "0.2" -tracing-opentelemetry = "0.27" +tracing-log = "0.2" +tracing-opentelemetry = "0.28" +tracing-serde = "0.2.0" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } try-lock = "0.2.5" twox-hash = { version = "1.6.3", default-features = false } @@ -201,6 +211,7 @@ rustls-native-certs = "0.8" x509-parser = "0.16" whoami = "1.5.1" zerocopy = { version = "0.7", features = ["derive"] } +json-structural-diff = { version = "0.2.0" } ## TODO replace this with tracing env_logger = "0.10" @@ -221,6 +232,7 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } +http-utils = { version = "0.1", path = "./libs/http-utils/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } @@ -233,6 +245,7 @@ postgres_initdb = { path = "./libs/postgres_initdb" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } +safekeeper_client = { path = "./safekeeper/client" } desim = { version = "0.1", path = "./libs/desim" } storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. storage_controller_client = { path = "./storage_controller/client" } @@ -263,6 +276,8 @@ tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", br [profile.release] # This is useful for profiling and, to some extent, debug. # Besides, debug info should not affect the performance. +# +# NB: we also enable frame pointers for improved profiling, see .cargo/config.toml. debug = true # disable debug symbols for all packages except this one to decrease binaries size diff --git a/Dockerfile b/Dockerfile index e888efbae2..83ad86badb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,28 @@ ARG STABLE_PG_VERSION=16 ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim +# Here are the INDEX DIGESTS for the images we use. +# You can get them following next steps for now: +# 1. Get an authentication token from DockerHub: +# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token) +# 2. Using that token, query index for the given tag: +# curl -s -H "Authorization: Bearer $TOKEN" \ +# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ +# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \ +# -I | grep -i docker-content-digest +# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks +# and updates on regular bases and in automated way. +ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 +ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 + +# Here we use ${var/search/replace} syntax, to check +# if base image is one of the images, we pin image index for. +# If var will match one the known images, we will replace it with the known sha. +# If no match, than value will be unaffected, and will process with no-pinned image. +ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} + # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build WORKDIR /home/nonroot @@ -28,6 +50,14 @@ RUN set -e \ && rm -rf pg_install/build \ && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz . +# Prepare cargo-chef recipe +FROM $REPOSITORY/$IMAGE:$TAG AS plan +WORKDIR /home/nonroot + +COPY --chown=nonroot . . + +RUN cargo chef prepare --recipe-path recipe.json + # Build neon binaries FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot @@ -41,11 +71,17 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib +COPY --from=plan /home/nonroot/recipe.json recipe.json + +ARG ADDITIONAL_RUSTFLAGS="" + +RUN set -e \ + && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json + COPY --chown=nonroot . . -ARG ADDITIONAL_RUSTFLAGS RUN set -e \ - && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ + && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ @@ -59,16 +95,21 @@ RUN set -e \ # Build final image # -FROM debian:${DEBIAN_FLAVOR} +FROM $BASE_IMAGE_SHA ARG DEFAULT_PG_VERSION WORKDIR /data RUN set -e \ + && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \ && apt update \ && apt install -y \ libreadline-dev \ libseccomp-dev \ ca-certificates \ + # System postgres for use with client libraries (e.g. in storage controller) + postgresql-15 \ + openssl \ + && rm -f /etc/apt/apt.conf.d/80-retries \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ && useradd -d /data neon \ && chown -R neon:neon /data @@ -101,15 +142,9 @@ RUN mkdir -p /data/.neon/ && \ > /data/.neon/pageserver.toml && \ chown -R neon:neon /data/.neon -# When running a binary that links with libpq, default to using our most recent postgres version. Binaries -# that want a particular postgres version will select it explicitly: this is just a default. -ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib - - VOLUME ["/data"] USER neon EXPOSE 6400 EXPOSE 9898 CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"] - diff --git a/Makefile b/Makefile index 9cffc74508..42ee643bb5 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Where to install Postgres, default is ./pg_install, maybe useful for package managers POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ -OPENSSL_PREFIX_DIR := /usr/local/openssl ICU_PREFIX_DIR := /usr/local/icu # @@ -11,32 +10,43 @@ ICU_PREFIX_DIR := /usr/local/icu # environment variable. # BUILD_TYPE ?= debug +WITH_SANITIZERS ?= no ifeq ($(BUILD_TYPE),release) PG_CONFIGURE_OPTS = --enable-debug --with-openssl PG_CFLAGS = -O2 -g3 $(CFLAGS) + PG_LDFLAGS = $(LDFLAGS) # Unfortunately, `--profile=...` is a nightly feature CARGO_BUILD_FLAGS += --release else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend PG_CFLAGS = -O0 -g3 $(CFLAGS) + PG_LDFLAGS = $(LDFLAGS) else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif +ifeq ($(WITH_SANITIZERS),yes) + PG_CFLAGS += -fsanitize=address -fsanitize=undefined -fno-sanitize-recover + COPT += -Wno-error # to avoid failing on warnings induced by sanitizers + PG_LDFLAGS = -fsanitize=address -fsanitize=undefined -static-libasan -static-libubsan $(LDFLAGS) + export CC := gcc + export ASAN_OPTIONS := detect_leaks=0 +endif + ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes) # Exclude static build openssl, icu for local build (MacOS, Linux) # Only keep for build type release and debug - PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include PG_CONFIGURE_OPTS += --with-icu PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION' PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm' - PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread' endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Linux) # Seccomp BPF is only available for Linux - PG_CONFIGURE_OPTS += --with-libseccomp + ifneq ($(WITH_SANITIZERS),yes) + PG_CONFIGURE_OPTS += --with-libseccomp + endif else ifeq ($(UNAME_S),Darwin) PG_CFLAGS += -DUSE_PREFETCH ifndef DISABLE_HOMEBREW @@ -67,8 +77,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS)) CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+) # Force cargo not to print progress bar CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 -# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel) -CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55" @@ -111,7 +119,7 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status: EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \ (cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \ env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \ - CFLAGS='$(PG_CFLAGS)' \ + CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \ $(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \ --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log) diff --git a/README.md b/README.md index 1417d6b9e7..4453904346 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,10 @@ The Neon storage engine consists of two major components: See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information. -## Running local installation +## Running a local development environment +Neon can be run on a workstation for small experiments and to test code changes, by +following these instructions. #### Installing dependencies on Linux 1. Install build dependencies and other applicable packages @@ -238,7 +240,7 @@ postgres=# select * from t; > cargo neon stop ``` -More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md). +More advanced usages can be found at [Local Development Control Plane (`neon_local`))](./control_plane/README.md). #### Handling build failures diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index fa84e467ad..c103ceaea5 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -1,8 +1,42 @@ ARG DEBIAN_VERSION=bookworm +ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim -FROM debian:bookworm-slim AS pgcopydb_builder +# Here are the INDEX DIGESTS for the images we use. +# You can get them following next steps for now: +# 1. Get an authentication token from DockerHub: +# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token) +# 2. Using that token, query index for the given tag: +# curl -s -H "Authorization: Bearer $TOKEN" \ +# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ +# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \ +# -I | grep -i docker-content-digest +# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks +# and updates on regular bases and in automated way. +ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 +ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 + +# Here we use ${var/search/replace} syntax, to check +# if base image is one of the images, we pin image index for. +# If var will match one the known images, we will replace it with the known sha. +# If no match, than value will be unaffected, and will process with no-pinned image. +ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} + +FROM $BASE_IMAGE_SHA AS pgcopydb_builder ARG DEBIAN_VERSION +# Use strict mode for bash to catch errors early +SHELL ["/bin/bash", "-euo", "pipefail", "-c"] + +# By default, /bin/sh used in debian images will treat '\n' as eol, +# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that. +RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ + echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ + echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc + +COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch + RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ set -e && \ apt update && \ @@ -35,6 +69,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ mkdir /tmp/pgcopydb && \ tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \ cd /tmp/pgcopydb && \ + patch -p1 < /pgcopydbv017.patch && \ make -s clean && \ make -s -j12 install && \ libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \ @@ -46,12 +81,13 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \ fi -FROM debian:${DEBIAN_VERSION}-slim AS build_tools +FROM $BASE_IMAGE_SHA AS build_tools ARG DEBIAN_VERSION # Add nonroot user RUN useradd -ms /bin/bash nonroot -b /home -SHELL ["/bin/bash", "-c"] +# Use strict mode for bash to catch errors early +SHELL ["/bin/bash", "-euo", "pipefail", "-c"] RUN mkdir -p /pgcopydb/bin && \ mkdir -p /pgcopydb/lib && \ @@ -61,6 +97,10 @@ RUN mkdir -p /pgcopydb/bin && \ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 +RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ + echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ + echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc + # System deps # # 'gdb' is included so that we get backtraces of core dumps produced in @@ -115,13 +155,14 @@ RUN set -e \ # Keep the version the same as in compute/compute-node.Dockerfile and # test_runner/regress/test_compute_metrics.py. -ENV SQL_EXPORTER_VERSION=0.16.0 +ENV SQL_EXPORTER_VERSION=0.17.0 RUN curl -fsSL \ "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \ --output sql_exporter.tar.gz \ && mkdir /tmp/sql_exporter \ && tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \ - && mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter + && mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter \ + && rm sql_exporter.tar.gz # protobuf-compiler (protoc) ENV PROTOC_VERSION=25.1 @@ -182,29 +223,20 @@ RUN set -e \ # It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master) # And patches from us: # - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz) -RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \ - && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \ +RUN set +o pipefail && \ + for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do \ + yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')";\ + done && \ + set -o pipefail +# Split into separate step to debug flaky failures here +RUN wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \ + && ls -laht lcov.tar.gz && sha256sum lcov.tar.gz \ && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \ && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \ && cd lcov \ && make install \ && rm -rf ../lcov.tar.gz -# Compile and install the static OpenSSL library -ENV OPENSSL_VERSION=1.1.1w -ENV OPENSSL_PREFIX=/usr/local/openssl -RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \ - echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \ - cd /tmp && \ - tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ - rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ - cd /tmp/openssl-${OPENSSL_VERSION} && \ - ./config --prefix=${OPENSSL_PREFIX} -static --static no-shared -fPIC && \ - make -j "$(nproc)" && \ - make install && \ - cd /tmp && \ - rm -rf /tmp/openssl-${OPENSSL_VERSION} - # Use the same version of libicu as the compute nodes so that # clusters created using inidb on pageserver can be used by computes. # @@ -233,6 +265,8 @@ RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/re USER nonroot:nonroot WORKDIR /home/nonroot +RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc + # Python ENV PYTHON_VERSION=3.11.10 \ PYENV_ROOT=/home/nonroot/.pyenv \ @@ -258,7 +292,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.83.0 +ENV RUSTC_VERSION=1.85.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 @@ -266,6 +300,8 @@ ARG CARGO_HAKARI_VERSION=0.9.33 ARG CARGO_DENY_VERSION=0.16.2 ARG CARGO_HACK_VERSION=0.6.33 ARG CARGO_NEXTEST_VERSION=0.9.85 +ARG CARGO_CHEF_VERSION=0.1.71 +ARG CARGO_DIESEL_CLI_VERSION=2.2.6 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ chmod +x rustup-init && \ ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ @@ -279,6 +315,9 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \ cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \ cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \ + cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \ + cargo install diesel_cli --version ${CARGO_DIESEL_CLI_VERSION} \ + --features postgres-bundled --no-default-features && \ rm -rf /home/nonroot/.cargo/registry && \ rm -rf /home/nonroot/.cargo/git diff --git a/build_tools/patches/pgcopydbv017.patch b/build_tools/patches/pgcopydbv017.patch new file mode 100644 index 0000000000..4e68793afc --- /dev/null +++ b/build_tools/patches/pgcopydbv017.patch @@ -0,0 +1,57 @@ +diff --git a/src/bin/pgcopydb/copydb.c b/src/bin/pgcopydb/copydb.c +index d730b03..69a9be9 100644 +--- a/src/bin/pgcopydb/copydb.c ++++ b/src/bin/pgcopydb/copydb.c +@@ -44,6 +44,7 @@ GUC dstSettings[] = { + { "synchronous_commit", "'off'" }, + { "statement_timeout", "0" }, + { "lock_timeout", "0" }, ++ { "idle_in_transaction_session_timeout", "0" }, + { NULL, NULL }, + }; + +diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c +index 94f2f46..e051ba8 100644 +--- a/src/bin/pgcopydb/pgsql.c ++++ b/src/bin/pgcopydb/pgsql.c +@@ -2319,6 +2319,11 @@ pgsql_execute_log_error(PGSQL *pgsql, + + LinesBuffer lbuf = { 0 }; + ++ if (message != NULL){ ++ // make sure message is writable by splitLines ++ message = strdup(message); ++ } ++ + if (!splitLines(&lbuf, message)) + { + /* errors have already been logged */ +@@ -2332,6 +2337,7 @@ pgsql_execute_log_error(PGSQL *pgsql, + PQbackendPID(pgsql->connection), + lbuf.lines[lineNumber]); + } ++ free(message); // free copy of message we created above + + if (pgsql->logSQL) + { +@@ -3174,11 +3180,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context) + /* errors have already been logged */ + return; + } +- + if (res != NULL) + { + char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE); +- strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate)); ++ if (sqlstate == NULL) ++ { ++ // PQresultErrorField returned NULL! ++ pgsql->sqlstate[0] = '\0'; // Set to an empty string to avoid segfault ++ } ++ else ++ { ++ strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate)); ++ } + } + + char *endpoint = diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 33d2a10285..0cdb44853f 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1,3 +1,81 @@ +# +# This Dockerfile builds the compute image. It is built multiple times to produce +# different images for each PostgreSQL major version. +# +# We use Debian as the base for all the steps. The production images use Debian bookworm +# for v17, and Debian bullseye for older PostgreSQL versions. +# +# ## Intermediary layers +# +# build-tools: This contains Rust compiler toolchain and other tools needed at compile +# time. This is also used for the storage builds. This image is defined in +# build-tools.Dockerfile. +# +# build-deps: Contains C compiler, other build tools, and compile-time dependencies +# needed to compile PostgreSQL and most extensions. (Some extensions need +# extra tools and libraries that are not included in this image. They are +# installed in the extension-specific build stages.) +# +# pg-build: Result of compiling PostgreSQL. The PostgreSQL binaries are copied from +# this to the final image. This is also used as the base for compiling all +# the extensions. +# +# compute-tools: This contains compute_ctl, the launcher program that starts Postgres +# in Neon. It also contains a few other tools that are built from the +# sources from this repository and used in compute VMs: 'fast_import' and +# 'local_proxy' +# +# ## Extensions +# +# By convention, the build of each extension consists of two layers: +# +# {extension}-src: Contains the source tarball, possible neon-specific patches, and +# the extracted tarball with the patches applied. All of these are +# under the /ext-src/ directory. +# +# {extension}-build: Contains the installed extension files, under /usr/local/pgsql +# (in addition to the PostgreSQL binaries inherited from the pg-build +# image). A few extensions need extra libraries or other files +# installed elsewhere in the filesystem. They are installed by ONBUILD +# directives. +# +# These are merged together into two layers: +# +# all-extensions: All the extension -build layers merged together +# +# extension-tests: All the extension -src layers merged together. This is used by the +# extension tests. The tests are executed against the compiled image, +# but the tests need test scripts, expected result files etc. from the +# original sources, which are not included in the binary image. +# +# ## Extra components +# +# These are extra included in the compute image, but are not directly used by PostgreSQL +# itself. +# +# pgbouncer: pgbouncer and its configuration +# +# sql_exporter: Metrics exporter daemon. +# +# postgres_exporter: Another metrics exporter daemon, for different sets of metrics. +# +# The configuration files for the metrics exporters are under etc/ directory. We use +# a templating system to handle variations between different PostgreSQL versions, +# building slightly different config files for each PostgreSQL version. +# +# +# ## Final image +# +# The final image puts together the PostgreSQL binaries (pg-build), the compute tools +# (compute-tools), all the extensions (all-extensions) and the extra components into +# one image. +# +# VM image: The final image built by this dockerfile isn't actually the final image that +# we use in computes VMs. There's an extra step that adds some files and makes other +# small adjustments, and builds the QCOV2 filesystem image suitable for using in a VM. +# That step is done by the 'vm-builder' tool. See the vm-compute-node-image job in the +# build_and_test.yml github workflow for how that's done. + ARG PG_VERSION ARG REPOSITORY=neondatabase ARG IMAGE=build-tools @@ -6,17 +84,49 @@ ARG BUILD_TAG ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim +# Here are the INDEX DIGESTS for the images we use. +# You can get them following next steps for now: +# 1. Get an authentication token from DockerHub: +# TOKEN=$(curl -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:library/debian:pull" | jq -r .token) +# 2. Using that token, query index for the given tag: +# curl -s -H "Authorization: Bearer $TOKEN" \ +# -H "Accept: application/vnd.docker.distribution.manifest.list.v2+json" \ +# "https://registry.hub.docker.com/v2/library/debian/manifests/bullseye-slim" \ +# -I | grep -i docker-content-digest +# 3. As a next step, TODO(fedordikarev): create script and schedule workflow to run these checks +# and updates on regular bases and in automated way. +ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 +ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 + +# Here we use ${var/search/replace} syntax, to check +# if base image is one of the images, we pin image index for. +# If var will match one the known images, we will replace it with the known sha. +# If no match, than value will be unaffected, and will process with no-pinned image. +ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} +ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} + +# By default, build all PostgreSQL extensions. For quick local testing when you don't +# care about the extensions, pass EXTENSIONS=none or EXTENSIONS=minimal +ARG EXTENSIONS=all + ######################################################################################### # # Layer "build-deps" # ######################################################################################### -FROM debian:$DEBIAN_FLAVOR AS build-deps +FROM $BASE_IMAGE_SHA AS build-deps ARG DEBIAN_VERSION # Use strict mode for bash to catch errors early SHELL ["/bin/bash", "-euo", "pipefail", "-c"] +# By default, /bin/sh used in debian images will treat '\n' as eol, +# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that. +RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ + echo -e "retry_connrefused = on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \ + echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc + RUN case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. @@ -35,10 +145,12 @@ RUN case $DEBIAN_VERSION in \ ;; \ esac && \ apt update && \ - apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \ + apt install --no-install-recommends --no-install-suggests -y \ + ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \ - libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \ - $VERSION_INSTALLS + libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \ + $VERSION_INSTALLS \ + && apt clean && rm -rf /var/lib/apt/lists/* ######################################################################################### # @@ -48,11 +160,11 @@ RUN case $DEBIAN_VERSION in \ ######################################################################################### FROM build-deps AS pg-build ARG PG_VERSION -COPY vendor/postgres-${PG_VERSION} postgres +COPY vendor/postgres-${PG_VERSION:?} postgres RUN cd postgres && \ export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \ --with-icu --with-libxml --with-libxslt --with-lz4" && \ - if [ "${PG_VERSION}" != "v14" ]; then \ + if [ "${PG_VERSION:?}" != "v14" ]; then \ # zstd is available only from PG15 export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \ fi && \ @@ -64,6 +176,10 @@ RUN cd postgres && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \ # Enable some of contrib extensions echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgres_fdw.control && \ + file=/usr/local/pgsql/share/extension/postgres_fdw--1.0.sql && [ -e $file ] && \ + echo 'GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO neon_superuser;' >> $file && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \ @@ -102,22 +218,18 @@ RUN cd postgres && \ esac; \ done; +# Set PATH for all the subsequent build steps +ENV PATH="/usr/local/pgsql/bin:$PATH" + ######################################################################################### # # Layer "postgis-build" # Build PostGIS from the upstream PostGIS mirror. # ######################################################################################### -FROM build-deps AS postgis-build +FROM build-deps AS postgis-src ARG DEBIAN_VERSION ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt update && \ - apt install --no-install-recommends -y gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ - libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ - libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ - protobuf-c-compiler xsltproc - # Postgis 3.5.0 requires SFCGAL 1.4+ # @@ -126,6 +238,7 @@ RUN apt update && \ # and also we must check backward compatibility with older versions of PostGIS. # # Use new version only for v17 +WORKDIR /ext-src RUN case "${DEBIAN_VERSION}" in \ "bookworm") \ export SFCGAL_VERSION=1.4.1 \ @@ -139,18 +252,13 @@ RUN case "${DEBIAN_VERSION}" in \ echo "unexpected PostgreSQL version" && exit 1 \ ;; \ esac && \ - mkdir -p /sfcgal && \ wget https://gitlab.com/sfcgal/SFCGAL/-/archive/v${SFCGAL_VERSION}/SFCGAL-v${SFCGAL_VERSION}.tar.gz -O SFCGAL.tar.gz && \ echo "${SFCGAL_CHECKSUM} SFCGAL.tar.gz" | sha256sum --check && \ - mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ - cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ - DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ - make clean && cp -R /sfcgal/* / - -ENV PATH="/usr/local/pgsql/bin:$PATH" + mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . # Postgis 3.5.0 supports v17 -RUN case "${PG_VERSION}" in \ +WORKDIR /ext-src +RUN case "${PG_VERSION:?}" in \ "v17") \ export POSTGIS_VERSION=3.5.0 \ export POSTGIS_CHECKSUM=ca698a22cc2b2b3467ac4e063b43a28413f3004ddd505bdccdd74c56a647f510 \ @@ -165,9 +273,27 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \ echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \ - mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ - ./autogen.sh && \ + mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . + +# This is reused for pgrouting +FROM pg-build AS postgis-build-deps +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y \ + gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ + libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ + libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ + protobuf-c-compiler xsltproc \ + && apt clean && rm -rf /var/lib/apt/lists/* + +FROM postgis-build-deps AS postgis-build +COPY --from=postgis-src /ext-src/ /ext-src/ +WORKDIR /ext-src/sfcgal-src +RUN cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \ + ninja clean && cp -R /sfcgal/* / + +WORKDIR /ext-src/postgis-src +RUN ./autogen.sh && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -190,13 +316,24 @@ RUN case "${PG_VERSION}" in \ cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis +######################################################################################### +# +# Layer "pgrouting-build" +# Build pgrouting. Note: This depends on the postgis-build-deps layer built above +# +######################################################################################### + # Uses versioned libraries, i.e. libpgrouting-3.4 # and may introduce function signature changes between releases # i.e. release 3.5.0 has new signature for pg_dijkstra function # # Use new version only for v17 # last release v3.6.2 - Mar 30, 2024 -RUN case "${PG_VERSION}" in \ +FROM build-deps AS pgrouting-src +ARG DEBIAN_VERSION +ARG PG_VERSION +WORKDIR /ext-src +RUN case "${PG_VERSION:?}" in \ "v17") \ export PGROUTING_VERSION=3.6.2 \ export PGROUTING_CHECKSUM=f4a1ed79d6f714e52548eca3bb8e5593c6745f1bde92eb5fb858efd8984dffa2 \ @@ -211,16 +348,16 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/pgRouting/pgrouting/archive/v${PGROUTING_VERSION}.tar.gz -O pgrouting.tar.gz && \ echo "${PGROUTING_CHECKSUM} pgrouting.tar.gz" | sha256sum --check && \ - mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \ - mkdir build && cd build && \ - cmake -DCMAKE_BUILD_TYPE=Release .. && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ - cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \ - sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \ - comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T - + mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . + +FROM postgis-build-deps AS pgrouting-build +COPY --from=pgrouting-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgrouting-src +RUN mkdir build && cd build && \ + cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \ + ninja -j $(getconf _NPROCESSORS_ONLN) && \ + ninja -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control ######################################################################################### # @@ -228,14 +365,11 @@ RUN case "${PG_VERSION}" in \ # Build plv8 # ######################################################################################### -FROM build-deps AS plv8-build +FROM build-deps AS plv8-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +WORKDIR /ext-src -COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch - -RUN apt update && \ - apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang +COPY compute/patches/plv8-3.1.10.patch . # plv8 3.2.3 supports v17 # last release v3.2.3 - Sep 7, 2024 @@ -245,7 +379,7 @@ RUN apt update && \ # # Use new version only for v17 # because since v3.2, plv8 doesn't include plcoffee and plls extensions -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ "v17") \ export PLV8_TAG=v3.2.3 \ ;; \ @@ -259,17 +393,34 @@ RUN case "${PG_VERSION}" in \ git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ tar -czf plv8.tar.gz --exclude .git plv8-src && \ cd plv8-src && \ - if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /plv8-3.1.10.patch; fi && \ + if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi + +# Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use +# 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds. +# (The V8 engine takes a very long time to build) +FROM build-deps AS plv8-build +ARG PG_VERSION +WORKDIR /ext-src/plv8-src +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y \ + ninja-build python3-dev libncurses5 binutils clang \ + && apt clean && rm -rf /var/lib/apt/lists/* +COPY --from=plv8-src /ext-src/ /ext-src/ +RUN make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) v8 + +# Step 2: Build the PostgreSQL-dependent parts +COPY --from=pg-build /usr/local/pgsql /usr/local/pgsql +ENV PATH="/usr/local/pgsql/bin:$PATH" +RUN \ # generate and copy upgrade scripts - mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \ + make generate_upgrades && \ cp upgrade/* /usr/local/pgsql/share/extension/ && \ - export PATH="/usr/local/pgsql/bin:$PATH" && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ # don't break computes with installed old version of plv8 cd /usr/local/pgsql/lib/ && \ - case "${PG_VERSION}" in \ + case "${PG_VERSION:?}" in \ "v17") \ ln -s plv8-3.2.3.so plv8-3.1.8.so && \ ln -s plv8-3.2.3.so plv8-3.1.5.so && \ @@ -290,29 +441,37 @@ RUN case "${PG_VERSION}" in \ # Build h3_pg # ######################################################################################### -FROM build-deps AS h3-pg-build +FROM build-deps AS h3-pg-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +WORKDIR /ext-src # not version-specific # last release v4.1.0 - Jan 18, 2023 RUN mkdir -p /h3/usr/ && \ wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ - mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \ - mkdir build && cd build && \ - cmake .. -DCMAKE_BUILD_TYPE=Release && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - DESTDIR=/h3 make install && \ - cp -R /h3/usr / && \ - rm -rf build + mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . # not version-specific # last release v4.1.3 - Jul 26, 2023 +WORKDIR /ext-src RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ - mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \ - export PATH="/usr/local/pgsql/bin:$PATH" && \ + mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . + +FROM pg-build AS h3-pg-build +COPY --from=h3-pg-src /ext-src/ /ext-src/ +WORKDIR /ext-src/h3-src +RUN mkdir build && cd build && \ + cmake .. -GNinja -DBUILD_BENCHMARKS=0 -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_FUZZERS=0 -DBUILD_FILTERS=0 -DBUILD_GENERATORS=0 -DBUILD_TESTING=0 \ + && ninja -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/h3 ninja install && \ + cp -R /h3/usr / && \ + rm -rf build + +WORKDIR /ext-src/h3-pg-src +RUN ls -l && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \ @@ -320,21 +479,25 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3 ######################################################################################### # -# Layer "unit-pg-build" +# Layer "postgresql-unit-build" # compile unit extension # ######################################################################################### -FROM build-deps AS unit-pg-build +FROM build-deps AS postgresql-unit-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release 7.9 - Sep 15, 2024 +WORKDIR /ext-src RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \ echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \ - mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . + +FROM pg-build AS postgresql-unit-build +COPY --from=postgresql-unit-src /ext-src/ /ext-src/ +WORKDIR /ext-src/postgresql-unit-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ # unit extension's "create extension" script relies on absolute install path to fill some reference tables. # We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path. # This one-liner removes pgsql/ part of the path. @@ -344,15 +507,15 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz - ######################################################################################### # -# Layer "vector-pg-build" +# Layer "pgvector-build" # compile pgvector extension # ######################################################################################### -FROM build-deps AS vector-pg-build +FROM build-deps AS pgvector-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY compute/patches/pgvector.patch /pgvector.patch +WORKDIR /ext-src +COPY compute/patches/pgvector.patch . # By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. @@ -363,78 +526,96 @@ COPY compute/patches/pgvector.patch /pgvector.patch RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \ echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ - patch -p1 < /pgvector.patch && \ - make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + wget https://github.com/pgvector/pgvector/raw/refs/tags/v0.7.4/sql/vector.sql -O ./sql/vector--0.7.4.sql && \ + echo "10218d05dc02299562252a9484775178b14a1d8edb92a2d1672ef488530f7778 ./sql/vector--0.7.4.sql" | sha256sum --check && \ + patch -p1 < /ext-src/pgvector.patch + +FROM pg-build AS pgvector-build +COPY --from=pgvector-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgvector-src +RUN make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \ + make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control ######################################################################################### # -# Layer "pgjwt-pg-build" +# Layer "pgjwt-build" # compile pgjwt extension # ######################################################################################### -FROM build-deps AS pgjwt-pg-build +FROM build-deps AS pgjwt-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # doesn't use releases, last commit f3d82fd - Mar 2, 2023 +WORKDIR /ext-src RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \ echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \ - mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . + +FROM pg-build AS pgjwt-build +COPY --from=pgjwt-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgjwt-src +RUN make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control ######################################################################################### # -# Layer "hypopg-pg-build" +# Layer "hypopg-build" # compile hypopg extension # ######################################################################################### -FROM build-deps AS hypopg-pg-build +FROM build-deps AS hypopg-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # HypoPG 1.4.1 supports v17 # last release 1.4.1 - Apr 28, 2024 +WORKDIR /ext-src RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \ echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \ - mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . + +FROM pg-build AS hypopg-build +COPY --from=hypopg-src /ext-src/ /ext-src/ +WORKDIR /ext-src/hypopg-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control ######################################################################################### # -# Layer "pg-hashids-pg-build" +# Layer "pg_hashids-build" # compile pg_hashids extension # ######################################################################################### -FROM build-deps AS pg-hashids-pg-build +FROM build-deps AS pg_hashids-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v1.2.1 -Jan 12, 2018 +WORKDIR /ext-src RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ - mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_hashids-build +COPY --from=pg_hashids-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_hashids-src +RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control ######################################################################################### # -# Layer "rum-pg-build" +# Layer "rum-build" # compile rum extension # ######################################################################################### -FROM build-deps AS rum-pg-build +FROM build-deps AS rum-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY compute/patches/rum.patch /rum.patch +WORKDIR /ext-src +COPY compute/patches/rum.patch . # supports v17 since https://github.com/postgrespro/rum/commit/cb1edffc57736cd2a4455f8d0feab0d69928da25 # doesn't use releases since 1.3.13 - Sep 19, 2022 @@ -442,119 +623,141 @@ COPY compute/patches/rum.patch /rum.patch RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0feab0d69928da25.tar.gz -O rum.tar.gz && \ echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ - patch -p1 < /rum.patch && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + patch -p1 < /ext-src/rum.patch + +FROM pg-build AS rum-build +COPY --from=rum-src /ext-src/ /ext-src/ +WORKDIR /ext-src/rum-src +RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control ######################################################################################### # -# Layer "pgtap-pg-build" +# Layer "pgtap-build" # compile pgTAP extension # ######################################################################################### -FROM build-deps AS pgtap-pg-build +FROM build-deps AS pgtap-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # pgtap 1.3.3 supports v17 # last release v1.3.3 - Apr 8, 2024 +WORKDIR /ext-src RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \ echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \ - mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . + +FROM pg-build AS pgtap-build +COPY --from=pgtap-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgtap-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control ######################################################################################### # -# Layer "ip4r-pg-build" +# Layer "ip4r-build" # compile ip4r extension # ######################################################################################### -FROM build-deps AS ip4r-pg-build +FROM build-deps AS ip4r-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v2.4.2 - Jul 29, 2023 +WORKDIR /ext-src RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ - mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . + +FROM pg-build AS ip4r-build +COPY --from=ip4r-src /ext-src/ /ext-src/ +WORKDIR /ext-src/ip4r-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control ######################################################################################### # -# Layer "prefix-pg-build" +# Layer "prefix-build" # compile Prefix extension # ######################################################################################### -FROM build-deps AS prefix-pg-build +FROM build-deps AS prefix-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v1.2.10 - Jul 5, 2023 +WORKDIR /ext-src RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ - mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . + +FROM pg-build AS prefix-build +COPY --from=prefix-src /ext-src/ /ext-src/ +WORKDIR /ext-src/prefix-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control ######################################################################################### # -# Layer "hll-pg-build" +# Layer "hll-build" # compile hll extension # ######################################################################################### -FROM build-deps AS hll-pg-build +FROM build-deps AS hll-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v2.18 - Aug 29, 2023 +WORKDIR /ext-src RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ - mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . + +FROM pg-build AS hll-build +COPY --from=hll-src /ext-src/ /ext-src/ +WORKDIR /ext-src/hll-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control ######################################################################################### # -# Layer "plpgsql-check-pg-build" +# Layer "plpgsql_check-build" # compile plpgsql_check extension # ######################################################################################### -FROM build-deps AS plpgsql-check-pg-build +FROM build-deps AS plpgsql_check-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # plpgsql_check v2.7.11 supports v17 # last release v2.7.11 - Sep 16, 2024 +WORKDIR /ext-src RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \ echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \ - mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . + +FROM pg-build AS plpgsql_check-build +COPY --from=plpgsql_check-src /ext-src/ /ext-src/ +WORKDIR /ext-src/plpgsql_check-src +RUN make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control ######################################################################################### # -# Layer "timescaledb-pg-build" +# Layer "timescaledb-build" # compile timescaledb extension # ######################################################################################### -FROM build-deps AS timescaledb-pg-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - +FROM build-deps AS timescaledb-src ARG PG_VERSION -ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in \ +WORKDIR /ext-src +RUN case "${PG_VERSION:?}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ @@ -570,8 +773,12 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ - mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \ - ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ + mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . + +FROM pg-build AS timescaledb-build +COPY --from=timescaledb-src /ext-src/ /ext-src/ +WORKDIR /ext-src/timescaledb-src +RUN ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ cd build && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ @@ -579,18 +786,16 @@ RUN case "${PG_VERSION}" in \ ######################################################################################### # -# Layer "pg-hint-plan-pg-build" +# Layer "pg_hint_plan-build" # compile pg_hint_plan extension # ######################################################################################### -FROM build-deps AS pg-hint-plan-pg-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - +FROM build-deps AS pg_hint_plan-src ARG PG_VERSION -ENV PATH="/usr/local/pgsql/bin:$PATH" # version-specific, has separate releases for each version -RUN case "${PG_VERSION}" in \ +WORKDIR /ext-src +RUN case "${PG_VERSION:?}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ @@ -613,51 +818,50 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \ echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \ - mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_hint_plan-build +COPY --from=pg_hint_plan-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_hint_plan-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control ######################################################################################### # -# Layer "pg-cron-pg-build" +# Layer "pg_cron-build" # compile pg_cron extension # ######################################################################################### -FROM build-deps AS pg-cron-pg-build +FROM build-deps AS pg_cron-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # This is an experimental extension that we do not support on prod yet. # !Do not remove! # We set it in shared_preload_libraries and computes will fail to start if library is not found. -ENV PATH="/usr/local/pgsql/bin/:$PATH" +WORKDIR /ext-src +COPY compute/patches/pg_cron.patch . RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \ echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + patch < /ext-src/pg_cron.patch + +FROM pg-build AS pg_cron-build +COPY --from=pg_cron-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_cron-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control ######################################################################################### # -# Layer "rdkit-pg-build" +# Layer "rdkit-build" # compile rdkit extension # ######################################################################################### -FROM build-deps AS rdkit-pg-build +FROM build-deps AS rdkit-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - -RUN apt-get update && \ - apt-get install --no-install-recommends -y \ - libboost-iostreams1.74-dev \ - libboost-regex1.74-dev \ - libboost-serialization1.74-dev \ - libboost-system1.74-dev \ - libeigen3-dev \ - libboost-all-dev # rdkit Release_2024_09_1 supports v17 # last release Release_2024_09_1 - Sep 27, 2024 @@ -665,8 +869,9 @@ RUN apt-get update && \ # Use new version only for v17 # because Release_2024_09_1 has some backward incompatible changes # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 -ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" -RUN case "${PG_VERSION}" in \ + +WORKDIR /ext-src +RUN case "${PG_VERSION:?}" in \ "v17") \ export RDKIT_VERSION=Release_2024_09_1 \ export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \ @@ -681,8 +886,28 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/rdkit/rdkit/archive/refs/tags/${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \ echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \ - mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ - cmake \ + mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . + +FROM pg-build AS rdkit-build +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y \ + libboost-iostreams1.74-dev \ + libboost-regex1.74-dev \ + libboost-serialization1.74-dev \ + libboost-system1.74-dev \ + libeigen3-dev \ + libboost-all-dev \ + && apt clean && rm -rf /var/lib/apt/lists/* + +COPY --from=rdkit-src /ext-src/ /ext-src/ +WORKDIR /ext-src/rdkit-src + +# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find +# pg_config. For some reason the rdkit cmake script doesn't work with just that, +# however. By also adding /usr/local/pgsql, it works, which is weird because there +# are no executables in that directory. +ENV PATH="/usr/local/pgsql:$PATH" +RUN cmake \ -D RDK_BUILD_CAIRO_SUPPORT=OFF \ -D RDK_BUILD_INCHI_SUPPORT=ON \ -D RDK_BUILD_AVALON_SUPPORT=ON \ @@ -693,6 +918,8 @@ RUN case "${PG_VERSION}" in \ -D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \ -D RDK_BUILD_YAEHMOP_SUPPORT=OFF \ -D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \ + -D RDK_TEST_MULTITHREADED=OFF \ + -D RDK_BUILD_CPP_TESTS=OFF \ -D RDK_USE_URF=OFF \ -D RDK_BUILD_PGSQL=ON \ -D RDK_PGSQL_STATIC=ON \ @@ -704,68 +931,74 @@ RUN case "${PG_VERSION}" in \ -D RDK_INSTALL_COMIC_FONTS=OFF \ -D RDK_BUILD_FREETYPE_SUPPORT=OFF \ -D CMAKE_BUILD_TYPE=Release \ + -GNinja \ . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ + ninja -j $(getconf _NPROCESSORS_ONLN) && \ + ninja -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control ######################################################################################### # -# Layer "pg-uuidv7-pg-build" +# Layer "pg_uuidv7-build" # compile pg_uuidv7 extension # ######################################################################################### -FROM build-deps AS pg-uuidv7-pg-build +FROM build-deps AS pg_uuidv7-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v1.6.0 - Oct 9, 2024 -ENV PATH="/usr/local/pgsql/bin/:$PATH" +WORKDIR /ext-src RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \ echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \ - mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_uuidv7-build +COPY --from=pg_uuidv7-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_uuidv7-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control ######################################################################################### # -# Layer "pg-roaringbitmap-pg-build" +# Layer "pg_roaringbitmap-build" # compile pg_roaringbitmap extension # ######################################################################################### -FROM build-deps AS pg-roaringbitmap-pg-build +FROM build-deps AS pg_roaringbitmap-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v0.5.4 - Jun 28, 2022 -ENV PATH="/usr/local/pgsql/bin/:$PATH" +WORKDIR /ext-src RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ - mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_roaringbitmap-build +COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_roaringbitmap-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control ######################################################################################### # -# Layer "pg-semver-pg-build" +# Layer "pg_semver-build" # compile pg_semver extension # ######################################################################################### -FROM build-deps AS pg-semver-pg-build +FROM build-deps AS pg_semver-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # Release 0.40.0 breaks backward compatibility with previous versions # see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0 # Use new version only for v17 # # last release v0.40.0 - Jul 22, 2024 -ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in \ +WORKDIR /ext-src +RUN case "${PG_VERSION:?}" in \ "v17") \ export SEMVER_VERSION=0.40.0 \ export SEMVER_CHECKSUM=3e50bcc29a0e2e481e7b6d2bc937cadc5f5869f55d983b5a1aafeb49f5425cfc \ @@ -780,25 +1013,28 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/theory/pg-semver/archive/refs/tags/v${SEMVER_VERSION}.tar.gz -O pg_semver.tar.gz && \ echo "${SEMVER_CHECKSUM} pg_semver.tar.gz" | sha256sum --check && \ - mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_semver-build +COPY --from=pg_semver-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_semver-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control ######################################################################################### # -# Layer "pg-embedding-pg-build" +# Layer "pg_embedding-build" # compile pg_embedding extension # ######################################################################################### -FROM build-deps AS pg-embedding-pg-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +FROM build-deps AS pg_embedding-src +ARG PG_VERSION # This is our extension, support stopped in favor of pgvector # TODO: deprecate it -ARG PG_VERSION -ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in \ +WORKDIR /ext-src +RUN case "${PG_VERSION:?}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ @@ -808,36 +1044,71 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \ echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \ - mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install + mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_embedding-build +COPY --from=pg_embedding-src /ext-src/ /ext-src/ +WORKDIR /ext-src/ +RUN if [ -d pg_embedding-src ]; then \ + cd pg_embedding-src && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install; \ + fi ######################################################################################### # -# Layer "pg-anon-pg-build" +# Layer "pg_anon-build" # compile anon extension # ######################################################################################### -FROM build-deps AS pg-anon-pg-build +FROM build-deps AS pg_anon-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # This is an experimental extension, never got to real production. # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. -ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ +WORKDIR /ext-src +RUN case "${PG_VERSION:?}" in "v17") \ echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ esac && \ wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ - mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ - mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \ - sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \ - comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T - + mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_anon-build +COPY --from=pg_anon-src /ext-src/ /ext-src/ +WORKDIR /ext-src +RUN if [ -d pg_anon-src ]; then \ + cd pg_anon-src && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; \ + fi + +######################################################################################### +# +# Layer "pg build with nonroot user and cargo installed" +# This layer is base and common for layers with `pgrx` +# +######################################################################################### +FROM pg-build AS pg-build-nonroot-with-cargo +ARG PG_VERSION + +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \ + apt clean && rm -rf /var/lib/apt/lists/* && \ + useradd -ms /bin/bash nonroot -b /home + +ENV HOME=/home/nonroot +ENV PATH="/home/nonroot/.cargo/bin:$PATH" +USER nonroot +WORKDIR /home/nonroot + +# See comment on the top of the file regading `echo` and `\n` +RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc + +RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ + chmod +x rustup-init && \ + ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ + rm rustup-init ######################################################################################### # @@ -845,24 +1116,10 @@ RUN case "${PG_VERSION}" in "v17") \ # This layer is used to build `pgrx` deps # ######################################################################################### -FROM build-deps AS rust-extensions-build +FROM pg-build-nonroot-with-cargo AS rust-extensions-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ - apt-get install --no-install-recommends -y curl libclang-dev && \ - useradd -ms /bin/bash nonroot -b /home - -ENV HOME=/home/nonroot -ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" -USER nonroot -WORKDIR /home/nonroot - -RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ - chmod +x rustup-init && \ - ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ - rm rustup-init && \ - case "${PG_VERSION}" in \ +RUN case "${PG_VERSION:?}" in \ 'v17') \ echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \ esac && \ @@ -881,70 +1138,67 @@ USER root # and eventually get merged with `rust-extensions-build` # ######################################################################################### -FROM build-deps AS rust-extensions-build-pgrx12 +FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx12 ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ - apt-get install --no-install-recommends -y curl libclang-dev && \ - useradd -ms /bin/bash nonroot -b /home - -ENV HOME=/home/nonroot -ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" -USER nonroot -WORKDIR /home/nonroot - -RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ - chmod +x rustup-init && \ - ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ - rm rustup-init && \ - cargo install --locked --version 0.12.6 cargo-pgrx && \ +RUN cargo install --locked --version 0.12.9 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root ######################################################################################### # -# Layers "pg-onnx-build" and "pgrag-pg-build" +# Layers "pg-onnx-build" and "pgrag-build" # Compile "pgrag" extensions # ######################################################################################### -FROM rust-extensions-build-pgrx12 AS pg-onnx-build +FROM build-deps AS pgrag-src +ARG PG_VERSION +WORKDIR /ext-src +RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \ + mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ + echo "#nothing to test here" > neon-test.sh + +RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \ + echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \ + mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . + +FROM rust-extensions-build-pgrx12 AS pgrag-build +COPY --from=pgrag-src /ext-src/ /ext-src/ + +# Install build-time dependencies # cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25). # Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise -RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \ +WORKDIR /ext-src/onnxruntime-src +RUN apt update && apt install --no-install-recommends --no-install-suggests -y \ + python3 python3-pip python3-venv protobuf-compiler && \ + apt clean && rm -rf /var/lib/apt/lists/* && \ python3 -m venv venv && \ . venv/bin/activate && \ - python3 -m pip install cmake==3.30.5 && \ - wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \ - mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ - ./build.sh --config Release --parallel --skip_submodule_sync --skip_tests --allow_running_as_root + python3 -m pip install cmake==3.30.5 +RUN . venv/bin/activate && \ + ./build.sh --config Release --parallel --cmake_generator Ninja \ + --skip_submodule_sync --skip_tests --allow_running_as_root -FROM pg-onnx-build AS pgrag-pg-build - -RUN apt-get install -y protobuf-compiler && \ - wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \ - echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \ - mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \ - \ - cd exts/rag && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ +WORKDIR /ext-src/pgrag-src +RUN cd exts/rag && \ + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ - echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \ - \ - cd ../rag_bge_small_en_v15 && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ - ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control + +RUN cd exts/rag_bge_small_en_v15 && \ + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \ cargo pgrx install --release --features remote_onnx && \ - echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \ - \ - cd ../rag_jina_reranker_v1_tiny_en && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ - ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control + +RUN cd exts/rag_jina_reranker_v1_tiny_en && \ + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \ cargo pgrx install --release --features remote_onnx && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control @@ -952,133 +1206,179 @@ RUN apt-get install -y protobuf-compiler && \ ######################################################################################### # -# Layer "pg-jsonschema-pg-build" +# Layer "pg_jsonschema-build" # Compile "pg_jsonschema" extension # ######################################################################################### -FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build +FROM build-deps AS pg_jsonschema-src ARG PG_VERSION -# version 0.3.3 supports v17 # last release v0.3.3 - Oct 16, 2024 -# -# there were no breaking changes -# so we can use the same version for all postgres versions -RUN case "${PG_VERSION}" in \ - "v14" | "v15" | "v16" | "v17") \ - export PG_JSONSCHEMA_VERSION=0.3.3 \ - export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \ - ;; \ - *) \ - echo "unexpected PostgreSQL version" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \ - echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \ - mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ +WORKDIR /ext-src +RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \ + echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \ + mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . + +FROM rust-extensions-build-pgrx12 AS pg_jsonschema-build +COPY --from=pg_jsonschema-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_jsonschema-src +RUN \ # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 # `unsafe-postgres` feature allows to build pgx extensions # against postgres forks that decided to change their ABI name (like us). # With that we can build extensions without forking them and using stock # pgx. As this feature is new few manual version bumps were required. - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx-tests = "0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control ######################################################################################### # -# Layer "pg-graphql-pg-build" +# Layer "pg_graphql-build" # Compile "pg_graphql" extension # ######################################################################################### -FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build +FROM build-deps AS pg_graphql-src ARG PG_VERSION -# version 1.5.9 supports v17 # last release v1.5.9 - Oct 16, 2024 -# -# there were no breaking changes -# so we can use the same version for all postgres versions -RUN case "${PG_VERSION}" in \ - "v14" | "v15" | "v16" | "v17") \ - export PG_GRAPHQL_VERSION=1.5.9 \ - export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \ - ;; \ - *) \ - echo "unexpected PostgreSQL version" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \ - echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \ +WORKDIR /ext-src +COPY compute/patches/pg_graphql.patch . +RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \ + echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ - cargo pgrx install --release && \ + sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \ + patch -p1 < /ext-src/pg_graphql.patch + + +FROM rust-extensions-build-pgrx12 AS pg_graphql-build +COPY --from=pg_graphql-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_graphql-src +RUN cargo pgrx install --release && \ # it's needed to enable extension because it uses untrusted C language sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control ######################################################################################### # -# Layer "pg-tiktoken-build" +# Layer "pg_tiktoken-build" # Compile "pg_tiktoken" extension # ######################################################################################### -FROM rust-extensions-build-pgrx12 AS pg-tiktoken-pg-build +FROM build-deps AS pg_tiktoken-src ARG PG_VERSION # doesn't use releases # 9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7 - on Oct 29, 2024 +WORKDIR /ext-src RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \ echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ - # TODO update pgrx version in the pg_tiktoken repo and remove this line - sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \ - sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \ - cargo pgrx install --release && \ + sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \ + sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml + +FROM rust-extensions-build-pgrx12 AS pg_tiktoken-build +COPY --from=pg_tiktoken-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_tiktoken-src +RUN cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control ######################################################################################### # -# Layer "pg-pgx-ulid-build" -# Compile "pgx_ulid" extension +# Layer "pgx_ulid-build" +# Compile "pgx_ulid" extension for v16 and below # ######################################################################################### -FROM rust-extensions-build AS pg-pgx-ulid-build +FROM build-deps AS pgx_ulid-src ARG PG_VERSION -# doesn't support v17 yet -# https://github.com/pksunkara/pgx_ulid/pull/52 -RUN case "${PG_VERSION}" in "v17") \ - echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \ +WORKDIR /ext-src +RUN case "${PG_VERSION:?}" in \ + "v14" | "v15" | "v16") \ + ;; \ + *) \ + echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \ + ;; \ esac && \ wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ - echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ + echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ - cargo pgrx install --release && \ - echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control + sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml + +FROM rust-extensions-build AS pgx_ulid-build +COPY --from=pgx_ulid-src /ext-src/ /ext-src/ +WORKDIR /ext-src/ +RUN if [ -d pgx_ulid-src ]; then \ + cd pgx_ulid-src && \ + cargo pgrx install --release && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control; \ + fi ######################################################################################### # -# Layer "pg-session-jwt-build" +# Layer "pgx_ulid-pgrx12-build" +# Compile "pgx_ulid" extension for v17 and up +# +######################################################################################### + +FROM build-deps AS pgx_ulid-pgrx12-src +ARG PG_VERSION + +WORKDIR /ext-src +RUN case "${PG_VERSION:?}" in \ + "v17") \ + ;; \ + *) \ + echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \ + ;; \ + esac && \ + wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \ + echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \ + mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ + sed -i 's/pgrx = "^0.12.7"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml + +FROM rust-extensions-build-pgrx12 AS pgx_ulid-pgrx12-build +ARG PG_VERSION +WORKDIR /ext-src +COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/ +RUN if [ -d pgx_ulid-src ]; then \ + cd pgx_ulid-src && \ + cargo pgrx install --release && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control; \ + fi + +######################################################################################### +# +# Layer "pg_session_jwt-build" # Compile "pg_session_jwt" extension # ######################################################################################### -FROM rust-extensions-build-pgrx12 AS pg-session-jwt-build +FROM build-deps AS pg_session_jwt-src ARG PG_VERSION # NOTE: local_proxy depends on the version of pg_session_jwt # Do not update without approve from proxy team # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs -RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \ - echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \ +WORKDIR /ext-src +RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \ + echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ - cargo pgrx install --release + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \ + sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \ + sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \ + sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml + +FROM rust-extensions-build-pgrx12 AS pg_session_jwt-build +COPY --from=pg_session_jwt-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_session_jwt-src +RUN cargo pgrx install --release ######################################################################################### # @@ -1087,17 +1387,20 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2 # ######################################################################################### -FROM build-deps AS wal2json-pg-build +FROM build-deps AS wal2json-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # wal2json wal2json_2_6 supports v17 # last release wal2json_2_6 - Apr 25, 2024 -ENV PATH="/usr/local/pgsql/bin/:$PATH" +WORKDIR /ext-src RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \ echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \ - mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . + +FROM pg-build AS wal2json-build +COPY --from=wal2json-src /ext-src/ /ext-src/ +WORKDIR /ext-src/wal2json-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install ######################################################################################### @@ -1106,17 +1409,20 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar. # compile pg_ivm extension # ######################################################################################### -FROM build-deps AS pg-ivm-build +FROM build-deps AS pg_ivm-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # pg_ivm v1.9 supports v17 # last release v1.9 - Jul 31 -ENV PATH="/usr/local/pgsql/bin/:$PATH" +WORKDIR /ext-src RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \ echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \ - mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_ivm-build +COPY --from=pg_ivm-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_ivm-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control @@ -1126,17 +1432,20 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv # compile pg_partman extension # ######################################################################################### -FROM build-deps AS pg-partman-build +FROM build-deps AS pg_partman-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # should support v17 https://github.com/pgpartman/pg_partman/discussions/693 # last release 5.1.0 Apr 2, 2024 -ENV PATH="/usr/local/pgsql/bin/:$PATH" +WORKDIR /ext-src RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \ echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \ - mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ + mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . + +FROM pg-build AS pg_partman-build +COPY --from=pg_partman-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_partman-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control @@ -1146,109 +1455,248 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz # compile pg_mooncake extension # ######################################################################################### -FROM rust-extensions-build AS pg-mooncake-build +FROM build-deps AS pg_mooncake-src ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +WORKDIR /ext-src +COPY compute/patches/duckdb_v113.patch . +RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.2/pg_mooncake-0.1.2.tar.gz -O pg_mooncake.tar.gz && \ + echo "4550473784fcdd2e1e18062bc01eb9c286abd27cdf5e11a4399be6c0a426ba90 pg_mooncake.tar.gz" | sha256sum --check && \ + mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \ + cd third_party/duckdb && patch -p1 < /ext-src/duckdb_v113.patch && cd ../.. && \ + echo "make -f pg_mooncake-src/Makefile.build installcheck TEST_DIR=./test SQL_DIR=./sql SRC_DIR=./src" > neon-test.sh && \ + chmod a+x neon-test.sh -# The topmost commit in the `neon` branch at the time of writing this -# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/ -# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af -ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af -ENV PATH="/usr/local/pgsql/bin/:$PATH" - -RUN case "${PG_VERSION}" in \ - 'v14') \ - echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \ - esac && \ - git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \ - cd pg_mooncake-src && \ - git checkout "${PG_MOONCAKE_VERSION}" && \ - git submodule update --init --depth 1 --recursive && \ - make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \ - make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \ +FROM rust-extensions-build AS pg_mooncake-build +COPY --from=pg_mooncake-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_mooncake-src +RUN make release -j $(getconf _NPROCESSORS_ONLN) && \ + make install -j $(getconf _NPROCESSORS_ONLN) && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control ######################################################################################### # -# Layer "neon-pg-ext-build" +# Layer "pg-duckdb-pg-build" +# compile pg_duckdb extension +# +######################################################################################### +FROM build-deps AS pg_duckdb-src +WORKDIR /ext-src +COPY compute/patches/pg_duckdb_v031.patch . +COPY compute/patches/duckdb_v120.patch . +# pg_duckdb build requires source dir to be a git repo to get submodules +# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: +# - extension management function duckdb.install_extension() +# - access to duckdb.extensions table and its sequence +RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \ + cd pg_duckdb-src && \ + git submodule update --init --recursive && \ + patch -p1 < /ext-src/pg_duckdb_v031.patch && \ + cd third_party/duckdb && \ + patch -p1 < /ext-src/duckdb_v120.patch + +FROM pg-build AS pg_duckdb-build +ARG PG_VERSION +COPY --from=pg_duckdb-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_duckdb-src +RUN make install -j $(getconf _NPROCESSORS_ONLN) && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control + +######################################################################################### +# +# Layer "pg_repack" +# compile pg_repack extension +# +######################################################################################### + +FROM build-deps AS pg_repack-src +ARG PG_VERSION +WORKDIR /ext-src +RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \ + echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \ + mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C . + +FROM rust-extensions-build AS pg_repack-build +COPY --from=pg_repack-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pg_repack-src +RUN make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install + + +######################################################################################### +# +# Layer "pgaudit" +# compile pgaudit extension +# +######################################################################################### + +FROM build-deps AS pgaudit-src +ARG PG_VERSION +WORKDIR /ext-src +RUN case "${PG_VERSION}" in \ + "v14") \ + export PGAUDIT_VERSION=1.6.2 \ + export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \ + ;; \ + "v15") \ + export PGAUDIT_VERSION=1.7.0 \ + export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \ + ;; \ + "v16") \ + export PGAUDIT_VERSION=16.0 \ + export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \ + ;; \ + "v17") \ + export PGAUDIT_VERSION=17.0 \ + export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \ + ;; \ + *) \ + echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \ + esac && \ + wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \ + echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \ + mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C . + +FROM pg-build AS pgaudit-build +COPY --from=pgaudit-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgaudit-src +RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) + +######################################################################################### +# +# Layer "pgauditlogtofile" +# compile pgauditlogtofile extension +# +######################################################################################### + +FROM build-deps AS pgauditlogtofile-src +ARG PG_VERSION +WORKDIR /ext-src +RUN case "${PG_VERSION}" in \ + "v14" | "v15" | "v16" | "v17") \ + export PGAUDITLOGTOFILE_VERSION=v1.6.4 \ + export PGAUDITLOGTOFILE_CHECKSUM=ef801eb09c26aaa935c0dabd92c81eb9ebe338930daa9674d420a280c6bc2d70 \ + ;; \ + *) \ + echo "pgauditlogtofile is not supported on this PostgreSQL version" && exit 1;; \ + esac && \ + wget https://github.com/fmbiete/pgauditlogtofile/archive/refs/tags/${PGAUDITLOGTOFILE_VERSION}.tar.gz -O pgauditlogtofile.tar.gz && \ + echo "${PGAUDITLOGTOFILE_CHECKSUM} pgauditlogtofile.tar.gz" | sha256sum --check && \ + mkdir pgauditlogtofile-src && cd pgauditlogtofile-src && tar xzf ../pgauditlogtofile.tar.gz --strip-components=1 -C . + +FROM pg-build AS pgauditlogtofile-build +COPY --from=pgauditlogtofile-src /ext-src/ /ext-src/ +WORKDIR /ext-src/pgauditlogtofile-src +RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) + +######################################################################################### +# +# Layer "neon-ext-build" # compile neon extensions # ######################################################################################### -FROM build-deps AS neon-pg-ext-build +FROM pg-build AS neon-ext-build ARG PG_VERSION -# Public extensions -COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=postgis-build /sfcgal/* / -COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=h3-pg-build /h3/usr / -COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=ip4r-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql -COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ -COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ - RUN make -j $(getconf _NPROCESSORS_ONLN) \ - PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon \ -s install && \ make -j $(getconf _NPROCESSORS_ONLN) \ - PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_utils \ -s install && \ make -j $(getconf _NPROCESSORS_ONLN) \ - PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_test_utils \ -s install && \ make -j $(getconf _NPROCESSORS_ONLN) \ - PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_rmgr \ - -s install && \ - case "${PG_VERSION}" in \ - "v14" | "v15") \ - ;; \ - "v16" | "v17") \ - echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \ - ;; \ - *) \ - echo "unexpected PostgreSQL version" && exit 1 \ - ;; \ - esac && \ - make -j $(getconf _NPROCESSORS_ONLN) \ - PG_CONFIG=/usr/local/pgsql/bin/pg_config \ - -C pgxn/hnsw \ -s install ######################################################################################### # -# Compile and run the Neon-specific `compute_ctl` and `fast_import` binaries +# Layer "extensions-none" +# +######################################################################################### +FROM build-deps AS extensions-none + +RUN mkdir /usr/local/pgsql + +######################################################################################### +# +# Layer "extensions-minimal" +# +# This subset of extensions includes the extensions that we have in +# shared_preload_libraries by default. +# +######################################################################################### +FROM build-deps AS extensions-minimal + +COPY --from=pgrag-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=timescaledb-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_cron-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ + +######################################################################################### +# +# Layer "extensions-all" +# Bundle together all the extensions +# +######################################################################################### +FROM build-deps AS extensions-all + +# Public extensions +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=postgis-build /sfcgal/* / +COPY --from=pgrouting-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /h3/usr / +COPY --from=postgresql-unit-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgvector-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgjwt-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgrag-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=ip4r-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=prefix-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=hll-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plpgsql_check-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=timescaledb-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_hint_plan-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_cron-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgx_ulid-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgx_ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_session_jwt-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=rdkit-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_embedding-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql +COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/ + +######################################################################################### +# +# Layer "neon-pg-ext-build" +# Includes Postgres and all the extensions chosen by EXTENSIONS arg. +# +######################################################################################### +FROM extensions-${EXTENSIONS} AS neon-pg-ext-build + +######################################################################################### +# +# Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries # ######################################################################################### FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools @@ -1258,18 +1706,15 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto - -######################################################################################### -# -# Final compute-tools image -# -######################################################################################### - -FROM debian:$DEBIAN_FLAVOR AS compute-tools-image - -COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl -COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import +RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \ + --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \ + --mount=type=cache,uid=1000,target=/home/nonroot/target \ + mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \ + mkdir target-bin && \ + cp target/release-line-debug-size-lto/compute_ctl \ + target/release-line-debug-size-lto/fast_import \ + target/release-line-debug-size-lto/local_proxy \ + target-bin ######################################################################################### # @@ -1277,10 +1722,11 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_ # ######################################################################################### -FROM debian:$DEBIAN_FLAVOR AS pgbouncer +FROM $BASE_IMAGE_SHA AS pgbouncer RUN set -e \ - && apt-get update \ - && apt-get install --no-install-recommends -y \ + && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \ + && apt update \ + && apt install --no-install-suggests --no-install-recommends -y \ build-essential \ git \ ca-certificates \ @@ -1288,7 +1734,8 @@ RUN set -e \ automake \ libevent-dev \ libtool \ - pkg-config + pkg-config \ + && apt clean && rm -rf /var/lib/apt/lists/* # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) ENV PGBOUNCER_TAG=pgbouncer_1_22_1 @@ -1296,42 +1743,46 @@ RUN set -e \ && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ && ./autogen.sh \ - && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \ + && ./configure --prefix=/usr/local/pgbouncer --without-openssl \ && make -j $(nproc) dist_man_MANS= \ && make install dist_man_MANS= ######################################################################################### # -# Compile the Neon-specific `local_proxy` binary +# Layer "exporters" # ######################################################################################### -FROM $REPOSITORY/$IMAGE:$TAG AS local_proxy -ARG BUILD_TAG -ENV BUILD_TAG=$BUILD_TAG - -USER nonroot -# Copy entire project to get Cargo.* files with proper dependencies for the whole project -COPY --chown=nonroot . . -RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin local_proxy - -######################################################################################### -# -# Layers "postgres-exporter" and "sql-exporter" -# -######################################################################################### - -FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter - -# Keep the version the same as in build-tools.Dockerfile and -# test_runner/regress/test_compute_metrics.py. -FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter +FROM build-deps AS exporters +ARG TARGETARCH +# Keep sql_exporter version same as in build-tools.Dockerfile and +# test_runner/regress/test_compute_metrics.py +# See comment on the top of the file regading `echo`, `-e` and `\n` +RUN if [ "$TARGETARCH" = "amd64" ]; then\ + postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\ + pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\ + sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\ + else\ + postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\ + pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\ + sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\ + fi\ + && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\ + | tar xzf - --strip-components=1 -C.\ + && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\ + | tar xzf - --strip-components=1 -C.\ + && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.0/sql_exporter-0.17.0.linux-${TARGETARCH}.tar.gz\ + | tar xzf - --strip-components=1 -C.\ + && echo "${postgres_exporter_sha256} postgres_exporter" | sha256sum -c -\ + && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\ + && echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c - ######################################################################################### # # Clean up postgres folder before inclusion # ######################################################################################### -FROM neon-pg-ext-build AS postgres-cleanup-layer +FROM neon-ext-build AS postgres-cleanup-layer + COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) @@ -1357,118 +1808,80 @@ USER nonroot COPY --chown=nonroot compute compute -RUN make PG_VERSION="${PG_VERSION}" -C compute +RUN make PG_VERSION="${PG_VERSION:?}" -C compute ######################################################################################### # -# Layer neon-pg-ext-test +# Layer extension-tests # ######################################################################################### -FROM neon-pg-ext-build AS neon-pg-ext-test +FROM pg-build AS extension-tests ARG PG_VERSION -RUN mkdir /ext-src +COPY docker-compose/ext-src/ /ext-src/ + +COPY --from=pg-build /postgres /postgres +#COPY --from=postgis-src /ext-src/ /ext-src/ +COPY --from=plv8-src /ext-src/ /ext-src/ +#COPY --from=h3-pg-src /ext-src/ /ext-src/ +COPY --from=postgresql-unit-src /ext-src/ /ext-src/ +COPY --from=pgvector-src /ext-src/ /ext-src/ +COPY --from=pgjwt-src /ext-src/ /ext-src/ +#COPY --from=pgrag-src /ext-src/ /ext-src/ +#COPY --from=pg_jsonschema-src /ext-src/ /ext-src/ +COPY --from=pg_graphql-src /ext-src/ /ext-src/ +#COPY --from=pg_tiktoken-src /ext-src/ /ext-src/ +COPY --from=hypopg-src /ext-src/ /ext-src/ +COPY --from=pg_hashids-src /ext-src/ /ext-src/ +COPY --from=rum-src /ext-src/ /ext-src/ +COPY --from=pgtap-src /ext-src/ /ext-src/ +COPY --from=ip4r-src /ext-src/ /ext-src/ +COPY --from=prefix-src /ext-src/ /ext-src/ +COPY --from=hll-src /ext-src/ /ext-src/ +COPY --from=plpgsql_check-src /ext-src/ /ext-src/ +#COPY --from=timescaledb-src /ext-src/ /ext-src/ +COPY --from=pg_hint_plan-src /ext-src/ /ext-src/ +COPY compute/patches/pg_hint_plan_${PG_VERSION:?}.patch /ext-src +RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION:?}.patch +COPY --from=pg_cron-src /ext-src/ /ext-src/ +#COPY --from=pgx_ulid-src /ext-src/ /ext-src/ +#COPY --from=pgx_ulid-pgrx12-src /ext-src/ /ext-src/ +#COPY --from=pg_session_jwt-src /ext-src/ /ext-src/ +#COPY --from=rdkit-src /ext-src/ /ext-src/ +COPY --from=pg_uuidv7-src /ext-src/ /ext-src/ +COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/ +COPY --from=pg_semver-src /ext-src/ /ext-src/ +#COPY --from=pg_embedding-src /ext-src/ /ext-src/ +#COPY --from=wal2json-src /ext-src/ /ext-src/ +COPY --from=pg_ivm-src /ext-src/ /ext-src/ +COPY --from=pg_partman-src /ext-src/ /ext-src/ +#COPY --from=pg_mooncake-src /ext-src/ /ext-src/ +COPY --from=pg_repack-src /ext-src/ /ext-src/ +COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY compute/patches/pg_repack.patch /ext-src +RUN cd /ext-src/pg_repack-src && patch -p1 > /etc/ld.so.conf && /sbin/ldconfig && \ - # create folder for file cache - mkdir -p -m 777 /neon/cache -COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local -COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl -COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import - -# pgbouncer and its config -COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer -COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini - -# local_proxy and its config -COPY --from=local_proxy --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy -RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy - -# Metrics exporter binaries and configuration files -COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter -COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter - -COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml - -COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml -COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml -COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml -COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml - -# Create remote extension download directory -RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions +# Use strict mode for bash to catch errors early +SHELL ["/bin/bash", "-euo", "pipefail", "-c"] # Install: # libreadline8 for psql @@ -1479,8 +1892,9 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca # libzstd1 for zstd # libboost* for rdkit # ca-certificates for communicating with s3 by compute_ctl - - +# libevent for pgbouncer +RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ + echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc RUN apt update && \ case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): @@ -1515,31 +1929,54 @@ RUN apt update && \ libxslt1.1 \ libzstd1 \ libcurl4 \ + libevent-2.1-7 \ locales \ procps \ ca-certificates \ $VERSION_INSTALLS && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 -# s5cmd 2.2.2 from https://github.com/peak/s5cmd/releases/tag/v2.2.2 -# used by fast_import -ARG TARGETARCH -ADD https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_linux_$TARGETARCH.deb /tmp/s5cmd.deb -RUN set -ex; \ - \ - # Determine the expected checksum based on TARGETARCH - if [ "${TARGETARCH}" = "amd64" ]; then \ - CHECKSUM="392c385320cd5ffa435759a95af77c215553d967e4b1c0fffe52e4f14c29cf85"; \ - elif [ "${TARGETARCH}" = "arm64" ]; then \ - CHECKSUM="939bee3cf4b5604ddb00e67f8c157b91d7c7a5b553d1fbb6890fad32894b7b46"; \ - else \ - echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ - fi; \ - \ - # Compute and validate the checksum - echo "${CHECKSUM} /tmp/s5cmd.deb" | sha256sum -c - -RUN dpkg -i /tmp/s5cmd.deb && rm /tmp/s5cmd.deb +# Add user postgres +RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ + echo "postgres:test_console_pass" | chpasswd && \ + mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ + mkdir /var/db/postgres/pgbouncer && \ + chown -R postgres:postgres /var/db/postgres && \ + chmod 0750 /var/db/postgres/compute && \ + chmod 0750 /var/db/postgres/pgbouncer && \ + # create folder for file cache + mkdir -p -m 777 /neon/cache && \ + # Create remote extension download directory + mkdir /usr/local/download_extensions && \ + chown -R postgres:postgres /usr/local/download_extensions + +# pgbouncer and its config +COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer +COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini + +COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local +COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/compute_ctl /usr/local/bin/compute_ctl +COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/fast_import /usr/local/bin/fast_import + +# local_proxy and its config +COPY --from=compute-tools --chown=postgres /home/nonroot/target-bin/local_proxy /usr/local/bin/local_proxy +RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy + +# Metrics exporter binaries and configuration files +COPY --from=exporters ./postgres_exporter /bin/postgres_exporter +COPY --from=exporters ./pgbouncer_exporter /bin/pgbouncer_exporter +COPY --from=exporters ./sql_exporter /bin/sql_exporter + +COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml + +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml + +# Make the libraries we built available +RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig ENV LANG=en_US.utf8 USER postgres diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index aa6cc1cfc8..f8f4cab63b 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -3,7 +3,7 @@ metrics: [ import 'sql_exporter/checkpoints_req.libsonnet', import 'sql_exporter/checkpoints_timed.libsonnet', - import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet', + import 'sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet', import 'sql_exporter/compute_current_lsn.libsonnet', import 'sql_exporter/compute_logical_snapshot_files.libsonnet', import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet', diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini index abcd165636..9d68cbb8d5 100644 --- a/compute/etc/pgbouncer.ini +++ b/compute/etc/pgbouncer.ini @@ -19,3 +19,12 @@ max_prepared_statements=0 admin_users=postgres unix_socket_dir=/tmp/ unix_socket_mode=0777 +; required for pgbouncer_exporter +ignore_startup_parameters=extra_float_digits + +;; Disable connection logging. It produces a lot of logs that no one looks at, +;; and we can get similar log entries from the proxy too. We had incidents in +;; the past where the logging significantly stressed the log device or pgbouncer +;; itself. +log_connections=0 +log_disconnections=0 diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet similarity index 61% rename from compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet rename to compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet index 02c803cfa6..31725bd179 100644 --- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet +++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet @@ -1,10 +1,10 @@ { - metric_name: 'compute_backpressure_throttling_seconds', - type: 'gauge', + metric_name: 'compute_backpressure_throttling_seconds_total', + type: 'counter', help: 'Time compute has spent throttled', key_labels: null, values: [ 'throttled', ], - query: importstr 'sql_exporter/compute_backpressure_throttling_seconds.sql', + query: importstr 'sql_exporter/compute_backpressure_throttling_seconds_total.sql', } diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql similarity index 100% rename from compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql rename to compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql diff --git a/compute/patches/cloud_regress_pg16.patch b/compute/patches/cloud_regress_pg16.patch index a4b93d0260..3f0bb84ae7 100644 --- a/compute/patches/cloud_regress_pg16.patch +++ b/compute/patches/cloud_regress_pg16.patch @@ -981,7 +981,7 @@ index fc42d418bf..e38f517574 100644 CREATE SCHEMA addr_nsp; SET search_path TO 'addr_nsp'; diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out -index 8475231735..1afae5395f 100644 +index 8475231735..0653946337 100644 --- a/src/test/regress/expected/password.out +++ b/src/test/regress/expected/password.out @@ -12,11 +12,11 @@ SET password_encryption = 'md5'; -- ok @@ -1006,65 +1006,63 @@ index 8475231735..1afae5395f 100644 -----------------+--------------------------------------------------- - regress_passwd1 | md5783277baca28003b33453252be4dbb34 - regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3 -+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1 -+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2 ++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 ++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: - regress_passwd4 | + regress_passwd4 | SCRAM-SHA-256$4096:$: (4 rows) -- Rename a role -@@ -54,24 +54,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; +@@ -54,24 +54,16 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; -- passwords. SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; +--- already encrypted, use as they are +-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; +-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted, use as they are - ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +--- already encrypted with MD5, use as it is +-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +--- This looks like a valid SCRAM-SHA-256 secret, but it is not +--- so it should be hashed with SCRAM-SHA-256. +-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; +--- These may look like valid MD5 secrets, but they are not, so they +--- should be hashed with SCRAM-SHA-256. +--- trailing garbage at the end +-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; +--- invalid length +-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted with MD5, use as it is - CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- This looks like a valid SCRAM-SHA-256 secret, but it is not - -- so it should be hashed with SCRAM-SHA-256. - CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- These may look like valid MD5 secrets, but they are not, so they - -- should be hashed with SCRAM-SHA-256. - -- trailing garbage at the end - CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- invalid length - CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Changing the SCRAM iteration count SET scram_iterations = 1024; CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount'; -@@ -81,63 +87,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ +@@ -81,11 +73,11 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70 - regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb -+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1 -+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2 ++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 ++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: regress_passwd4 | SCRAM-SHA-256$4096:$: - regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023 -- regress_passwd6 | SCRAM-SHA-256$4096:$: -- regress_passwd7 | SCRAM-SHA-256$4096:$: -- regress_passwd8 | SCRAM-SHA-256$4096:$: - regress_passwd9 | SCRAM-SHA-256$1024:$: --(9 rows) -+(5 rows) - ++ regress_passwd5 | SCRAM-SHA-256$4096:$: + regress_passwd6 | SCRAM-SHA-256$4096:$: + regress_passwd7 | SCRAM-SHA-256$4096:$: + regress_passwd8 | SCRAM-SHA-256$4096:$: +@@ -95,23 +87,20 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ -- An empty password is not allowed, in any form CREATE ROLE regress_passwd_empty PASSWORD ''; NOTICE: empty string is not a valid password, clearing password @@ -1082,56 +1080,37 @@ index 8475231735..1afae5395f 100644 -(1 row) +(0 rows) - -- Test with invalid stored and server keys. - -- - -- The first is valid, to act as a control. The others have too long - -- stored/server keys. They will be re-hashed. - CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} +--- Test with invalid stored and server keys. +--- +--- The first is valid, to act as a control. The others have too long +--- stored/server keys. They will be re-hashed. +-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed - FROM pg_authid - WHERE rolname LIKE 'regress_passwd_sha_len%' +@@ -120,7 +109,7 @@ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassw ORDER BY rolname; -- rolname | is_rolpassword_rehashed ---------------------------+------------------------- + rolname | is_rolpassword_rehashed + -------------------------+------------------------- - regress_passwd_sha_len0 | f -- regress_passwd_sha_len1 | t -- regress_passwd_sha_len2 | t --(3 rows) -+ rolname | is_rolpassword_rehashed -+---------+------------------------- -+(0 rows) - - DROP ROLE regress_passwd1; - DROP ROLE regress_passwd2; - DROP ROLE regress_passwd3; - DROP ROLE regress_passwd4; - DROP ROLE regress_passwd5; -+ERROR: role "regress_passwd5" does not exist - DROP ROLE regress_passwd6; -+ERROR: role "regress_passwd6" does not exist - DROP ROLE regress_passwd7; -+ERROR: role "regress_passwd7" does not exist ++ regress_passwd_sha_len0 | t + regress_passwd_sha_len1 | t + regress_passwd_sha_len2 | t + (3 rows) +@@ -135,6 +124,7 @@ DROP ROLE regress_passwd7; DROP ROLE regress_passwd8; -+ERROR: role "regress_passwd8" does not exist DROP ROLE regress_passwd9; DROP ROLE regress_passwd_empty; +ERROR: role "regress_passwd_empty" does not exist DROP ROLE regress_passwd_sha_len0; -+ERROR: role "regress_passwd_sha_len0" does not exist DROP ROLE regress_passwd_sha_len1; -+ERROR: role "regress_passwd_sha_len1" does not exist DROP ROLE regress_passwd_sha_len2; -+ERROR: role "regress_passwd_sha_len2" does not exist - -- all entries should have been removed - SELECT rolname, rolpassword - FROM pg_authid diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 5b9dba7b32..cc408dad42 100644 --- a/src/test/regress/expected/privileges.out @@ -3194,7 +3173,7 @@ index 1a6c61f49d..1c31ac6a53 100644 -- Test generic object addressing/identification functions CREATE SCHEMA addr_nsp; diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql -index 53e86b0b6c..f07cf1ec54 100644 +index 53e86b0b6c..0303fdfe96 100644 --- a/src/test/regress/sql/password.sql +++ b/src/test/regress/sql/password.sql @@ -10,11 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok @@ -3213,23 +3192,59 @@ index 53e86b0b6c..f07cf1ec54 100644 -- check list of created entries -- -@@ -42,14 +42,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; +@@ -42,26 +42,18 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; +--- already encrypted, use as they are +-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; +-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted, use as they are - ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; - ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +--- already encrypted with MD5, use as it is +-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted with MD5, use as it is - CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; +--- This looks like a valid SCRAM-SHA-256 secret, but it is not +--- so it should be hashed with SCRAM-SHA-256. +-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; +--- These may look like valid MD5 secrets, but they are not, so they +--- should be hashed with SCRAM-SHA-256. +--- trailing garbage at the end +-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; +--- invalid length +-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; + + -- Changing the SCRAM iteration count + SET scram_iterations = 1024; +@@ -78,13 +70,10 @@ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a'; + ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4='; + SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty'; + +--- Test with invalid stored and server keys. +--- +--- The first is valid, to act as a control. The others have too long +--- stored/server keys. They will be re-hashed. +-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; + + -- Check that the invalid secrets were re-hashed. A re-hashed secret + -- should not contain the original salt. diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index 249df17a58..b258e7f26a 100644 --- a/src/test/regress/sql/privileges.sql diff --git a/compute/patches/cloud_regress_pg17.patch b/compute/patches/cloud_regress_pg17.patch index cbe84ef54b..e57447a2c6 100644 --- a/compute/patches/cloud_regress_pg17.patch +++ b/compute/patches/cloud_regress_pg17.patch @@ -1014,10 +1014,10 @@ index fc42d418bf..e38f517574 100644 CREATE SCHEMA addr_nsp; SET search_path TO 'addr_nsp'; diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out -index 924d6e001d..5966531db6 100644 +index 924d6e001d..7fdda73439 100644 --- a/src/test/regress/expected/password.out +++ b/src/test/regress/expected/password.out -@@ -12,13 +12,13 @@ SET password_encryption = 'md5'; -- ok +@@ -12,13 +12,11 @@ SET password_encryption = 'md5'; -- ok SET password_encryption = 'scram-sha-256'; -- ok -- consistency of password entries SET password_encryption = 'md5'; @@ -1026,9 +1026,7 @@ index 924d6e001d..5966531db6 100644 -CREATE ROLE regress_passwd2; -ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2'; +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; -+ALTER ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -+ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3'; -CREATE ROLE regress_passwd4 PASSWORD NULL; @@ -1037,71 +1035,69 @@ index 924d6e001d..5966531db6 100644 -- check list of created entries -- -- The scram secret will look something like: -@@ -32,10 +32,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ +@@ -32,10 +30,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5783277baca28003b33453252be4dbb34 - regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3 -+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1 -+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2 ++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 ++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: - regress_passwd4 | + regress_passwd4 | SCRAM-SHA-256$4096:$: (4 rows) -- Rename a role -@@ -56,24 +56,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; +@@ -56,24 +54,17 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; -- passwords. SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; +--- already encrypted, use as they are +-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; +-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted, use as they are - ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- already encrypted with MD5, use as it is - CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- This looks like a valid SCRAM-SHA-256 secret, but it is not - -- so it should be hashed with SCRAM-SHA-256. - CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- These may look like valid MD5 secrets, but they are not, so they - -- should be hashed with SCRAM-SHA-256. - -- trailing garbage at the end - CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- invalid length - CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} +-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +--- This looks like a valid SCRAM-SHA-256 secret, but it is not +--- so it should be hashed with SCRAM-SHA-256. +-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; +--- These may look like valid MD5 secrets, but they are not, so they +--- should be hashed with SCRAM-SHA-256. +--- trailing garbage at the end +-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; +--- invalid length +-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Changing the SCRAM iteration count SET scram_iterations = 1024; CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount'; -@@ -83,63 +89,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ +@@ -83,11 +74,11 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70 - regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb -+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1 -+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2 ++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 ++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: regress_passwd4 | SCRAM-SHA-256$4096:$: - regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023 -- regress_passwd6 | SCRAM-SHA-256$4096:$: -- regress_passwd7 | SCRAM-SHA-256$4096:$: -- regress_passwd8 | SCRAM-SHA-256$4096:$: - regress_passwd9 | SCRAM-SHA-256$1024:$: --(9 rows) -+(5 rows) - ++ regress_passwd5 | SCRAM-SHA-256$4096:$: + regress_passwd6 | SCRAM-SHA-256$4096:$: + regress_passwd7 | SCRAM-SHA-256$4096:$: + regress_passwd8 | SCRAM-SHA-256$4096:$: +@@ -97,23 +88,20 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ -- An empty password is not allowed, in any form CREATE ROLE regress_passwd_empty PASSWORD ''; NOTICE: empty string is not a valid password, clearing password @@ -1119,56 +1115,37 @@ index 924d6e001d..5966531db6 100644 -(1 row) +(0 rows) - -- Test with invalid stored and server keys. - -- - -- The first is valid, to act as a control. The others have too long - -- stored/server keys. They will be re-hashed. - CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} +--- Test with invalid stored and server keys. +--- +--- The first is valid, to act as a control. The others have too long +--- stored/server keys. They will be re-hashed. +-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed - FROM pg_authid - WHERE rolname LIKE 'regress_passwd_sha_len%' +@@ -122,7 +110,7 @@ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassw ORDER BY rolname; -- rolname | is_rolpassword_rehashed ---------------------------+------------------------- + rolname | is_rolpassword_rehashed + -------------------------+------------------------- - regress_passwd_sha_len0 | f -- regress_passwd_sha_len1 | t -- regress_passwd_sha_len2 | t --(3 rows) -+ rolname | is_rolpassword_rehashed -+---------+------------------------- -+(0 rows) - - DROP ROLE regress_passwd1; - DROP ROLE regress_passwd2; - DROP ROLE regress_passwd3; - DROP ROLE regress_passwd4; - DROP ROLE regress_passwd5; -+ERROR: role "regress_passwd5" does not exist - DROP ROLE regress_passwd6; -+ERROR: role "regress_passwd6" does not exist - DROP ROLE regress_passwd7; -+ERROR: role "regress_passwd7" does not exist ++ regress_passwd_sha_len0 | t + regress_passwd_sha_len1 | t + regress_passwd_sha_len2 | t + (3 rows) +@@ -137,6 +125,7 @@ DROP ROLE regress_passwd7; DROP ROLE regress_passwd8; -+ERROR: role "regress_passwd8" does not exist DROP ROLE regress_passwd9; DROP ROLE regress_passwd_empty; +ERROR: role "regress_passwd_empty" does not exist DROP ROLE regress_passwd_sha_len0; -+ERROR: role "regress_passwd_sha_len0" does not exist DROP ROLE regress_passwd_sha_len1; -+ERROR: role "regress_passwd_sha_len1" does not exist DROP ROLE regress_passwd_sha_len2; -+ERROR: role "regress_passwd_sha_len2" does not exist - -- all entries should have been removed - SELECT rolname, rolpassword - FROM pg_authid diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 1296da0d57..f43fffa44c 100644 --- a/src/test/regress/expected/privileges.out @@ -3249,10 +3226,10 @@ index 1a6c61f49d..1c31ac6a53 100644 -- Test generic object addressing/identification functions CREATE SCHEMA addr_nsp; diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql -index bb82aa4aa2..7424c91b10 100644 +index bb82aa4aa2..dd8a05e24d 100644 --- a/src/test/regress/sql/password.sql +++ b/src/test/regress/sql/password.sql -@@ -10,13 +10,13 @@ SET password_encryption = 'scram-sha-256'; -- ok +@@ -10,13 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok -- consistency of password entries SET password_encryption = 'md5'; @@ -3261,9 +3238,7 @@ index bb82aa4aa2..7424c91b10 100644 -CREATE ROLE regress_passwd2; -ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2'; +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; -+ALTER ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -+ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3'; -CREATE ROLE regress_passwd4 PASSWORD NULL; @@ -3272,23 +3247,59 @@ index bb82aa4aa2..7424c91b10 100644 -- check list of created entries -- -@@ -44,14 +44,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; +@@ -44,26 +42,19 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; +--- already encrypted, use as they are +-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; +-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted, use as they are - ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; - ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- already encrypted with MD5, use as it is - CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; +--- This looks like a valid SCRAM-SHA-256 secret, but it is not +--- so it should be hashed with SCRAM-SHA-256. +-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; +--- These may look like valid MD5 secrets, but they are not, so they +--- should be hashed with SCRAM-SHA-256. +--- trailing garbage at the end +-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; +--- invalid length +-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; + + -- Changing the SCRAM iteration count + SET scram_iterations = 1024; +@@ -80,13 +71,10 @@ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a'; + ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4='; + SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty'; + +--- Test with invalid stored and server keys. +--- +--- The first is valid, to act as a control. The others have too long +--- stored/server keys. They will be re-hashed. +-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; + + -- Check that the invalid secrets were re-hashed. A re-hashed secret + -- should not contain the original salt. diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index 5880bc018d..27aa952b18 100644 --- a/src/test/regress/sql/privileges.sql diff --git a/compute/patches/contrib_pg16.patch b/compute/patches/contrib_pg16.patch new file mode 100644 index 0000000000..71adaabe7d --- /dev/null +++ b/compute/patches/contrib_pg16.patch @@ -0,0 +1,242 @@ +diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out +index 979e5e8..2375b45 100644 +--- a/contrib/amcheck/expected/check_heap.out ++++ b/contrib/amcheck/expected/check_heap.out +@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b) + -- same transaction. The heaptest table is smaller than the default + -- wal_skip_threshold, so a wal_level=minimal commit reads the table into + -- shared_buffers. A transaction delays that and excludes any autovacuum. +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; + SELECT sum(reads) AS stats_bulkreads_before + FROM pg_stat_io WHERE context = 'bulkread' \gset + BEGIN; +-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; + -- Check that valid options are not rejected nor corruption reported + -- for a non-empty table + SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); +@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush(); + + (1 row) + +-SELECT sum(reads) AS stats_bulkreads_after +- FROM pg_stat_io WHERE context = 'bulkread' \gset +-SELECT :stats_bulkreads_after > :stats_bulkreads_before; +- ?column? +----------- +- t +-(1 row) +- + CREATE ROLE regress_heaptest_role; + -- verify permissions are checked (error due to function not callable) + SET ROLE regress_heaptest_role; +@@ -233,7 +222,6 @@ ERROR: cannot check relation "test_foreign_table" + DETAIL: This operation is not supported for foreign tables. + -- cleanup + DROP TABLE heaptest; +-DROP TABLESPACE regress_test_stats_tblspc; + DROP TABLE test_partition; + DROP TABLE test_partitioned; + DROP OWNED BY regress_heaptest_role; -- permissions +diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql +index 1745bae..3b429c3 100644 +--- a/contrib/amcheck/sql/check_heap.sql ++++ b/contrib/amcheck/sql/check_heap.sql +@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b) + -- same transaction. The heaptest table is smaller than the default + -- wal_skip_threshold, so a wal_level=minimal commit reads the table into + -- shared_buffers. A transaction delays that and excludes any autovacuum. +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; + SELECT sum(reads) AS stats_bulkreads_before + FROM pg_stat_io WHERE context = 'bulkread' \gset + BEGIN; +-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; + -- Check that valid options are not rejected nor corruption reported + -- for a non-empty table + SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); +@@ -58,9 +55,6 @@ COMMIT; + -- ALTER TABLE ... SET TABLESPACE ... + -- causing an additional bulkread, which should be reflected in pg_stat_io. + SELECT pg_stat_force_next_flush(); +-SELECT sum(reads) AS stats_bulkreads_after +- FROM pg_stat_io WHERE context = 'bulkread' \gset +-SELECT :stats_bulkreads_after > :stats_bulkreads_before; + + CREATE ROLE regress_heaptest_role; + +@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table', + + -- cleanup + DROP TABLE heaptest; +-DROP TABLESPACE regress_test_stats_tblspc; + DROP TABLE test_partition; + DROP TABLE test_partitioned; + DROP OWNED BY regress_heaptest_role; -- permissions +diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out +index 33be13a..70a406c 100644 +--- a/contrib/citext/expected/create_index_acl.out ++++ b/contrib/citext/expected/create_index_acl.out +@@ -5,9 +5,6 @@ + -- owner having as few applicable privileges as possible. (The privileges.sql + -- regress_sro_user tests look for the opposite defect; they confirm that + -- DefineIndex() uses the table owner userid where necessary.) +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_create_idx_tblspace LOCATION ''; +-RESET allow_in_place_tablespaces; + BEGIN; + CREATE ROLE regress_minimal; + CREATE SCHEMA s; +@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal; + -- Empty-table DefineIndex() + CREATE UNIQUE INDEX u0rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Make the table nonempty. + INSERT INTO s.x VALUES ('foo'), ('bar'); +@@ -66,11 +61,9 @@ RESET search_path; + GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal; + CREATE UNIQUE INDEX u2rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Shall not find s.coll via search_path, despite the s.const->public.setter + -- call having set search_path=s during expression planning. Suppress the +@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + \set VERBOSITY sqlstate + ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + ERROR: 42704 + \set VERBOSITY default + ROLLBACK; +-DROP TABLESPACE regress_create_idx_tblspace; +diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql +index 10b5225..ae442e1 100644 +--- a/contrib/citext/sql/create_index_acl.sql ++++ b/contrib/citext/sql/create_index_acl.sql +@@ -6,10 +6,6 @@ + -- regress_sro_user tests look for the opposite defect; they confirm that + -- DefineIndex() uses the table owner userid where necessary.) + +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_create_idx_tblspace LOCATION ''; +-RESET allow_in_place_tablespaces; +- + BEGIN; + CREATE ROLE regress_minimal; + CREATE SCHEMA s; +@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal; + -- Empty-table DefineIndex() + CREATE UNIQUE INDEX u0rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Make the table nonempty. + INSERT INTO s.x VALUES ('foo'), ('bar'); +@@ -68,11 +62,9 @@ RESET search_path; + GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal; + CREATE UNIQUE INDEX u2rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Shall not find s.coll via search_path, despite the s.const->public.setter + -- call having set search_path=s during expression planning. Suppress the +@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + \set VERBOSITY sqlstate + ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + \set VERBOSITY default + ROLLBACK; + +-DROP TABLESPACE regress_create_idx_tblspace; +diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out +index 72304e0..ebe131b 100644 +--- a/contrib/file_fdw/expected/file_fdw.out ++++ b/contrib/file_fdw/expected/file_fdw.out +@@ -4,6 +4,7 @@ + -- directory paths are passed to us in environment variables + \getenv abs_srcdir PG_ABS_SRCDIR + -- Clean up in case a prior regression run failed ++SET compute_query_id TO 'off'; + SET client_min_messages TO 'warning'; + DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; + RESET client_min_messages; +diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql +index f0548e1..848a08c 100644 +--- a/contrib/file_fdw/sql/file_fdw.sql ++++ b/contrib/file_fdw/sql/file_fdw.sql +@@ -6,6 +6,7 @@ + \getenv abs_srcdir PG_ABS_SRCDIR + + -- Clean up in case a prior regression run failed ++SET compute_query_id TO 'off'; + SET client_min_messages TO 'warning'; + DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; + RESET client_min_messages; +diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out +index d1adbab..38b52ac 100644 +--- a/contrib/pageinspect/expected/gist.out ++++ b/contrib/pageinspect/expected/gist.out +@@ -10,25 +10,6 @@ BEGIN; + CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM + generate_series(1,1000) i; + CREATE INDEX test_gist_idx ON test_gist USING gist (p); +--- Page 0 is the root, the rest are leaf pages +-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0)); +- lsn | nsn | rightlink | flags +------+-----+------------+------- +- 0/1 | 0/0 | 4294967295 | {} +-(1 row) +- +-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1)); +- lsn | nsn | rightlink | flags +------+-----+------------+-------- +- 0/1 | 0/0 | 4294967295 | {leaf} +-(1 row) +- +-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2)); +- lsn | nsn | rightlink | flags +------+-----+-----------+-------- +- 0/1 | 0/0 | 1 | {leaf} +-(1 row) +- + COMMIT; + SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx'); + itemoffset | ctid | itemlen | dead | keys +diff --git a/contrib/pageinspect/sql/gist.sql b/contrib/pageinspect/sql/gist.sql +index d263542..607992f 100644 +--- a/contrib/pageinspect/sql/gist.sql ++++ b/contrib/pageinspect/sql/gist.sql +@@ -12,11 +12,6 @@ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM + generate_series(1,1000) i; + CREATE INDEX test_gist_idx ON test_gist USING gist (p); + +--- Page 0 is the root, the rest are leaf pages +-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0)); +-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1)); +-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2)); +- + COMMIT; + + SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx'); diff --git a/compute/patches/contrib_pg17.patch b/compute/patches/contrib_pg17.patch new file mode 100644 index 0000000000..0d6c1203b0 --- /dev/null +++ b/compute/patches/contrib_pg17.patch @@ -0,0 +1,196 @@ +diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out +index 979e5e8..2375b45 100644 +--- a/contrib/amcheck/expected/check_heap.out ++++ b/contrib/amcheck/expected/check_heap.out +@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b) + -- same transaction. The heaptest table is smaller than the default + -- wal_skip_threshold, so a wal_level=minimal commit reads the table into + -- shared_buffers. A transaction delays that and excludes any autovacuum. +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; + SELECT sum(reads) AS stats_bulkreads_before + FROM pg_stat_io WHERE context = 'bulkread' \gset + BEGIN; +-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; + -- Check that valid options are not rejected nor corruption reported + -- for a non-empty table + SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); +@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush(); + + (1 row) + +-SELECT sum(reads) AS stats_bulkreads_after +- FROM pg_stat_io WHERE context = 'bulkread' \gset +-SELECT :stats_bulkreads_after > :stats_bulkreads_before; +- ?column? +----------- +- t +-(1 row) +- + CREATE ROLE regress_heaptest_role; + -- verify permissions are checked (error due to function not callable) + SET ROLE regress_heaptest_role; +@@ -233,7 +222,6 @@ ERROR: cannot check relation "test_foreign_table" + DETAIL: This operation is not supported for foreign tables. + -- cleanup + DROP TABLE heaptest; +-DROP TABLESPACE regress_test_stats_tblspc; + DROP TABLE test_partition; + DROP TABLE test_partitioned; + DROP OWNED BY regress_heaptest_role; -- permissions +diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql +index 1745bae..3b429c3 100644 +--- a/contrib/amcheck/sql/check_heap.sql ++++ b/contrib/amcheck/sql/check_heap.sql +@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b) + -- same transaction. The heaptest table is smaller than the default + -- wal_skip_threshold, so a wal_level=minimal commit reads the table into + -- shared_buffers. A transaction delays that and excludes any autovacuum. +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_test_stats_tblspc LOCATION ''; + SELECT sum(reads) AS stats_bulkreads_before + FROM pg_stat_io WHERE context = 'bulkread' \gset + BEGIN; +-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc; + -- Check that valid options are not rejected nor corruption reported + -- for a non-empty table + SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none'); +@@ -58,9 +55,6 @@ COMMIT; + -- ALTER TABLE ... SET TABLESPACE ... + -- causing an additional bulkread, which should be reflected in pg_stat_io. + SELECT pg_stat_force_next_flush(); +-SELECT sum(reads) AS stats_bulkreads_after +- FROM pg_stat_io WHERE context = 'bulkread' \gset +-SELECT :stats_bulkreads_after > :stats_bulkreads_before; + + CREATE ROLE regress_heaptest_role; + +@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table', + + -- cleanup + DROP TABLE heaptest; +-DROP TABLESPACE regress_test_stats_tblspc; + DROP TABLE test_partition; + DROP TABLE test_partitioned; + DROP OWNED BY regress_heaptest_role; -- permissions +diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out +index 33be13a..70a406c 100644 +--- a/contrib/citext/expected/create_index_acl.out ++++ b/contrib/citext/expected/create_index_acl.out +@@ -5,9 +5,6 @@ + -- owner having as few applicable privileges as possible. (The privileges.sql + -- regress_sro_user tests look for the opposite defect; they confirm that + -- DefineIndex() uses the table owner userid where necessary.) +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_create_idx_tblspace LOCATION ''; +-RESET allow_in_place_tablespaces; + BEGIN; + CREATE ROLE regress_minimal; + CREATE SCHEMA s; +@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal; + -- Empty-table DefineIndex() + CREATE UNIQUE INDEX u0rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Make the table nonempty. + INSERT INTO s.x VALUES ('foo'), ('bar'); +@@ -66,11 +61,9 @@ RESET search_path; + GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal; + CREATE UNIQUE INDEX u2rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Shall not find s.coll via search_path, despite the s.const->public.setter + -- call having set search_path=s during expression planning. Suppress the +@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + \set VERBOSITY sqlstate + ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + ERROR: 42704 + \set VERBOSITY default + ROLLBACK; +-DROP TABLESPACE regress_create_idx_tblspace; +diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql +index 10b5225..ae442e1 100644 +--- a/contrib/citext/sql/create_index_acl.sql ++++ b/contrib/citext/sql/create_index_acl.sql +@@ -6,10 +6,6 @@ + -- regress_sro_user tests look for the opposite defect; they confirm that + -- DefineIndex() uses the table owner userid where necessary.) + +-SET allow_in_place_tablespaces = true; +-CREATE TABLESPACE regress_create_idx_tblspace LOCATION ''; +-RESET allow_in_place_tablespaces; +- + BEGIN; + CREATE ROLE regress_minimal; + CREATE SCHEMA s; +@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal; + -- Empty-table DefineIndex() + CREATE UNIQUE INDEX u0rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Make the table nonempty. + INSERT INTO s.x VALUES ('foo'), ('bar'); +@@ -68,11 +62,9 @@ RESET search_path; + GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal; + CREATE UNIQUE INDEX u2rows ON s.x USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops) +- TABLESPACE regress_create_idx_tblspace + WHERE s.index_row_if(y); + ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + -- Shall not find s.coll via search_path, despite the s.const->public.setter + -- call having set search_path=s during expression planning. Suppress the +@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree + \set VERBOSITY sqlstate + ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree + ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=) +- USING INDEX TABLESPACE regress_create_idx_tblspace + WHERE (s.index_row_if(y)); + \set VERBOSITY default + ROLLBACK; + +-DROP TABLESPACE regress_create_idx_tblspace; +diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out +index 86c148a..81bdb2c 100644 +--- a/contrib/file_fdw/expected/file_fdw.out ++++ b/contrib/file_fdw/expected/file_fdw.out +@@ -4,6 +4,7 @@ + -- directory paths are passed to us in environment variables + \getenv abs_srcdir PG_ABS_SRCDIR + -- Clean up in case a prior regression run failed ++SET compute_query_id TO 'off'; + SET client_min_messages TO 'warning'; + DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; + RESET client_min_messages; +diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql +index f0548e1..848a08c 100644 +--- a/contrib/file_fdw/sql/file_fdw.sql ++++ b/contrib/file_fdw/sql/file_fdw.sql +@@ -6,6 +6,7 @@ + \getenv abs_srcdir PG_ABS_SRCDIR + + -- Clean up in case a prior regression run failed ++SET compute_query_id TO 'off'; + SET client_min_messages TO 'warning'; + DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user; + RESET client_min_messages; diff --git a/compute/patches/duckdb_v113.patch b/compute/patches/duckdb_v113.patch new file mode 100644 index 0000000000..b7b43b88bf --- /dev/null +++ b/compute/patches/duckdb_v113.patch @@ -0,0 +1,25 @@ +diff --git a/libduckdb.map b/libduckdb.map +new file mode 100644 +index 0000000000..3b56f00cd7 +--- /dev/null ++++ b/libduckdb.map +@@ -0,0 +1,6 @@ ++DUCKDB_1.1.3 { ++ global: ++ *duckdb*; ++ local: ++ *; ++}; +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 3e757a4bcc..88ab4005b9 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -135,6 +135,8 @@ else() + target_link_libraries(duckdb ${DUCKDB_LINK_LIBS}) + link_threads(duckdb) + link_extension_libraries(duckdb) ++ target_link_options(duckdb PRIVATE ++ -Wl,--version-script=${CMAKE_SOURCE_DIR}/libduckdb.map) + + add_library(duckdb_static STATIC ${ALL_OBJECT_FILES}) + target_link_libraries(duckdb_static ${DUCKDB_LINK_LIBS}) diff --git a/compute/patches/duckdb_v120.patch b/compute/patches/duckdb_v120.patch new file mode 100644 index 0000000000..cf317736a5 --- /dev/null +++ b/compute/patches/duckdb_v120.patch @@ -0,0 +1,67 @@ +diff --git a/libduckdb_pg_duckdb.map b/libduckdb_pg_duckdb.map +new file mode 100644 +index 0000000000..0872978b48 +--- /dev/null ++++ b/libduckdb_pg_duckdb.map +@@ -0,0 +1,6 @@ ++DUCKDB_1.2.0 { ++ global: ++ *duckdb*; ++ local: ++ *; ++}; +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 58adef3fc0..2c522f91be 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -59,7 +59,7 @@ endfunction() + + if(AMALGAMATION_BUILD) + +- add_library(duckdb SHARED "${PROJECT_SOURCE_DIR}/src/amalgamation/duckdb.cpp") ++ add_library(duckdb_pg_duckdb SHARED "${PROJECT_SOURCE_DIR}/src/amalgamation/duckdb.cpp") + target_link_libraries(duckdb ${DUCKDB_SYSTEM_LIBS}) + link_threads(duckdb) + link_extension_libraries(duckdb) +@@ -109,7 +109,7 @@ else() + duckdb_yyjson + duckdb_zstd) + +- add_library(duckdb SHARED ${ALL_OBJECT_FILES}) ++ add_library(duckdb_pg_duckdb SHARED ${ALL_OBJECT_FILES}) + + if(WIN32 AND NOT MINGW) + ensure_variable_is_number(DUCKDB_MAJOR_VERSION RC_MAJOR_VERSION) +@@ -131,9 +131,11 @@ else() + target_sources(duckdb PRIVATE version.rc) + endif() + +- target_link_libraries(duckdb ${DUCKDB_LINK_LIBS}) +- link_threads(duckdb) +- link_extension_libraries(duckdb) ++ target_link_libraries(duckdb_pg_duckdb ${DUCKDB_LINK_LIBS}) ++ link_threads(duckdb_pg_duckdb) ++ link_extension_libraries(duckdb_pg_duckdb) ++ target_link_options(duckdb_pg_duckdb PRIVATE ++ -Wl,--version-script=${CMAKE_SOURCE_DIR}/libduckdb_pg_duckdb.map) + + add_library(duckdb_static STATIC ${ALL_OBJECT_FILES}) + target_link_libraries(duckdb_static ${DUCKDB_LINK_LIBS}) +@@ -141,7 +143,7 @@ else() + link_extension_libraries(duckdb_static) + + target_include_directories( +- duckdb PUBLIC $ ++ duckdb_pg_duckdb PUBLIC $ + $) + + target_include_directories( +@@ -161,7 +163,7 @@ else() + endif() + + install( +- TARGETS duckdb duckdb_static ++ TARGETS duckdb_pg_duckdb duckdb_static + EXPORT "${DUCKDB_EXPORT_SET}" + LIBRARY DESTINATION "${INSTALL_LIB_DIR}" + ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" diff --git a/compute/patches/pg_duckdb_v031.patch b/compute/patches/pg_duckdb_v031.patch new file mode 100644 index 0000000000..edc7fbf69d --- /dev/null +++ b/compute/patches/pg_duckdb_v031.patch @@ -0,0 +1,33 @@ +diff --git a/Makefile b/Makefile +index 3235cc8..6b892bc 100644 +--- a/Makefile ++++ b/Makefile +@@ -32,7 +32,7 @@ else + DUCKDB_BUILD_TYPE = release + endif + +-DUCKDB_LIB = libduckdb$(DLSUFFIX) ++DUCKDB_LIB = libduckdb_pg_duckdb$(DLSUFFIX) + FULL_DUCKDB_LIB = third_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src/$(DUCKDB_LIB) + + ERROR_ON_WARNING ?= +@@ -54,7 +54,7 @@ override PG_CXXFLAGS += -std=c++17 ${DUCKDB_BUILD_CXX_FLAGS} ${COMPILER_FLAGS} - + # changes to the vendored code in one place. + override PG_CFLAGS += -Wno-declaration-after-statement + +-SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb -lstdc++ -llz4 ++SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb_pg_duckdb -lstdc++ -llz4 + + include Makefile.global + +diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql +index d777d76..af60106 100644 +--- a/sql/pg_duckdb--0.2.0--0.3.0.sql ++++ b/sql/pg_duckdb--0.2.0--0.3.0.sql +@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC; + GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC; + GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC; + GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC; ++GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser; ++GRANT ALL ON TABLE duckdb.extensions TO neon_superuser; ++GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser; diff --git a/compute/patches/pg_graphql.patch b/compute/patches/pg_graphql.patch new file mode 100644 index 0000000000..bf0ac38afa --- /dev/null +++ b/compute/patches/pg_graphql.patch @@ -0,0 +1,19 @@ +commit ec6a491d126882966a696f9ad5d3698935361d55 +Author: Alexey Masterov +Date: Tue Dec 17 10:25:00 2024 +0100 + + Changes required to run tests on Neon + +diff --git a/test/expected/permissions_functions.out b/test/expected/permissions_functions.out +index 1e9fbc2..94cbe25 100644 +--- a/test/expected/permissions_functions.out ++++ b/test/expected/permissions_functions.out +@@ -64,7 +64,7 @@ begin; + select current_user; + current_user + -------------- +- postgres ++ cloud_admin + (1 row) + + -- revoke default access from the public role for new functions diff --git a/compute/patches/pg_hint_plan_v16.patch b/compute/patches/pg_hint_plan_v16.patch index 4039a036df..1fc3ffa609 100644 --- a/compute/patches/pg_hint_plan_v16.patch +++ b/compute/patches/pg_hint_plan_v16.patch @@ -6,16 +6,16 @@ index da723b8..5328114 100644 ---- -- No.A-1-1-3 CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan -- No.A-1-2-3 DROP EXTENSION pg_hint_plan; -- No.A-1-1-4 CREATE SCHEMA other_schema; CREATE EXTENSION pg_hint_plan SCHEMA other_schema; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan DROP SCHEMA other_schema; ---- ---- No. A-5-1 comment pattern @@ -35,7 +35,7 @@ index d372459..6282afe 100644 SET client_min_messages TO LOG; SET pg_hint_plan.enable_hint TO on; CREATE EXTENSION file_fdw; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; CREATE USER MAPPING FOR PUBLIC SERVER file_server; CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); diff --git a/compute/patches/pg_hint_plan_v17.patch b/compute/patches/pg_hint_plan_v17.patch index dbf4e470ea..3442a094eb 100644 --- a/compute/patches/pg_hint_plan_v17.patch +++ b/compute/patches/pg_hint_plan_v17.patch @@ -6,16 +6,16 @@ index e7d68a1..65a056c 100644 ---- -- No.A-1-1-3 CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan -- No.A-1-2-3 DROP EXTENSION pg_hint_plan; -- No.A-1-1-4 CREATE SCHEMA other_schema; CREATE EXTENSION pg_hint_plan SCHEMA other_schema; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" CREATE EXTENSION pg_hint_plan; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan DROP SCHEMA other_schema; ---- ---- No. A-5-1 comment pattern @@ -168,7 +168,7 @@ index 017fa4b..98d989b 100644 SET client_min_messages TO LOG; SET pg_hint_plan.enable_hint TO on; CREATE EXTENSION file_fdw; -+LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw ++LOG: Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; CREATE USER MAPPING FOR PUBLIC SERVER file_server; CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); diff --git a/compute/patches/pg_repack.patch b/compute/patches/pg_repack.patch new file mode 100644 index 0000000000..f6b0aa1e13 --- /dev/null +++ b/compute/patches/pg_repack.patch @@ -0,0 +1,72 @@ +diff --git a/regress/Makefile b/regress/Makefile +index bf6edcb..89b4c7f 100644 +--- a/regress/Makefile ++++ b/regress/Makefile +@@ -17,7 +17,7 @@ INTVERSION := $(shell echo $$(($$(echo $(VERSION).0 | sed 's/\([[:digit:]]\{1,\} + # Test suite + # + +-REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper tablespace get_order_by trigger ++REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger + + USE_PGXS = 1 # use pgxs if not in contrib directory + PGXS := $(shell $(PG_CONFIG) --pgxs) +diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out +index 8d0a94e..63b68bf 100644 +--- a/regress/expected/nosuper.out ++++ b/regress/expected/nosuper.out +@@ -4,22 +4,22 @@ + SET client_min_messages = error; + DROP ROLE IF EXISTS nosuper; + SET client_min_messages = warning; +-CREATE ROLE nosuper WITH LOGIN; ++CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD'; + -- => OK + \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check + INFO: repacking table "public.tbl_cluster" + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper + ERROR: pg_repack failed with error: You must be a superuser to use pg_repack + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + ERROR: pg_repack failed with error: ERROR: permission denied for schema repack + LINE 1: select repack.version(), repack.version_sql() + ^ + GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper; + GRANT USAGE ON SCHEMA repack TO nosuper; + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + INFO: repacking table "public.tbl_cluster" + ERROR: query failed: ERROR: current transaction is aborted, commands ignored until end of transaction block + DETAIL: query was: RESET lock_timeout +diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql +index 072f0fa..dbe60f8 100644 +--- a/regress/sql/nosuper.sql ++++ b/regress/sql/nosuper.sql +@@ -4,19 +4,19 @@ + SET client_min_messages = error; + DROP ROLE IF EXISTS nosuper; + SET client_min_messages = warning; +-CREATE ROLE nosuper WITH LOGIN; ++CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD'; + -- => OK + \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + + GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper; + GRANT USAGE ON SCHEMA repack TO nosuper; + + -- => ERROR +-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check ++\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check + + REVOKE ALL ON ALL TABLES IN SCHEMA repack FROM nosuper; + REVOKE USAGE ON SCHEMA repack FROM nosuper; diff --git a/compute/patches/pgvector.patch b/compute/patches/pgvector.patch index 3e1ffcaaaf..da41c86140 100644 --- a/compute/patches/pgvector.patch +++ b/compute/patches/pgvector.patch @@ -1,8 +1,24 @@ +diff --git a/Makefile b/Makefile +index 7a4b88c..56678af 100644 +--- a/Makefile ++++ b/Makefile +@@ -3,7 +3,10 @@ EXTVERSION = 0.8.0 + + MODULE_big = vector + DATA = $(wildcard sql/*--*--*.sql) +-DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql ++# This change is needed to install different per-version SQL files ++# like pgvector--0.8.0.sql and pgvector--0.7.4.sql ++# The corresponding file is downloaded during the Docker image build process ++DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql sql/vector--0.7.4.sql + OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o + HEADERS = src/halfvec.h src/sparsevec.h src/vector.h + diff --git a/src/hnswbuild.c b/src/hnswbuild.c -index dcfb2bd..d5189ee 100644 +index b667478..fc1897c 100644 --- a/src/hnswbuild.c +++ b/src/hnswbuild.c -@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) +@@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false); @@ -20,7 +36,7 @@ index dcfb2bd..d5189ee 100644 /* Close relations within worker */ index_close(indexRel, indexLockmode); table_close(heapRel, heapLockmode); -@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, +@@ -1100,12 +1108,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, SeedRandom(42); #endif diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index ac9f5c6904..568f0b0444 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -27,6 +27,10 @@ commands: user: nobody sysvInitAction: respawn shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' + - name: pgbouncer-exporter + user: postgres + sysvInitAction: respawn + shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"' - name: sql-exporter user: nobody sysvInitAction: respawn @@ -43,7 +47,9 @@ files: # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota + # + # Also allow it to shut down the VM. The fast_import job does that when it's finished. + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -68,8 +74,8 @@ build: | # At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2, # and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset # for debian version migration. - # - FROM debian:bookworm-slim as libcgroup-builder + ARG BOOKWORM_SLIM_SHA=sha256:40b107342c492725bc7aacbe93a49945445191ae364184a6d24fedb28172f6f7 + FROM debian@$BOOKWORM_SLIM_SHA as libcgroup-builder ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index 0d178e1c24..6617c98599 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -27,6 +27,10 @@ commands: user: nobody sysvInitAction: respawn shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' + - name: pgbouncer-exporter + user: postgres + sysvInitAction: respawn + shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"' - name: sql-exporter user: nobody sysvInitAction: respawn @@ -43,7 +47,9 @@ files: # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) - postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota + # + # Also allow it to shut down the VM. The fast_import job does that when it's finished. + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -64,7 +70,8 @@ build: | # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor # requires cgroup v2, so we'll build cgroup-tools ourselves. - FROM debian:bullseye-slim as libcgroup-builder + ARG BULLSEYE_SLIM_SHA=sha256:e831d9a884d63734fe3dd9c491ed9a5a3d4c6a6d32c5b14f2067357c49b0b7e1 + FROM debian@$BULLSEYE_SLIM_SHA as libcgroup-builder ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index c0c390caef..c276996df5 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -7,21 +7,24 @@ license.workspace = true [features] default = [] # Enables test specific features. -testing = [] +testing = ["fail/failpoints"] [dependencies] base64.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true aws-sdk-kms.workspace = true +aws-smithy-types.workspace = true anyhow.workspace = true +axum = { workspace = true, features = [] } camino.workspace = true chrono.workspace = true cfg-if.workspace = true clap.workspace = true +fail.workspace = true flate2.workspace = true futures.workspace = true -hyper0 = { workspace = true, features = ["full"] } +http.workspace = true metrics.workspace = true nix.workspace = true notify.workspace = true @@ -36,18 +39,20 @@ serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true tar.workspace = true +tower.workspace = true +tower-http.workspace = true reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true tokio-stream.workspace = true tracing.workspace = true -tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true url.workspace = true -prometheus.workspace = true +uuid.workspace = true +walkdir.workspace = true postgres_initdb.workspace = true compute_api.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index e73ccd908e..1cdae718fe 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -34,6 +34,7 @@ //! -r http://pg-ext-s3-gateway \ //! ``` use std::collections::HashMap; +use std::ffi::OsString; use std::fs::File; use std::path::Path; use std::process::exit; @@ -44,15 +45,16 @@ use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; -use clap::Arg; +use clap::Parser; use compute_tools::disk_quota::set_disk_quota; +use compute_tools::http::server::Server; use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::{consts::SIGINT, iterator::Signals}; use tracing::{error, info, warn}; use url::Url; -use compute_api::responses::ComputeStatus; +use compute_api::responses::{ComputeCtlConfig, ComputeStatus}; use compute_api::spec::ComputeSpec; use compute_tools::compute::{ @@ -60,20 +62,105 @@ use compute_tools::compute::{ }; use compute_tools::configurator::launch_configurator; use compute_tools::extension_server::get_pg_version_string; -use compute_tools::http::api::launch_http_server; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; use compute_tools::swap::resize_swap; use rlimit::{setrlimit, Resource}; +use utils::failpoint_support; // this is an arbitrary build tag. Fine as a default / for testing purposes // in-case of not-set environment var const BUILD_TAG_DEFAULT: &str = "latest"; +// Compatibility hack: if the control plane specified any remote-ext-config +// use the default value for extension storage proxy gateway. +// Remove this once the control plane is updated to pass the gateway URL +fn parse_remote_ext_config(arg: &str) -> Result { + if arg.starts_with("http") { + Ok(arg.trim_end_matches('/').to_string()) + } else { + Ok("http://pg-ext-s3-gateway".to_string()) + } +} + +#[derive(Parser)] +#[command(rename_all = "kebab-case")] +struct Cli { + #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")] + pub pgbin: String, + + #[arg(short = 'r', long, value_parser = parse_remote_ext_config)] + pub remote_ext_config: Option, + + /// The port to bind the external listening HTTP server to. Clients running + /// outside the compute will talk to the compute through this port. Keep + /// the previous name for this argument around for a smoother release + /// with the control plane. + #[arg(long, default_value_t = 3080)] + pub external_http_port: u16, + + /// The port to bind the internal listening HTTP server to. Clients include + /// the neon extension (for installing remote extensions) and local_proxy. + #[arg(long, default_value_t = 3081)] + pub internal_http_port: u16, + + #[arg(short = 'D', long, value_name = "DATADIR")] + pub pgdata: String, + + #[arg(short = 'C', long, value_name = "DATABASE_URL")] + pub connstr: String, + + #[cfg(target_os = "linux")] + #[arg(long, default_value = "neon-postgres")] + pub cgroup: String, + + #[cfg(target_os = "linux")] + #[arg( + long, + default_value = "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor" + )] + pub filecache_connstr: String, + + #[cfg(target_os = "linux")] + #[arg(long, default_value = "0.0.0.0:10301")] + pub vm_monitor_addr: String, + + #[arg(long, action = clap::ArgAction::SetTrue)] + pub resize_swap_on_bind: bool, + + #[arg(long)] + pub set_disk_quota_for_fs: Option, + + #[arg(short = 's', long = "spec", group = "spec")] + pub spec_json: Option, + + #[arg(short = 'S', long, group = "spec-path")] + pub spec_path: Option, + + #[arg(short = 'i', long, group = "compute-id")] + pub compute_id: String, + + #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")] + pub control_plane_uri: Option, +} + fn main() -> Result<()> { - let (build_tag, clap_args) = init()?; + let cli = Cli::parse(); + + // For historical reasons, the main thread that processes the spec and launches postgres + // is synchronous, but we always have this tokio runtime available and we "enter" it so + // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...) + // from all parts of compute_ctl. + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + let _rt_guard = runtime.enter(); + + let build_tag = runtime.block_on(init())?; + + let scenario = failpoint_support::init(); // enable core dumping for all child processes setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?; @@ -82,13 +169,11 @@ fn main() -> Result<()> { // Enter startup tracing context let _startup_context_guard = startup_context_from_env(); - let cli_args = process_cli(&clap_args)?; + let cli_spec = try_spec_from_cli(&cli)?; - let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?; + let compute = wait_spec(build_tag, &cli, cli_spec)?; - let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?; - - start_postgres(&clap_args, wait_spec_result)? + start_postgres(&cli, compute)? // Startup is finished, exit the startup tracing span }; @@ -100,16 +185,13 @@ fn main() -> Result<()> { maybe_delay_exit(delay_exit); + scenario.teardown(); + deinit_and_exit(wait_pg_result); } -fn init() -> Result<(String, clap::ArgMatches)> { - init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; - - opentelemetry::global::set_error_handler(|err| { - tracing::info!("OpenTelemetry error: {err}"); - }) - .expect("global error handler lock poisoned"); +async fn init() -> Result { + init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; thread::spawn(move || { @@ -123,66 +205,7 @@ fn init() -> Result<(String, clap::ArgMatches)> { .to_string(); info!("build_tag: {build_tag}"); - Ok((build_tag, cli().get_matches())) -} - -fn process_cli(matches: &clap::ArgMatches) -> Result { - let pgbin_default = "postgres"; - let pgbin = matches - .get_one::("pgbin") - .map(|s| s.as_str()) - .unwrap_or(pgbin_default); - - let ext_remote_storage = matches - .get_one::("remote-ext-config") - // Compatibility hack: if the control plane specified any remote-ext-config - // use the default value for extension storage proxy gateway. - // Remove this once the control plane is updated to pass the gateway URL - .map(|conf| { - if conf.starts_with("http") { - conf.trim_end_matches('/') - } else { - "http://pg-ext-s3-gateway" - } - }); - - let http_port = *matches - .get_one::("http-port") - .expect("http-port is required"); - let pgdata = matches - .get_one::("pgdata") - .expect("PGDATA path is required"); - let connstr = matches - .get_one::("connstr") - .expect("Postgres connection string is required"); - let spec_json = matches.get_one::("spec"); - let spec_path = matches.get_one::("spec-path"); - let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind"); - let set_disk_quota_for_fs = matches.get_one::("set-disk-quota-for-fs"); - - Ok(ProcessCliResult { - connstr, - pgdata, - pgbin, - ext_remote_storage, - http_port, - spec_json, - spec_path, - resize_swap_on_bind, - set_disk_quota_for_fs, - }) -} - -struct ProcessCliResult<'clap> { - connstr: &'clap str, - pgdata: &'clap str, - pgbin: &'clap str, - ext_remote_storage: Option<&'clap str>, - http_port: u16, - spec_json: Option<&'clap String>, - spec_path: Option<&'clap String>, - resize_swap_on_bind: bool, - set_disk_quota_for_fs: Option<&'clap String>, + Ok(build_tag) } fn startup_context_from_env() -> Option { @@ -235,83 +258,65 @@ fn startup_context_from_env() -> Option { } } -fn try_spec_from_cli( - matches: &clap::ArgMatches, - ProcessCliResult { - spec_json, - spec_path, - .. - }: &ProcessCliResult, -) -> Result { - let compute_id = matches.get_one::("compute-id"); - let control_plane_uri = matches.get_one::("control-plane-uri"); +fn try_spec_from_cli(cli: &Cli) -> Result { + // First, try to get cluster spec from the cli argument + if let Some(ref spec_json) = cli.spec_json { + info!("got spec from cli argument {}", spec_json); + return Ok(CliSpecParams { + spec: Some(serde_json::from_str(spec_json)?), + compute_ctl_config: ComputeCtlConfig::default(), + live_config_allowed: false, + }); + } - let spec; - let mut live_config_allowed = false; - match spec_json { - // First, try to get cluster spec from the cli argument - Some(json) => { - info!("got spec from cli argument {}", json); - spec = Some(serde_json::from_str(json)?); - } - None => { - // Second, try to read it from the file if path is provided - if let Some(sp) = spec_path { - let path = Path::new(sp); - let file = File::open(path)?; - spec = Some(serde_json::from_reader(file)?); - live_config_allowed = true; - } else if let Some(id) = compute_id { - if let Some(cp_base) = control_plane_uri { - live_config_allowed = true; - spec = match get_spec_from_control_plane(cp_base, id) { - Ok(s) => s, - Err(e) => { - error!("cannot get response from control plane: {}", e); - panic!("neither spec nor confirmation that compute is in the Empty state was received"); - } - }; - } else { - panic!("must specify both --control-plane-uri and --compute-id or none"); - } - } else { - panic!( - "compute spec should be provided by one of the following ways: \ - --spec OR --spec-path OR --control-plane-uri and --compute-id" - ); - } - } + // Second, try to read it from the file if path is provided + if let Some(ref spec_path) = cli.spec_path { + let file = File::open(Path::new(spec_path))?; + return Ok(CliSpecParams { + spec: Some(serde_json::from_reader(file)?), + compute_ctl_config: ComputeCtlConfig::default(), + live_config_allowed: true, + }); + } + + if cli.control_plane_uri.is_none() { + panic!("must specify --control-plane-uri"); }; - Ok(CliSpecParams { - spec, - live_config_allowed, - }) + match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) { + Ok(resp) => Ok(CliSpecParams { + spec: resp.0, + compute_ctl_config: resp.1, + live_config_allowed: true, + }), + Err(e) => { + error!( + "cannot get response from control plane: {}\n\ + neither spec nor confirmation that compute is in the Empty state was received", + e + ); + Err(e) + } + } } struct CliSpecParams { /// If a spec was provided via CLI or file, the [`ComputeSpec`] spec: Option, + #[allow(dead_code)] + compute_ctl_config: ComputeCtlConfig, live_config_allowed: bool, } fn wait_spec( build_tag: String, - ProcessCliResult { - connstr, - pgdata, - pgbin, - ext_remote_storage, - resize_swap_on_bind, - set_disk_quota_for_fs, - http_port, - .. - }: ProcessCliResult, + cli: &Cli, CliSpecParams { spec, live_config_allowed, + compute_ctl_config: _, }: CliSpecParams, -) -> Result { +) -> Result> { let mut new_state = ComputeState::new(); let spec_set; @@ -323,23 +328,25 @@ fn wait_spec( } else { spec_set = false; } - let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?; + let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?; let conn_conf = postgres::config::Config::from_str(connstr.as_str()) .context("cannot build postgres config from connstr")?; let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str()) .context("cannot build tokio postgres config from connstr")?; let compute_node = ComputeNode { + compute_id: cli.compute_id.clone(), connstr, conn_conf, tokio_conn_conf, - pgdata: pgdata.to_string(), - pgbin: pgbin.to_string(), - pgversion: get_pg_version_string(pgbin), - http_port, + pgdata: cli.pgdata.clone(), + pgbin: cli.pgbin.clone(), + pgversion: get_pg_version_string(&cli.pgbin), + external_http_port: cli.external_http_port, + internal_http_port: cli.internal_http_port, live_config_allowed, state: Mutex::new(new_state), state_changed: Condvar::new(), - ext_remote_storage: ext_remote_storage.map(|s| s.to_string()), + ext_remote_storage: cli.remote_ext_config.clone(), ext_download_progress: RwLock::new(HashMap::new()), build_tag, }; @@ -353,10 +360,13 @@ fn wait_spec( compute.prewarm_postgres()?; } - // Launch http service first, so that we can serve control-plane requests - // while configuration is still in progress. - let _http_handle = - launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread"); + // Launch the external HTTP server first, so that we can serve control plane + // requests while configuration is still in progress. + Server::External(cli.external_http_port).launch(&compute); + + // The internal HTTP server could be launched later, but there isn't much + // sense in waiting. + Server::Internal(cli.internal_http_port).launch(&compute); if !spec_set { // No spec provided, hang waiting for it. @@ -388,27 +398,12 @@ fn wait_spec( launch_lsn_lease_bg_task_for_static(&compute); - Ok(WaitSpecResult { - compute, - resize_swap_on_bind, - set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(), - }) -} - -struct WaitSpecResult { - compute: Arc, - resize_swap_on_bind: bool, - set_disk_quota_for_fs: Option, + Ok(compute) } fn start_postgres( - // need to allow unused because `matches` is only used if target_os = "linux" - #[allow(unused_variables)] matches: &clap::ArgMatches, - WaitSpecResult { - compute, - resize_swap_on_bind, - set_disk_quota_for_fs, - }: WaitSpecResult, + cli: &Cli, + compute: Arc, ) -> Result<(Option, StartPostgresResult)> { // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); @@ -418,9 +413,14 @@ fn start_postgres( "running compute with features: {:?}", state.pspec.as_ref().unwrap().spec.features ); - // before we release the mutex, fetch the swap size (if any) for later. - let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes; - let disk_quota_bytes = state.pspec.as_ref().unwrap().spec.disk_quota_bytes; + // before we release the mutex, fetch some parameters for later. + let &ComputeSpec { + swap_size_bytes, + disk_quota_bytes, + #[cfg(target_os = "linux")] + disable_lfc_resizing, + .. + } = &state.pspec.as_ref().unwrap().spec; drop(state); // Launch remaining service threads @@ -431,7 +431,7 @@ fn start_postgres( let mut delay_exit = false; // Resize swap to the desired size if the compute spec says so - if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) { + if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) { // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion // *before* starting postgres. // @@ -458,9 +458,9 @@ fn start_postgres( // Set disk quota if the compute spec says so if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = - (disk_quota_bytes, set_disk_quota_for_fs) + (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref()) { - match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) { + match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) { Ok(()) => { let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display. info!(%disk_quota_bytes, %size_mib, "set disk quota"); @@ -482,7 +482,10 @@ fn start_postgres( let mut pg = None; if !prestartup_failed { pg = match compute.start_compute() { - Ok(pg) => Some(pg), + Ok(pg) => { + info!(postmaster_pid = %pg.0.id(), "Postgres was started"); + Some(pg) + } Err(err) => { error!("could not start the compute node: {:#}", err); compute.set_failed_status(err); @@ -500,41 +503,30 @@ fn start_postgres( if #[cfg(target_os = "linux")] { use std::env; use tokio_util::sync::CancellationToken; - let vm_monitor_addr = matches - .get_one::("vm-monitor-addr") - .expect("--vm-monitor-addr should always be set because it has a default arg"); - let file_cache_connstr = matches.get_one::("filecache-connstr"); - let cgroup = matches.get_one::("cgroup"); - - // Only make a runtime if we need to. - // Note: it seems like you can make a runtime in an inner scope and - // if you start a task in it it won't be dropped. However, make it - // in the outermost scope just to be safe. - let rt = if env::var_os("AUTOSCALING").is_some() { - Some( - tokio::runtime::Builder::new_multi_thread() - .worker_threads(4) - .enable_all() - .build() - .expect("failed to create tokio runtime for monitor") - ) - } else { - None - }; // This token is used internally by the monitor to clean up all threads let token = CancellationToken::new(); - let vm_monitor = rt.as_ref().map(|rt| { - rt.spawn(vm_monitor::start( + // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC + let pgconnstr = if disable_lfc_resizing.unwrap_or(false) { + None + } else { + Some(cli.filecache_connstr.clone()) + }; + + let vm_monitor = if env::var_os("AUTOSCALING").is_some() { + let vm_monitor = tokio::spawn(vm_monitor::start( Box::leak(Box::new(vm_monitor::Args { - cgroup: cgroup.cloned(), - pgconnstr: file_cache_connstr.cloned(), - addr: vm_monitor_addr.clone(), + cgroup: Some(cli.cgroup.clone()), + pgconnstr, + addr: cli.vm_monitor_addr.clone(), })), token.clone(), - )) - }); + )); + Some(vm_monitor) + } else { + None + }; } } @@ -544,8 +536,6 @@ fn start_postgres( delay_exit, compute, #[cfg(target_os = "linux")] - rt, - #[cfg(target_os = "linux")] token, #[cfg(target_os = "linux")] vm_monitor, @@ -553,15 +543,13 @@ fn start_postgres( )) } -type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>); +type PostgresHandle = (std::process::Child, tokio::task::JoinHandle>); struct StartPostgresResult { delay_exit: bool, // passed through from WaitSpecResult compute: Arc, - #[cfg(target_os = "linux")] - rt: Option, #[cfg(target_os = "linux")] token: tokio_util::sync::CancellationToken, #[cfg(target_os = "linux")] @@ -573,15 +561,17 @@ fn wait_postgres(pg: Option) -> Result { // propagate to Postgres and it will be shut down as well. let mut exit_code = None; if let Some((mut pg, logs_handle)) = pg { + info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit"); + let ecode = pg .wait() .expect("failed to start waiting on Postgres process"); PG_PID.store(0, Ordering::SeqCst); - // Process has exited, so we can join the logs thread. - let _ = logs_handle - .join() - .map_err(|e| tracing::error!("log thread panicked: {:?}", e)); + // Process has exited. Wait for the log collecting task to finish. + let _ = tokio::runtime::Handle::current() + .block_on(logs_handle) + .map_err(|e| tracing::error!("log task panicked: {:?}", e)); info!("Postgres exited with code {}, shutting down", ecode); exit_code = ecode.code() @@ -602,8 +592,6 @@ fn cleanup_after_postgres_exit( vm_monitor, #[cfg(target_os = "linux")] token, - #[cfg(target_os = "linux")] - rt, }: StartPostgresResult, ) -> Result { // Terminate the vm_monitor so it releases the file watcher on @@ -616,10 +604,6 @@ fn cleanup_after_postgres_exit( token.cancel(); // Kills the actual task running the monitor handle.abort(); - - // If handle is some, rt must have been used to produce it, and - // hence is also some - rt.unwrap().shutdown_timeout(Duration::from_secs(2)); } } } @@ -684,105 +668,6 @@ fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! { exit(exit_code.unwrap_or(1)) } -fn cli() -> clap::Command { - // Env variable is set by `cargo` - let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown"); - clap::Command::new("compute_ctl") - .version(version) - .arg( - Arg::new("http-port") - .long("http-port") - .value_name("HTTP_PORT") - .default_value("3080") - .value_parser(clap::value_parser!(u16)) - .required(false), - ) - .arg( - Arg::new("connstr") - .short('C') - .long("connstr") - .value_name("DATABASE_URL") - .required(true), - ) - .arg( - Arg::new("pgdata") - .short('D') - .long("pgdata") - .value_name("DATADIR") - .required(true), - ) - .arg( - Arg::new("pgbin") - .short('b') - .long("pgbin") - .default_value("postgres") - .value_name("POSTGRES_PATH"), - ) - .arg( - Arg::new("spec") - .short('s') - .long("spec") - .value_name("SPEC_JSON"), - ) - .arg( - Arg::new("spec-path") - .short('S') - .long("spec-path") - .value_name("SPEC_PATH"), - ) - .arg( - Arg::new("compute-id") - .short('i') - .long("compute-id") - .value_name("COMPUTE_ID"), - ) - .arg( - Arg::new("control-plane-uri") - .short('p') - .long("control-plane-uri") - .value_name("CONTROL_PLANE_API_BASE_URI"), - ) - .arg( - Arg::new("remote-ext-config") - .short('r') - .long("remote-ext-config") - .value_name("REMOTE_EXT_CONFIG"), - ) - // TODO(fprasx): we currently have default arguments because the cloud PR - // to pass them in hasn't been merged yet. We should get rid of them once - // the PR is merged. - .arg( - Arg::new("vm-monitor-addr") - .long("vm-monitor-addr") - .default_value("0.0.0.0:10301") - .value_name("VM_MONITOR_ADDR"), - ) - .arg( - Arg::new("cgroup") - .long("cgroup") - .default_value("neon-postgres") - .value_name("CGROUP"), - ) - .arg( - Arg::new("filecache-connstr") - .long("filecache-connstr") - .default_value( - "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor", - ) - .value_name("FILECACHE_CONNSTR"), - ) - .arg( - Arg::new("resize-swap-on-bind") - .long("resize-swap-on-bind") - .action(clap::ArgAction::SetTrue), - ) - .arg( - Arg::new("set-disk-quota-for-fs") - .long("set-disk-quota-for-fs") - .value_name("SET_DISK_QUOTA_FOR_FS") - ) -} - /// When compute_ctl is killed, send also termination signal to sync-safekeepers /// to prevent leakage. TODO: it is better to convert compute_ctl to async and /// wait for termination which would be easy then. @@ -792,7 +677,14 @@ fn handle_exit_signal(sig: i32) { exit(1); } -#[test] -fn verify_cli() { - cli().debug_assert() +#[cfg(test)] +mod test { + use clap::CommandFactory; + + use super::Cli; + + #[test] + fn verify_cli() { + Cli::command().debug_assert() + } } diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index b6db3eb11a..585f3e4e1d 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -17,7 +17,7 @@ //! //! # Local Testing //! -//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build. +//! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build. //! - Build the image with the following command: //! //! ```bash @@ -25,32 +25,78 @@ //! docker push localhost:3030/localregistry/compute-node-v14:latest //! ``` -use anyhow::Context; +use anyhow::{bail, Context}; use aws_config::BehaviorVersion; use camino::{Utf8Path, Utf8PathBuf}; -use clap::Parser; +use clap::{Parser, Subcommand}; use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion}; use nix::unistd::Pid; -use tracing::{info, info_span, warn, Instrument}; +use tracing::{error, info, info_span, warn, Instrument}; use utils::fs_ext::is_directory_empty; +#[path = "fast_import/aws_s3_sync.rs"] +mod aws_s3_sync; #[path = "fast_import/child_stdio_to_log.rs"] mod child_stdio_to_log; #[path = "fast_import/s3_uri.rs"] mod s3_uri; -#[path = "fast_import/s5cmd.rs"] -mod s5cmd; + +const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600); +const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300); + +#[derive(Subcommand, Debug)] +enum Command { + /// Runs local postgres (neon binary), restores into it, + /// uploads pgdata to s3 to be consumed by pageservers + Pgdata { + /// Raw connection string to the source database. Used only in tests, + /// real scenario uses encrypted connection string in spec.json from s3. + #[clap(long)] + source_connection_string: Option, + /// If specified, will not shut down the local postgres after the import. Used in local testing + #[clap(short, long)] + interactive: bool, + /// Port to run postgres on. Default is 5432. + #[clap(long, default_value_t = 5432)] + pg_port: u16, // port to run postgres on, 5432 is default + + /// Number of CPUs in the system. This is used to configure # of + /// parallel worker processes, for index creation. + #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")] + num_cpus: Option, + + /// Amount of RAM in the system. This is used to configure shared_buffers + /// and maintenance_work_mem. + #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")] + memory_mb: Option, + }, + + /// Runs pg_dump-pg_restore from source to destination without running local postgres. + DumpRestore { + /// Raw connection string to the source database. Used only in tests, + /// real scenario uses encrypted connection string in spec.json from s3. + #[clap(long)] + source_connection_string: Option, + /// Raw connection string to the destination database. Used only in tests, + /// real scenario uses encrypted connection string in spec.json from s3. + #[clap(long)] + destination_connection_string: Option, + }, +} #[derive(clap::Parser)] struct Args { - #[clap(long)] + #[clap(long, env = "NEON_IMPORTER_WORKDIR")] working_directory: Utf8PathBuf, #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")] - s3_prefix: s3_uri::S3Uri, - #[clap(long)] + s3_prefix: Option, + #[clap(long, env = "NEON_IMPORTER_PG_BIN_DIR")] pg_bin_dir: Utf8PathBuf, - #[clap(long)] + #[clap(long, env = "NEON_IMPORTER_PG_LIB_DIR")] pg_lib_dir: Utf8PathBuf, + + #[clap(subcommand)] + command: Command, } #[serde_with::serde_as] @@ -59,6 +105,8 @@ struct Spec { encryption_secret: EncryptionSecret, #[serde_as(as = "serde_with::base64::Base64")] source_connstring_ciphertext_base64: Vec, + #[serde_as(as = "Option")] + destination_connstring_ciphertext_base64: Option>, } #[derive(serde::Deserialize)] @@ -67,164 +115,220 @@ enum EncryptionSecret { KMS { key_id: String }, } -#[tokio::main] -pub(crate) async fn main() -> anyhow::Result<()> { - utils::logging::init( - utils::logging::LogFormat::Plain, - utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, - utils::logging::Output::Stdout, - )?; +// copied from pageserver_api::config::defaults::DEFAULT_LOCALE to avoid dependency just for a constant +const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") { + "C" +} else { + "C.UTF-8" +}; - info!("starting"); - - let Args { - working_directory, - s3_prefix, - pg_bin_dir, - pg_lib_dir, - } = Args::parse(); - - let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; - - let spec: Spec = { - let spec_key = s3_prefix.append("/spec.json"); - let s3_client = aws_sdk_s3::Client::new(&aws_config); - let object = s3_client - .get_object() - .bucket(&spec_key.bucket) - .key(spec_key.key) - .send() - .await - .context("get spec from s3")? - .body - .collect() - .await - .context("download spec body")?; - serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? - }; - - match tokio::fs::create_dir(&working_directory).await { - Ok(()) => {} - Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { - if !is_directory_empty(&working_directory) - .await - .context("check if working directory is empty")? - { - anyhow::bail!("working directory is not empty"); - } else { - // ok - } - } - Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), - } - - let pgdata_dir = working_directory.join("pgdata"); - tokio::fs::create_dir(&pgdata_dir) +async fn decode_connstring( + kms_client: &aws_sdk_kms::Client, + key_id: &String, + connstring_ciphertext_base64: Vec, +) -> Result { + let mut output = kms_client + .decrypt() + .key_id(key_id) + .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( + connstring_ciphertext_base64, + )) + .send() .await - .context("create pgdata directory")?; + .context("decrypt connection string")?; - // - // Setup clients - // - let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; - let kms_client = aws_sdk_kms::Client::new(&aws_config); + let plaintext = output + .plaintext + .take() + .context("get plaintext connection string")?; - // - // Initialize pgdata - // - let pgbin = pg_bin_dir.join("postgres"); - let pg_version = match get_pg_version(pgbin.as_ref()) { - PostgresMajorVersion::V14 => 14, - PostgresMajorVersion::V15 => 15, - PostgresMajorVersion::V16 => 16, - PostgresMajorVersion::V17 => 17, - }; - let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded - postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { - superuser, - locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded, - pg_version, - initdb_bin: pg_bin_dir.join("initdb").as_ref(), - library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. - pgdata: &pgdata_dir, - }) - .await - .context("initdb")?; + String::from_utf8(plaintext.into_inner()).context("parse connection string as utf8") +} - let nproc = num_cpus::get(); +struct PostgresProcess { + pgdata_dir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pgbin: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, + postgres_proc: Option, +} - // - // Launch postgres process - // - let mut postgres_proc = tokio::process::Command::new(pgbin) - .arg("-D") - .arg(&pgdata_dir) - .args(["-c", "wal_level=minimal"]) - .args(["-c", "shared_buffers=10GB"]) - .args(["-c", "max_wal_senders=0"]) - .args(["-c", "fsync=off"]) - .args(["-c", "full_page_writes=off"]) - .args(["-c", "synchronous_commit=off"]) - .args(["-c", "maintenance_work_mem=8388608"]) - .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) - .args(["-c", &format!("max_parallel_workers={nproc}")]) - .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) - .args(["-c", &format!("max_worker_processes={nproc}")]) - .args(["-c", "effective_io_concurrency=100"]) - .env_clear() - .stdout(std::process::Stdio::piped()) - .stderr(std::process::Stdio::piped()) - .spawn() - .context("spawn postgres")?; - - info!("spawned postgres, waiting for it to become ready"); - tokio::spawn( - child_stdio_to_log::relay_process_output( - postgres_proc.stdout.take(), - postgres_proc.stderr.take(), - ) - .instrument(info_span!("postgres")), - ); - let restore_pg_connstring = - format!("host=localhost port=5432 user={superuser} dbname=postgres"); - loop { - let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await; - if res.is_ok() { - info!("postgres is ready, could connect to it"); - break; +impl PostgresProcess { + fn new(pgdata_dir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf) -> Self { + Self { + pgdata_dir, + pgbin: pg_bin_dir.join("postgres"), + pg_bin_dir, + pg_lib_dir, + postgres_proc: None, } } - // - // Decrypt connection string - // - let source_connection_string = { - match spec.encryption_secret { - EncryptionSecret::KMS { key_id } => { - let mut output = kms_client - .decrypt() - .key_id(key_id) - .ciphertext_blob(aws_sdk_s3::primitives::Blob::new( - spec.source_connstring_ciphertext_base64, - )) - .send() + async fn prepare(&self, initdb_user: &str) -> Result<(), anyhow::Error> { + tokio::fs::create_dir(&self.pgdata_dir) + .await + .context("create pgdata directory")?; + + let pg_version = match get_pg_version(self.pgbin.as_ref()) { + PostgresMajorVersion::V14 => 14, + PostgresMajorVersion::V15 => 15, + PostgresMajorVersion::V16 => 16, + PostgresMajorVersion::V17 => 17, + }; + postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { + superuser: initdb_user, + locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded, + pg_version, + initdb_bin: self.pg_bin_dir.join("initdb").as_ref(), + library_search_path: &self.pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local. + pgdata: &self.pgdata_dir, + }) + .await + .context("initdb") + } + + async fn start( + &mut self, + initdb_user: &str, + port: u16, + nproc: usize, + memory_mb: usize, + ) -> Result<&tokio::process::Child, anyhow::Error> { + self.prepare(initdb_user).await?; + + // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for + // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest + // available for misc other stuff that PostgreSQL uses memory for. + let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize; + let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize; + + // + // Launch postgres process + // + let mut proc = tokio::process::Command::new(&self.pgbin) + .arg("-D") + .arg(&self.pgdata_dir) + .args(["-p", &format!("{port}")]) + .args(["-c", "wal_level=minimal"]) + .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")]) + .args(["-c", "max_wal_senders=0"]) + .args(["-c", "fsync=off"]) + .args(["-c", "full_page_writes=off"]) + .args(["-c", "synchronous_commit=off"]) + .args([ + "-c", + &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"), + ]) + .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers={nproc}")]) + .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")]) + .args(["-c", &format!("max_worker_processes={nproc}")]) + .args(["-c", "effective_io_concurrency=100"]) + .env_clear() + .env("LD_LIBRARY_PATH", &self.pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("spawn postgres")?; + + info!("spawned postgres, waiting for it to become ready"); + tokio::spawn( + child_stdio_to_log::relay_process_output(proc.stdout.take(), proc.stderr.take()) + .instrument(info_span!("postgres")), + ); + + self.postgres_proc = Some(proc); + Ok(self.postgres_proc.as_ref().unwrap()) + } + + async fn shutdown(&mut self) -> Result<(), anyhow::Error> { + let proc: &mut tokio::process::Child = self.postgres_proc.as_mut().unwrap(); + info!("shutdown postgres"); + nix::sys::signal::kill( + Pid::from_raw(i32::try_from(proc.id().unwrap()).expect("convert child pid to i32")), + nix::sys::signal::SIGTERM, + ) + .context("signal postgres to shut down")?; + proc.wait() + .await + .context("wait for postgres to shut down") + .map(|_| ()) + } +} + +async fn wait_until_ready(connstring: String, create_dbname: String) { + // Create neondb database in the running postgres + let start_time = std::time::Instant::now(); + + loop { + if start_time.elapsed() > PG_WAIT_TIMEOUT { + error!( + "timeout exceeded: failed to poll postgres and create database within 10 minutes" + ); + std::process::exit(1); + } + + match tokio_postgres::connect( + &connstring.replace("dbname=neondb", "dbname=postgres"), + tokio_postgres::NoTls, + ) + .await + { + Ok((client, connection)) => { + // Spawn the connection handling task to maintain the connection + tokio::spawn(async move { + if let Err(e) = connection.await { + warn!("connection error: {}", e); + } + }); + + match client + .simple_query(format!("CREATE DATABASE {create_dbname};").as_str()) .await - .context("decrypt source connection string")?; - let plaintext = output - .plaintext - .take() - .context("get plaintext source connection string")?; - String::from_utf8(plaintext.into_inner()) - .context("parse source connection string as utf8")? + { + Ok(_) => { + info!("created {} database", create_dbname); + break; + } + Err(e) => { + warn!( + "failed to create database: {}, retying in {}s", + e, + PG_WAIT_RETRY_INTERVAL.as_secs_f32() + ); + tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await; + continue; + } + } + } + Err(_) => { + info!( + "postgres not ready yet, retrying in {}s", + PG_WAIT_RETRY_INTERVAL.as_secs_f32() + ); + tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await; + continue; } } - }; + } +} - // - // Start the work - // - - let dumpdir = working_directory.join("dumpdir"); +async fn run_dump_restore( + workdir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, + source_connstring: String, + destination_connstring: String, +) -> Result<(), anyhow::Error> { + let dumpdir = workdir.join("dumpdir"); let common_args = [ // schema mapping (prob suffices to specify them on one side) @@ -253,9 +357,18 @@ pub(crate) async fn main() -> anyhow::Result<()> { .arg("--no-sync") // POSITIONAL args // source db (db name included in connection string) - .arg(&source_connection_string) + .arg(&source_connstring) // how we run it .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) .kill_on_drop(true) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) @@ -272,23 +385,31 @@ pub(crate) async fn main() -> anyhow::Result<()> { let st = pg_dump.wait().await.context("wait for pg_dump")?; info!(status=?st, "pg_dump exited"); if !st.success() { - warn!(status=%st, "pg_dump failed, restore will likely fail as well"); + error!(status=%st, "pg_dump failed, restore will likely fail as well"); + bail!("pg_dump failed"); } } - // TODO: do it in a streaming way, plenty of internal research done on this already + // TODO: maybe do it in a streaming way, plenty of internal research done on this already // TODO: do the unlogged table trick - - info!("restore from working directory into vanilla postgres"); { let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore")) .args(&common_args) .arg("-d") - .arg(&restore_pg_connstring) + .arg(&destination_connstring) // POSITIONAL args .arg(&dumpdir) // how we run it .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) .kill_on_drop(true) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) @@ -306,41 +427,261 @@ pub(crate) async fn main() -> anyhow::Result<()> { let st = pg_restore.wait().await.context("wait for pg_restore")?; info!(status=?st, "pg_restore exited"); if !st.success() { - warn!(status=%st, "pg_restore failed, restore will likely fail as well"); + error!(status=%st, "pg_restore failed, restore will likely fail as well"); + bail!("pg_restore failed"); + } + } + + Ok(()) +} + +#[allow(clippy::too_many_arguments)] +async fn cmd_pgdata( + s3_client: Option, + kms_client: Option, + maybe_s3_prefix: Option, + maybe_spec: Option, + source_connection_string: Option, + interactive: bool, + pg_port: u16, + workdir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, + num_cpus: Option, + memory_mb: Option, +) -> Result<(), anyhow::Error> { + if maybe_spec.is_none() && source_connection_string.is_none() { + bail!("spec must be provided for pgdata command"); + } + if maybe_spec.is_some() && source_connection_string.is_some() { + bail!("only one of spec or source_connection_string can be provided"); + } + + let source_connection_string = if let Some(spec) = maybe_spec { + match spec.encryption_secret { + EncryptionSecret::KMS { key_id } => { + decode_connstring( + kms_client.as_ref().unwrap(), + &key_id, + spec.source_connstring_ciphertext_base64, + ) + .await? + } + } + } else { + source_connection_string.unwrap() + }; + + let superuser = "cloud_admin"; + let destination_connstring = format!( + "host=localhost port={} user={} dbname=neondb", + pg_port, superuser + ); + + let pgdata_dir = workdir.join("pgdata"); + let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone()); + let nproc = num_cpus.unwrap_or_else(num_cpus::get); + let memory_mb = memory_mb.unwrap_or(256); + proc.start(superuser, pg_port, nproc, memory_mb).await?; + wait_until_ready(destination_connstring.clone(), "neondb".to_string()).await; + + run_dump_restore( + workdir.clone(), + pg_bin_dir, + pg_lib_dir, + source_connection_string, + destination_connstring, + ) + .await?; + + // If interactive mode, wait for Ctrl+C + if interactive { + info!("Running in interactive mode. Press Ctrl+C to shut down."); + tokio::signal::ctrl_c().await.context("wait for ctrl-c")?; + } + + proc.shutdown().await?; + + // Only sync if s3_prefix was specified + if let Some(s3_prefix) = maybe_s3_prefix { + info!("upload pgdata"); + aws_s3_sync::upload_dir_recursive( + s3_client.as_ref().unwrap(), + Utf8Path::new(&pgdata_dir), + &s3_prefix.append("/pgdata/"), + ) + .await + .context("sync dump directory to destination")?; + + info!("write status"); + { + let status_dir = workdir.join("status"); + std::fs::create_dir(&status_dir).context("create status directory")?; + let status_file = status_dir.join("pgdata"); + std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) + .context("write status file")?; + aws_s3_sync::upload_dir_recursive( + s3_client.as_ref().unwrap(), + &status_dir, + &s3_prefix.append("/status/"), + ) + .await + .context("sync status directory to destination")?; + } + } + + Ok(()) +} + +async fn cmd_dumprestore( + kms_client: Option, + maybe_spec: Option, + source_connection_string: Option, + destination_connection_string: Option, + workdir: Utf8PathBuf, + pg_bin_dir: Utf8PathBuf, + pg_lib_dir: Utf8PathBuf, +) -> Result<(), anyhow::Error> { + let (source_connstring, destination_connstring) = if let Some(spec) = maybe_spec { + match spec.encryption_secret { + EncryptionSecret::KMS { key_id } => { + let source = decode_connstring( + kms_client.as_ref().unwrap(), + &key_id, + spec.source_connstring_ciphertext_base64, + ) + .await?; + + let dest = if let Some(dest_ciphertext) = + spec.destination_connstring_ciphertext_base64 + { + decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext) + .await? + } else { + bail!("destination connection string must be provided in spec for dump_restore command"); + }; + + (source, dest) + } + } + } else { + ( + source_connection_string.unwrap(), + if let Some(val) = destination_connection_string { + val + } else { + bail!("destination connection string must be provided for dump_restore command"); + }, + ) + }; + + run_dump_restore( + workdir, + pg_bin_dir, + pg_lib_dir, + source_connstring, + destination_connstring, + ) + .await +} + +#[tokio::main] +pub(crate) async fn main() -> anyhow::Result<()> { + utils::logging::init( + utils::logging::LogFormat::Json, + utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + utils::logging::Output::Stdout, + )?; + + info!("starting"); + + let args = Args::parse(); + + // Initialize AWS clients only if s3_prefix is specified + let (s3_client, kms_client) = if args.s3_prefix.is_some() { + let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + let s3_client = aws_sdk_s3::Client::new(&config); + let kms = aws_sdk_kms::Client::new(&config); + (Some(s3_client), Some(kms)) + } else { + (None, None) + }; + + let spec: Option = if let Some(s3_prefix) = &args.s3_prefix { + let spec_key = s3_prefix.append("/spec.json"); + let object = s3_client + .as_ref() + .unwrap() + .get_object() + .bucket(&spec_key.bucket) + .key(spec_key.key) + .send() + .await + .context("get spec from s3")? + .body + .collect() + .await + .context("download spec body")?; + serde_json::from_slice(&object.into_bytes()).context("parse spec as json")? + } else { + None + }; + + match tokio::fs::create_dir(&args.working_directory).await { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => { + if !is_directory_empty(&args.working_directory) + .await + .context("check if working directory is empty")? + { + bail!("working directory is not empty"); + } else { + // ok + } + } + Err(e) => return Err(anyhow::Error::new(e).context("create working directory")), + } + + match args.command { + Command::Pgdata { + source_connection_string, + interactive, + pg_port, + num_cpus, + memory_mb, + } => { + cmd_pgdata( + s3_client, + kms_client, + args.s3_prefix, + spec, + source_connection_string, + interactive, + pg_port, + args.working_directory, + args.pg_bin_dir, + args.pg_lib_dir, + num_cpus, + memory_mb, + ) + .await?; + } + Command::DumpRestore { + source_connection_string, + destination_connection_string, + } => { + cmd_dumprestore( + kms_client, + spec, + source_connection_string, + destination_connection_string, + args.working_directory, + args.pg_bin_dir, + args.pg_lib_dir, + ) + .await?; } } - info!("shutdown postgres"); - { - nix::sys::signal::kill( - Pid::from_raw( - i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"), - ), - nix::sys::signal::SIGTERM, - ) - .context("signal postgres to shut down")?; - postgres_proc - .wait() - .await - .context("wait for postgres to shut down")?; - } - - info!("upload pgdata"); - s5cmd::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/")) - .await - .context("sync dump directory to destination")?; - - info!("write status"); - { - let status_dir = working_directory.join("status"); - std::fs::create_dir(&status_dir).context("create status directory")?; - let status_file = status_dir.join("status"); - std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) - .context("write status file")?; - s5cmd::sync(&status_file, &s3_prefix.append("/status/pgdata")) - .await - .context("sync status directory to destination")?; - } - Ok(()) } diff --git a/compute_tools/src/bin/fast_import/aws_s3_sync.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs new file mode 100644 index 0000000000..1be10b36d6 --- /dev/null +++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs @@ -0,0 +1,102 @@ +use camino::{Utf8Path, Utf8PathBuf}; +use tokio::task::JoinSet; +use walkdir::WalkDir; + +use super::s3_uri::S3Uri; + +use tracing::{info, warn}; + +const MAX_PARALLEL_UPLOADS: usize = 10; + +/// Upload all files from 'local' to 'remote' +pub(crate) async fn upload_dir_recursive( + s3_client: &aws_sdk_s3::Client, + local: &Utf8Path, + remote: &S3Uri, +) -> anyhow::Result<()> { + // Recursively scan directory + let mut dirwalker = WalkDir::new(local) + .into_iter() + .map(|entry| { + let entry = entry?; + let file_type = entry.file_type(); + let path = <&Utf8Path>::try_from(entry.path())?.to_path_buf(); + Ok((file_type, path)) + }) + .filter_map(|e: anyhow::Result<(std::fs::FileType, Utf8PathBuf)>| { + match e { + Ok((file_type, path)) if file_type.is_file() => Some(Ok(path)), + Ok((file_type, _path)) if file_type.is_dir() => { + // The WalkDir iterator will recurse into directories, but we don't want + // to do anything with directories as such. There's no concept of uploading + // an empty directory to S3. + None + } + Ok((file_type, path)) if file_type.is_symlink() => { + // huh, didn't expect a symlink. Can't upload that to S3. Warn and skip. + warn!("cannot upload symlink ({})", path); + None + } + Ok((_file_type, path)) => { + // should not happen + warn!("directory entry has unexpected type ({})", path); + None + } + Err(e) => Some(Err(e)), + } + }); + + // Spawn upload tasks for each file, keeping MAX_PARALLEL_UPLOADS active in + // parallel. + let mut joinset = JoinSet::new(); + loop { + // Could we upload more? + while joinset.len() < MAX_PARALLEL_UPLOADS { + if let Some(full_local_path) = dirwalker.next() { + let full_local_path = full_local_path?; + let relative_local_path = full_local_path + .strip_prefix(local) + .expect("all paths start from the walkdir root"); + let remote_path = remote.append(relative_local_path.as_str()); + info!( + "starting upload of {} to {}", + &full_local_path, &remote_path + ); + let upload_task = upload_file(s3_client.clone(), full_local_path, remote_path); + joinset.spawn(upload_task); + } else { + info!("draining upload tasks"); + break; + } + } + + // Wait for an upload to complete + if let Some(res) = joinset.join_next().await { + let _ = res?; + } else { + // all done! + break; + } + } + Ok(()) +} + +pub(crate) async fn upload_file( + s3_client: aws_sdk_s3::Client, + local_path: Utf8PathBuf, + remote: S3Uri, +) -> anyhow::Result<()> { + use aws_smithy_types::byte_stream::ByteStream; + let stream = ByteStream::from_path(&local_path).await?; + + let _result = s3_client + .put_object() + .bucket(remote.bucket) + .key(&remote.key) + .body(stream) + .send() + .await?; + info!("upload of {} to {} finished", &local_path, &remote.key); + + Ok(()) +} diff --git a/compute_tools/src/bin/fast_import/s5cmd.rs b/compute_tools/src/bin/fast_import/s5cmd.rs deleted file mode 100644 index d2d9a79736..0000000000 --- a/compute_tools/src/bin/fast_import/s5cmd.rs +++ /dev/null @@ -1,27 +0,0 @@ -use anyhow::Context; -use camino::Utf8Path; - -use super::s3_uri::S3Uri; - -pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> { - let mut builder = tokio::process::Command::new("s5cmd"); - // s5cmd uses aws-sdk-go v1, hence doesn't support AWS_ENDPOINT_URL - if let Some(val) = std::env::var_os("AWS_ENDPOINT_URL") { - builder.arg("--endpoint-url").arg(val); - } - builder - .arg("sync") - .arg(local.as_str()) - .arg(remote.to_string()); - let st = builder - .spawn() - .context("spawn s5cmd")? - .wait() - .await - .context("wait for s5cmd")?; - if st.success() { - Ok(()) - } else { - Err(anyhow::anyhow!("s5cmd failed")) - } -} diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 72198a9479..28b10ce21c 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -36,11 +36,11 @@ pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { + // We keep a reference to the child process to ensure it stays alive + // while the stream is being consumed. When SchemaStream is dropped, + // cmd will be dropped, which triggers kill_on_drop and terminates pg_dump + cmd: tokio::process::Child, + stream: S, + } + + impl Stream for SchemaStream + where + S: Stream> + Unpin, + { + type Item = Result; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + Stream::poll_next(std::pin::Pin::new(&mut self.stream), cx) + } + } + + let schema_stream = SchemaStream { + cmd, + stream: initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))), + }; + + Ok(schema_stream) } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index d72a04f2f9..d323ea3dcd 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -9,13 +9,12 @@ use std::str::FromStr; use std::sync::atomic::AtomicU32; use std::sync::atomic::Ordering; use std::sync::{Arc, Condvar, Mutex, RwLock}; -use std::thread; use std::time::Duration; use std::time::Instant; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; -use compute_api::spec::{PgIdent, Role}; +use compute_api::spec::{Database, PgIdent, Role}; use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; @@ -41,12 +40,14 @@ use crate::local_proxy; use crate::pg_helpers::*; use crate::spec::*; use crate::spec_apply::ApplySpecPhase::{ - CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSuperUser, - DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions, - RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase, + CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon, + CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, + HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, + RunInEachDatabase, }; +use crate::spec_apply::PerDatabasePhase; use crate::spec_apply::PerDatabasePhase::{ - ChangeSchemaPerms, DeleteDBRoleReferences, HandleAnonExtension, + ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension, }; use crate::spec_apply::{apply_operations, MutableApplyContext, DB}; use crate::sync_sk::{check_if_synced, ping_safekeeper}; @@ -57,6 +58,8 @@ pub static PG_PID: AtomicU32 = AtomicU32::new(0); /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { + /// The ID of the compute + pub compute_id: String, // Url type maintains proper escaping pub connstr: url::Url, // We connect to Postgres from many different places, so build configs once @@ -79,8 +82,10 @@ pub struct ComputeNode { /// - we push spec and it does configuration /// - but then it is restarted without any spec again pub live_config_allowed: bool, - /// The port that the compute's HTTP server listens on - pub http_port: u16, + /// The port that the compute's external HTTP server listens on + pub external_http_port: u16, + /// The port that the compute's internal HTTP server listens on + pub internal_http_port: u16, /// Volatile part of the `ComputeNode`, which should be used under `Mutex`. /// To allow HTTP API server to serving status requests, while configuration /// is in progress, lock should be held only for short periods of time to do @@ -338,6 +343,15 @@ impl ComputeNode { self.state.lock().unwrap().status } + pub fn get_timeline_id(&self) -> Option { + self.state + .lock() + .unwrap() + .pspec + .as_ref() + .map(|s| s.timeline_id) + } + // Remove `pgdata` directory and create it again with right permissions. fn create_pgdata(&self) -> Result<()> { // Ignore removal error, likely it is a 'No such file or directory (os error 2)'. @@ -535,11 +549,7 @@ impl ComputeNode { pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result> { let start_time = Utc::now(); - // Run actual work with new tokio runtime - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create rt"); + let rt = tokio::runtime::Handle::current(); let result = rt.block_on(self.check_safekeepers_synced_async(compute_state)); // Record runtime @@ -586,9 +596,9 @@ impl ComputeNode { SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst); // Process has exited, so we can join the logs thread. - let _ = logs_handle - .join() - .map_err(|e| tracing::error!("log thread panicked: {:?}", e)); + let _ = tokio::runtime::Handle::current() + .block_on(logs_handle) + .map_err(|e| tracing::error!("log task panicked: {:?}", e)); if !sync_output.status.success() { anyhow::bail!( @@ -623,7 +633,7 @@ impl ComputeNode { config::write_postgres_conf( &pgdata_path.join("postgresql.conf"), &pspec.spec, - self.http_port, + self.internal_http_port, )?; // Syncing safekeepers is only safe with primary nodes: if a primary @@ -773,7 +783,7 @@ impl ComputeNode { pub fn start_postgres( &self, storage_auth_token: Option, - ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { + ) -> Result<(std::process::Child, tokio::task::JoinHandle>)> { let pgdata_path = Path::new(&self.pgdata); // Run postgres as a child process. @@ -789,7 +799,7 @@ impl ComputeNode { .expect("cannot start postgres process"); PG_PID.store(pg.id(), Ordering::SeqCst); - // Start a thread to collect logs from stderr. + // Start a task to collect logs from stderr. let stderr = pg.stderr.take().expect("stderr should be captured"); let logs_handle = handle_postgres_logs(stderr); @@ -798,20 +808,28 @@ impl ComputeNode { Ok((pg, logs_handle)) } - /// Do post configuration of the already started Postgres. This function spawns a background thread to + /// Do post configuration of the already started Postgres. This function spawns a background task to /// configure the database after applying the compute spec. Currently, it upgrades the neon extension /// version. In the future, it may upgrade all 3rd-party extensions. #[instrument(skip_all)] pub fn post_apply_config(&self) -> Result<()> { - let conf = self.get_conn_conf(Some("compute_ctl:post_apply_config")); - thread::spawn(move || { - let func = || { - let mut client = conf.connect(NoTls)?; + let conf = self.get_tokio_conn_conf(Some("compute_ctl:post_apply_config")); + tokio::spawn(async move { + let res = async { + let (mut client, connection) = conf.connect(NoTls).await?; + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + handle_neon_extension_upgrade(&mut client) + .await .context("handle_neon_extension_upgrade")?; Ok::<_, anyhow::Error>(()) - }; - if let Err(err) = func() { + } + .await; + if let Err(err) = res { error!("error while post_apply_config: {err:#}"); } }); @@ -834,7 +852,7 @@ impl ComputeNode { conf } - async fn get_maintenance_client( + pub async fn get_maintenance_client( conf: &tokio_postgres::Config, ) -> Result { let mut conf = conf.clone(); @@ -908,13 +926,10 @@ impl ComputeNode { conf: Arc, concurrency: usize, ) -> Result<()> { - let rt = tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build()?; - info!("Applying config with max {} concurrency", concurrency); debug!("Config: {:?}", spec); + let rt = tokio::runtime::Handle::current(); rt.block_on(async { // Proceed with post-startup configuration. Note, that order of operations is important. let client = Self::get_maintenance_client(&conf).await?; @@ -927,6 +942,48 @@ impl ComputeNode { .map(|role| (role.name.clone(), role)) .collect::>(); + // Check if we need to drop subscriptions before starting the endpoint. + // + // It is important to do this operation exactly once when endpoint starts on a new branch. + // Otherwise, we may drop not inherited, but newly created subscriptions. + // + // We cannot rely only on spec.drop_subscriptions_before_start flag, + // because if for some reason compute restarts inside VM, + // it will start again with the same spec and flag value. + // + // To handle this, we save the fact of the operation in the database + // in the neon.drop_subscriptions_done table. + // If the table does not exist, we assume that the operation was never performed, so we must do it. + // If table exists, we check if the operation was performed on the current timelilne. + // + let mut drop_subscriptions_done = false; + + if spec.drop_subscriptions_before_start { + let timeline_id = self.get_timeline_id().context("timeline_id must be set")?; + let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id); + + info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id); + + drop_subscriptions_done = match + client.simple_query(&query).await { + Ok(result) => { + matches!(&result[0], postgres::SimpleQueryMessage::Row(_)) + }, + Err(e) => + { + match e.code() { + Some(&SqlState::UNDEFINED_TABLE) => false, + _ => { + // We don't expect any other error here, except for the schema/table not existing + error!("Error checking if drop subscription operation was already performed: {}", e); + return Err(e.into()); + } + } + } + } + }; + + let jwks_roles = Arc::new( spec.as_ref() .local_proxy_config @@ -943,6 +1000,78 @@ impl ComputeNode { dbs: databases, })); + // Apply special pre drop database phase. + // NOTE: we use the code of RunInEachDatabase phase for parallelism + // and connection management, but we don't really run it in *each* database, + // only in databases, we're about to drop. + info!("Applying PerDatabase (pre-dropdb) phase"); + let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); + + // Run the phase for each database that we're about to drop. + let db_processes = spec + .delta_operations + .iter() + .flatten() + .filter_map(move |op| { + if op.action.as_str() == "delete_db" { + Some(op.name.clone()) + } else { + None + } + }) + .map(|dbname| { + let spec = spec.clone(); + let ctx = ctx.clone(); + let jwks_roles = jwks_roles.clone(); + let mut conf = conf.as_ref().clone(); + let concurrency_token = concurrency_token.clone(); + // We only need dbname field for this phase, so set other fields to dummy values + let db = DB::UserDB(Database { + name: dbname.clone(), + owner: "cloud_admin".to_string(), + options: None, + restrict_conn: false, + invalid: false, + }); + + debug!("Applying per-database phases for Database {:?}", &db); + + match &db { + DB::SystemDB => {} + DB::UserDB(db) => { + conf.dbname(db.name.as_str()); + } + } + + let conf = Arc::new(conf); + let fut = Self::apply_spec_sql_db( + spec.clone(), + conf, + ctx.clone(), + jwks_roles.clone(), + concurrency_token.clone(), + db, + [DropLogicalSubscriptions].to_vec(), + ); + + Ok(spawn(fut)) + }) + .collect::>>(); + + for process in db_processes.into_iter() { + let handle = process?; + if let Err(e) = handle.await? { + // Handle the error case where the database does not exist + // We do not check whether the DB exists or not in the deletion phase, + // so we shouldn't be strict about it in pre-deletion cleanup as well. + if e.to_string().contains("does not exist") { + warn!("Error dropping subscription: {}", e); + } else { + return Err(e); + } + }; + } + for phase in [ CreateSuperUser, DropInvalidDatabases, @@ -950,6 +1079,7 @@ impl ComputeNode { CreateAndAlterRoles, RenameAndDeleteDatabases, CreateAndAlterDatabases, + CreateSchemaNeon, ] { info!("Applying phase {:?}", &phase); apply_operations( @@ -962,7 +1092,7 @@ impl ComputeNode { .await?; } - info!("Applying RunInEachDatabase phase"); + info!("Applying RunInEachDatabase2 phase"); let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); let db_processes = spec @@ -990,6 +1120,17 @@ impl ComputeNode { } let conf = Arc::new(conf); + let mut phases = vec![ + DeleteDBRoleReferences, + ChangeSchemaPerms, + HandleAnonExtension, + ]; + + if spec.drop_subscriptions_before_start && !drop_subscriptions_done { + info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); + phases.push(DropLogicalSubscriptions); + } + let fut = Self::apply_spec_sql_db( spec.clone(), conf, @@ -997,6 +1138,7 @@ impl ComputeNode { jwks_roles.clone(), concurrency_token.clone(), db, + phases, ); Ok(spawn(fut)) @@ -1008,12 +1150,20 @@ impl ComputeNode { handle.await??; } - for phase in vec![ + let mut phases = vec![ HandleOtherExtensions, - HandleNeonExtension, + HandleNeonExtension, // This step depends on CreateSchemaNeon CreateAvailabilityCheck, DropRoles, - ] { + ]; + + // This step depends on CreateSchemaNeon + if spec.drop_subscriptions_before_start && !drop_subscriptions_done { + info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set"); + phases.push(FinalizeDropLogicalSubscriptions); + } + + for phase in phases { debug!("Applying phase {:?}", &phase); apply_operations( spec.clone(), @@ -1043,16 +1193,13 @@ impl ComputeNode { jwks_roles: Arc>, concurrency_token: Arc, db: DB, + subphases: Vec, ) -> Result<()> { let _permit = concurrency_token.acquire().await?; let mut client_conn = None; - for subphase in [ - DeleteDBRoleReferences, - ChangeSchemaPerms, - HandleAnonExtension, - ] { + for subphase in subphases { apply_operations( spec.clone(), ctx.clone(), @@ -1176,13 +1323,28 @@ impl ComputeNode { } // Run migrations separately to not hold up cold starts - thread::spawn(move || { - let conf = conf.as_ref().clone(); - let mut conf = postgres::config::Config::from(conf); + tokio::spawn(async move { + let mut conf = conf.as_ref().clone(); conf.application_name("compute_ctl:migrations"); - let mut client = conf.connect(NoTls)?; - handle_migrations(&mut client).context("apply_config handle_migrations") + match conf.connect(NoTls).await { + Ok((mut client, connection)) => { + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + if let Err(e) = handle_migrations(&mut client).await { + error!("Failed to run migrations: {}", e); + } + } + Err(e) => { + error!( + "Failed to connect to the compute for running migrations: {}", + e + ); + } + }; }); Ok::<(), anyhow::Error>(()) @@ -1209,16 +1371,11 @@ impl ComputeNode { if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { info!("tuning pgbouncer"); - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create rt"); - - // Spawn a thread to do the tuning, + // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. let pgbouncer_settings = pgbouncer_settings.clone(); - let _handle = thread::spawn(move || { - let res = rt.block_on(tune_pgbouncer(pgbouncer_settings)); + tokio::spawn(async move { + let res = tune_pgbouncer(pgbouncer_settings).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); } @@ -1228,41 +1385,42 @@ impl ComputeNode { if let Some(ref local_proxy) = spec.local_proxy_config { info!("configuring local_proxy"); - // Spawn a thread to do the configuration, + // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. let local_proxy = local_proxy.clone(); - let _handle = Some(thread::spawn(move || { + tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); } - })); + }); } // Write new config let pgdata_path = Path::new(&self.pgdata); let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?; + config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?; - let max_concurrent_connections = spec.reconfigure_concurrency; + if !spec.skip_pg_catalog_updates { + let max_concurrent_connections = spec.reconfigure_concurrency; + // Temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are reconfiguring: + // creating new extensions, roles, etc. + config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { + self.pg_reload_conf()?; - // Temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are reconfiguring: - // creating new extensions, roles, etc. - config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { - self.pg_reload_conf()?; + if spec.mode == ComputeMode::Primary { + let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); + conf.application_name("apply_config"); + let conf = Arc::new(conf); - if spec.mode == ComputeMode::Primary { - let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); - conf.application_name("apply_config"); - let conf = Arc::new(conf); + let spec = Arc::new(spec.clone()); - let spec = Arc::new(spec.clone()); + self.apply_spec_sql(spec, conf, max_concurrent_connections)?; + } - self.apply_spec_sql(spec, conf, max_concurrent_connections)?; - } - - Ok(()) - })?; + Ok(()) + })?; + } self.pg_reload_conf()?; @@ -1277,7 +1435,9 @@ impl ComputeNode { } #[instrument(skip_all)] - pub fn start_compute(&self) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { + pub fn start_compute( + &self, + ) -> Result<(std::process::Child, tokio::task::JoinHandle>)> { let compute_state = self.state.lock().unwrap().clone(); let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( @@ -1292,16 +1452,11 @@ impl ComputeNode { if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { info!("tuning pgbouncer"); - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create rt"); - - // Spawn a thread to do the tuning, + // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. let pgbouncer_settings = pgbouncer_settings.clone(); - let _handle = thread::spawn(move || { - let res = rt.block_on(tune_pgbouncer(pgbouncer_settings)); + let _handle = tokio::spawn(async move { + let res = tune_pgbouncer(pgbouncer_settings).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); } @@ -1311,10 +1466,10 @@ impl ComputeNode { if let Some(local_proxy) = &pspec.spec.local_proxy_config { info!("configuring local_proxy"); - // Spawn a thread to do the configuration, + // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. let local_proxy = local_proxy.clone(); - let _handle = thread::spawn(move || { + let _handle = tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); } @@ -1333,7 +1488,8 @@ impl ComputeNode { extension_server::create_control_files(remote_extensions, &self.pgbin); let library_load_start_time = Utc::now(); - let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?; + let rt = tokio::runtime::Handle::current(); + let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?; let library_load_time = Utc::now() .signed_duration_since(library_load_start_time) @@ -1375,12 +1531,20 @@ impl ComputeNode { Ok(()) }, )?; + + let postgresql_conf_path = pgdata_path.join("postgresql.conf"); + if config::line_in_file( + &postgresql_conf_path, + "neon.disable_logical_replication_subscribers=false", + )? { + info!("updated postgresql.conf to set neon.disable_logical_replication_subscribers=false"); + } self.pg_reload_conf()?; } self.post_apply_config()?; let conf = self.get_conn_conf(None); - thread::spawn(move || { + tokio::task::spawn_blocking(|| { let res = get_installed_extensions(conf); match res { Ok(extensions) => { @@ -1729,7 +1893,6 @@ LIMIT 100", Ok(ext_version) } - #[tokio::main] pub async fn prepare_preload_libraries( &self, spec: &ComputeSpec, diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index b257c8a68f..e1bdfffa54 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -129,6 +129,13 @@ pub fn write_postgres_conf( writeln!(file, "neon.extension_server_port={}", extension_server_port)?; + if spec.drop_subscriptions_before_start { + writeln!(file, "neon.disable_logical_replication_subscribers=true")?; + } else { + // be explicit about the default value + writeln!(file, "neon.disable_logical_replication_subscribers=false")?; + } + // This is essential to keep this line at the end of the file, // because it is intended to override any settings above. writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?; diff --git a/compute_tools/src/configurator.rs b/compute_tools/src/configurator.rs index a2043529a1..d88f26ca20 100644 --- a/compute_tools/src/configurator.rs +++ b/compute_tools/src/configurator.rs @@ -51,9 +51,12 @@ fn configurator_main_loop(compute: &Arc) { pub fn launch_configurator(compute: &Arc) -> thread::JoinHandle<()> { let compute = Arc::clone(compute); + let runtime = tokio::runtime::Handle::current(); + thread::Builder::new() .name("compute-configurator".into()) .spawn(move || { + let _rt_guard = runtime.enter(); configurator_main_loop(&compute); info!("configurator thread is exited"); }) diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index f13b2308e7..00f46386e7 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -85,6 +85,8 @@ use tracing::info; use tracing::log::warn; use zstd::stream::read::Decoder; +use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; + fn get_pg_config(argument: &str, pgbin: &str) -> String { // gives the result of `pg_config [argument]` // where argument is a flag like `--version` or `--sharedir` @@ -256,23 +258,60 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result { let uri = format!("{}/{}", ext_remote_storage, ext_path); - info!("Download extension {:?} from uri {:?}", ext_path, uri); + info!("Download extension {} from uri {}", ext_path, uri); - let resp = reqwest::get(uri).await?; + match do_extension_server_request(&uri).await { + Ok(resp) => { + info!("Successfully downloaded remote extension data {}", ext_path); + REMOTE_EXT_REQUESTS_TOTAL + .with_label_values(&[&StatusCode::OK.to_string()]) + .inc(); + Ok(resp) + } + Err((msg, status)) => { + REMOTE_EXT_REQUESTS_TOTAL + .with_label_values(&[&status]) + .inc(); + bail!(msg); + } + } +} - match resp.status() { +// Do a single remote extensions server request. +// Return result or (error message + stringified status code) in case of any failures. +async fn do_extension_server_request(uri: &str) -> Result { + let resp = reqwest::get(uri).await.map_err(|e| { + ( + format!( + "could not perform remote extensions server request: {:?}", + e + ), + UNKNOWN_HTTP_STATUS.to_string(), + ) + })?; + let status = resp.status(); + + match status { StatusCode::OK => match resp.bytes().await { - Ok(resp) => { - info!("Download extension {:?} completed successfully", ext_path); - Ok(resp) - } - Err(e) => bail!("could not deserialize remote extension response: {}", e), + Ok(resp) => Ok(resp), + Err(e) => Err(( + format!("could not read remote extensions server response: {:?}", e), + // It's fine to return and report error with status as 200 OK, + // because we still failed to read the response. + status.to_string(), + )), }, - StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"), - _ => bail!( - "unexpected remote extension response status code: {}", - resp.status() - ), + StatusCode::SERVICE_UNAVAILABLE => Err(( + "remote extensions server is temporarily unavailable".to_string(), + status.to_string(), + )), + _ => Err(( + format!( + "unexpected remote extensions server response status code: {}", + status + ), + status.to_string(), + )), } } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs deleted file mode 100644 index 7fa6426d8f..0000000000 --- a/compute_tools/src/http/api.rs +++ /dev/null @@ -1,591 +0,0 @@ -use std::convert::Infallible; -use std::net::IpAddr; -use std::net::Ipv6Addr; -use std::net::SocketAddr; -use std::sync::Arc; -use std::thread; - -use crate::catalog::SchemaDumpError; -use crate::catalog::{get_database_schema, get_dbs_and_roles}; -use crate::compute::forward_termination_signal; -use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; -use crate::installed_extensions; -use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest}; -use compute_api::responses::{ - ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError, - SetRoleGrantsResponse, -}; - -use anyhow::Result; -use hyper::header::CONTENT_TYPE; -use hyper::service::{make_service_fn, service_fn}; -use hyper::{Body, Method, Request, Response, Server, StatusCode}; -use metrics::proto::MetricFamily; -use metrics::Encoder; -use metrics::TextEncoder; -use tokio::task; -use tracing::{debug, error, info, warn}; -use tracing_utils::http::OtelName; -use utils::http::request::must_get_query_param; - -fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { - ComputeStatusResponse { - start_time: state.start_time, - tenant: state - .pspec - .as_ref() - .map(|pspec| pspec.tenant_id.to_string()), - timeline: state - .pspec - .as_ref() - .map(|pspec| pspec.timeline_id.to_string()), - status: state.status, - last_active: state.last_active, - error: state.error.clone(), - } -} - -// Service function to handle all available routes. -async fn routes(req: Request, compute: &Arc) -> Response { - // - // NOTE: The URI path is currently included in traces. That's OK because - // it doesn't contain any variable parts or sensitive information. But - // please keep that in mind if you change the routing here. - // - match (req.method(), req.uri().path()) { - // Serialized compute state. - (&Method::GET, "/status") => { - debug!("serving /status GET request"); - let state = compute.state.lock().unwrap(); - let status_response = status_response_from_state(&state); - Response::new(Body::from(serde_json::to_string(&status_response).unwrap())) - } - - // Startup metrics in JSON format. Keep /metrics reserved for a possible - // future use for Prometheus metrics format. - (&Method::GET, "/metrics.json") => { - info!("serving /metrics.json GET request"); - let metrics = compute.state.lock().unwrap().metrics.clone(); - Response::new(Body::from(serde_json::to_string(&metrics).unwrap())) - } - - // Prometheus metrics - (&Method::GET, "/metrics") => { - debug!("serving /metrics GET request"); - - // When we call TextEncoder::encode() below, it will immediately - // return an error if a metric family has no metrics, so we need to - // preemptively filter out metric families with no metrics. - let metrics = installed_extensions::collect() - .into_iter() - .filter(|m| !m.get_metric().is_empty()) - .collect::>(); - - let encoder = TextEncoder::new(); - let mut buffer = vec![]; - - if let Err(err) = encoder.encode(&metrics, &mut buffer) { - let msg = format!("error handling /metrics request: {err}"); - error!(msg); - return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR); - } - - match Response::builder() - .status(StatusCode::OK) - .header(CONTENT_TYPE, encoder.format_type()) - .body(Body::from(buffer)) - { - Ok(response) => response, - Err(err) => { - let msg = format!("error handling /metrics request: {err}"); - error!(msg); - render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR) - } - } - } - // Collect Postgres current usage insights - (&Method::GET, "/insights") => { - info!("serving /insights GET request"); - let status = compute.get_status(); - if status != ComputeStatus::Running { - let msg = format!("compute is not running, current status: {:?}", status); - error!(msg); - return Response::new(Body::from(msg)); - } - - let insights = compute.collect_insights().await; - Response::new(Body::from(insights)) - } - - (&Method::POST, "/check_writability") => { - info!("serving /check_writability POST request"); - let status = compute.get_status(); - if status != ComputeStatus::Running { - let msg = format!( - "invalid compute status for check_writability request: {:?}", - status - ); - error!(msg); - return Response::new(Body::from(msg)); - } - - let res = crate::checker::check_writability(compute).await; - match res { - Ok(_) => Response::new(Body::from("true")), - Err(e) => { - error!("check_writability failed: {}", e); - Response::new(Body::from(e.to_string())) - } - } - } - - (&Method::POST, "/extensions") => { - info!("serving /extensions POST request"); - let status = compute.get_status(); - if status != ComputeStatus::Running { - let msg = format!( - "invalid compute status for extensions request: {:?}", - status - ); - error!(msg); - return render_json_error(&msg, StatusCode::PRECONDITION_FAILED); - } - - let request = hyper::body::to_bytes(req.into_body()).await.unwrap(); - let request = serde_json::from_slice::(&request).unwrap(); - let res = compute - .install_extension(&request.extension, &request.database, request.version) - .await; - match res { - Ok(version) => render_json(Body::from( - serde_json::to_string(&ExtensionInstallResult { - extension: request.extension, - version, - }) - .unwrap(), - )), - Err(e) => { - error!("install_extension failed: {}", e); - render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR) - } - } - } - - (&Method::GET, "/info") => { - let num_cpus = num_cpus::get_physical(); - info!("serving /info GET request. num_cpus: {}", num_cpus); - Response::new(Body::from( - serde_json::json!({ - "num_cpus": num_cpus, - }) - .to_string(), - )) - } - - // Accept spec in JSON format and request compute configuration. If - // anything goes wrong after we set the compute status to `ConfigurationPending` - // and update compute state with new spec, we basically leave compute - // in the potentially wrong state. That said, it's control-plane's - // responsibility to watch compute state after reconfiguration request - // and to clean restart in case of errors. - (&Method::POST, "/configure") => { - info!("serving /configure POST request"); - match handle_configure_request(req, compute).await { - Ok(msg) => Response::new(Body::from(msg)), - Err((msg, code)) => { - error!("error handling /configure request: {msg}"); - render_json_error(&msg, code) - } - } - } - - (&Method::POST, "/terminate") => { - info!("serving /terminate POST request"); - match handle_terminate_request(compute).await { - Ok(()) => Response::new(Body::empty()), - Err((msg, code)) => { - error!("error handling /terminate request: {msg}"); - render_json_error(&msg, code) - } - } - } - - (&Method::GET, "/dbs_and_roles") => { - info!("serving /dbs_and_roles GET request",); - match get_dbs_and_roles(compute).await { - Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())), - Err(_) => { - render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR) - } - } - } - - (&Method::GET, "/database_schema") => { - let database = match must_get_query_param(&req, "database") { - Err(e) => return e.into_response(), - Ok(database) => database, - }; - info!("serving /database_schema GET request with database: {database}",); - match get_database_schema(compute, &database).await { - Ok(res) => render_plain(Body::wrap_stream(res)), - Err(SchemaDumpError::DatabaseDoesNotExist) => { - render_json_error("database does not exist", StatusCode::NOT_FOUND) - } - Err(e) => { - error!("can't get schema dump: {}", e); - render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR) - } - } - } - - (&Method::POST, "/grants") => { - info!("serving /grants POST request"); - let status = compute.get_status(); - if status != ComputeStatus::Running { - let msg = format!( - "invalid compute status for set_role_grants request: {:?}", - status - ); - error!(msg); - return render_json_error(&msg, StatusCode::PRECONDITION_FAILED); - } - - let request = hyper::body::to_bytes(req.into_body()).await.unwrap(); - let request = serde_json::from_slice::(&request).unwrap(); - - let res = compute - .set_role_grants( - &request.database, - &request.schema, - &request.privileges, - &request.role, - ) - .await; - match res { - Ok(()) => render_json(Body::from( - serde_json::to_string(&SetRoleGrantsResponse { - database: request.database, - schema: request.schema, - role: request.role, - privileges: request.privileges, - }) - .unwrap(), - )), - Err(e) => render_json_error( - &format!("could not grant role privileges to the schema: {e}"), - // TODO: can we filter on role/schema not found errors - // and return appropriate error code? - StatusCode::INTERNAL_SERVER_ERROR, - ), - } - } - - // get the list of installed extensions - // currently only used in python tests - // TODO: call it from cplane - (&Method::GET, "/installed_extensions") => { - info!("serving /installed_extensions GET request"); - let status = compute.get_status(); - if status != ComputeStatus::Running { - let msg = format!( - "invalid compute status for extensions request: {:?}", - status - ); - error!(msg); - return Response::new(Body::from(msg)); - } - - let conf = compute.get_conn_conf(None); - let res = - task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf)) - .await - .unwrap(); - - match res { - Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())), - Err(e) => render_json_error( - &format!("could not get list of installed extensions: {}", e), - StatusCode::INTERNAL_SERVER_ERROR, - ), - } - } - - // download extension files from remote extension storage on demand - (&Method::POST, route) if route.starts_with("/extension_server/") => { - info!("serving {:?} POST request", route); - info!("req.uri {:?}", req.uri()); - - // don't even try to download extensions - // if no remote storage is configured - if compute.ext_remote_storage.is_none() { - info!("no extensions remote storage configured"); - let mut resp = Response::new(Body::from("no remote storage configured")); - *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR; - return resp; - } - - let mut is_library = false; - if let Some(params) = req.uri().query() { - info!("serving {:?} POST request with params: {}", route, params); - if params == "is_library=true" { - is_library = true; - } else { - let mut resp = Response::new(Body::from("Wrong request parameters")); - *resp.status_mut() = StatusCode::BAD_REQUEST; - return resp; - } - } - let filename = route.split('/').last().unwrap().to_string(); - info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}"); - - // get ext_name and path from spec - // don't lock compute_state for too long - let ext = { - let compute_state = compute.state.lock().unwrap(); - let pspec = compute_state.pspec.as_ref().expect("spec must be set"); - let spec = &pspec.spec; - - // debug only - info!("spec: {:?}", spec); - - let remote_extensions = match spec.remote_extensions.as_ref() { - Some(r) => r, - None => { - info!("no remote extensions spec was provided"); - let mut resp = Response::new(Body::from("no remote storage configured")); - *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR; - return resp; - } - }; - - remote_extensions.get_ext( - &filename, - is_library, - &compute.build_tag, - &compute.pgversion, - ) - }; - - match ext { - Ok((ext_name, ext_path)) => { - match compute.download_extension(ext_name, ext_path).await { - Ok(_) => Response::new(Body::from("OK")), - Err(e) => { - error!("extension download failed: {}", e); - let mut resp = Response::new(Body::from(e.to_string())); - *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR; - resp - } - } - } - Err(e) => { - warn!("extension download failed to find extension: {}", e); - let mut resp = Response::new(Body::from("failed to find file")); - *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR; - resp - } - } - } - - // Return the `404 Not Found` for any other routes. - _ => { - let mut not_found = Response::new(Body::from("404 Not Found")); - *not_found.status_mut() = StatusCode::NOT_FOUND; - not_found - } - } -} - -async fn handle_configure_request( - req: Request, - compute: &Arc, -) -> Result { - if !compute.live_config_allowed { - return Err(( - "live configuration is not allowed for this compute node".to_string(), - StatusCode::PRECONDITION_FAILED, - )); - } - - let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap(); - let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap(); - if let Ok(request) = serde_json::from_str::(&spec_raw) { - let spec = request.spec; - - let parsed_spec = match ParsedSpec::try_from(spec) { - Ok(ps) => ps, - Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)), - }; - - // XXX: wrap state update under lock in code blocks. Otherwise, - // we will try to `Send` `mut state` into the spawned thread - // bellow, which will cause error: - // ``` - // error: future cannot be sent between threads safely - // ``` - { - let mut state = compute.state.lock().unwrap(); - if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running { - let msg = format!( - "invalid compute status for configuration request: {:?}", - state.status.clone() - ); - return Err((msg, StatusCode::PRECONDITION_FAILED)); - } - state.pspec = Some(parsed_spec); - state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed); - drop(state); - info!("set new spec and notified waiters"); - } - - // Spawn a blocking thread to wait for compute to become Running. - // This is needed to do not block the main pool of workers and - // be able to serve other requests while some particular request - // is waiting for compute to finish configuration. - let c = compute.clone(); - task::spawn_blocking(move || { - let mut state = c.state.lock().unwrap(); - while state.status != ComputeStatus::Running { - state = c.state_changed.wait(state).unwrap(); - info!( - "waiting for compute to become Running, current status: {:?}", - state.status - ); - - if state.status == ComputeStatus::Failed { - let err = state.error.as_ref().map_or("unknown error", |x| x); - let msg = format!("compute configuration failed: {:?}", err); - return Err((msg, StatusCode::INTERNAL_SERVER_ERROR)); - } - } - - Ok(()) - }) - .await - .unwrap()?; - - // Return current compute state if everything went well. - let state = compute.state.lock().unwrap().clone(); - let status_response = status_response_from_state(&state); - Ok(serde_json::to_string(&status_response).unwrap()) - } else { - Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST)) - } -} - -fn render_json_error(e: &str, status: StatusCode) -> Response { - let error = GenericAPIError { - error: e.to_string(), - }; - Response::builder() - .status(status) - .header(CONTENT_TYPE, "application/json") - .body(Body::from(serde_json::to_string(&error).unwrap())) - .unwrap() -} - -fn render_json(body: Body) -> Response { - Response::builder() - .header(CONTENT_TYPE, "application/json") - .body(body) - .unwrap() -} - -fn render_plain(body: Body) -> Response { - Response::builder() - .header(CONTENT_TYPE, "text/plain") - .body(body) - .unwrap() -} - -async fn handle_terminate_request(compute: &Arc) -> Result<(), (String, StatusCode)> { - { - let mut state = compute.state.lock().unwrap(); - if state.status == ComputeStatus::Terminated { - return Ok(()); - } - if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running { - let msg = format!( - "invalid compute status for termination request: {}", - state.status - ); - return Err((msg, StatusCode::PRECONDITION_FAILED)); - } - state.set_status(ComputeStatus::TerminationPending, &compute.state_changed); - drop(state); - } - - forward_termination_signal(); - info!("sent signal and notified waiters"); - - // Spawn a blocking thread to wait for compute to become Terminated. - // This is needed to do not block the main pool of workers and - // be able to serve other requests while some particular request - // is waiting for compute to finish configuration. - let c = compute.clone(); - task::spawn_blocking(move || { - let mut state = c.state.lock().unwrap(); - while state.status != ComputeStatus::Terminated { - state = c.state_changed.wait(state).unwrap(); - info!( - "waiting for compute to become {}, current status: {:?}", - ComputeStatus::Terminated, - state.status - ); - } - - Ok(()) - }) - .await - .unwrap()?; - info!("terminated Postgres"); - Ok(()) -} - -// Main Hyper HTTP server function that runs it and blocks waiting on it forever. -#[tokio::main] -async fn serve(port: u16, state: Arc) { - // this usually binds to both IPv4 and IPv6 on linux - // see e.g. https://github.com/rust-lang/rust/pull/34440 - let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port); - - let make_service = make_service_fn(move |_conn| { - let state = state.clone(); - async move { - Ok::<_, Infallible>(service_fn(move |req: Request| { - let state = state.clone(); - async move { - Ok::<_, Infallible>( - // NOTE: We include the URI path in the string. It - // doesn't contain any variable parts or sensitive - // information in this API. - tracing_utils::http::tracing_handler( - req, - |req| routes(req, &state), - OtelName::UriPath, - ) - .await, - ) - } - })) - } - }); - - info!("starting HTTP server on {}", addr); - - let server = Server::bind(&addr).serve(make_service); - - // Run this server forever - if let Err(e) = server.await { - error!("server error: {}", e); - } -} - -/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`. -pub fn launch_http_server(port: u16, state: &Arc) -> Result> { - let state = Arc::clone(state); - - Ok(thread::Builder::new() - .name("http-endpoint".into()) - .spawn(move || serve(port, state))?) -} diff --git a/compute_tools/src/http/extract/json.rs b/compute_tools/src/http/extract/json.rs new file mode 100644 index 0000000000..104cc25d5f --- /dev/null +++ b/compute_tools/src/http/extract/json.rs @@ -0,0 +1,44 @@ +use std::ops::{Deref, DerefMut}; + +use axum::extract::{rejection::JsonRejection, FromRequest, Request}; +use compute_api::responses::GenericAPIError; +use http::StatusCode; + +/// Custom `Json` extractor, so that we can format errors into +/// `JsonResponse`. +#[derive(Debug, Clone, Copy, Default)] +pub(crate) struct Json(pub T); + +impl FromRequest for Json +where + axum::Json: FromRequest, + S: Send + Sync, +{ + type Rejection = (StatusCode, axum::Json); + + async fn from_request(req: Request, state: &S) -> Result { + match axum::Json::::from_request(req, state).await { + Ok(value) => Ok(Self(value.0)), + Err(rejection) => Err(( + rejection.status(), + axum::Json(GenericAPIError { + error: rejection.body_text().to_lowercase(), + }), + )), + } + } +} + +impl Deref for Json { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Json { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/compute_tools/src/http/extract/mod.rs b/compute_tools/src/http/extract/mod.rs new file mode 100644 index 0000000000..1b690e444d --- /dev/null +++ b/compute_tools/src/http/extract/mod.rs @@ -0,0 +1,7 @@ +pub(crate) mod json; +pub(crate) mod path; +pub(crate) mod query; + +pub(crate) use json::Json; +pub(crate) use path::Path; +pub(crate) use query::Query; diff --git a/compute_tools/src/http/extract/path.rs b/compute_tools/src/http/extract/path.rs new file mode 100644 index 0000000000..09637a96a4 --- /dev/null +++ b/compute_tools/src/http/extract/path.rs @@ -0,0 +1,44 @@ +use std::ops::{Deref, DerefMut}; + +use axum::extract::{rejection::PathRejection, FromRequestParts}; +use compute_api::responses::GenericAPIError; +use http::{request::Parts, StatusCode}; + +/// Custom `Path` extractor, so that we can format errors into +/// `JsonResponse`. +#[derive(Debug, Clone, Copy, Default)] +pub(crate) struct Path(pub T); + +impl FromRequestParts for Path +where + axum::extract::Path: FromRequestParts, + S: Send + Sync, +{ + type Rejection = (StatusCode, axum::Json); + + async fn from_request_parts(parts: &mut Parts, state: &S) -> Result { + match axum::extract::Path::::from_request_parts(parts, state).await { + Ok(value) => Ok(Self(value.0)), + Err(rejection) => Err(( + rejection.status(), + axum::Json(GenericAPIError { + error: rejection.body_text().to_ascii_lowercase(), + }), + )), + } + } +} + +impl Deref for Path { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Path { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/compute_tools/src/http/extract/query.rs b/compute_tools/src/http/extract/query.rs new file mode 100644 index 0000000000..9dec3642cf --- /dev/null +++ b/compute_tools/src/http/extract/query.rs @@ -0,0 +1,44 @@ +use std::ops::{Deref, DerefMut}; + +use axum::extract::{rejection::QueryRejection, FromRequestParts}; +use compute_api::responses::GenericAPIError; +use http::{request::Parts, StatusCode}; + +/// Custom `Query` extractor, so that we can format errors into +/// `JsonResponse`. +#[derive(Debug, Clone, Copy, Default)] +pub(crate) struct Query(pub T); + +impl FromRequestParts for Query +where + axum::extract::Query: FromRequestParts, + S: Send + Sync, +{ + type Rejection = (StatusCode, axum::Json); + + async fn from_request_parts(parts: &mut Parts, state: &S) -> Result { + match axum::extract::Query::::from_request_parts(parts, state).await { + Ok(value) => Ok(Self(value.0)), + Err(rejection) => Err(( + rejection.status(), + axum::Json(GenericAPIError { + error: rejection.body_text().to_ascii_lowercase(), + }), + )), + } + } +} + +impl Deref for Query { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Query { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index e5fdf85eed..93eb6ef5b7 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -1 +1,54 @@ -pub mod api; +use axum::{body::Body, response::Response}; +use compute_api::responses::{ComputeStatus, GenericAPIError}; +use http::{header::CONTENT_TYPE, StatusCode}; +use serde::Serialize; +use tracing::error; + +mod extract; +mod routes; +pub mod server; + +/// Convenience response builder for JSON responses +struct JsonResponse; + +impl JsonResponse { + /// Helper for actually creating a response + fn create_response(code: StatusCode, body: impl Serialize) -> Response { + Response::builder() + .status(code) + .header(CONTENT_TYPE.as_str(), "application/json") + .body(Body::from(serde_json::to_string(&body).unwrap())) + .unwrap() + } + + /// Create a successful error response + pub(self) fn success(code: StatusCode, body: impl Serialize) -> Response { + assert!({ + let code = code.as_u16(); + + (200..300).contains(&code) + }); + + Self::create_response(code, body) + } + + /// Create an error response + pub(self) fn error(code: StatusCode, error: impl ToString) -> Response { + assert!(code.as_u16() >= 400); + + let message = error.to_string(); + error!(message); + + Self::create_response(code, &GenericAPIError { error: message }) + } + + /// Create an error response related to the compute being in an invalid state + pub(self) fn invalid_status(status: ComputeStatus) -> Response { + Self::create_response( + StatusCode::PRECONDITION_FAILED, + &GenericAPIError { + error: format!("invalid compute status: {status}"), + }, + ) + } +} diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 7b9a62c545..bbdb7d0917 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -37,7 +37,7 @@ paths: schema: $ref: "#/components/schemas/ComputeMetrics" - /metrics + /metrics: get: tags: - Info @@ -68,35 +68,6 @@ paths: schema: $ref: "#/components/schemas/ComputeInsights" - /installed_extensions: - get: - tags: - - Info - summary: Get installed extensions. - description: "" - operationId: getInstalledExtensions - responses: - 200: - description: List of installed extensions - content: - application/json: - schema: - $ref: "#/components/schemas/InstalledExtensions" - /info: - get: - tags: - - Info - summary: Get info about the compute pod / VM. - description: "" - operationId: getInfo - responses: - 200: - description: Info - content: - application/json: - schema: - $ref: "#/components/schemas/Info" - /dbs_and_roles: get: tags: @@ -537,12 +508,14 @@ components: properties: extname: type: string - versions: - type: array + version: + type: string items: type: string n_databases: type: integer + owned_by_superuser: + type: integer SetRoleGrantsRequest: type: object diff --git a/compute_tools/src/http/routes/check_writability.rs b/compute_tools/src/http/routes/check_writability.rs new file mode 100644 index 0000000000..d7feb055e9 --- /dev/null +++ b/compute_tools/src/http/routes/check_writability.rs @@ -0,0 +1,20 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use compute_api::responses::ComputeStatus; +use http::StatusCode; + +use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse}; + +/// Check that the compute is currently running. +pub(in crate::http) async fn is_writable(State(compute): State>) -> Response { + let status = compute.get_status(); + if status != ComputeStatus::Running { + return JsonResponse::invalid_status(status); + } + + match check_writability(&compute).await { + Ok(_) => JsonResponse::success(StatusCode::OK, true), + Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), + } +} diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs new file mode 100644 index 0000000000..2546cbc344 --- /dev/null +++ b/compute_tools/src/http/routes/configure.rs @@ -0,0 +1,91 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use compute_api::{ + requests::ConfigurationRequest, + responses::{ComputeStatus, ComputeStatusResponse}, +}; +use http::StatusCode; +use tokio::task; +use tracing::info; + +use crate::{ + compute::{ComputeNode, ParsedSpec}, + http::{extract::Json, JsonResponse}, +}; + +// Accept spec in JSON format and request compute configuration. If anything +// goes wrong after we set the compute status to `ConfigurationPending` and +// update compute state with new spec, we basically leave compute in the +// potentially wrong state. That said, it's control-plane's responsibility to +// watch compute state after reconfiguration request and to clean restart in +// case of errors. +pub(in crate::http) async fn configure( + State(compute): State>, + request: Json, +) -> Response { + if !compute.live_config_allowed { + return JsonResponse::error( + StatusCode::PRECONDITION_FAILED, + "live configuration is not allowed for this compute node".to_string(), + ); + } + + let pspec = match ParsedSpec::try_from(request.spec.clone()) { + Ok(p) => p, + Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e), + }; + + // XXX: wrap state update under lock in a code block. Otherwise, we will try + // to `Send` `mut state` into the spawned thread bellow, which will cause + // the following rustc error: + // + // error: future cannot be sent between threads safely + { + let mut state = compute.state.lock().unwrap(); + if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) { + return JsonResponse::invalid_status(state.status); + } + + state.pspec = Some(pspec); + state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed); + drop(state); + } + + // Spawn a blocking thread to wait for compute to become Running. This is + // needed to do not block the main pool of workers and be able to serve + // other requests while some particular request is waiting for compute to + // finish configuration. + let c = compute.clone(); + let completed = task::spawn_blocking(move || { + let mut state = c.state.lock().unwrap(); + while state.status != ComputeStatus::Running { + state = c.state_changed.wait(state).unwrap(); + info!( + "waiting for compute to become {}, current status: {}", + ComputeStatus::Running, + state.status + ); + + if state.status == ComputeStatus::Failed { + let err = state.error.as_ref().map_or("unknown error", |x| x); + let msg = format!("compute configuration failed: {:?}", err); + return Err(msg); + } + } + + Ok(()) + }) + .await + .unwrap(); + + if let Err(e) = completed { + return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e); + } + + // Return current compute state if everything went well. + let state = compute.state.lock().unwrap().clone(); + let body = ComputeStatusResponse::from(&state); + + JsonResponse::success(StatusCode::OK, body) +} diff --git a/compute_tools/src/http/routes/database_schema.rs b/compute_tools/src/http/routes/database_schema.rs new file mode 100644 index 0000000000..fd716272dc --- /dev/null +++ b/compute_tools/src/http/routes/database_schema.rs @@ -0,0 +1,34 @@ +use std::sync::Arc; + +use axum::{body::Body, extract::State, response::Response}; +use http::{header::CONTENT_TYPE, StatusCode}; +use serde::Deserialize; + +use crate::{ + catalog::{get_database_schema, SchemaDumpError}, + compute::ComputeNode, + http::{extract::Query, JsonResponse}, +}; + +#[derive(Debug, Clone, Deserialize)] +pub(in crate::http) struct DatabaseSchemaParams { + database: String, +} + +/// Get a schema dump of the requested database. +pub(in crate::http) async fn get_schema_dump( + params: Query, + State(compute): State>, +) -> Response { + match get_database_schema(&compute, ¶ms.database).await { + Ok(schema) => Response::builder() + .status(StatusCode::OK) + .header(CONTENT_TYPE.as_str(), "application/json") + .body(Body::from_stream(schema)) + .unwrap(), + Err(SchemaDumpError::DatabaseDoesNotExist) => { + JsonResponse::error(StatusCode::NOT_FOUND, SchemaDumpError::DatabaseDoesNotExist) + } + Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), + } +} diff --git a/compute_tools/src/http/routes/dbs_and_roles.rs b/compute_tools/src/http/routes/dbs_and_roles.rs new file mode 100644 index 0000000000..4843c3fab4 --- /dev/null +++ b/compute_tools/src/http/routes/dbs_and_roles.rs @@ -0,0 +1,16 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use http::StatusCode; + +use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse}; + +/// Get the databases and roles from the compute. +pub(in crate::http) async fn get_catalog_objects( + State(compute): State>, +) -> Response { + match get_dbs_and_roles(&compute).await { + Ok(catalog_objects) => JsonResponse::success(StatusCode::OK, catalog_objects), + Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), + } +} diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs new file mode 100644 index 0000000000..5cc9b6d277 --- /dev/null +++ b/compute_tools/src/http/routes/extension_server.rs @@ -0,0 +1,68 @@ +use std::sync::Arc; + +use axum::{ + extract::State, + response::{IntoResponse, Response}, +}; +use http::StatusCode; +use serde::Deserialize; + +use crate::{ + compute::ComputeNode, + http::{ + extract::{Path, Query}, + JsonResponse, + }, +}; + +#[derive(Debug, Clone, Deserialize)] +pub(in crate::http) struct ExtensionServerParams { + #[serde(default)] + is_library: bool, +} + +/// Download a remote extension. +pub(in crate::http) async fn download_extension( + Path(filename): Path, + params: Query, + State(compute): State>, +) -> Response { + // Don't even try to download extensions if no remote storage is configured + if compute.ext_remote_storage.is_none() { + return JsonResponse::error( + StatusCode::PRECONDITION_FAILED, + "remote storage is not configured", + ); + } + + let ext = { + let state = compute.state.lock().unwrap(); + let pspec = state.pspec.as_ref().unwrap(); + let spec = &pspec.spec; + + let remote_extensions = match spec.remote_extensions.as_ref() { + Some(r) => r, + None => { + return JsonResponse::error( + StatusCode::CONFLICT, + "information about remote extensions is unavailable", + ); + } + }; + + remote_extensions.get_ext( + &filename, + params.is_library, + &compute.build_tag, + &compute.pgversion, + ) + }; + + match ext { + Ok((ext_name, ext_path)) => match compute.download_extension(ext_name, ext_path).await { + Ok(_) => StatusCode::OK.into_response(), + Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e), + }, + Err(e) => JsonResponse::error(StatusCode::NOT_FOUND, e), + } +} diff --git a/compute_tools/src/http/routes/extensions.rs b/compute_tools/src/http/routes/extensions.rs new file mode 100644 index 0000000000..1fc03b9109 --- /dev/null +++ b/compute_tools/src/http/routes/extensions.rs @@ -0,0 +1,45 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use compute_api::{ + requests::ExtensionInstallRequest, + responses::{ComputeStatus, ExtensionInstallResponse}, +}; +use http::StatusCode; + +use crate::{ + compute::ComputeNode, + http::{extract::Json, JsonResponse}, +}; + +/// Install a extension. +pub(in crate::http) async fn install_extension( + State(compute): State>, + request: Json, +) -> Response { + let status = compute.get_status(); + if status != ComputeStatus::Running { + return JsonResponse::invalid_status(status); + } + + match compute + .install_extension( + &request.extension, + &request.database, + request.version.to_string(), + ) + .await + { + Ok(version) => JsonResponse::success( + StatusCode::CREATED, + Some(ExtensionInstallResponse { + extension: request.extension.clone(), + version, + }), + ), + Err(e) => JsonResponse::error( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to install extension: {e}"), + ), + } +} diff --git a/compute_tools/src/http/routes/failpoints.rs b/compute_tools/src/http/routes/failpoints.rs new file mode 100644 index 0000000000..836417d784 --- /dev/null +++ b/compute_tools/src/http/routes/failpoints.rs @@ -0,0 +1,49 @@ +use axum::response::{IntoResponse, Response}; +use http::StatusCode; +use serde::{Deserialize, Serialize}; +use tracing::info; +use utils::failpoint_support::apply_failpoint; + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} + +use crate::http::{extract::Json, JsonResponse}; + +/// Configure failpoints for testing purposes. +pub(in crate::http) async fn configure_failpoints( + failpoints: Json, +) -> Response { + if !fail::has_failpoints() { + return JsonResponse::error( + StatusCode::PRECONDITION_FAILED, + "Cannot manage failpoints because neon was compiled without failpoints support", + ); + } + + for fp in &*failpoints { + info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = apply_failpoint(&fp.name, &fp.actions); + + if let Err(e) = cfg_result { + return JsonResponse::error( + StatusCode::BAD_REQUEST, + format!("failed to configure failpoints: {e}"), + ); + } + } + + StatusCode::OK.into_response() +} diff --git a/compute_tools/src/http/routes/grants.rs b/compute_tools/src/http/routes/grants.rs new file mode 100644 index 0000000000..3f67f011e5 --- /dev/null +++ b/compute_tools/src/http/routes/grants.rs @@ -0,0 +1,48 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use compute_api::{ + requests::SetRoleGrantsRequest, + responses::{ComputeStatus, SetRoleGrantsResponse}, +}; +use http::StatusCode; + +use crate::{ + compute::ComputeNode, + http::{extract::Json, JsonResponse}, +}; + +/// Add grants for a role. +pub(in crate::http) async fn add_grant( + State(compute): State>, + request: Json, +) -> Response { + let status = compute.get_status(); + if status != ComputeStatus::Running { + return JsonResponse::invalid_status(status); + } + + match compute + .set_role_grants( + &request.database, + &request.schema, + &request.privileges, + &request.role, + ) + .await + { + Ok(()) => JsonResponse::success( + StatusCode::CREATED, + Some(SetRoleGrantsResponse { + database: request.database.clone(), + schema: request.schema.clone(), + role: request.role.clone(), + privileges: request.privileges.clone(), + }), + ), + Err(e) => JsonResponse::error( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to grant role privileges to the schema: {e}"), + ), + } +} diff --git a/compute_tools/src/http/routes/insights.rs b/compute_tools/src/http/routes/insights.rs new file mode 100644 index 0000000000..6b03a461c3 --- /dev/null +++ b/compute_tools/src/http/routes/insights.rs @@ -0,0 +1,18 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use compute_api::responses::ComputeStatus; +use http::StatusCode; + +use crate::{compute::ComputeNode, http::JsonResponse}; + +/// Collect current Postgres usage insights. +pub(in crate::http) async fn get_insights(State(compute): State>) -> Response { + let status = compute.get_status(); + if status != ComputeStatus::Running { + return JsonResponse::invalid_status(status); + } + + let insights = compute.collect_insights().await; + JsonResponse::success(StatusCode::OK, insights) +} diff --git a/compute_tools/src/http/routes/metrics.rs b/compute_tools/src/http/routes/metrics.rs new file mode 100644 index 0000000000..13150a7588 --- /dev/null +++ b/compute_tools/src/http/routes/metrics.rs @@ -0,0 +1,31 @@ +use axum::{body::Body, response::Response}; +use http::header::CONTENT_TYPE; +use http::StatusCode; +use metrics::proto::MetricFamily; +use metrics::{Encoder, TextEncoder}; + +use crate::{http::JsonResponse, metrics::collect}; + +/// Expose Prometheus metrics. +pub(in crate::http) async fn get_metrics() -> Response { + // When we call TextEncoder::encode() below, it will immediately return an + // error if a metric family has no metrics, so we need to preemptively + // filter out metric families with no metrics. + let metrics = collect() + .into_iter() + .filter(|m| !m.get_metric().is_empty()) + .collect::>(); + + let encoder = TextEncoder::new(); + let mut buffer = vec![]; + + if let Err(e) = encoder.encode(&metrics, &mut buffer) { + return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e); + } + + Response::builder() + .status(StatusCode::OK) + .header(CONTENT_TYPE, encoder.format_type()) + .body(Body::from(buffer)) + .unwrap() +} diff --git a/compute_tools/src/http/routes/metrics_json.rs b/compute_tools/src/http/routes/metrics_json.rs new file mode 100644 index 0000000000..0709db5011 --- /dev/null +++ b/compute_tools/src/http/routes/metrics_json.rs @@ -0,0 +1,12 @@ +use std::sync::Arc; + +use axum::{extract::State, response::Response}; +use http::StatusCode; + +use crate::{compute::ComputeNode, http::JsonResponse}; + +/// Get startup metrics. +pub(in crate::http) async fn get_metrics(State(compute): State>) -> Response { + let metrics = compute.state.lock().unwrap().metrics.clone(); + JsonResponse::success(StatusCode::OK, metrics) +} diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs new file mode 100644 index 0000000000..a67be7fd5a --- /dev/null +++ b/compute_tools/src/http/routes/mod.rs @@ -0,0 +1,36 @@ +use compute_api::responses::ComputeStatusResponse; + +use crate::compute::ComputeState; + +pub(in crate::http) mod check_writability; +pub(in crate::http) mod configure; +pub(in crate::http) mod database_schema; +pub(in crate::http) mod dbs_and_roles; +pub(in crate::http) mod extension_server; +pub(in crate::http) mod extensions; +pub(in crate::http) mod failpoints; +pub(in crate::http) mod grants; +pub(in crate::http) mod insights; +pub(in crate::http) mod metrics; +pub(in crate::http) mod metrics_json; +pub(in crate::http) mod status; +pub(in crate::http) mod terminate; + +impl From<&ComputeState> for ComputeStatusResponse { + fn from(state: &ComputeState) -> Self { + ComputeStatusResponse { + start_time: state.start_time, + tenant: state + .pspec + .as_ref() + .map(|pspec| pspec.tenant_id.to_string()), + timeline: state + .pspec + .as_ref() + .map(|pspec| pspec.timeline_id.to_string()), + status: state.status, + last_active: state.last_active, + error: state.error.clone(), + } + } +} diff --git a/compute_tools/src/http/routes/status.rs b/compute_tools/src/http/routes/status.rs new file mode 100644 index 0000000000..d64d53a58f --- /dev/null +++ b/compute_tools/src/http/routes/status.rs @@ -0,0 +1,14 @@ +use std::{ops::Deref, sync::Arc}; + +use axum::{extract::State, http::StatusCode, response::Response}; +use compute_api::responses::ComputeStatusResponse; + +use crate::{compute::ComputeNode, http::JsonResponse}; + +/// Retrieve the state of the comute. +pub(in crate::http) async fn get_status(State(compute): State>) -> Response { + let state = compute.state.lock().unwrap(); + let body = ComputeStatusResponse::from(state.deref()); + + JsonResponse::success(StatusCode::OK, body) +} diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs new file mode 100644 index 0000000000..7acd84f236 --- /dev/null +++ b/compute_tools/src/http/routes/terminate.rs @@ -0,0 +1,58 @@ +use std::sync::Arc; + +use axum::{ + extract::State, + response::{IntoResponse, Response}, +}; +use compute_api::responses::ComputeStatus; +use http::StatusCode; +use tokio::task; +use tracing::info; + +use crate::{ + compute::{forward_termination_signal, ComputeNode}, + http::JsonResponse, +}; + +/// Terminate the compute. +pub(in crate::http) async fn terminate(State(compute): State>) -> Response { + { + let mut state = compute.state.lock().unwrap(); + if state.status == ComputeStatus::Terminated { + return StatusCode::CREATED.into_response(); + } + + if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) { + return JsonResponse::invalid_status(state.status); + } + + state.set_status(ComputeStatus::TerminationPending, &compute.state_changed); + drop(state); + } + + forward_termination_signal(); + info!("sent signal and notified waiters"); + + // Spawn a blocking thread to wait for compute to become Terminated. + // This is needed to do not block the main pool of workers and + // be able to serve other requests while some particular request + // is waiting for compute to finish configuration. + let c = compute.clone(); + task::spawn_blocking(move || { + let mut state = c.state.lock().unwrap(); + while state.status != ComputeStatus::Terminated { + state = c.state_changed.wait(state).unwrap(); + info!( + "waiting for compute to become {}, current status: {:?}", + ComputeStatus::Terminated, + state.status + ); + } + }) + .await + .unwrap(); + + info!("terminated Postgres"); + + StatusCode::OK.into_response() +} diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs new file mode 100644 index 0000000000..a523ecd96f --- /dev/null +++ b/compute_tools/src/http/server.rs @@ -0,0 +1,211 @@ +use std::{ + fmt::Display, + net::{IpAddr, Ipv6Addr, SocketAddr}, + sync::Arc, + time::Duration, +}; + +use anyhow::Result; +use axum::{ + extract::Request, + middleware::{self, Next}, + response::{IntoResponse, Response}, + routing::{get, post}, + Router, +}; +use http::StatusCode; +use tokio::net::TcpListener; +use tower::ServiceBuilder; +use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer}; +use tracing::{debug, error, info, Span}; +use uuid::Uuid; + +use super::routes::{ + check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, + grants, insights, metrics, metrics_json, status, terminate, +}; +use crate::compute::ComputeNode; + +const X_REQUEST_ID: &str = "x-request-id"; + +/// `compute_ctl` has two servers: internal and external. The internal server +/// binds to the loopback interface and handles communication from clients on +/// the compute. The external server is what receives communication from the +/// control plane, the metrics scraper, etc. We make the distinction because +/// certain routes in `compute_ctl` only need to be exposed to local processes +/// like Postgres via the neon extension and local_proxy. +#[derive(Clone, Copy, Debug)] +pub enum Server { + Internal(u16), + External(u16), +} + +impl Display for Server { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Server::Internal(_) => f.write_str("internal"), + Server::External(_) => f.write_str("external"), + } + } +} + +impl From for Router> { + fn from(server: Server) -> Self { + let mut router = Router::>::new(); + + router = match server { + Server::Internal(_) => { + router = router + .route( + "/extension_server/{*filename}", + post(extension_server::download_extension), + ) + .route("/extensions", post(extensions::install_extension)) + .route("/grants", post(grants::add_grant)); + + // Add in any testing support + if cfg!(feature = "testing") { + use super::routes::failpoints; + + router = router.route("/failpoints", post(failpoints::configure_failpoints)); + } + + router + } + Server::External(_) => router + .route("/check_writability", post(check_writability::is_writable)) + .route("/configure", post(configure::configure)) + .route("/database_schema", get(database_schema::get_schema_dump)) + .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) + .route("/insights", get(insights::get_insights)) + .route("/metrics", get(metrics::get_metrics)) + .route("/metrics.json", get(metrics_json::get_metrics)) + .route("/status", get(status::get_status)) + .route("/terminate", post(terminate::terminate)), + }; + + router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer( + ServiceBuilder::new() + // Add this middleware since we assume the request ID exists + .layer(middleware::from_fn(maybe_add_request_id_header)) + .layer( + TraceLayer::new_for_http() + .on_request(|request: &http::Request<_>, _span: &Span| { + let request_id = request + .headers() + .get(X_REQUEST_ID) + .unwrap() + .to_str() + .unwrap(); + + match request.uri().path() { + "/metrics" => { + debug!(%request_id, "{} {}", request.method(), request.uri()) + } + _ => info!(%request_id, "{} {}", request.method(), request.uri()), + }; + }) + .on_response( + |response: &http::Response<_>, latency: Duration, _span: &Span| { + let request_id = response + .headers() + .get(X_REQUEST_ID) + .unwrap() + .to_str() + .unwrap(); + + info!( + %request_id, + code = response.status().as_u16(), + latency = latency.as_millis() + ) + }, + ), + ) + .layer(PropagateRequestIdLayer::x_request_id()), + ) + } +} + +impl Server { + async fn handle_404() -> impl IntoResponse { + StatusCode::NOT_FOUND + } + + async fn handle_405() -> impl IntoResponse { + StatusCode::METHOD_NOT_ALLOWED + } + + async fn listener(&self) -> Result { + let addr = SocketAddr::new(self.ip(), self.port()); + let listener = TcpListener::bind(&addr).await?; + + Ok(listener) + } + + fn ip(&self) -> IpAddr { + match self { + // TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners + // allow binding to localhost + Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED), + Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED), + } + } + + fn port(self) -> u16 { + match self { + Server::Internal(port) => port, + Server::External(port) => port, + } + } + + async fn serve(self, compute: Arc) { + let listener = self.listener().await.unwrap_or_else(|e| { + // If we can't bind, the compute cannot operate correctly + panic!( + "failed to bind the compute_ctl {} HTTP server to {}: {}", + self, + SocketAddr::new(self.ip(), self.port()), + e + ); + }); + + if tracing::enabled!(tracing::Level::INFO) { + let local_addr = match listener.local_addr() { + Ok(local_addr) => local_addr, + Err(_) => SocketAddr::new(self.ip(), self.port()), + }; + + info!( + "compute_ctl {} HTTP server listening at {}", + self, local_addr + ); + } + + let router = Router::from(self).with_state(compute); + + if let Err(e) = axum::serve(listener, router).await { + error!("compute_ctl {} HTTP server error: {}", self, e); + } + } + + pub fn launch(self, compute: &Arc) { + let state = Arc::clone(compute); + + info!("Launching the {} server", self); + + tokio::spawn(self.serve(state)); + } +} + +/// This middleware function allows compute_ctl to generate its own request ID +/// if one isn't supplied. The control plane will always send one as a UUID. The +/// neon Postgres extension on the other hand does not send one. +async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { + let headers = request.headers_mut(); + if headers.get(X_REQUEST_ID).is_none() { + headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); + } + + next.run(request).await +} diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 5f62f08858..173dbf40b0 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -1,14 +1,10 @@ use compute_api::responses::{InstalledExtension, InstalledExtensions}; -use metrics::proto::MetricFamily; use std::collections::HashMap; -use std::collections::HashSet; use anyhow::Result; use postgres::{Client, NoTls}; -use metrics::core::Collector; -use metrics::{register_uint_gauge_vec, UIntGaugeVec}; -use once_cell::sync::Lazy; +use crate::metrics::INSTALLED_EXTENSIONS; /// We don't reuse get_existing_dbs() just for code clarity /// and to make database listing query here more explicit. @@ -38,65 +34,68 @@ fn list_dbs(client: &mut Client) -> Result> { /// Connect to every database (see list_dbs above) and get the list of installed extensions. /// /// Same extension can be installed in multiple databases with different versions, -/// we only keep the highest and lowest version across all databases. +/// so we report a separate metric (number of databases where it is installed) +/// for each extension version. pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result { conf.application_name("compute_ctl:get_installed_extensions"); let mut client = conf.connect(NoTls)?; - let databases: Vec = list_dbs(&mut client)?; - let mut extensions_map: HashMap = HashMap::new(); + let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new(); for db in databases.iter() { conf.dbname(db); let mut db_client = conf.connect(NoTls)?; - let extensions: Vec<(String, String)> = db_client + let extensions: Vec<(String, String, i32)> = db_client .query( - "SELECT extname, extversion FROM pg_catalog.pg_extension;", + "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension", &[], )? .iter() - .map(|row| (row.get("extname"), row.get("extversion"))) + .map(|row| { + ( + row.get("extname"), + row.get("extversion"), + row.get("extowner"), + ) + }) .collect(); - for (extname, v) in extensions.iter() { + for (extname, v, extowner) in extensions.iter() { let version = v.to_string(); - // increment the number of databases where the version of extension is installed - INSTALLED_EXTENSIONS - .with_label_values(&[extname, &version]) - .inc(); + // check if the extension is owned by superuser + // 10 is the oid of superuser + let owned_by_superuser = if *extowner == 10 { "1" } else { "0" }; extensions_map - .entry(extname.to_string()) + .entry(( + extname.to_string(), + version.clone(), + owned_by_superuser.to_string(), + )) .and_modify(|e| { - e.versions.insert(version.clone()); // count the number of databases where the extension is installed e.n_databases += 1; }) .or_insert(InstalledExtension { extname: extname.to_string(), - versions: HashSet::from([version.clone()]), + version: version.clone(), n_databases: 1, + owned_by_superuser: owned_by_superuser.to_string(), }); } } - let res = InstalledExtensions { + for (key, ext) in extensions_map.iter() { + let (extname, version, owned_by_superuser) = key; + let n_databases = ext.n_databases as u64; + + INSTALLED_EXTENSIONS + .with_label_values(&[extname, version, owned_by_superuser]) + .set(n_databases); + } + + Ok(InstalledExtensions { extensions: extensions_map.into_values().collect(), - }; - - Ok(res) -} - -static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { - register_uint_gauge_vec!( - "compute_installed_extensions", - "Number of databases where the version of extension is installed", - &["extension_name", "version"] - ) - .expect("failed to define a metric") -}); - -pub fn collect() -> Vec { - INSTALLED_EXTENSIONS.collect() + }) } diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index ee4cf2dfa5..b08df22134 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -3,8 +3,6 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -extern crate hyper0 as hyper; - pub mod checker; pub mod config; pub mod configurator; @@ -18,6 +16,7 @@ pub mod extension_server; pub mod installed_extensions; pub mod local_proxy; pub mod lsn_lease; +pub mod metrics; mod migration; pub mod monitor; pub mod params; diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 00be5c13f9..3749dfc844 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -11,7 +11,7 @@ use tracing_subscriber::prelude::*; /// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See /// `tracing-utils` package description. /// -pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { +pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { // Initialize Logging let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level)); @@ -22,7 +22,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { .with_writer(std::io::stderr); // Initialize OpenTelemetry - let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl"); + let otlp_layer = tracing_utils::init_tracing("compute_ctl").await; // Put it all together tracing_subscriber::registry() diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs new file mode 100644 index 0000000000..870b294d08 --- /dev/null +++ b/compute_tools/src/metrics.rs @@ -0,0 +1,70 @@ +use metrics::core::Collector; +use metrics::proto::MetricFamily; +use metrics::{register_int_counter_vec, register_uint_gauge_vec, IntCounterVec, UIntGaugeVec}; +use once_cell::sync::Lazy; + +pub(crate) static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "compute_installed_extensions", + "Number of databases where the version of extension is installed", + &["extension_name", "version", "owned_by_superuser"] + ) + .expect("failed to define a metric") +}); + +// Normally, any HTTP API request is described by METHOD (e.g. GET, POST, etc.) + PATH, +// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec. +// And it's fair to call it a 'RPC' (Remote Procedure Call). +pub enum CPlaneRequestRPC { + GetSpec, +} + +impl CPlaneRequestRPC { + pub fn as_str(&self) -> &str { + match self { + CPlaneRequestRPC::GetSpec => "GetSpec", + } + } +} + +pub const UNKNOWN_HTTP_STATUS: &str = "unknown"; + +pub(crate) static CPLANE_REQUESTS_TOTAL: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "compute_ctl_cplane_requests_total", + "Total number of control plane requests made by compute_ctl by status", + &["rpc", "http_status"] + ) + .expect("failed to define a metric") +}); + +/// Total number of failed database migrations. Per-compute, this is actually a boolean metric, +/// either empty or with a single value (1, migration_id) because we stop at the first failure. +/// Yet, the sum over the fleet will provide the total number of failures. +pub(crate) static DB_MIGRATION_FAILED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "compute_ctl_db_migration_failed_total", + "Total number of failed database migrations", + &["migration_id"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "compute_ctl_remote_ext_requests_total", + "Total number of requests made by compute_ctl to download extensions from S3 proxy by status", + // Do not use any labels like extension name yet. + // We can add them later if needed. + &["http_status"] + ) + .expect("failed to define a metric") +}); + +pub fn collect() -> Vec { + let mut metrics = INSTALLED_EXTENSIONS.collect(); + metrics.extend(CPLANE_REQUESTS_TOTAL.collect()); + metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect()); + metrics.extend(DB_MIGRATION_FAILED.collect()); + metrics +} diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs index 22ab145eda..c5e05822c0 100644 --- a/compute_tools/src/migration.rs +++ b/compute_tools/src/migration.rs @@ -1,13 +1,18 @@ use anyhow::{Context, Result}; -use postgres::Client; -use tracing::info; +use fail::fail_point; +use tokio_postgres::{Client, Transaction}; +use tracing::{error, info}; +use crate::metrics::DB_MIGRATION_FAILED; + +/// Runs a series of migrations on a target database pub(crate) struct MigrationRunner<'m> { client: &'m mut Client, migrations: &'m [&'m str], } impl<'m> MigrationRunner<'m> { + /// Create a new migration runner pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self { // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64 assert!(migrations.len() + 1 < i64::MAX as usize); @@ -15,86 +20,128 @@ impl<'m> MigrationRunner<'m> { Self { client, migrations } } - fn get_migration_id(&mut self) -> Result { - let query = "SELECT id FROM neon_migration.migration_id"; + /// Get the current value neon_migration.migration_id + async fn get_migration_id(&mut self) -> Result { let row = self .client - .query_one(query, &[]) - .context("run_migrations get migration_id")?; + .query_one("SELECT id FROM neon_migration.migration_id", &[]) + .await?; Ok(row.get::<&str, i64>("id")) } - fn update_migration_id(&mut self, migration_id: i64) -> Result<()> { - let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id); + /// Update the neon_migration.migration_id value + /// + /// This function has a fail point called compute-migration, which can be + /// used if you would like to fail the application of a series of migrations + /// at some point. + async fn update_migration_id(txn: &mut Transaction<'_>, migration_id: i64) -> Result<()> { + // We use this fail point in order to check that failing in the + // middle of applying a series of migrations fails in an expected + // manner + if cfg!(feature = "testing") { + let fail = (|| { + fail_point!("compute-migration", |fail_migration_id| { + migration_id == fail_migration_id.unwrap().parse::().unwrap() + }); - self.client - .simple_query(&setval) - .context("run_migrations update id")?; + false + })(); - Ok(()) - } - - fn prepare_migrations(&mut self) -> Result<()> { - let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; - self.client.simple_query(query)?; - - let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; - self.client.simple_query(query)?; - - let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; - self.client.simple_query(query)?; - - let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; - self.client.simple_query(query)?; - - let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; - self.client.simple_query(query)?; - - Ok(()) - } - - pub fn run_migrations(mut self) -> Result<()> { - self.prepare_migrations()?; - - let mut current_migration = self.get_migration_id()? as usize; - while current_migration < self.migrations.len() { - macro_rules! migration_id { - ($cm:expr) => { - ($cm + 1) as i64 - }; + if fail { + return Err(anyhow::anyhow!(format!( + "migration {} was configured to fail because of a failpoint", + migration_id + ))); } + } + txn.query( + "UPDATE neon_migration.migration_id SET id = $1", + &[&migration_id], + ) + .await + .with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?; + + Ok(()) + } + + /// Prepare the migrations the target database for handling migrations + async fn prepare_database(&mut self) -> Result<()> { + self.client + .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration") + .await?; + self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?; + self.client + .simple_query( + "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING", + ) + .await?; + self.client + .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin") + .await?; + self.client + .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC") + .await?; + + Ok(()) + } + + /// Run an individual migration in a separate transaction block. + async fn run_migration(client: &mut Client, migration_id: i64, migration: &str) -> Result<()> { + let mut txn = client + .transaction() + .await + .with_context(|| format!("begin transaction for migration {migration_id}"))?; + + if migration.starts_with("-- SKIP") { + info!("Skipping migration id={}", migration_id); + + // Even though we are skipping the migration, updating the + // migration ID should help keep logic easy to understand when + // trying to understand the state of a cluster. + Self::update_migration_id(&mut txn, migration_id).await?; + } else { + info!("Running migration id={}:\n{}\n", migration_id, migration); + + txn.simple_query(migration) + .await + .with_context(|| format!("apply migration {migration_id}"))?; + + Self::update_migration_id(&mut txn, migration_id).await?; + } + + txn.commit() + .await + .with_context(|| format!("commit transaction for migration {migration_id}"))?; + + Ok(()) + } + + /// Run the configured set of migrations + pub async fn run_migrations(mut self) -> Result<()> { + self.prepare_database() + .await + .context("prepare database to handle migrations")?; + + let mut current_migration = self.get_migration_id().await? as usize; + while current_migration < self.migrations.len() { + // The index lags the migration ID by 1, so the current migration + // ID is also the next index + let migration_id = (current_migration + 1) as i64; let migration = self.migrations[current_migration]; - if migration.starts_with("-- SKIP") { - info!("Skipping migration id={}", migration_id!(current_migration)); - } else { - info!( - "Running migration id={}:\n{}\n", - migration_id!(current_migration), - migration - ); - - self.client - .simple_query("BEGIN") - .context("begin migration")?; - - self.client.simple_query(migration).with_context(|| { - format!( - "run_migrations migration id={}", - migration_id!(current_migration) - ) - })?; - - // Migration IDs start at 1 - self.update_migration_id(migration_id!(current_migration))?; - - self.client - .simple_query("COMMIT") - .context("commit migration")?; - - info!("Finished migration id={}", migration_id!(current_migration)); + match Self::run_migration(self.client, migration_id, migration).await { + Ok(_) => { + info!("Finished migration id={}", migration_id); + } + Err(e) => { + error!("Failed to run migration id={}: {:?}", migration_id, e); + DB_MIGRATION_FAILED + .with_label_values(&[migration_id.to_string().as_str()]) + .inc(); + return Err(e); + } } current_migration += 1; diff --git a/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql new file mode 100644 index 0000000000..0c81cef1c4 --- /dev/null +++ b/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql @@ -0,0 +1,9 @@ +DO $$ +DECLARE + bypassrls boolean; +BEGIN + SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser'; + IF NOT bypassrls THEN + RAISE EXCEPTION 'neon_superuser cannot bypass RLS'; + END IF; +END $$; diff --git a/compute_tools/src/migrations/tests/0002-alter_roles.sql b/compute_tools/src/migrations/tests/0002-alter_roles.sql new file mode 100644 index 0000000000..433f7b34f7 --- /dev/null +++ b/compute_tools/src/migrations/tests/0002-alter_roles.sql @@ -0,0 +1,25 @@ +DO $$ +DECLARE + role record; +BEGIN + FOR role IN + SELECT rolname AS name, rolinherit AS inherit + FROM pg_roles + WHERE pg_has_role(rolname, 'neon_superuser', 'member') + LOOP + IF NOT role.inherit THEN + RAISE EXCEPTION '% cannot inherit', quote_ident(role.name); + END IF; + END LOOP; + + FOR role IN + SELECT rolname AS name, rolbypassrls AS bypassrls + FROM pg_roles + WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member') + AND NOT starts_with(rolname, 'pg_') + LOOP + IF role.bypassrls THEN + RAISE EXCEPTION '% can bypass RLS', quote_ident(role.name); + END IF; + END LOOP; +END $$; diff --git a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql new file mode 100644 index 0000000000..b164d61295 --- /dev/null +++ b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql @@ -0,0 +1,10 @@ +DO $$ +BEGIN + IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN + RETURN; + END IF; + + IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN + RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription'; + END IF; +END $$; diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql new file mode 100644 index 0000000000..acb8dd417d --- /dev/null +++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql @@ -0,0 +1,19 @@ +DO $$ +DECLARE + monitor record; +BEGIN + SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member, + admin_option AS admin + INTO monitor + FROM pg_auth_members + WHERE roleid = 'pg_monitor'::regrole + AND member = 'pg_monitor'::regrole; + + IF NOT monitor.member THEN + RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor'; + END IF; + + IF NOT monitor.admin THEN + RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor'; + END IF; +END $$; diff --git a/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql new file mode 100644 index 0000000000..f99101bd65 --- /dev/null +++ b/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql @@ -0,0 +1,2 @@ +-- This test was never written becuase at the time migration tests were added +-- the accompanying migration was already skipped. diff --git a/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql new file mode 100644 index 0000000000..f99101bd65 --- /dev/null +++ b/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql @@ -0,0 +1,2 @@ +-- This test was never written becuase at the time migration tests were added +-- the accompanying migration was already skipped. diff --git a/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql new file mode 100644 index 0000000000..f99101bd65 --- /dev/null +++ b/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql @@ -0,0 +1,2 @@ +-- This test was never written becuase at the time migration tests were added +-- the accompanying migration was already skipped. diff --git a/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql new file mode 100644 index 0000000000..f99101bd65 --- /dev/null +++ b/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql @@ -0,0 +1,2 @@ +-- This test was never written becuase at the time migration tests were added +-- the accompanying migration was already skipped. diff --git a/compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql new file mode 100644 index 0000000000..f99101bd65 --- /dev/null +++ b/compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql @@ -0,0 +1,2 @@ +-- This test was never written becuase at the time migration tests were added +-- the accompanying migration was already skipped. diff --git a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql new file mode 100644 index 0000000000..af7f50e95d --- /dev/null +++ b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql @@ -0,0 +1,13 @@ +DO $$ +DECLARE + can_execute boolean; +BEGIN + SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute')) + INTO can_execute + FROM pg_proc + WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot') + AND pronamespace = 'pg_catalog'::regnamespace; + IF NOT can_execute THEN + RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot'; + END IF; +END $$; diff --git a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql new file mode 100644 index 0000000000..e55dcdc3b6 --- /dev/null +++ b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql @@ -0,0 +1,13 @@ +DO $$ +DECLARE + can_execute boolean; +BEGIN + SELECT has_function_privilege('neon_superuser', oid, 'execute') + INTO can_execute + FROM pg_proc + WHERE proname = 'pg_show_replication_origin_status' + AND pronamespace = 'pg_catalog'::regnamespace; + IF NOT can_execute THEN + RAISE EXCEPTION 'neon_superuser cannot execute pg_show_replication_origin_status'; + END IF; +END $$; diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index e03b410699..86fcf99085 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -7,7 +7,6 @@ use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; use std::str::FromStr; -use std::thread::JoinHandle; use std::time::{Duration, Instant}; use anyhow::{bail, Result}; @@ -16,6 +15,7 @@ use ini::Ini; use notify::{RecursiveMode, Watcher}; use postgres::config::Config; use tokio::io::AsyncBufReadExt; +use tokio::task::JoinHandle; use tokio::time::timeout; use tokio_postgres; use tokio_postgres::NoTls; @@ -477,23 +477,13 @@ pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result Ok(()) } -/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs +/// Spawn a task that will read Postgres logs from `stderr`, join multiline logs /// and send them to the logger. In the future we may also want to add context to /// these logs. -pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> { - std::thread::spawn(move || { - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to build tokio runtime"); - - let res = runtime.block_on(async move { - let stderr = tokio::process::ChildStderr::from_std(stderr)?; - handle_postgres_logs_async(stderr).await - }); - if let Err(e) = res { - tracing::error!("error while processing postgres logs: {}", e); - } +pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle> { + tokio::spawn(async move { + let stderr = tokio::process::ChildStderr::from_std(stderr)?; + handle_postgres_logs_async(stderr).await }) } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index c7d2deb090..6f28bd9733 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,16 +1,19 @@ use anyhow::{anyhow, bail, Result}; -use postgres::Client; use reqwest::StatusCode; use std::fs::File; use std::path::Path; +use tokio_postgres::Client; use tracing::{error, info, instrument, warn}; use crate::config; +use crate::metrics::{CPlaneRequestRPC, CPLANE_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; use crate::migration::MigrationRunner; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; -use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse}; +use compute_api::responses::{ + ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse, +}; use compute_api::spec::ComputeSpec; // Do control plane request and return response if any. In case of error it @@ -19,7 +22,7 @@ use compute_api::spec::ComputeSpec; fn do_control_plane_request( uri: &str, jwt: &str, -) -> Result { +) -> Result { let resp = reqwest::blocking::Client::new() .get(uri) .header("Authorization", format!("Bearer {}", jwt)) @@ -27,35 +30,42 @@ fn do_control_plane_request( .map_err(|e| { ( true, - format!("could not perform spec request to control plane: {}", e), + format!("could not perform spec request to control plane: {:?}", e), + UNKNOWN_HTTP_STATUS.to_string(), ) })?; - match resp.status() { + let status = resp.status(); + match status { StatusCode::OK => match resp.json::() { Ok(spec_resp) => Ok(spec_resp), Err(e) => Err(( true, - format!("could not deserialize control plane response: {}", e), + format!("could not deserialize control plane response: {:?}", e), + status.to_string(), )), }, - StatusCode::SERVICE_UNAVAILABLE => { - Err((true, "control plane is temporarily unavailable".to_string())) - } + StatusCode::SERVICE_UNAVAILABLE => Err(( + true, + "control plane is temporarily unavailable".to_string(), + status.to_string(), + )), StatusCode::BAD_GATEWAY => { // We have a problem with intermittent 502 errors now // https://github.com/neondatabase/cloud/issues/2353 // It's fine to retry GET request in this case. - Err((true, "control plane request failed with 502".to_string())) + Err(( + true, + "control plane request failed with 502".to_string(), + status.to_string(), + )) } // Another code, likely 500 or 404, means that compute is unknown to the control plane // or some internal failure happened. Doesn't make much sense to retry in this case. _ => Err(( false, - format!( - "unexpected control plane response status code: {}", - resp.status() - ), + format!("unexpected control plane response status code: {}", status), + status.to_string(), )), } } @@ -65,14 +75,13 @@ fn do_control_plane_request( pub fn get_spec_from_control_plane( base_uri: &str, compute_id: &str, -) -> Result> { +) -> Result<(Option, ComputeCtlConfig)> { let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec"); let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") { Ok(v) => v, Err(_) => "".to_string(), }; let mut attempt = 1; - let mut spec: Result> = Ok(None); info!("getting spec from control plane: {}", cp_uri); @@ -82,18 +91,29 @@ pub fn get_spec_from_control_plane( // - no spec for compute yet (Empty state) -> return Ok(None) // - got spec -> return Ok(Some(spec)) while attempt < 4 { - spec = match do_control_plane_request(&cp_uri, &jwt) { - Ok(spec_resp) => match spec_resp.status { - ControlPlaneComputeStatus::Empty => Ok(None), - ControlPlaneComputeStatus::Attached => { - if let Some(spec) = spec_resp.spec { - Ok(Some(spec)) - } else { - bail!("compute is attached, but spec is empty") + let result = match do_control_plane_request(&cp_uri, &jwt) { + Ok(spec_resp) => { + CPLANE_REQUESTS_TOTAL + .with_label_values(&[ + CPlaneRequestRPC::GetSpec.as_str(), + &StatusCode::OK.to_string(), + ]) + .inc(); + match spec_resp.status { + ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)), + ControlPlaneComputeStatus::Attached => { + if let Some(spec) = spec_resp.spec { + Ok((Some(spec), spec_resp.compute_ctl_config)) + } else { + bail!("compute is attached, but spec is empty") + } } } - }, - Err((retry, msg)) => { + } + Err((retry, msg, status)) => { + CPLANE_REQUESTS_TOTAL + .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status]) + .inc(); if retry { Err(anyhow!(msg)) } else { @@ -102,10 +122,10 @@ pub fn get_spec_from_control_plane( } }; - if let Err(e) = &spec { + if let Err(e) = &result { error!("attempt {} to get spec failed with: {}", attempt, e); } else { - return spec; + return result; } attempt += 1; @@ -113,7 +133,9 @@ pub fn get_spec_from_control_plane( } // All attempts failed, return error. - spec + Err(anyhow::anyhow!( + "Exhausted all attempts to retrieve the spec from the control plane" + )) } /// Check `pg_hba.conf` and update if needed to allow external connections. @@ -147,17 +169,17 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { } #[instrument(skip_all)] -pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { +pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { info!("handle neon extension upgrade"); let query = "ALTER EXTENSION neon UPDATE"; info!("update neon extension version with query: {}", query); - client.simple_query(query)?; + client.simple_query(query).await?; Ok(()) } #[instrument(skip_all)] -pub fn handle_migrations(client: &mut Client) -> Result<()> { +pub async fn handle_migrations(client: &mut Client) -> Result<()> { info!("handle migrations"); // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @@ -187,7 +209,9 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { ), ]; - MigrationRunner::new(client, &migrations).run_migrations()?; + MigrationRunner::new(client, &migrations) + .run_migrations() + .await?; Ok(()) } @@ -195,7 +219,7 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { /// Connect to the database as superuser and pre-create anon extension /// if it is present in shared_preload_libraries #[instrument(skip_all)] -pub fn handle_extension_anon( +pub async fn handle_extension_anon( spec: &ComputeSpec, db_owner: &str, db_client: &mut Client, @@ -208,7 +232,7 @@ pub fn handle_extension_anon( if !grants_only { // check if extension is already initialized using anon.is_initialized() let query = "SELECT anon.is_initialized()"; - match db_client.query(query, &[]) { + match db_client.query(query, &[]).await { Ok(rows) => { if !rows.is_empty() { let is_initialized: bool = rows[0].get(0); @@ -230,7 +254,7 @@ pub fn handle_extension_anon( // Users cannot create it themselves, because superuser is required. let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE"; info!("creating anon extension with query: {}", query); - match db_client.query(query, &[]) { + match db_client.query(query, &[]).await { Ok(_) => {} Err(e) => { error!("anon extension creation failed with error: {}", e); @@ -240,7 +264,7 @@ pub fn handle_extension_anon( // check that extension is installed query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; - let rows = db_client.query(query, &[])?; + let rows = db_client.query(query, &[]).await?; if rows.is_empty() { error!("anon extension is not installed"); return Ok(()); @@ -249,7 +273,7 @@ pub fn handle_extension_anon( // Initialize anon extension // This also requires superuser privileges, so users cannot do it themselves. query = "SELECT anon.init()"; - match db_client.query(query, &[]) { + match db_client.query(query, &[]).await { Ok(_) => {} Err(e) => { error!("anon.init() failed with error: {}", e); @@ -260,7 +284,7 @@ pub fn handle_extension_anon( // check that extension is installed, if not bail early let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; - match db_client.query(query, &[]) { + match db_client.query(query, &[]).await { Ok(rows) => { if rows.is_empty() { error!("anon extension is not installed"); @@ -275,12 +299,12 @@ pub fn handle_extension_anon( let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner); info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; // Grant permissions to db_owner to use anon extension functions let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner); info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; // This is needed, because some functions are defined as SECURITY DEFINER. // In Postgres SECURITY DEFINER functions are executed with the privileges @@ -295,16 +319,16 @@ pub fn handle_extension_anon( where nsp.nspname = 'anon';", db_owner); info!("change anon extension functions owner to db owner"); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; // affects views as well let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner); info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner); info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query)?; + db_client.simple_query(&query).await?; } } diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 7308d5d36e..c4416480d8 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -7,12 +7,12 @@ use std::sync::Arc; use crate::compute::construct_superuser_query; use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt}; -use anyhow::{bail, Result}; +use anyhow::Result; use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role}; use futures::future::join_all; use tokio::sync::RwLock; use tokio_postgres::Client; -use tracing::{debug, info_span, Instrument}; +use tracing::{debug, info_span, warn, Instrument}; #[derive(Clone)] pub enum DB { @@ -47,6 +47,12 @@ pub enum PerDatabasePhase { DeleteDBRoleReferences, ChangeSchemaPerms, HandleAnonExtension, + /// This is a shared phase, used for both i) dropping dangling LR subscriptions + /// before dropping the DB, and ii) dropping all subscriptions after creating + /// a fresh branch. + /// N.B. we will skip all DBs that are not present in Postgres, invalid, or + /// have `datallowconn = false` (`restrict_conn`). + DropLogicalSubscriptions, } #[derive(Clone, Debug)] @@ -57,11 +63,13 @@ pub enum ApplySpecPhase { CreateAndAlterRoles, RenameAndDeleteDatabases, CreateAndAlterDatabases, + CreateSchemaNeon, RunInEachDatabase { db: DB, subphase: PerDatabasePhase }, HandleOtherExtensions, HandleNeonExtension, CreateAvailabilityCheck, DropRoles, + FinalizeDropLogicalSubscriptions, } pub struct Operation { @@ -74,7 +82,7 @@ pub struct MutableApplyContext { pub dbs: HashMap, } -/// Appply the operations that belong to the given spec apply phase. +/// Apply the operations that belong to the given spec apply phase. /// /// Commands within a single phase are executed in order of Iterator yield. /// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database @@ -165,7 +173,7 @@ where /// /// In the future we may generate a single stream of changes and then /// sort/merge/batch execution, but for now this is a nice way to improve -/// batching behaviour of the commands. +/// batching behavior of the commands. async fn get_operations<'a>( spec: &'a ComputeSpec, ctx: &'a RwLock, @@ -326,13 +334,12 @@ async fn get_operations<'a>( // Use FORCE to drop database even if there are active connections. // We run this from `cloud_admin`, so it should have enough privileges. + // // NB: there could be other db states, which prevent us from dropping // the database. For example, if db is used by any active subscription // or replication slot. - // TODO: deal with it once we allow logical replication. Proper fix should - // involve returning an error code to the control plane, so it could - // figure out that this is a non-retryable error, return it to the user - // and fail operation permanently. + // Such cases are handled in the DropLogicalSubscriptions + // phase. We do all the cleanup before actually dropping the database. let drop_db_query: String = format!( "DROP DATABASE IF EXISTS {} WITH (FORCE)", &op.name.pg_quote() @@ -442,8 +449,70 @@ async fn get_operations<'a>( Ok(Box::new(operations)) } + ApplySpecPhase::CreateSchemaNeon => Ok(Box::new(once(Operation { + query: String::from("CREATE SCHEMA IF NOT EXISTS neon"), + comment: Some(String::from( + "create schema for neon extension and utils tables", + )), + }))), ApplySpecPhase::RunInEachDatabase { db, subphase } => { + // Do some checks that user DB exists and we can access it. + // + // During the phases like DropLogicalSubscriptions, DeleteDBRoleReferences, + // which happen before dropping the DB, the current run could be a retry, + // so it's a valid case when DB is absent already. The case of + // `pg_database.datallowconn = false`/`restrict_conn` is a bit tricky, as + // in theory user can have some dangling objects there, so we will fail at + // the actual drop later. Yet, to fix that in the current code we would need + // to ALTER DATABASE, and then check back, but that even more invasive, so + // that's not what we really want to do here. + // + // For ChangeSchemaPerms, skipping DBs we cannot access is totally fine. + if let DB::UserDB(db) = db { + let databases = &ctx.read().await.dbs; + + let edb = match databases.get(&db.name) { + Some(edb) => edb, + None => { + warn!("skipping RunInEachDatabase phase {:?}, database {} doesn't exist in PostgreSQL", subphase, db.name); + return Ok(Box::new(empty())); + } + }; + + if edb.restrict_conn || edb.invalid { + warn!( + "skipping RunInEachDatabase phase {:?}, database {} is (restrict_conn={}, invalid={})", + subphase, db.name, edb.restrict_conn, edb.invalid + ); + return Ok(Box::new(empty())); + } + } + match subphase { + PerDatabasePhase::DropLogicalSubscriptions => { + match &db { + DB::UserDB(db) => { + let drop_subscription_query: String = format!( + include_str!("sql/drop_subscriptions.sql"), + datname_str = escape_literal(&db.name), + ); + + let operations = vec![Operation { + query: drop_subscription_query, + comment: Some(format!( + "optionally dropping subscriptions for DB {}", + db.name, + )), + }] + .into_iter(); + + Ok(Box::new(operations)) + } + // skip this cleanup for the system databases + // because users can't drop them + DB::SystemDB => Ok(Box::new(empty())), + } + } PerDatabasePhase::DeleteDBRoleReferences => { let ctx = ctx.read().await; @@ -474,7 +543,19 @@ async fn get_operations<'a>( ), comment: None, }, + // Revoke some potentially blocking privileges (Neon-specific currently) + Operation { + query: format!( + include_str!("sql/pre_drop_role_revoke_privileges.sql"), + role_name = quoted, + ), + comment: None, + }, // This now will only drop privileges of the role + // TODO: this is obviously not 100% true because of the above case, + // there could be still some privileges that are not revoked. Maybe this + // only drops privileges that were granted *by this* role, not *to this* role, + // but this has to be checked. Operation { query: format!("DROP OWNED BY {}", quoted), comment: None, @@ -486,25 +567,12 @@ async fn get_operations<'a>( Ok(Box::new(operations)) } PerDatabasePhase::ChangeSchemaPerms => { - let ctx = ctx.read().await; - let databases = &ctx.dbs; - let db = match &db { // ignore schema permissions on the system database DB::SystemDB => return Ok(Box::new(empty())), DB::UserDB(db) => db, }; - if databases.get(&db.name).is_none() { - bail!("database {} doesn't exist in PostgreSQL", db.name); - } - - let edb = databases.get(&db.name).unwrap(); - - if edb.restrict_conn || edb.invalid { - return Ok(Box::new(empty())); - } - let operations = vec![ Operation { query: format!( @@ -522,6 +590,7 @@ async fn get_operations<'a>( Ok(Box::new(operations)) } + // TODO: remove this completely https://github.com/neondatabase/cloud/issues/22663 PerDatabasePhase::HandleAnonExtension => { // Only install Anon into user databases let db = match &db { @@ -630,10 +699,6 @@ async fn get_operations<'a>( } ApplySpecPhase::HandleNeonExtension => { let operations = vec![ - Operation { - query: String::from("CREATE SCHEMA IF NOT EXISTS neon"), - comment: Some(String::from("init: add schema for extension")), - }, Operation { query: String::from("CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"), comment: Some(String::from( @@ -676,5 +741,9 @@ async fn get_operations<'a>( Ok(Box::new(operations)) } + ApplySpecPhase::FinalizeDropLogicalSubscriptions => Ok(Box::new(once(Operation { + query: String::from(include_str!("sql/finalize_drop_subscriptions.sql")), + comment: None, + }))), } } diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql new file mode 100644 index 0000000000..03e8e158fa --- /dev/null +++ b/compute_tools/src/sql/drop_subscriptions.sql @@ -0,0 +1,12 @@ +DO $$ +DECLARE + subname TEXT; +BEGIN + LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE; + FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP + EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname); + EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname); + EXECUTE format('DROP SUBSCRIPTION %I;', subname); + END LOOP; +END; +$$; diff --git a/compute_tools/src/sql/finalize_drop_subscriptions.sql b/compute_tools/src/sql/finalize_drop_subscriptions.sql new file mode 100644 index 0000000000..4bb291876f --- /dev/null +++ b/compute_tools/src/sql/finalize_drop_subscriptions.sql @@ -0,0 +1,21 @@ +DO $$ +BEGIN + IF NOT EXISTS( + SELECT 1 + FROM pg_catalog.pg_tables + WHERE tablename = 'drop_subscriptions_done' + AND schemaname = 'neon' + ) + THEN + CREATE TABLE neon.drop_subscriptions_done + (id serial primary key, timeline_id text); + END IF; + + -- preserve the timeline_id of the last drop_subscriptions run + -- to ensure that the cleanup of a timeline is executed only once. + -- use upsert to avoid the table bloat in case of cascade branching (branch of a branch) + INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id')) + ON CONFLICT (id) DO UPDATE + SET timeline_id = current_setting('neon.timeline_id'); +END +$$ diff --git a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql new file mode 100644 index 0000000000..cdaa7071d3 --- /dev/null +++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql @@ -0,0 +1,28 @@ +SET SESSION ROLE neon_superuser; + +DO $$ +DECLARE + schema TEXT; + revoke_query TEXT; +BEGIN + FOR schema IN + SELECT schema_name + FROM information_schema.schemata + -- So far, we only had issues with 'public' schema. Probably, because we do some additional grants, + -- e.g., make DB owner the owner of 'public' schema automatically (when created via API). + -- See https://github.com/neondatabase/cloud/issues/13582 for the context. + -- Still, keep the loop because i) it efficiently handles the case when there is no 'public' schema, + -- ii) it's easy to add more schemas to the list if needed. + WHERE schema_name IN ('public') + LOOP + revoke_query := format( + 'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;', + schema + ); + + EXECUTE revoke_query; + END LOOP; +END; +$$; + +RESET ROLE; diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index f718102847..162c49ec7c 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -33,6 +33,7 @@ postgres_backend.workspace = true safekeeper_api.workspace = true postgres_connection.workspace = true storage_broker.workspace = true +http-utils.workspace = true utils.workspace = true whoami.workspace = true diff --git a/control_plane/README.md b/control_plane/README.md index 827aba5c1f..aa6f935e27 100644 --- a/control_plane/README.md +++ b/control_plane/README.md @@ -1,6 +1,10 @@ -# Control Plane and Neon Local +# Local Development Control Plane (`neon_local`) -This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command. +This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command. This is a convenience to invoke +the `neon_local` binary. + +**Note**: this is a dev/test tool -- a minimal control plane suitable for testing +code changes locally, but not suitable for running production systems. ## Example: Start with Postgres 16 diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 94a072e394..c668e68402 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -261,7 +261,13 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", backtrace_setting); // Pass through these environment variables to the command - for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] { + for var in [ + "LLVM_PROFILE_FILE", + "FAILPOINTS", + "RUST_LOG", + "ASAN_OPTIONS", + "UBSAN_OPTIONS", + ] { if let Some(val) = std::env::var_os(var) { filled_cmd = filled_cmd.env(var, val); } @@ -274,6 +280,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { for env_key in [ "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", + "AWS_SESSION_TOKEN", "AWS_PROFILE", // HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions. "HOME", diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 1ea443b026..7d908ccae9 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -19,6 +19,7 @@ use control_plane::storage_controller::{ NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, }; use control_plane::{broker, local_env}; +use nix::fcntl::{flock, FlockArg}; use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, @@ -36,6 +37,8 @@ use safekeeper_api::{ }; use std::borrow::Cow; use std::collections::{BTreeSet, HashMap}; +use std::fs::File; +use std::os::fd::AsRawFd; use std::path::PathBuf; use std::process::exit; use std::str::FromStr; @@ -549,8 +552,10 @@ struct EndpointCreateCmdArgs { lsn: Option, #[clap(long)] pg_port: Option, + #[clap(long, alias = "http-port")] + external_http_port: Option, #[clap(long)] - http_port: Option, + internal_http_port: Option, #[clap(long = "pageserver-id")] endpoint_pageserver_id: Option, @@ -689,6 +694,21 @@ struct TimelineTreeEl { pub children: BTreeSet, } +/// A flock-based guard over the neon_local repository directory +struct RepoLock { + _file: File, +} + +impl RepoLock { + fn new() -> Result { + let repo_dir = File::open(local_env::base_path())?; + let repo_dir_fd = repo_dir.as_raw_fd(); + flock(repo_dir_fd, FlockArg::LockExclusive)?; + + Ok(Self { _file: repo_dir }) + } +} + // Main entry point for the 'neon_local' CLI utility // // This utility helps to manage neon installation. That includes following: @@ -700,9 +720,14 @@ fn main() -> Result<()> { let cli = Cli::parse(); // Check for 'neon init' command first. - let subcommand_result = if let NeonLocalCmd::Init(args) = cli.command { - handle_init(&args).map(|env| Some(Cow::Owned(env))) + let (subcommand_result, _lock) = if let NeonLocalCmd::Init(args) = cli.command { + (handle_init(&args).map(|env| Some(Cow::Owned(env))), None) } else { + // This tool uses a collection of simple files to store its state, and consequently + // it is not generally safe to run multiple commands concurrently. Rather than expect + // all callers to know this, use a lock file to protect against concurrent execution. + let _repo_lock = RepoLock::new().unwrap(); + // all other commands need an existing config let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?; let original_env = env.clone(); @@ -728,11 +753,12 @@ fn main() -> Result<()> { NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env), }; - if &original_env != env { + let subcommand_result = if &original_env != env { subcommand_result.map(|()| Some(Cow::Borrowed(env))) } else { subcommand_result.map(|()| None) - } + }; + (subcommand_result, Some(_repo_lock)) }; match subcommand_result { @@ -861,20 +887,6 @@ fn print_timeline( Ok(()) } -/// Returns a map of timeline IDs to timeline_id@lsn strings. -/// Connects to the pageserver to query this information. -async fn get_timeline_infos( - env: &local_env::LocalEnv, - tenant_shard_id: &TenantShardId, -) -> Result> { - Ok(get_default_pageserver(env) - .timeline_list(tenant_shard_id) - .await? - .into_iter() - .map(|timeline_info| (timeline_info.timeline_id, timeline_info)) - .collect()) -} - /// Helper function to get tenant id from an optional --tenant_id option or from the config file fn get_tenant_id( tenant_id_arg: Option, @@ -922,7 +934,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { } else { // User (likely interactive) did not provide a description of the environment, give them the default NeonLocalInitConf { - control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())), + control_plane_api: Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap()), broker: NeonBroker { listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(), }, @@ -1225,12 +1237,6 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller // where shard 0 is attached, and query there. let tenant_shard_id = get_tenant_shard_id(args.tenant_shard_id, env)?; - let timeline_infos = get_timeline_infos(env, &tenant_shard_id) - .await - .unwrap_or_else(|e| { - eprintln!("Failed to load timeline info: {}", e); - HashMap::new() - }); let timeline_name_mappings = env.timeline_name_mappings(); @@ -1259,12 +1265,9 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res lsn.to_string() } _ => { - // -> primary endpoint or hot replica - // Use the LSN at the end of the timeline. - timeline_infos - .get(&endpoint.timeline_id) - .map(|bi| bi.last_record_lsn.to_string()) - .unwrap_or_else(|| "?".to_string()) + // As the LSN here refers to the one that the compute is started with, + // we display nothing as it is a primary/hot standby compute. + "---".to_string() } }; @@ -1329,10 +1332,12 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res tenant_id, timeline_id, args.pg_port, - args.http_port, + args.external_http_port, + args.internal_http_port, args.pg_version, mode, !args.update_catalog, + false, )?; } EndpointCmd::Start(args) => { @@ -1718,18 +1723,15 @@ async fn handle_start_all_impl( broker::start_broker_process(env, &retry_timeout).await }); - // Only start the storage controller if the pageserver is configured to need it - if env.control_plane_api.is_some() { - js.spawn(async move { - let storage_controller = StorageController::from_env(env); - storage_controller - .start(NeonStorageControllerStartArgs::with_default_instance_id( - retry_timeout, - )) - .await - .map_err(|e| e.context("start storage_controller")) - }); - } + js.spawn(async move { + let storage_controller = StorageController::from_env(env); + storage_controller + .start(NeonStorageControllerStartArgs::with_default_instance_id( + retry_timeout, + )) + .await + .map_err(|e| e.context("start storage_controller")) + }); for ps_conf in &env.pageservers { js.spawn(async move { @@ -1774,10 +1776,6 @@ async fn neon_start_status_check( const RETRY_INTERVAL: Duration = Duration::from_millis(100); const NOTICE_AFTER_RETRIES: Duration = Duration::from_secs(5); - if env.control_plane_api.is_none() { - return Ok(()); - } - let storcon = StorageController::from_env(env); let retries = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis(); diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 35067c95b6..407578abb8 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -37,6 +37,8 @@ //! ``` //! use std::collections::BTreeMap; +use std::net::IpAddr; +use std::net::Ipv4Addr; use std::net::SocketAddr; use std::net::TcpStream; use std::path::PathBuf; @@ -44,8 +46,12 @@ use std::process::Command; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use std::time::SystemTime; +use std::time::UNIX_EPOCH; use anyhow::{anyhow, bail, Context, Result}; +use compute_api::requests::ConfigurationRequest; +use compute_api::responses::ComputeCtlConfig; use compute_api::spec::Database; use compute_api::spec::PgIdent; use compute_api::spec::RemoteExtSpec; @@ -55,6 +61,7 @@ use nix::sys::signal::Signal; use pageserver_api::shard::ShardStripeSize; use reqwest::header::CONTENT_TYPE; use serde::{Deserialize, Serialize}; +use tracing::debug; use url::Host; use utils::id::{NodeId, TenantId, TimelineId}; @@ -62,7 +69,7 @@ use crate::local_env::LocalEnv; use crate::postgresql_conf::PostgresConf; use crate::storage_controller::StorageController; -use compute_api::responses::{ComputeState, ComputeStatus}; +use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec}; // contents of a endpoint.json file @@ -73,10 +80,14 @@ pub struct EndpointConf { timeline_id: TimelineId, mode: ComputeMode, pg_port: u16, - http_port: u16, + external_http_port: u16, + internal_http_port: u16, pg_version: u32, skip_pg_catalog_updates: bool, + reconfigure_concurrency: usize, + drop_subscriptions_before_start: bool, features: Vec, + cluster: Option, } // @@ -127,7 +138,7 @@ impl ComputeControlPlane { 1 + self .endpoints .values() - .map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port())) + .map(|ep| std::cmp::max(ep.pg_address.port(), ep.external_http_address.port())) .max() .unwrap_or(self.base_port) } @@ -139,17 +150,27 @@ impl ComputeControlPlane { tenant_id: TenantId, timeline_id: TimelineId, pg_port: Option, - http_port: Option, + external_http_port: Option, + internal_http_port: Option, pg_version: u32, mode: ComputeMode, skip_pg_catalog_updates: bool, + drop_subscriptions_before_start: bool, ) -> Result> { let pg_port = pg_port.unwrap_or_else(|| self.get_port()); - let http_port = http_port.unwrap_or_else(|| self.get_port() + 1); + let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1); + let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1); let ep = Arc::new(Endpoint { endpoint_id: endpoint_id.to_owned(), - pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port), - http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port), + pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port), + external_http_address: SocketAddr::new( + IpAddr::from(Ipv4Addr::UNSPECIFIED), + external_http_port, + ), + internal_http_address: SocketAddr::new( + IpAddr::from(Ipv4Addr::LOCALHOST), + internal_http_port, + ), env: self.env.clone(), timeline_id, mode, @@ -162,7 +183,10 @@ impl ComputeControlPlane { // with this we basically test a case of waking up an idle compute, where // we also skip catalog updates in the cloud. skip_pg_catalog_updates, + drop_subscriptions_before_start, + reconfigure_concurrency: 1, features: vec![], + cluster: None, }); ep.create_endpoint_dir()?; @@ -173,11 +197,15 @@ impl ComputeControlPlane { tenant_id, timeline_id, mode, - http_port, + external_http_port, + internal_http_port, pg_port, pg_version, skip_pg_catalog_updates, + drop_subscriptions_before_start, + reconfigure_concurrency: 1, features: vec![], + cluster: None, })?, )?; std::fs::write( @@ -226,9 +254,10 @@ pub struct Endpoint { pub timeline_id: TimelineId, pub mode: ComputeMode, - // port and address of the Postgres server and `compute_ctl`'s HTTP API + // port and address of the Postgres server and `compute_ctl`'s HTTP APIs pub pg_address: SocketAddr, - pub http_address: SocketAddr, + pub external_http_address: SocketAddr, + pub internal_http_address: SocketAddr, // postgres major version in the format: 14, 15, etc. pg_version: u32, @@ -240,8 +269,12 @@ pub struct Endpoint { // Optimizations skip_pg_catalog_updates: bool, + drop_subscriptions_before_start: bool, + reconfigure_concurrency: usize, // Feature flags features: Vec, + // Cluster settings + cluster: Option, } #[derive(PartialEq, Eq)] @@ -281,9 +314,18 @@ impl Endpoint { let conf: EndpointConf = serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; + debug!("serialized endpoint conf: {:?}", conf); + Ok(Endpoint { - pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port), - http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port), + pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port), + external_http_address: SocketAddr::new( + IpAddr::from(Ipv4Addr::UNSPECIFIED), + conf.external_http_port, + ), + internal_http_address: SocketAddr::new( + IpAddr::from(Ipv4Addr::LOCALHOST), + conf.internal_http_port, + ), endpoint_id, env: env.clone(), timeline_id: conf.timeline_id, @@ -291,7 +333,10 @@ impl Endpoint { tenant_id: conf.tenant_id, pg_version: conf.pg_version, skip_pg_catalog_updates: conf.skip_pg_catalog_updates, + reconfigure_concurrency: conf.reconfigure_concurrency, + drop_subscriptions_before_start: conf.drop_subscriptions_before_start, features: conf.features, + cluster: conf.cluster, }) } @@ -316,6 +361,10 @@ impl Endpoint { // and can cause errors like 'no unpinned buffers available', see // conf.append("shared_buffers", "1MB"); + // Postgres defaults to effective_io_concurrency=1, which does not exercise the pageserver's + // batching logic. Set this to 2 so that we exercise the code a bit without letting + // individual tests do a lot of concurrent work on underpowered test machines + conf.append("effective_io_concurrency", "2"); conf.append("fsync", "off"); conf.append("max_connections", "100"); conf.append("wal_level", "logical"); @@ -574,13 +623,14 @@ impl Endpoint { }; // Create spec file - let spec = ComputeSpec { + let mut spec = ComputeSpec { skip_pg_catalog_updates: self.skip_pg_catalog_updates, format_version: 1.0, operation_uuid: None, features: self.features.clone(), swap_size_bytes: None, disk_quota_bytes: None, + disable_lfc_resizing: None, cluster: Cluster { cluster_id: None, // project ID: not used name: None, // project name: not used @@ -606,7 +656,7 @@ impl Endpoint { Vec::new() }, settings: None, - postgresql_conf: Some(postgresql_conf), + postgresql_conf: Some(postgresql_conf.clone()), }, delta_operations: None, tenant_id: Some(self.tenant_id), @@ -619,8 +669,35 @@ impl Endpoint { pgbouncer_settings: None, shard_stripe_size: Some(shard_stripe_size), local_proxy_config: None, - reconfigure_concurrency: 1, + reconfigure_concurrency: self.reconfigure_concurrency, + drop_subscriptions_before_start: self.drop_subscriptions_before_start, }; + + // this strange code is needed to support respec() in tests + if self.cluster.is_some() { + debug!("Cluster is already set in the endpoint spec, using it"); + spec.cluster = self.cluster.clone().unwrap(); + + debug!("spec.cluster {:?}", spec.cluster); + + // fill missing fields again + if create_test_user { + spec.cluster.roles.push(Role { + name: PgIdent::from_str("test").unwrap(), + encrypted_password: None, + options: None, + }); + spec.cluster.databases.push(Database { + name: PgIdent::from_str("neondb").unwrap(), + owner: PgIdent::from_str("test").unwrap(), + options: None, + restrict_conn: false, + invalid: false, + }); + } + spec.cluster.postgresql_conf = Some(postgresql_conf); + } + let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; @@ -638,24 +715,43 @@ impl Endpoint { println!("Also at '{}'", conn_str); } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); - cmd.args(["--http-port", &self.http_address.port().to_string()]) - .args(["--pgdata", self.pgdata().to_str().unwrap()]) - .args(["--connstr", &conn_str]) - .args([ - "--spec-path", - self.endpoint_path().join("spec.json").to_str().unwrap(), - ]) - .args([ - "--pgbin", - self.env - .pg_bin_dir(self.pg_version)? - .join("postgres") - .to_str() - .unwrap(), - ]) - .stdin(std::process::Stdio::null()) - .stderr(logfile.try_clone()?) - .stdout(logfile); + cmd.args([ + "--external-http-port", + &self.external_http_address.port().to_string(), + ]) + .args([ + "--internal-http-port", + &self.internal_http_address.port().to_string(), + ]) + .args(["--pgdata", self.pgdata().to_str().unwrap()]) + .args(["--connstr", &conn_str]) + .args([ + "--spec-path", + self.endpoint_path().join("spec.json").to_str().unwrap(), + ]) + .args([ + "--pgbin", + self.env + .pg_bin_dir(self.pg_version)? + .join("postgres") + .to_str() + .unwrap(), + ]) + // TODO: It would be nice if we generated compute IDs with the same + // algorithm as the real control plane. + .args([ + "--compute-id", + &format!( + "compute-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs() + ), + ]) + .stdin(std::process::Stdio::null()) + .stderr(logfile.try_clone()?) + .stdout(logfile); if let Some(remote_ext_config) = remote_ext_config { cmd.args(["--remote-ext-config", remote_ext_config]); @@ -734,7 +830,7 @@ impl Endpoint { } // Call the /status HTTP API - pub async fn get_status(&self) -> Result { + pub async fn get_status(&self) -> Result { let client = reqwest::Client::new(); let response = client @@ -742,8 +838,8 @@ impl Endpoint { reqwest::Method::GET, format!( "http://{}:{}/status", - self.http_address.ip(), - self.http_address.port() + self.external_http_address.ip(), + self.external_http_address.port() ), ) .send() @@ -810,20 +906,23 @@ impl Endpoint { } let client = reqwest::Client::builder() - .timeout(Duration::from_secs(30)) + .timeout(Duration::from_secs(120)) .build() .unwrap(); let response = client .post(format!( "http://{}:{}/configure", - self.http_address.ip(), - self.http_address.port() + self.external_http_address.ip(), + self.external_http_address.port() )) .header(CONTENT_TYPE.as_str(), "application/json") - .body(format!( - "{{\"spec\":{}}}", - serde_json::to_string_pretty(&spec)? - )) + .body( + serde_json::to_string(&ConfigurationRequest { + spec, + compute_ctl_config: ComputeCtlConfig::default(), + }) + .unwrap(), + ) .send() .await?; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 032c88a829..2fe4cd5202 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -76,7 +76,7 @@ pub struct LocalEnv { // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. - pub control_plane_api: Option, + pub control_plane_api: Url, // Control plane upcall API for storage controller. If set, this will be propagated into the // storage controller's configuration. @@ -133,7 +133,7 @@ pub struct NeonLocalInitConf { pub storage_controller: Option, pub pageservers: Vec, pub safekeepers: Vec, - pub control_plane_api: Option>, + pub control_plane_api: Option, pub control_plane_compute_hook_api: Option>, } @@ -180,7 +180,7 @@ impl NeonStorageControllerConf { const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30); // Very tight heartbeat interval to speed up tests - const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100); + const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(1000); } impl Default for NeonStorageControllerConf { @@ -483,7 +483,6 @@ impl LocalEnv { .iter() .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) .map(|&(_, timeline_id)| timeline_id) - .map(TimelineId::from) } pub fn timeline_name_mappings(&self) -> HashMap { @@ -535,7 +534,7 @@ impl LocalEnv { storage_controller, pageservers, safekeepers, - control_plane_api, + control_plane_api: control_plane_api.unwrap(), control_plane_compute_hook_api, branch_name_mappings, } @@ -638,7 +637,7 @@ impl LocalEnv { storage_controller: self.storage_controller.clone(), pageservers: vec![], // it's skip_serializing anyway safekeepers: self.safekeepers.clone(), - control_plane_api: self.control_plane_api.clone(), + control_plane_api: Some(self.control_plane_api.clone()), control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), branch_name_mappings: self.branch_name_mappings.clone(), }, @@ -768,7 +767,7 @@ impl LocalEnv { storage_controller: storage_controller.unwrap_or_default(), pageservers: pageservers.iter().map(Into::into).collect(), safekeepers, - control_plane_api: control_plane_api.unwrap_or_default(), + control_plane_api: control_plane_api.unwrap(), control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), branch_name_mappings: Default::default(), }; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 1d1455b95b..2bf89b7bfa 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -95,21 +95,19 @@ impl PageServerNode { let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param]; - if let Some(control_plane_api) = &self.env.control_plane_api { - overrides.push(format!( - "control_plane_api='{}'", - control_plane_api.as_str() - )); + overrides.push(format!( + "control_plane_api='{}'", + self.env.control_plane_api.as_str() + )); - // Storage controller uses the same auth as pageserver: if JWT is enabled - // for us, we will also need it to talk to them. - if matches!(conf.http_auth_type, AuthType::NeonJWT) { - let jwt_token = self - .env - .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) - .unwrap(); - overrides.push(format!("control_plane_api_token='{}'", jwt_token)); - } + // Storage controller uses the same auth as pageserver: if JWT is enabled + // for us, we will also need it to talk to them. + if matches!(conf.http_auth_type, AuthType::NeonJWT) { + let jwt_token = self + .env + .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) + .unwrap(); + overrides.push(format!("control_plane_api_token='{}'", jwt_token)); } if !conf.other.contains_key("remote_storage") { @@ -337,29 +335,70 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'checkpoint_distance' as an integer")?, - checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()), + checkpoint_timeout: settings + .remove("checkpoint_timeout") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'checkpoint_timeout' as duration")?, compaction_target_size: settings .remove("compaction_target_size") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_target_size' as an integer")?, - compaction_period: settings.remove("compaction_period").map(|x| x.to_string()), + compaction_period: settings + .remove("compaction_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'compaction_period' as duration")?, compaction_threshold: settings .remove("compaction_threshold") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_threshold' as an integer")?, + compaction_upper_limit: settings + .remove("compaction_upper_limit") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_upper_limit' as an integer")?, compaction_algorithm: settings .remove("compaction_algorithm") .map(serde_json::from_str) .transpose() .context("Failed to parse 'compaction_algorithm' json")?, + compaction_l0_first: settings + .remove("compaction_l0_first") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_l0_first' as a bool")?, + compaction_l0_semaphore: settings + .remove("compaction_l0_semaphore") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_l0_semaphore' as a bool")?, + l0_flush_delay_threshold: settings + .remove("l0_flush_delay_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'l0_flush_delay_threshold' as an integer")?, + l0_flush_wait_upload: settings + .remove("l0_flush_wait_upload") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'l0_flush_wait_upload' as a boolean")?, + l0_flush_stall_threshold: settings + .remove("l0_flush_stall_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'l0_flush_stall_threshold' as an integer")?, gc_horizon: settings .remove("gc_horizon") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_horizon' as an integer")?, - gc_period: settings.remove("gc_period").map(|x| x.to_string()), + gc_period: settings.remove("gc_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'gc_period' as duration")?, image_creation_threshold: settings .remove("image_creation_threshold") .map(|x| x.parse::()) @@ -370,13 +409,25 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_check_threshold' as integer")?, - pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), + image_creation_preempt_threshold: settings + .remove("image_creation_preempt_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'image_creation_preempt_threshold' as integer")?, + pitr_interval: settings.remove("pitr_interval") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'pitr_interval' as duration")?, walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'walreceiver_connect_timeout' as duration")?, lagging_wal_timeout: settings .remove("lagging_wal_timeout") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lagging_wal_timeout' as duration")?, max_lsn_wal_lag: settings .remove("max_lsn_wal_lag") .map(|x| x.parse::()) @@ -394,8 +445,14 @@ impl PageServerNode { .context("Failed to parse 'min_resident_size_override' as integer")?, evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") - .map(|x| x.to_string()), - heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'evictions_low_residence_duration_metric_threshold' as duration")?, + heatmap_period: settings + .remove("heatmap_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'heatmap_period' as duration")?, lazy_slru_download: settings .remove("lazy_slru_download") .map(|x| x.parse::()) @@ -406,10 +463,15 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, - lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()), + lsn_lease_length: settings.remove("lsn_lease_length") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lsn_lease_length' as duration")?, lsn_lease_length_for_ts: settings .remove("lsn_lease_length_for_ts") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lsn_lease_length_for_ts' as duration")?, timeline_offloading: settings .remove("timeline_offloading") .map(|x| x.parse::()) @@ -420,6 +482,26 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `wal_receiver_protocol_override` from json")?, + rel_size_v2_enabled: settings + .remove("rel_size_v2_enabled") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'rel_size_v2_enabled' as bool")?, + gc_compaction_enabled: settings + .remove("gc_compaction_enabled") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_compaction_enabled' as bool")?, + gc_compaction_initial_threshold_kb: settings + .remove("gc_compaction_initial_threshold_kb") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_compaction_initial_threshold_kb' as integer")?, + gc_compaction_ratio_percent: settings + .remove("gc_compaction_ratio_percent") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_compaction_ratio_percent' as integer")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -435,7 +517,7 @@ impl PageServerNode { ) -> anyhow::Result<()> { let config = Self::parse_config(settings)?; self.http_client - .tenant_config(&models::TenantConfigRequest { tenant_id, config }) + .set_tenant_config(&models::TenantConfigRequest { tenant_id, config }) .await?; Ok(()) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index f0c3722925..ce7751fb14 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -17,8 +17,10 @@ use camino::Utf8PathBuf; use postgres_connection::PgConnectionConfig; use reqwest::{IntoUrl, Method}; use thiserror::Error; + +use http_utils::error::HttpErrorBody; use utils::auth::{Claims, Scope}; -use utils::{http::error::HttpErrorBody, id::NodeId}; +use utils::id::NodeId; use crate::{ background_process, diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index b70bd2e1b5..0fadb9c5fe 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -221,7 +221,17 @@ impl StorageController { "-p", &format!("{}", postgres_port), ]; - let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; + let pg_lib_dir = self.get_pg_lib_dir().await.unwrap(); + let envs = [ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]; + let exitcode = Command::new(bin_path) + .args(args) + .envs(envs) + .spawn()? + .wait() + .await?; Ok(exitcode.success()) } @@ -242,6 +252,11 @@ impl StorageController { let pg_bin_dir = self.get_pg_bin_dir().await?; let createdb_path = pg_bin_dir.join("createdb"); + let pg_lib_dir = self.get_pg_lib_dir().await.unwrap(); + let envs = [ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]; let output = Command::new(&createdb_path) .args([ "-h", @@ -254,6 +269,7 @@ impl StorageController { &username(), DB_NAME, ]) + .envs(envs) .output() .await .expect("Failed to spawn createdb"); @@ -338,7 +354,7 @@ impl StorageController { .port(), ) } else { - let listen_url = self.env.control_plane_api.clone().unwrap(); + let listen_url = self.env.control_plane_api.clone(); let listen = format!( "{}:{}", @@ -708,7 +724,7 @@ impl StorageController { } else { // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out // for general purpose API access. - let listen_url = self.env.control_plane_api.clone().unwrap(); + let listen_url = self.env.control_plane_api.clone(); Url::from_str(&format!( "http://{}:{}/{path}", listen_url.host_str().unwrap(), @@ -823,8 +839,8 @@ impl StorageController { Method::PUT, format!("control/v1/tenant/{tenant_shard_id}/migrate"), Some(TenantShardMigrateRequest { - tenant_shard_id, node_id, + migration_config: None, }), ) .await diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index e879424532..40b86e4110 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -1,22 +1,28 @@ use futures::StreamExt; -use std::{str::FromStr, time::Duration}; +use std::{ + collections::{HashMap, HashSet}, + str::FromStr, + time::Duration, +}; use clap::{Parser, Subcommand}; use pageserver_api::{ controller_api::{ AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, - ShardSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, + SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy, + ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, + TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, }, models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, - ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest, - TenantShardSplitResponse, + ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, + TenantShardSplitRequest, TenantShardSplitResponse, }, shard::{ShardStripeSize, TenantShardId}, }; use pageserver_client::mgmt_api::{self}; use reqwest::{Method, StatusCode, Url}; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TimelineId}; use pageserver_api::controller_api::{ NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, @@ -41,6 +47,9 @@ enum Command { listen_http_addr: String, #[arg(long)] listen_http_port: u16, + #[arg(long)] + listen_https_port: Option, + #[arg(long)] availability_zone_id: String, }, @@ -111,14 +120,31 @@ enum Command { #[arg(long)] node: NodeId, }, + /// Migrate the secondary location for a tenant shard to a specific pageserver. + TenantShardMigrateSecondary { + #[arg(long)] + tenant_shard_id: TenantShardId, + #[arg(long)] + node: NodeId, + }, /// Cancel any ongoing reconciliation for this shard TenantShardCancelReconcile { #[arg(long)] tenant_shard_id: TenantShardId, }, - /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure + /// Set the pageserver tenant configuration of a tenant: this is the configuration structure /// that is passed through to pageservers, and does not affect storage controller behavior. - TenantConfig { + /// Any previous tenant configs are overwritten. + SetTenantConfig { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + config: String, + }, + /// Patch the pageserver tenant configuration of a tenant. Any fields with null values in the + /// provided JSON are unset from the tenant config and all fields with non-null values are set. + /// Unspecified fields are not changed. + PatchTenantConfig { #[arg(long)] tenant_id: TenantId, #[arg(long)] @@ -135,6 +161,12 @@ enum Command { #[arg(long)] tenant_id: TenantId, }, + TenantSetPreferredAz { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + preferred_az: Option, + }, /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region. TenantDrop { @@ -201,6 +233,28 @@ enum Command { #[arg(long)] timeout: humantime::Duration, }, + /// List safekeepers known to the storage controller + Safekeepers {}, + /// Set the scheduling policy of the specified safekeeper + SafekeeperScheduling { + #[arg(long)] + node_id: NodeId, + #[arg(long)] + scheduling_policy: SkSchedulingPolicyArg, + }, + /// Downloads any missing heatmap layers for all shard for a given timeline + DownloadHeatmapLayers { + /// Tenant ID or tenant shard ID. When an unsharded tenant ID is specified, + /// the operation is performed on all shards. When a sharded tenant ID is + /// specified, the operation is only performed on the specified shard. + #[arg(long)] + tenant_shard_id: TenantShardId, + #[arg(long)] + timeline_id: TimelineId, + /// Optional: Maximum download concurrency (default is 16) + #[arg(long)] + concurrency: Option, + }, } #[derive(Parser)] @@ -253,6 +307,17 @@ impl FromStr for PlacementPolicyArg { } } +#[derive(Debug, Clone)] +struct SkSchedulingPolicyArg(SkSchedulingPolicy); + +impl FromStr for SkSchedulingPolicyArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + SkSchedulingPolicy::from_str(s).map(Self) + } +} + #[derive(Debug, Clone)] struct ShardSchedulingPolicyArg(ShardSchedulingPolicy); @@ -332,6 +397,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, + listen_https_port, availability_zone_id, } => { storcon_client @@ -344,6 +410,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, + listen_https_port, availability_zone_id: AvailabilityZone(availability_zone_id), }), ) @@ -382,11 +449,12 @@ async fn main() -> anyhow::Result<()> { resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr)); let mut table = comfy_table::Table::new(); - table.set_header(["Id", "Hostname", "Scheduling", "Availability"]); + table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]); for node in resp { table.add_row([ format!("{}", node.id), node.listen_http_addr, + node.availability_zone_id, format!("{:?}", node.scheduling), format!("{:?}", node.availability), ]); @@ -446,33 +514,65 @@ async fn main() -> anyhow::Result<()> { println!("{table}"); } Command::Tenants { node_id: None } => { - let mut resp = storcon_client - .dispatch::<(), Vec>( - Method::GET, - "control/v1/tenant".to_string(), - None, - ) - .await?; - - resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id)); - + // Set up output formatting let mut table = comfy_table::Table::new(); table.set_header([ "TenantId", + "Preferred AZ", "ShardCount", "StripeSize", "Placement", "Scheduling", ]); - for tenant in resp { - let shard_zero = tenant.shards.into_iter().next().unwrap(); - table.add_row([ - format!("{}", tenant.tenant_id), - format!("{}", shard_zero.tenant_shard_id.shard_count.literal()), - format!("{:?}", tenant.stripe_size), - format!("{:?}", tenant.policy), - format!("{:?}", shard_zero.scheduling_policy), - ]); + + // Pagination loop over listing API + let mut start_after = None; + const LIMIT: usize = 1000; + loop { + let path = match start_after { + None => format!("control/v1/tenant?limit={LIMIT}"), + Some(start_after) => { + format!("control/v1/tenant?limit={LIMIT}&start_after={start_after}") + } + }; + + let resp = storcon_client + .dispatch::<(), Vec>(Method::GET, path, None) + .await?; + + if resp.is_empty() { + // End of data reached + break; + } + + // Give some visual feedback while we're building up the table (comfy_table doesn't have + // streaming output) + if resp.len() >= LIMIT { + eprint!("."); + } + + start_after = Some(resp.last().unwrap().tenant_id); + + for tenant in resp { + let shard_zero = tenant.shards.into_iter().next().unwrap(); + table.add_row([ + format!("{}", tenant.tenant_id), + shard_zero + .preferred_az_id + .as_ref() + .cloned() + .unwrap_or("".to_string()), + format!("{}", shard_zero.tenant_shard_id.shard_count.literal()), + format!("{:?}", tenant.stripe_size), + format!("{:?}", tenant.policy), + format!("{:?}", shard_zero.scheduling_policy), + ]); + } + } + + // Terminate progress dots + if table.row_count() > LIMIT { + eprint!(""); } println!("{table}"); @@ -528,8 +628,8 @@ async fn main() -> anyhow::Result<()> { node, } => { let req = TenantShardMigrateRequest { - tenant_shard_id, node_id: node, + migration_config: None, }; storcon_client @@ -540,6 +640,23 @@ async fn main() -> anyhow::Result<()> { ) .await?; } + Command::TenantShardMigrateSecondary { + tenant_shard_id, + node, + } => { + let req = TenantShardMigrateRequest { + node_id: node, + migration_config: None, + }; + + storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"), + Some(req), + ) + .await?; + } Command::TenantShardCancelReconcile { tenant_shard_id } => { storcon_client .dispatch::<(), ()>( @@ -549,11 +666,21 @@ async fn main() -> anyhow::Result<()> { ) .await?; } - Command::TenantConfig { tenant_id, config } => { + Command::SetTenantConfig { tenant_id, config } => { let tenant_conf = serde_json::from_str(&config)?; vps_client - .tenant_config(&TenantConfigRequest { + .set_tenant_config(&TenantConfigRequest { + tenant_id, + config: tenant_conf, + }) + .await?; + } + Command::PatchTenantConfig { tenant_id, config } => { + let tenant_conf = serde_json::from_str(&config)?; + + vps_client + .patch_tenant_config(&TenantConfigPatchRequest { tenant_id, config: tenant_conf, }) @@ -573,6 +700,19 @@ async fn main() -> anyhow::Result<()> { None, ) .await?; + + let nodes = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + let nodes = nodes + .into_iter() + .map(|n| (n.id, n)) + .collect::>(); + println!("Tenant {tenant_id}"); let mut table = comfy_table::Table::new(); table.add_row(["Policy", &format!("{:?}", policy)]); @@ -581,7 +721,14 @@ async fn main() -> anyhow::Result<()> { println!("{table}"); println!("Shards:"); let mut table = comfy_table::Table::new(); - table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]); + table.set_header([ + "Shard", + "Attached", + "Attached AZ", + "Secondary", + "Last error", + "status", + ]); for shard in shards { let secondary = shard .node_secondary @@ -604,11 +751,18 @@ async fn main() -> anyhow::Result<()> { } let status = status_parts.join(","); + let attached_node = shard + .node_attached + .as_ref() + .map(|id| nodes.get(id).expect("Shard references nonexistent node")); + table.add_row([ format!("{}", shard.tenant_shard_id), - shard - .node_attached - .map(|n| format!("{}", n)) + attached_node + .map(|n| format!("{} ({})", n.listen_http_addr, n.id)) + .unwrap_or(String::new()), + attached_node + .map(|n| n.availability_zone_id.clone()) .unwrap_or(String::new()), secondary, shard.last_error, @@ -617,6 +771,66 @@ async fn main() -> anyhow::Result<()> { } println!("{table}"); } + Command::TenantSetPreferredAz { + tenant_id, + preferred_az, + } => { + // First learn about the tenant's shards + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + + // Learn about nodes to validate the AZ ID + let nodes = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + if let Some(preferred_az) = &preferred_az { + let azs = nodes + .into_iter() + .map(|n| (n.availability_zone_id)) + .collect::>(); + if !azs.contains(preferred_az) { + anyhow::bail!( + "AZ {} not found on any node: known AZs are: {:?}", + preferred_az, + azs + ); + } + } else { + // Make it obvious to the user that since they've omitted an AZ, we're clearing it + eprintln!("Clearing preferred AZ for tenant {}", tenant_id); + } + + // Construct a request that modifies all the tenant's shards + let req = ShardsPreferredAzsRequest { + preferred_az_ids: describe_response + .shards + .into_iter() + .map(|s| { + ( + s.tenant_shard_id, + preferred_az.clone().map(AvailabilityZone), + ) + }) + .collect(), + }; + storcon_client + .dispatch::( + Method::PUT, + "control/v1/preferred_azs".to_string(), + Some(req), + ) + .await?; + } Command::TenantWarmup { tenant_id } => { let describe_response = storcon_client .dispatch::<(), TenantDescribeResponse>( @@ -736,7 +950,7 @@ async fn main() -> anyhow::Result<()> { threshold, } => { vps_client - .tenant_config(&TenantConfigRequest { + .set_tenant_config(&TenantConfigRequest { tenant_id, config: TenantConfig { eviction_policy: Some(EvictionPolicy::LayerAccessThreshold( @@ -745,7 +959,7 @@ async fn main() -> anyhow::Result<()> { threshold: threshold.into(), }, )), - heatmap_period: Some("300s".to_string()), + heatmap_period: Some(Duration::from_secs(300)), ..Default::default() }, }) @@ -893,8 +1107,8 @@ async fn main() -> anyhow::Result<()> { Method::PUT, format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id), Some(TenantShardMigrateRequest { - tenant_shard_id: mv.tenant_shard_id, node_id: mv.to, + migration_config: None, }), ) .await @@ -1000,6 +1214,75 @@ async fn main() -> anyhow::Result<()> { "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}" ); } + Command::Safekeepers {} => { + let mut resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/safekeeper".to_string(), + None, + ) + .await?; + + resp.sort_by(|a, b| a.id.cmp(&b.id)); + + let mut table = comfy_table::Table::new(); + table.set_header([ + "Id", + "Version", + "Host", + "Port", + "Http Port", + "AZ Id", + "Scheduling", + ]); + for sk in resp { + table.add_row([ + format!("{}", sk.id), + format!("{}", sk.version), + sk.host, + format!("{}", sk.port), + format!("{}", sk.http_port), + sk.availability_zone_id.clone(), + String::from(sk.scheduling_policy), + ]); + } + println!("{table}"); + } + Command::SafekeeperScheduling { + node_id, + scheduling_policy, + } => { + let scheduling_policy = scheduling_policy.0; + storcon_client + .dispatch::( + Method::POST, + format!("control/v1/safekeeper/{node_id}/scheduling_policy"), + Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }), + ) + .await?; + println!( + "Scheduling policy of {node_id} set to {}", + String::from(scheduling_policy) + ); + } + Command::DownloadHeatmapLayers { + tenant_shard_id, + timeline_id, + concurrency, + } => { + let mut path = format!( + "/v1/tenant/{}/timeline/{}/download_heatmap_layers", + tenant_shard_id, timeline_id, + ); + + if let Some(c) = concurrency { + path = format!("{path}?concurrency={c}"); + } + + storcon_client + .dispatch::<(), ()>(Method::POST, path, None) + .await?; + } } Ok(()) diff --git a/deny.toml b/deny.toml index ff8d71cda5..b551405568 100644 --- a/deny.toml +++ b/deny.toml @@ -32,6 +32,7 @@ reason = "the marvin attack only affects private key decryption, not public key # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html [licenses] allow = [ + "0BSD", "Apache-2.0", "BSD-2-Clause", "BSD-3-Clause", @@ -41,8 +42,8 @@ allow = [ "MIT", "MPL-2.0", "OpenSSL", - "Unicode-DFS-2016", "Unicode-3.0", + "Zlib", ] confidence-threshold = 0.8 exceptions = [ diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index 05a2cf124c..b5f0f47ceb 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -7,14 +7,12 @@ FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG ARG COMPUTE_IMAGE USER root -RUN apt-get update && \ +RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \ + apt-get update && \ apt-get install -y curl \ jq \ - python3-pip \ netcat-openbsd -#Faker is required for the pg_anon test -RUN case $COMPUTE_IMAGE in compute-node-v17) OPT="--break-system-packages";; *) OPT= ;; esac && pip3 install $OPT Faker #This is required for the pg_hintplan test -RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src +RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw -USER postgres \ No newline at end of file +USER postgres diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 33455e458a..9dbdcce69f 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -20,30 +20,55 @@ while ! nc -z pageserver 6400; do done echo "Page server is ready." -echo "Create a tenant and timeline" -generate_id tenant_id -PARAMS=( - -X PUT - -H "Content-Type: application/json" - -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}" - "http://pageserver:9898/v1/tenant/${tenant_id}/location_config" -) -result=$(curl "${PARAMS[@]}") -echo $result | jq . +cp ${SPEC_FILE_ORG} ${SPEC_FILE} -generate_id timeline_id -PARAMS=( - -sbf - -X POST - -H "Content-Type: application/json" - -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}" - "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/" -) -result=$(curl "${PARAMS[@]}") -echo $result | jq . + if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then + tenant_id=${TENANT_ID} + timeline_id=${TIMELINE_ID} +else + echo "Check if a tenant present" + PARAMS=( + -X GET + -H "Content-Type: application/json" + "http://pageserver:9898/v1/tenant" + ) + tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id) + if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then + echo "Create a tenant" + generate_id tenant_id + PARAMS=( + -X PUT + -H "Content-Type: application/json" + -d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}" + "http://pageserver:9898/v1/tenant/${tenant_id}/location_config" + ) + result=$(curl "${PARAMS[@]}") + echo $result | jq . + fi + + echo "Check if a timeline present" + PARAMS=( + -X GET + -H "Content-Type: application/json" + "http://pageserver:9898/v1/tenant/${tenant_id}/timeline" + ) + timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id) + if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then + generate_id timeline_id + PARAMS=( + -sbf + -X POST + -H "Content-Type: application/json" + -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}" + "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/" + ) + result=$(curl "${PARAMS[@]}") + echo $result | jq . + fi +fi echo "Overwrite tenant id and timeline id in spec file" -sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE} +sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE} sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE} cat ${SPEC_FILE} @@ -52,4 +77,5 @@ echo "Start compute node" /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \ -C "postgresql://cloud_admin@localhost:55433/postgres" \ -b /usr/local/bin/postgres \ + --compute-id "compute-$RANDOM" \ -S ${SPEC_FILE} diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json index 8e582e74e1..0308cab451 100644 --- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json +++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json @@ -132,11 +132,6 @@ "name": "cron.database", "value": "postgres", "vartype": "string" - }, - { - "name": "session_preload_libraries", - "value": "anon", - "vartype": "string" } ] }, diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 6e15fdbe0d..489d60f38c 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -149,11 +149,13 @@ services: args: - REPOSITORY=${REPOSITORY:-neondatabase} - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16} - - TAG=${TAG:-latest} - - http_proxy=$http_proxy - - https_proxy=$https_proxy + - TAG=${COMPUTE_TAG:-${TAG:-latest}} + - http_proxy=${http_proxy:-} + - https_proxy=${https_proxy:-} environment: - PG_VERSION=${PG_VERSION:-16} + - TENANT_ID=${TENANT_ID:-} + - TIMELINE_ID=${TIMELINE_ID:-} #- RUST_BACKTRACE=1 # Mount the test files directly, for faster editing cycle. volumes: @@ -185,6 +187,8 @@ services: neon-test-extensions: profiles: ["test-extensions"] image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest} + environment: + - PGPASSWORD=cloud_admin entrypoint: - "/bin/bash" - "-c" diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index c97dfaa901..0f03d600a3 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -18,14 +18,10 @@ cd $(dirname $0) COMPUTE_CONTAINER_NAME=docker-compose-compute-1 TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1 PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres" -: ${http_proxy:=} -: ${https_proxy:=} -export http_proxy https_proxy cleanup() { echo "show container information" docker ps - docker compose --profile test-extensions -f $COMPOSE_FILE logs echo "stop containers..." docker compose --profile test-extensions -f $COMPOSE_FILE down } @@ -35,13 +31,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do echo "clean up containers if exists" cleanup PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version)) - # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option - if [ $pg_version -eq 17 ]; then - SPEC_PATH="compute_wrapper/var/db/postgres/specs" - mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak - jq 'del(.cluster.settings[] | select (.name == "session_preload_libraries"))' $SPEC_PATH/spec.bak > $SPEC_PATH/spec.json - fi - PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d + PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --quiet-pull --build -d echo "wait until the compute is ready. timeout after 60s. " cnt=0 @@ -50,7 +40,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do cnt=`expr $cnt + 3` if [ $cnt -gt 60 ]; then echo "timeout before the compute is ready." - cleanup exit 1 fi if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then @@ -62,52 +51,37 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do done if [ $pg_version -ge 16 ]; then - echo Enabling trust connection - docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' " - echo Adding postgres role - docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN" # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail # It cannot be moved to Dockerfile now because the database directory is created after the start of the container echo Adding dummy config docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf - # This block is required for the pg_anon extension test. - # The test assumes that it is running on the same host with the postgres engine. - # In our case it's not true, that's why we are copying files to the compute node + # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment TMPDIR=$(mktemp -d) - # Add support for pg_anon for pg_v16 - if [ $pg_version -ne 17 ]; then - docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data - echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv - docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data - rm -rf $TMPDIR - fi - TMPDIR=$(mktemp -d) - # The following block does the same for the pg_hintplan test docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/ rm -rf $TMPDIR + # The following block does the same for the contrib/file_fdw test + TMPDIR=$(mktemp -d) + docker cp $TEST_CONTAINER_NAME:/postgres/contrib/file_fdw/data $TMPDIR/data + docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/postgres/contrib/file_fdw/data + rm -rf $TMPDIR + # Apply patches + cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)" # We are running tests now - if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \ - $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt - then - cleanup - else - FAILED=$(tail -1 testout.txt) - for d in $FAILED - do - mkdir $d - docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true - docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true - cat $d/regression.out $d/regression.diffs || true + rm -f testout.txt testout_contrib.txt + docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \ + $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0 + docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \ + $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0 + if [ $EXT_SUCCESS -eq 0 ] || [ $CONTRIB_SUCCESS -eq 0 ]; then + CONTRIB_FAILED= + FAILED= + [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}') + [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}') + for d in $FAILED $CONTRIB_FAILED; do + docker exec $TEST_CONTAINER_NAME bash -c 'for file in $(find '"$d"' -name regression.diffs -o -name regression.out); do cat $file; done' || [ $? -eq 1 ] done - rm -rf $FAILED - cleanup exit 1 fi fi - cleanup - # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option - if [ $pg_version -eq 17 ]; then - mv $SPEC_PATH/spec.bak $SPEC_PATH/spec.json - fi done diff --git a/docker-compose/ext-src/hll-src/test-upgrade.sh b/docker-compose/ext-src/hll-src/test-upgrade.sh new file mode 100755 index 0000000000..f9e9aedcb2 --- /dev/null +++ b/docker-compose/ext-src/hll-src/test-upgrade.sh @@ -0,0 +1,5 @@ +#!/bin/sh +set -ex +cd "$(dirname ${0})" +PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress +${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression add_agg agg_oob auto_sparse card_op cast_shape copy_binary cumulative_add_cardinality_correction cumulative_add_comprehensive_promotion cumulative_add_sparse_edge cumulative_add_sparse_random cumulative_add_sparse_step cumulative_union_comprehensive cumulative_union_explicit_explicit cumulative_union_explicit_promotion cumulative_union_probabilistic_probabilistic cumulative_union_sparse_full_representation cumulative_union_sparse_promotion cumulative_union_sparse_sparse disable_hashagg equal explicit_thresh hash hash_any meta_func murmur_bigint murmur_bytea nosparse notequal scalar_oob storedproc transaction typmod typmod_insert union_op \ No newline at end of file diff --git a/docker-compose/ext-src/hypopg-src/test-upgrade.patch b/docker-compose/ext-src/hypopg-src/test-upgrade.patch new file mode 100644 index 0000000000..71fe26b164 --- /dev/null +++ b/docker-compose/ext-src/hypopg-src/test-upgrade.patch @@ -0,0 +1,27 @@ +diff --git a/expected/hypopg.out b/expected/hypopg.out +index 90121d0..859260b 100644 +--- a/expected/hypopg.out ++++ b/expected/hypopg.out +@@ -11,7 +11,8 @@ BEGIN + END; + $_$ + LANGUAGE plpgsql; +-CREATE EXTENSION hypopg; ++CREATE EXTENSION IF NOT EXISTS hypopg; ++NOTICE: extension "hypopg" already exists, skipping + CREATE TABLE hypo (id integer, val text, "Id2" bigint); + INSERT INTO hypo SELECT i, 'line ' || i + FROM generate_series(1,100000) f(i); +diff --git a/test/sql/hypopg.sql b/test/sql/hypopg.sql +index 99722b0..8d6bacb 100644 +--- a/test/sql/hypopg.sql ++++ b/test/sql/hypopg.sql +@@ -12,7 +12,7 @@ END; + $_$ + LANGUAGE plpgsql; + +-CREATE EXTENSION hypopg; ++CREATE EXTENSION IF NOT EXISTS hypopg; + + CREATE TABLE hypo (id integer, val text, "Id2" bigint); + diff --git a/docker-compose/ext-src/hypopg-src/test-upgrade.sh b/docker-compose/ext-src/hypopg-src/test-upgrade.sh new file mode 100755 index 0000000000..066ac3329e --- /dev/null +++ b/docker-compose/ext-src/hypopg-src/test-upgrade.sh @@ -0,0 +1,6 @@ +#!/bin/sh +set -ex +cd "$(dirname ${0})" +patch -p1 /dev/null || break +LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u) +for d in ${LIST}; do + [ -d "${d}" ] || continue + if ! psql -w -c "select 1" >/dev/null; then + FAILED="${d} ${FAILED}" + break + fi + if [ -f "${d}/neon-test.sh" ]; then + "${d}/neon-test.sh" || FAILED="${d} ${FAILED}" + else USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" + fi done [ -z "${FAILED}" ] && exit 0 echo "${FAILED}" diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh new file mode 100755 index 0000000000..c2168c47af --- /dev/null +++ b/docker-compose/test_extensions_upgrade.sh @@ -0,0 +1,105 @@ +#!/bin/bash +set -eux -o pipefail +cd "$(dirname "${0}")" +# Takes a variable name as argument. The result is stored in that variable. +generate_id() { + local -n resvar=$1 + printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM +} +if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEWTAG}" ]; then + echo OLDTAG and NEWTAG must be defined + exit 1 +fi +export PG_VERSION=${PG_VERSION:-16} +export PG_TEST_VERSION=${PG_VERSION} +function wait_for_ready { + TIME=0 + while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do + ((TIME += 1 )) + sleep 1 + done + if [ ${TIME} -gt 300 ]; then + echo Time is out. + exit 2 + fi +} +function create_extensions() { + for ext in ${1}; do + docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE" + done +} +EXTENSIONS='[ +{"extname": "plv8", "extdir": "plv8-src"}, +{"extname": "vector", "extdir": "pgvector-src"}, +{"extname": "unit", "extdir": "postgresql-unit-src"}, +{"extname": "hypopg", "extdir": "hypopg-src"}, +{"extname": "rum", "extdir": "rum-src"}, +{"extname": "ip4r", "extdir": "ip4r-src"}, +{"extname": "prefix", "extdir": "prefix-src"}, +{"extname": "hll", "extdir": "hll-src"}, +{"extname": "pg_cron", "extdir": "pg_cron-src"}, +{"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"}, +{"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"}, +{"extname": "semver", "extdir": "pg_semver-src"}, +{"extname": "pg_ivm", "extdir": "pg_ivm-src"}, +{"extname": "pgjwt", "extdir": "pgjwt-src"}, +{"extname": "pgtap", "extdir": "pgtap-src"}, +{"extname": "pg_repack", "extdir": "pg_repack-src"} +]' +EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -) +TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d +wait_for_ready +docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" +docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" +create_extensions "${EXTNAMES}" +query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')" +new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query") +docker compose --profile test-extensions down +TAG=${OLDTAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate +wait_for_ready +docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression" +docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression" +docker compose exec neon-test-extensions psql -c "CREATE DATABASE pgtap_regression" +docker compose exec neon-test-extensions psql -d pgtap_regression -c "CREATE EXTENSION pgtap" +create_extensions "${EXTNAMES}" +if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then + exts="${EXTNAMES}" +else + query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion" + exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query") +fi +if [ -z "${exts}" ]; then + echo "No extensions were upgraded" +else + tenant_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.tenant_id") + timeline_id=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") + for ext in ${exts}; do + echo Testing ${ext}... + EXTDIR=$(echo ${EXTENSIONS} | jq -r '.[] | select(.extname=="'${ext}'") | .extdir') + generate_id new_timeline_id + PARAMS=( + -sbf + -X POST + -H "Content-Type: application/json" + -d "{\"new_timeline_id\": \"${new_timeline_id}\", \"pg_version\": ${PG_VERSION}, \"ancestor_timeline_id\": \"${timeline_id}\"}" + "http://127.0.0.1:9898/v1/tenant/${tenant_id}/timeline/" + ) + result=$(curl "${PARAMS[@]}") + echo $result | jq . + TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} TAG=${OLDTAG} docker compose down compute compute_is_ready + COMPUTE_TAG=${NEWTAG} TAG=${OLDTAG} TENANT_ID=${tenant_id} TIMELINE_ID=${new_timeline_id} docker compose up --quiet-pull -d --build compute compute_is_ready + wait_for_ready + TID=$(docker compose exec neon-test-extensions psql -Aqt -c "SHOW neon.timeline_id") + if [ ${TID} != ${new_timeline_id} ]; then + echo Timeline mismatch + exit 1 + fi + docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}" + if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then + docker compose exec neon-test-extensions cat /ext-src/${EXTDIR}/regression.diffs + exit 1 + fi + docker compose exec neon-test-extensions psql -d contrib_regression -c "alter extension ${ext} update" + docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}" + done +fi diff --git a/docs/docker.md b/docs/docker.md index 0914a00082..ae74c2b2ab 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -7,15 +7,11 @@ Currently we build two main images: - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). - [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile). -And additional intermediate image: - -- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools. - ## Build pipeline We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs -1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14) +1. `neondatabase/compute-node-v17` (and -16, -v15, -v14) 2. `neondatabase/neon` diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md index 239ec58186..9b320c7285 100644 --- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md +++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md @@ -81,7 +81,7 @@ configuration generation in them is less than its current one. Namely, it refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In response it sends its current configuration generation to let walproposer know. -Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` +Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` accepting `Configuration`. Safekeeper switches to the given conf it is higher than its current one and ignores it otherwise. In any case it replies with ``` @@ -103,7 +103,7 @@ currently and tries to communicate with all of them. However, the list does not define consensus members. Instead, on start walproposer tracks highest configuration it receives from `AcceptorGreeting`s. Once it assembles greetings from majority of `sk_set` and majority of `new_sk_set` (if it is present), it -establishes this configuration as its own and moves to voting. +establishes this configuration as its own and moves to voting. It should stop talking to safekeepers not listed in the configuration at this point, though it is not unsafe to continue doing so. @@ -119,7 +119,7 @@ refusal to accept due to configuration change) it simply restarts. The following algorithm can be executed anywhere having access to configuration storage and safekeepers. It is safe to interrupt / restart it and run multiple instances of it concurrently, though likely one of them won't make -progress then. It accepts `desired_set: Vec` as input. +progress then. It accepts `desired_set: Vec` as input. Algorithm will refuse to make the change if it encounters previous interrupted change attempt, but in this case it will try to finish it. @@ -140,7 +140,7 @@ storage are reachable. safe. Failed CAS aborts the procedure. 4) Call `PUT` `configuration` on safekeepers from the current set, delivering them `joint_conf`. Collecting responses from majority is required - to proceed. If any response returned generation higher than + to proceed. If any response returned generation higher than `joint_conf.generation`, abort (another switch raced us). Otherwise, choose max `` among responses and establish it as (in memory) `sync_position`. Also choose max `term` and establish it as (in @@ -149,49 +149,49 @@ storage are reachable. without ack from the new set. Similarly, we'll bump term on new majority to `sync_term` so that two computes with the same term are never elected. 4) Initialize timeline on safekeeper(s) from `new_sk_set` where it - doesn't exist yet by doing `pull_timeline` from the majority of the + doesn't exist yet by doing `pull_timeline` from the majority of the current set. Doing that on majority of `new_sk_set` is enough to proceed, but it is reasonable to ensure that all `new_sk_set` members are initialized -- if some of them are down why are we migrating there? -5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. +5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. Success on majority is enough. 6) Repeatedly call `PUT` `configuration` on safekeepers from the new set, delivering them `joint_conf` and collecting their positions. This will - switch them to the `joint_conf` which generally won't be needed + switch them to the `joint_conf` which generally won't be needed because `pull_timeline` already includes it and plus additionally would be broadcast by compute. More importantly, we may proceed to the next step - only when `` on the majority of the new set reached - `sync_position`. Similarly, on the happy path no waiting is not needed because + only when `` on the majority of the new set reached + `sync_position`. Similarly, on the happy path no waiting is not needed because `pull_timeline` already includes it. However, we should double check to be safe. For example, timeline could have been created earlier e.g. - manually or after try-to-migrate, abort, try-to-migrate-again sequence. -7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new - safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration + manually or after try-to-migrate, abort, try-to-migrate-again sequence. +7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new + safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration storage under one more CAS. 8) Call `PUT` `configuration` on safekeepers from the new set, - delivering them `new_conf`. It is enough to deliver it to the majority + delivering them `new_conf`. It is enough to deliver it to the majority of the new set; the rest can be updated by compute. I haven't put huge effort to make the description above very precise, because it is natural language prone to interpretations anyway. Instead I'd like to make TLA+ spec of it. -Description above focuses on safety. To make the flow practical and live, here a few more +Description above focuses on safety. To make the flow practical and live, here a few more considerations. -1) It makes sense to ping new set to ensure it we are migrating to live node(s) before +1) It makes sense to ping new set to ensure it we are migrating to live node(s) before step 3. -2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed +2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed it is safe to rollback to the old conf with one more CAS. -3) On step 4 timeline might be already created on members of the new set for various reasons; +3) On step 4 timeline might be already created on members of the new set for various reasons; the simplest is the procedure restart. There are more complicated scenarious like mentioned - in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving - generations, so seems simpler to treat existing timeline as success. However, this also + in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving + generations, so seems simpler to treat existing timeline as success. However, this also has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in the step 5 is never reached until compute is (re)awaken up to synchronize new member(s). I don't think we'll observe this in practice, but can add waking up compute if needed. 4) In the end timeline should be locally deleted on the safekeeper(s) which are in the old set but not in the new one, unless they are unreachable. To be - safe this also should be done under generation number (deletion proceeds only if + safe this also should be done under generation number (deletion proceeds only if current configuration is <= than one in request and safekeeper is not memeber of it). 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`, jump to step 7, using it as `new_conf`. @@ -202,53 +202,93 @@ The procedure ought to be driven from somewhere. Obvious candidates are control plane and storage_controller; and as each of them already has db we don't want yet another storage. I propose to manage safekeepers in storage_controller because 1) since it is in rust it simplifies simulation testing (more on this -below) 2) it already manages pageservers. +below) 2) it already manages pageservers. This assumes that migration will be fully usable only after we migrate all tenants/timelines to storage_controller. It is discussible whether we want also to manage pageserver attachments for all of these, but likely we do. -This requires us to define storcon <-> cplane interface. +This requires us to define storcon <-> cplane interface and changes. -### storage_controller <-> control plane interface +### storage_controller <-> control plane interface and changes First of all, control plane should [change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829) storing safekeepers per timeline instead of per tenant because we can't migrate -tenants atomically. +tenants atomically. The important question is how updated configuration is delivered from storage_controller to control plane to provide it to computes. As always, there are two options, pull and push. Let's do it the same push as with pageserver `/notify-attach` because 1) it keeps storage_controller out of critical compute -start path 2) provides easier upgrade: there won't be such a thing as 'timeline -managed by control plane / storcon', cplane just takes the value out of its db -when needed 3) uniformity. It makes storage_controller responsible for retrying notifying -control plane until it succeeds. +start path 2) uniformity. It makes storage_controller responsible for retrying +notifying control plane until it succeeds. -So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and -updates it in the db if the provided conf generation is higher (the cplane db -should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it -should update db which makes the call successful, and then try to schedule -`apply_config` if possible, it is ok if not. storage_controller -should rate limit calling the endpoint, but likely this won't be needed, as migration +It is not needed for the control plane to fully know the `Configuration`. It is +enough for it to only to be aware of the list of safekeepers in the latest +configuration to supply it to compute, plus associated generation number to +protect from stale update requests and to also pass it to compute. + +So, cplane `/notify-safekeepers` for the timeline can accept JSON like +``` +{ + tenant_id: String, + timeline_id: String, + generation: u32, + safekeepers: Vec, +} +``` +where `SafekeeperId` is +``` +{ + node_id: u64, + host: String +} +``` +In principle `host` is redundant, but may be useful for observability. + +The request updates list of safekeepers in the db if the provided conf +generation is higher (the cplane db should also store generations for this). +Similarly to +[`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), +it should update db which makes the call successful, and then try to schedule +`apply_config` if possible, it is ok if not. storage_controller should rate +limit calling the endpoint, but likely this won't be needed, as migration throughput is limited by `pull_timeline`. Timeline (branch) creation in cplane should call storage_controller POST `tenant/:tenant_id/timeline` like it currently does for sharded tenants. -Response should be augmented with `safekeeper_conf: Configuration`. The call -should be retried until succeeds. +Response should be augmented with `safekeepers_generation` and `safekeepers` +fields like described in `/notify-safekeepers` above. Initially (currently) +these fields may be absent; in this case cplane chooses safekeepers on its own +like it currently does. The call should be retried until succeeds. Timeline deletion and tenant deletion in cplane should call appropriate storage_controller endpoints like it currently does for sharded tenants. The calls should be retried until they succeed. +When compute receives safekeepers list from control plane it needs to know the +generation to checked whether it should be updated (note that compute may get +safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers` +GUC is just a comma separates list of `host:port`. Let's prefix it with +`g#:` to this end, so it will look like +``` +g#42:safekeeper-0.eu-central-1.aws.neon.tech:6401,safekeeper-2.eu-central-1.aws.neon.tech:6401,safekeeper-1.eu-central-1.aws.neon.tech:6401 +``` + +To summarize, list of cplane changes: +- per tenant -> per timeline safekeepers management and addition of int `safekeeper_generation` field. +- `/notify-safekeepers` endpoint. +- Branch creation call may return list of safekeepers and when it is + present cplane should adopt it instead of choosing on its own like it does currently. +- `neon.safekeepers` GUC should be prefixed with `g#:`. + ### storage_controller implementation -Current 'load everything on startup and keep in memory' easy design is fine. -Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16 -byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so -10^6 of timelines shouldn't take more than 100MB. +If desired, we may continue using current 'load everything on startup and keep +in memory' approach: single timeline shouldn't take more than 100 bytes (it's 16 +byte tenant_id, 16 byte timeline_id, int generation, vec of ~3 safekeeper ids +plus some flags), so 10^6 of timelines shouldn't take more than 100MB. Similar to pageserver attachment Intents storage_controller would have in-memory `MigrationRequest` (or its absense) for each timeline and pool of tasks trying @@ -256,7 +296,7 @@ to make these request reality; this ensures one instance of storage_controller won't do several migrations on the same timeline concurrently. In the first version it is simpler to have more manual control and no retries, i.e. migration failure removes the request. Later we can build retries and automatic -scheduling/migration. `MigrationRequest` is +scheduling/migration around. `MigrationRequest` is ``` enum MigrationRequest { To(Vec), @@ -273,9 +313,9 @@ similarly, in the first version it is ok to trigger it manually). #### Schema `safekeepers` table mirroring current `nodes` should be added, except that for -`scheduling_policy` field (seems like `status` is a better name for it): it is enough -to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3) -`decomissioned`. +`scheduling_policy`: it is enough to have at least in the beginning only 3 +fields: 1) `active` 2) `paused` (initially means only not assign new tlis there +3) `decomissioned` (node is removed). `timelines` table: ``` @@ -284,18 +324,24 @@ table! { timelines (tenant_id, timeline_id) { timeline_id -> Varchar, tenant_id -> Varchar, + start_lsn -> pg_lsn, generation -> Int4, sk_set -> Array, // list of safekeeper ids - new_sk_set -> Nullable>, // list of safekeeper ids, null if not joint conf + new_sk_set -> Nullable>, // list of safekeeper ids, null if not joint conf cplane_notified_generation -> Int4, + deleted_at -> Nullable, } } ``` +`start_lsn` is needed to create timeline on safekeepers properly, see below. We +might also want to add ancestor_timeline_id to preserve the hierarchy, but for +this RFC it is not needed. + #### API Node management is similar to pageserver: -1) POST `/control/v1/safekeepers` upserts safekeeper. +1) POST `/control/v1/safekeepers` inserts safekeeper. 2) GET `/control/v1/safekeepers` lists safekeepers. 3) GET `/control/v1/safekeepers/:node_id` gets safekeeper. 4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g. @@ -305,25 +351,15 @@ Node management is similar to pageserver: Safekeeper deploy scripts should register safekeeper at storage_contorller as they currently do with cplane, under the same id. -Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline` -would 1) choose initial set of safekeepers; 2) write to the db initial -`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in -case of conflict; 3) create timeline on the majority of safekeepers (already -created is ok). +Timeline creation/deletion will work through already existing POST and DELETE +`tenant/:tenant_id/timeline`. Cplane is expected to retry both until they +succeed. See next section on the implementation details. -We don't want to block timeline creation when one safekeeper is down. Currently -this is solved by compute implicitly creating timeline on any safekeeper it is -connected to. This creates ugly timeline state on safekeeper when timeline is -created, but start LSN is not defined yet. It would be nice to remove this; to -do that, controller can in the background retry to create timeline on -safekeeper(s) which missed that during initial creation call. It can do that -through `pull_timeline` from majority so it doesn't need to remember -`parent_lsn` in its db. - -Timeline deletion removes the row from the db and forwards deletion to the -current configuration members. Without additional actions deletions might leak, -see below on this; initially let's ignore these, reporting to cplane success if -at least one safekeeper deleted the timeline (this will remove s3 data). +We don't want to block timeline creation/deletion when one safekeeper is down. +Currently this is crutched by compute implicitly creating timeline on any +safekeeper it is connected to. This creates ugly timeline state on safekeeper +when timeline is created, but start LSN is not defined yet. Next section +describes dealing with this. Tenant deletion repeats timeline deletion for all timelines. @@ -355,26 +391,6 @@ Similar call should be added for the tenant. It would be great to have some way of subscribing to the results (apart from looking at logs/metrics). -Migration is executed as described above. One subtlety is that (local) deletion on -source safekeeper might fail, which is not a problem if we are going to -decomission the node but leaves garbage otherwise. I'd propose in the first version -1) Don't attempt deletion at all if node status is `offline`. -2) If it failed, just issue warning. -And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and -remove garbage timelines for manual use. It will 1) list all timelines on the -safekeeper 2) compare each one against configuration storage: if timeline -doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can -be deleted under generation number if node is not member of current generation. - -Automating this is untrivial; we'd need to register all potential missing -deletions in the same transaction -which switches configurations. Similarly when timeline is fully deleted to -prevent cplane operation from blocking when some safekeeper is not available -deletion should be also registered. - -One more task pool should infinitely retry notifying control plane about changed -safekeeper sets. - 3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return current in memory state of the timeline and pending `MigrationRequest`, if any. @@ -383,12 +399,153 @@ safekeeper sets. migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS (incrementing generation as always). +#### API implementation and reconciliation + +For timeline creation/deletion we want to preserve the basic assumption that +unreachable minority (1 sk of 3) doesn't block their completion, but eventually +we want to finish creation/deletion on nodes which missed it (unless they are +removed). Similarly for migration; it may and should finish even though excluded +members missed their exclusion. And of course e.g. such pending exclusion on +node C after migration ABC -> ABD must not prevent next migration ABD -> ABE. As +another example, if some node missed timeline creation it clearly must not block +migration from it. Hence it is natural to have per safekeeper background +reconciler which retries these ops until they succeed. There are 3 possible +operation types, and the type is defined by timeline state (membership +configuration and whether it is deleted) and safekeeper id: we may need to +create timeline on sk (node added), locally delete it (node excluded, somewhat +similar to detach) or globally delete it (timeline is deleted). + +Next, on storage controller restart in principle these pending operations can be +figured out by comparing safekeepers state against storcon state. But it seems +better to me to materialize them in the database; it is not expensive, avoids +these startup scans which themselves can fail etc and makes it very easy to see +outstanding work directly at the source of truth -- the db. So we can add table +`safekeeper_timeline_pending_ops` +``` +table! { + // timeline_id, sk_id is primary key + safekeeper_timeline_pending_ops (sk_id, tenant_id, timeline_id) { + sk_id -> int8, + tenant_id -> Varchar, + timeline_id -> Varchar, + generation -> Int4, + op_type -> Varchar, + } +} +``` + +`op_type` can be `include` (seed from peers and ensure generation is up to +date), `exclude` (remove locally) and `delete`. Field is actually not strictly +needed as it can be computed from current configuration, but gives more explicit +observability. + +`generation` is necessary there because after op is done reconciler must remove +it and not remove another row with higher gen which in theory might appear. + +Any insert of row should overwrite (remove) all rows with the same sk and +timeline id but lower `generation` as next op makes previous obsolete. Insertion +of `op_type` `delete` overwrites all rows. + +About `exclude`: rather than adding explicit safekeeper http endpoint, it is +reasonable to reuse membership switch endpoint: if safekeeper is not member +of the configuration it locally removes the timeline on the switch. In this case +404 should also be considered an 'ok' answer by the caller. + +So, main loop of per sk reconcile reads `safekeeper_timeline_pending_ops` +joined with timeline configuration to get current conf (with generation `n`) +for the safekeeper and does the jobs, infinitely retrying failures: +1) If node is member (`include`): + - Check if timeline exists on it, if not, call pull_timeline on it from + other members + - Call switch configuration to the current +2) If node is not member (`exclude`): + - Call switch configuration to the current, 404 is ok. +3) If timeline is deleted (`delete`), call delete. + +In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and +timeline with generation <= `n` if `op_type` is not `delete`. +In case 3 also remove `safekeeper_timeline_pending_ops` +entry + remove `timelines` entry if there is nothing left in `safekeeper_timeline_pending_ops` for the timeline. + +Let's consider in details how APIs can be implemented from this angle. + +Timeline creation. It is assumed that cplane retries it until success, so all +actions must be idempotent. Now, a tricky point here is timeline start LSN. For +the initial (tenant creation) call cplane doesn't know it. However, setting +start_lsn on safekeepers during creation is a good thing -- it provides a +guarantee that walproposer can always find a common point in WAL histories of +safekeeper and its own, and so absense of it would be a clear sign of +corruption. The following sequence works: +1) Create timeline (or observe that it exists) on pageserver, + figuring out last_record_lsn in response. +2) Choose safekeepers and insert (ON CONFLICT DO NOTHING) timeline row into the + db. Note that last_record_lsn returned on the previous step is movable as it + changes once ingestion starts, insert must not overwrite it (as well as other + fields like membership conf). On the contrary, start_lsn used in the next + step must be set to the value in the db. cplane_notified_generation can be set + to 1 (initial generation) in insert to avoid notifying cplane about initial + conf as cplane will receive it in timeline creation request anyway. +3) Issue timeline creation calls to at least majority of safekeepers. Using + majority here is not necessary but handy because it guarantees that any live + majority will have at least one sk with created timeline and so + reconciliation task can use pull_timeline shared with migration instead of + create timeline special init case. OFC if timeline is already exists call is + ignored. +4) For minority of safekeepers which could have missed creation insert + entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion + because response to cplane is sent only after it has happened, and cplane + retries the call until 200 response. + + There is a small question how request handler (timeline creation in this + case) would interact with per sk reconciler. As always I prefer to do the + simplest possible thing and here it seems to be just waking it up so it + re-reads the db for work to do. Passing work in memory is faster, but + that shouldn't matter, and path to scan db for work will exist anyway, + simpler to reuse it. + +For pg version / wal segment size: while we may persist them in `timelines` +table, it is not necessary as initial creation at step 3 can take them from +pageserver or cplane creation call and later pull_timeline will carry them +around. + +Timeline migration. +1) CAS to the db to create joint conf, and in the same transaction create + `safekeeper_timeline_pending_ops` `include` entries to initialize new members + as well as deliver this conf to current ones; poke per sk reconcilers to work + on it. Also any conf change should also poke cplane notifier task(s). +2) Once it becomes possible per alg description above, get out of joint conf + with another CAS. Task should get wakeups from per sk reconcilers because + conf switch is required for advancement; however retries should be sleep + based as well as LSN advancement might be needed, though in happy path + it isn't. To see whether further transition is possible on wakup migration + executor polls safekeepers per the algorithm. CAS creating new conf with only + new members should again insert entries to `safekeeper_timeline_pending_ops` + to switch them there, as well as `exclude` rows to remove timeline from + old members. + +Timeline deletion: just set `deleted_at` on the timeline row and insert +`safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by +per sk reconcilers. + +When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops` +for it must be cleared in the same transaction. + +One more task pool should infinitely retry notifying control plane about changed +safekeeper sets (trying making `cplane_notified_generation` equal `generation`). + #### Dealing with multiple instances of storage_controller Operations described above executed concurrently might create some errors but do not prevent progress, so while we normally don't want to run multiple instances of storage_controller it is fine to have it temporarily, e.g. during redeploy. +To harden against some controller instance creating some work in +`safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up +the job per sk reconcilers apart from explicit wakups should scan for work +periodically. It is possible to remove that though if all db updates are +protected with leadership token/term -- then such scans are needed only after +leadership is acquired. + Any interactions with db update in-memory controller state, e.g. if migration request failed because different one is in progress, controller remembers that and tries to finish it. @@ -412,8 +569,8 @@ There should be following layers of tests: 3) Since simulation testing injects at relatively high level points (not syscalls), it omits some code, in particular `pull_timeline`. Thus it is better to have basic tests covering whole system as well. Extended version of - `test_restarts_under_load` would do: start background load and do migration - under it, then restart endpoint and check that no reported commits + `test_restarts_under_load` would do: start background load and do migration + under it, then restart endpoint and check that no reported commits had been lost. I'd also add one more creating classic network split scenario, with one compute talking to AC and another to BD while migration from nodes ABC to ABD happens. @@ -422,35 +579,51 @@ There should be following layers of tests: ## Order of implementation and rollout -Note that +Note that - Control plane parts and integration with it is fully independent from everything else (tests would use simulation and neon_local). +- It is reasonable to make compute <-> safekeepers protocol change + independent of enabling generations. - There is a lot of infra work making storage_controller aware of timelines and safekeepers and its impl/rollout should be separate from migration itself. -- Initially walproposer can just stop working while it observers joint configuration. +- Initially walproposer can just stop working while it observes joint configuration. Such window would be typically very short anyway. +- Obviously we want to test the whole thing thoroughly on staging and only then + gradually enable in prod. -To rollout smoothly, both walproposer and safekeeper should have flag -`configurations_enabled`; when set to false, they would work as currently, i.e. -walproposer is able to commit on whatever safekeeper set it is provided. Until -all timelines are managed by storcon we'd need to use current script to migrate -and update/drop entries in the storage_controller database if it has any. +Let's have the following implementation bits for gradual rollout: +- compute gets `neon.safekeepers_proto_version` flag. + Initially both compute and safekeepers will be able to talk both + versions so that we can delay force restart of them and for + simplicity of rollback in case it is needed. +- storcon gets `-set-safekeepers` config option disabled by + default. Timeline creation request chooses safekeepers + (and returns them in response to cplane) only when it is set to + true. +- control_plane [see above](storage_controller-<->-control-plane interface-and-changes) + prefixes `neon.safekeepers` GUC with generation number. When it is 0 + (or prefix not present at all), walproposer behaves as currently, committing on + the provided safekeeper list -- generations are disabled. + If it is non 0 it follows this RFC rules. +- We provide a script for manual migration to storage controller. + It selects timeline(s) from control plane (specified or all of them) db + and calls special import endpoint on storage controller which is very + similar to timeline creation: it inserts into the db, sets + configuration to initial on the safekeepers, calls cplane + `notify-safekeepers`. -Safekeepers would need to be able to talk both current and new protocol version -with compute to reduce number of computes restarted in prod once v2 protocol is -deployed (though before completely switching we'd need to force this). - -Let's have the following rollout order: -- storage_controller becomes aware of safekeepers; -- storage_controller gets timeline creation for new timelines and deletion requests, but - doesn't manage all timelines yet. Migration can be tested on these new timelines. - To keep control plane and storage_controller databases in sync while control - plane still chooses the safekeepers initially (until all timelines are imported - it can choose better), `TimelineCreateRequest` can get optional safekeepers - field with safekeepers chosen by cplane. -- Then we can import all existing timelines from control plane to - storage_controller and gradually enable configurations region by region. +Then the rollout for a region would be: +- Current situation: safekeepers are choosen by control_plane. +- We manually migrate some timelines, test moving them around. +- Then we enable `--set-safekeepers` so that all new timelines + are on storage controller. +- Finally migrate all existing timelines using the script (no + compute should be speaking old proto version at this point). +Until all timelines are managed by storcon we'd need to use current ad hoc +script to migrate if needed. To keep state clean, all storage controller managed +timelines must be migrated before that, or controller db and configurations +state of safekeepers dropped manually. Very rough implementation order: - Add concept of configurations to safekeepers (including control file), @@ -458,10 +631,10 @@ Very rough implementation order: - Implement walproposer changes, including protocol. - Implement storconn part. Use it in neon_local (and pytest). - Make cplane store safekeepers per timeline instead of per tenant. -- Implement cplane/storcon integration. Route branch creation/deletion +- Implement cplane/storcon integration. Route branch creation/deletion through storcon. Then we can test migration of new branches. -- Finally import existing branches. Then we can drop cplane - safekeeper selection code. Gradually enable configurations at +- Finally import existing branches. Then we can drop cplane + safekeeper selection code. Gradually enable configurations at computes and safekeepers. Before that, all computes must talk only v3 protocol version. @@ -489,7 +662,7 @@ Aurora does this but similarly I don't think this is needed. We should use Compute <-> safekeeper protocol change to include other (long yearned) modifications: -- send data in network order to make arm work. +- send data in network order without putting whole structs to be arch independent - remove term_start_lsn from AppendRequest - add horizon to TermHistory - add to ProposerGreeting number of connection from this wp to sk diff --git a/docs/rfcs/040-profiling.md b/docs/rfcs/040-profiling.md new file mode 100644 index 0000000000..8da9e50774 --- /dev/null +++ b/docs/rfcs/040-profiling.md @@ -0,0 +1,247 @@ +# CPU and Memory Profiling + +Created 2025-01-12 by Erik Grinaker. + +See also [internal user guide](https://www.notion.so/neondatabase/Storage-CPU-Memory-Profiling-14bf189e004780228ec7d04442742324?pvs=4). + +## Summary + +This document proposes a standard cross-team pattern for CPU and memory profiling across +applications and languages, using the [pprof](https://github.com/google/pprof) profile format. + +It enables both ad hoc profiles via HTTP endpoints, and continuous profiling across the fleet via +[Grafana Cloud Profiles](https://grafana.com/docs/grafana-cloud/monitor-applications/profiles/). +Continuous profiling incurs an overhead of about 0.1% CPU usage and 3% slower heap allocations. + +## Motivation + +CPU and memory profiles are crucial observability tools for understanding performance issues, +resource exhaustion, and resource costs. They allow answering questions like: + +* Why is this process using 100% CPU? +* How do I make this go faster? +* Why did this process run out of memory? +* Why are we paying for all these CPU cores and memory chips? + +Go has [first-class support](https://pkg.go.dev/net/http/pprof) for profiling included in its +standard library, using the [pprof](https://github.com/google/pprof) profile format and associated +tooling. + +This is not the case for Rust and C, where obtaining profiles can be rather cumbersome. It requires +installing and running additional tools like `perf` as root on production nodes, with analysis tools +that can be hard to use and often don't give good results. This is not only annoying, but can also +significantly affect the resolution time of production incidents. + +This proposal will: + +* Provide CPU and heap profiles in pprof format via HTTP API. +* Record continuous profiles in Grafana for aggregate historical analysis. +* Make it easy for anyone to see a flamegraph in less than one minute. +* Be reasonably consistent across teams and services (Rust, Go, C). + +## Non Goals (For Now) + +* [Additional profile types](https://grafana.com/docs/pyroscope/next/configure-client/profile-types/) + like mutexes, locks, goroutines, etc. +* [Runtime trace integration](https://grafana.com/docs/pyroscope/next/configure-client/trace-span-profiles/). +* [Profile-guided optimization](https://en.wikipedia.org/wiki/Profile-guided_optimization). + +## Using Profiles + +Ready-to-use profiles can be obtained using e.g. `curl`. For Rust services: + +``` +$ curl localhost:9898/profile/cpu >profile.pb.gz +``` + +pprof profiles can be explored using the [`pprof`](https://github.com/google/pprof) web UI, which +provides flamegraphs, call graphs, plain text listings, and more: + +``` +$ pprof -http :6060 +``` + +Some endpoints (e.g. Rust-based ones) can also generate flamegraph SVGs directly: + +``` +$ curl localhost:9898/profile/cpu?format=svg >profile.svg +$ open profile.svg +``` + +Continuous profiles are available in Grafana under Explore → Profiles → Explore Profiles +(currently only in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)). + +## API Requirements + +* HTTP endpoints that return a profile in pprof format (with symbols). + * CPU: records a profile over the request time interval (`seconds` query parameter). + * Memory: returns the current in-use heap allocations. +* Unauthenticated, as it should not expose user data or pose a denial-of-service risk. +* Default sample frequency should not impact service (maximum 5% CPU overhead). +* Linux-compatibility. + +Nice to have: + +* Return flamegraph SVG directly from the HTTP endpoint if requested. +* Configurable sample frequency for CPU profiles. +* Historical heap allocations, by count and bytes. +* macOS-compatiblity. + +## Rust Profiling + +[`libs/utils/src/http/endpoint.rs`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs) +contains ready-to-use HTTP endpoints for CPU and memory profiling: +[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338) and [`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416). + +### CPU + +CPU profiles are provided by [pprof-rs](https://github.com/tikv/pprof-rs) via +[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338). +Expose it unauthenticated at `/profile/cpu`. + +Parameters: + +* `format`: profile output format (`pprof` or `svg`; default `pprof`). +* `seconds`: duration to collect profile over, in seconds (default `5`). +* `frequency`: how often to sample thread stacks, in Hz (default `99`). +* `force`: if `true`, cancel a running profile and start a new one (default `false`). + +Works on Linux and macOS. + +### Memory + +Use the jemalloc allocator via [`tikv-jemallocator`](https://github.com/tikv/jemallocator), +and enable profiling with samples every 2 MB allocated: + +```rust +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; +``` + +pprof profiles are generated by +[`jemalloc-pprof`](https://github.com/polarsignals/rust-jemalloc-pprof) via +[`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416). +Expose it unauthenticated at `/profile/heap`. + +Parameters: + +* `format`: profile output format (`pprof`, `svg`, or `jemalloc`; default `pprof`). + +Works on Linux only, due to [jemalloc limitations](https://github.com/jemalloc/jemalloc/issues/26). + +## Go Profiling + +The Go standard library includes pprof profiling via HTTP API in +[`net/http/pprof`](https://pkg.go.dev/net/http/pprof). Expose it unauthenticated at +`/debug/pprof`. + +Works on Linux and macOS. + +### CPU + +Via `/debug/pprof/profile`. Parameters: + +* `debug`: profile output format (`0` is pprof, `1` or above is plaintext; default `0`). +* `seconds`: duration to collect profile over, in seconds (default `30`). + +Does not support a frequency parameter (see [#57488](https://github.com/golang/go/issues/57488)), +and defaults to 100 Hz. A lower frequency can be hardcoded via `SetCPUProfileRate`, but the default +is likely ok (estimated 1% overhead). + +### Memory + +Via `/debug/pprof/heap`. Parameters: + +* `seconds`: take a delta profile over the given duration, in seconds (default `0`). +* `gc`: if `1`, garbage collect before taking profile. + +## C Profiling + +[gperftools](https://github.com/gperftools/gperftools) provides in-process CPU and heap profiling +with pprof output. + +However, continuous profiling of PostgreSQL is expensive (many computes), and has limited value +since we don't own the internals anyway. + +Ad hoc profiling might still be useful, but the compute team considers existing tooling sufficient, +so this is not a priority at the moment. + +## Grafana Continuous Profiling + +[Grafana Alloy](https://grafana.com/docs/alloy/latest/) continually scrapes CPU and memory profiles +across the fleet, and archives them as time series. This can be used to analyze resource usage over +time, either in aggregate or zoomed in to specific events and nodes. + +Profiles are retained for 30 days. Profile ingestion volume for CPU+heap at 60-second intervals +is about 0.5 GB/node/day, or about $0.25/node/day = $7.5/node/month ($0.50/GB). + +It is currently enabled in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer) +for Pageserver and Safekeeper. + +### Scraping + +* CPU profiling: 59 seconds at 19 Hz every 60 seconds. +* Heap profiling: heap snapshot with 2 MB frequency every 60 seconds. + +There are two main approaches that can be taken for CPU profiles: + +* Continuous low-frequency profiles (e.g. 19 Hz for 60 seconds every 60 seconds). +* Occasional high-frequency profiles (e.g. 99 Hz for 5 seconds every 60 seconds). + +We choose continuous low-frequency profiles where possible. This has a fixed low overhead, instead +of a spiky high overhead. It likely also gives a more representative view of resource usage. +However, a 19 Hz rate gives a minimum resolution of 52.6 ms per sample, which may be larger than the +actual runtime of small functions. Note that Go does not support a frequency parameter, so we must +use a fixed frequency for all profiles via `SetCPUProfileRate()` (default 100 Hz). + +Only one CPU profile can be taken at a time. With continuous profiling, one will always be running. +To allow also taking an ad hoc CPU profile, the Rust endpoint supports a `force` query parameter to +cancel a running profile and start a new one. + +### Overhead + +With Rust: + +* CPU profiles at 19 Hz frequency: 0.1% overhead. +* Heap profiles at 2 MB frequency: 3% allocation overhead. +* Profile call/encoding/symbolization: 20 ms every 60 seconds, or 0.03% of 1 CPU (for Pageserver). +* Profile symbolization caches: 125 MB memory, or 0.4% of 32 GB (for Pageserver). + +Benchmarks with pprof-rs showed that the CPU time for taking a stack trace of a 40-frame stack was +11 µs using the `frame-pointer` feature, and 1.4 µs using `libunwind` with DWARF. `libunwind` saw +frequent seg faults, so we use `frame-pointer` and build binaries with frame pointers (negligible +overhead). + +CPU profiles work by installing an `ITIMER_PROF` for the process, which triggers a `SIGPROF` signal +after a given amount of cumulative CPU time across all CPUs. The signal handler will run for one +of the currently executing threads and take a stack trace. Thus, a 19 Hz profile will take 1 stack +trace every 52.6 ms CPU time -- assuming 11 µs for a stack trace, this is 0.02% overhead, but +likely 0.1% in practice (given e.g. context switches). + +Heap profiles work by probabilistically taking a stack trace on allocations, adjusted for the +allocation size. A 1 MB allocation takes about 15 µs in benchmarks, and a stack trace about 1 µs, +so we can estimate that a 2 MB sampling frequency has about 3% allocation overhead -- this is +consistent with benchmarks. This is significantly larger than CPU profiles, but mitigated by the +fact that performance-sensitive code will avoid allocations as far as possible. + +Profile symbolization uses in-memory caches for symbol lookups. These take about 125 MB for +Pageserver. + +## Alternatives Considered + +* eBPF profiles. + * Don't require instrumenting the binary. + * Use less resources. + * Can profile in kernel space too. + * Supported by Grafana. + * Less information about stack frames and spans. + * Limited tooling for local analysis. + * Does not support heap profiles. + * Does not work on macOS. + +* [Polar Signals](https://www.polarsignals.com) instead of Grafana. + * We already use Grafana for everything else. Appears good enough. diff --git a/docs/rfcs/041-sharded-ingest.md b/docs/rfcs/041-sharded-ingest.md new file mode 100644 index 0000000000..47b314891c --- /dev/null +++ b/docs/rfcs/041-sharded-ingest.md @@ -0,0 +1,255 @@ +# +Created on Aug 2024 +Implemented on Jan 2025 + +## Summary + +Data in large tenants is split up between multiple pageservers according to key hashes, as +introduced in the [sharding RFC](031-sharding-static.md) and [shard splitting RFC](032-shard-splitting.md). + +Whereas currently we send all WAL to all pageserver shards, and each shard filters out the data that it needs, +in this RFC we add a mechanism to filter the WAL on the safekeeper, so that each shard receives +only the data it needs. + +This will place some extra CPU load on the safekeepers, in exchange for reducing the network bandwidth +for ingesting WAL back to scaling as O(1) with shard count, rather than O(N_shards). + +## Motivation + +1. Large databases require higher shard counts. Whereas currently we run with up to 8 shards for tenants +with a few TB of storage, the next order of magnitude capacity increase will require tens of shards, such +that sending all WAL to all shards is impractical in terms of bandwidth. +2. For contemporary database sizes (~2TB), the pageserver is the bottleneck for ingest: since each + shard has to decode and process the whole WAL, sharding doesn't fully relieve this bottleneck. To achieve significantly higher ingest speeds, we need to filter the WAL earlier so that each pageserver + only has to process relevant parts. + +## Non Goals (if relevant) + +We do not seek to introduce multiple WALs per timeline, or to share the work of handling a timeline's +WAL across safekeepers (beyond simple 3x replication). This RFC may be thought of as an incremental +move of the ingestion bottleneck up the stack: instead of high write rates bottlenecking on the +pageserver, they will bottleneck on the safekeeper. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +Safekeeper, pageserver. + +There will be no control plane or storage controller coordination needed, as pageservers will directly +indicate their sharding parameters to the safekeeper when subscribing for WAL. + +## Proposed implementation + +Terminology: +- "Data pages" refers to postgres relation blocks, and SLRU blocks. +- "Metadata pages" refers to everything else the pageserver stores, such as relation sizes and + directories of relations. + +### Phase 1: Refactor ingest + +Currently, pageserver ingest code is structured approximately as follows: +1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network + socket +2. `WalIngest::ingest_record` to translate the record into a series of page-level modifications +3. `DatadirModification` accumulates page updates from several `ingest_record` calls, and when + its `commit()` method is called, flushes these into a Timeline's open `InMemoryLayer`. + +This process currently assumes access to a pageserver `Timeline` throughout `ingest_record` and +from `DatadirModification`, which is used to do read-modify-write cycles on metadata pages +such as relation sizes and the master DBDIR page. It also assumes that records are ingested +strictly one after the other: they cannot be ingested in parallel because each record assumes +that earlier records' changes have already been applied to `Timeline`. + +This code will be refactored to disentangle the simple, fast decode of relation page writes +from the more complex logic for updating internal metadata. An intermediate representation +called `InterpretedWalRecords` will be introduced. This is similar to the internal state of +a `DatadirModification`, but does not require access to a Timeline. Instead of storing +metadata updates as materialized writes to pages, it will accumulate these as abstract operations, +for example rather than including a write to a relation size key, this structure will include +an operation that indicates "Update relation _foo_'s size to the max of its current value and +_bar_", such that these may be applied later to a real Timeline. + +The `DatadirModification` will be aware of the `EphemeralFile` format, so that as it accumulates +simple page writes of relation blocks, it can write them directly into a buffer in the serialized +format. This will avoid the need to later deserialize/reserialize this data when passing the +structure between safekeeper and pageserver. + +The new pipeline will be: +1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network +2. A `InterpretedWalRecords` is generated from the incoming WAL records. This does not + require a reference to a Timeline. +3. The logic that is current spread between `WalIngest` and `DatadirModification` for updating + metadata will be refactored to consume the metadata operations from the `InterpretedWalRecords` + and turn them into literal writes to metadata pages. This part must be done sequentially. +4. The resulting buffer of metadata page writes is combined with the buffer of relation block + writes, and written into the `InMemoryLayer`. + +Implemented in: +1. https://github.com/neondatabase/neon/pull/9472 +2. https://github.com/neondatabase/neon/pull/9504 +3. https://github.com/neondatabase/neon/pull/9524 + +### Phase 2: Decode & filter on safekeeper + +In the previous phase, the ingest code was modified to be able to do most of its work without access to +a Timeline: this first stage of ingest simply converts a series of binary wal records into +a buffer of relation/SLRU page writes, and a buffer of abstract metadata writes. + +The modified ingest code may be transplanted from pageserver to safekeeper (probably via a +shared crate). The safekeeper->pageserver network protocol is modified to: + - in subscription requests, send the `ShardIdentity` from the pageserver to the safekeeper + - in responses, transmit a `InterpretedWalRecords` instead of a raw `WalRecord`. + - use the `ShardIdentity` to filter the `ProcessedWalIngest` to relevant content for + the subscribing shard before transmitting it. + +The overall behavior of the pageserver->safekeeper interaction remains the same, in terms of +consistent LSN feedback, and connection management. Only the payload of the subscriptions +changes, to express an LSN range of WAL as a filtered `ProcessedWalIngest` instead of the +raw data. + +The ingest code on the pageserver can now skip the part where it does the first phase of +processing, as it will receive pre-processed, compressed data off the wire. + +Note that `InterpretedWalRecord` batches multiple `InterpretedWalRecord(s)` in the same network +message. Safekeeper reads WAL in chunks of 16 blocks and then decodes as many Postgres WAL records +as possible. Each Postgres WAL record maps to one `InterpretedWalRecord` for potentially multiple shards. +Hence, the size of the batch is given by the number of Postgres WAL records that fit in 16 blocks. + +The protocol needs to support evolution. Protobuf was chosen here with the view that, in the future, +we may migrate it to GRPC altogether + +Implemented in: +1. https://github.com/neondatabase/neon/pull/9746 +2. https://github.com/neondatabase/neon/pull/9821 + +### Phase 3: Fan out interpreted WAL + +In the previous phase, the initial processing of WAL was moved to the safekeeper, but it is still +done once for each shard: this will generate O(N_shards) CPU work on the safekeeper (especially +when considering converting to Protobuf format and compression). + +To avoid this, we fan-out WAL from one (tenant, timeline, shard) to all other shards subscribed on +the same safekeeper. Under normal operation, the WAL will be read from disk, decoded and interpreted +_only_ once per (safekeeper, timeline). + +When the first shard of a sharded timeline subscribes to a given safekeeper a task is spawned +for the WAL reader (`InterpretedWalReader`). This task reads WAL, decodes, interprets it and sends +it to the sender (`InterpretedWalSender`). The sender is a future that is polled from the connection +task. When further shards subscribe on the safekeeper they will attach themselves to the existing WAL reader. +There's two cases to consider: +1. The shard's requested `start_lsn` is ahead of the current position of the WAL reader. In this case, the shard +will start receiving data when the reader reaches that LSN. The intuition here is that there's little to gain +by letting shards "front-run" since compute backpressure is based on the laggard LSN. +2. The shard's requested `start_lsn` is below the current position of the WAL reader. In this case, the WAL reader +gets reset to this requested position (same intuition). Special care is taken such that advanced shards do not receive +interpreted WAL records below their current position. + +The approach above implies that there is at most one WAL reader per (tenant, timeline) on a given safekeeper at any point in time. +If this turns out to be operationally problematic, there's a trick we can deploy: `--max-delta-for-fanout` is an optional safekeeper +argument that controls the max absolute delta between a new shard and the current WAL position of the WAL reader. If the absolute +delta is above that value, a new reader is spawned. Note that there's currently no concurrency control on the number of WAL readers, +so it's recommended to use large values to avoid pushing CPU utilisation too high. + +Unsharded tenants do not spawn a separate task for the interpreted WAL reader since there's no benefit to it. Instead they poll +the reader and sender concurrently from the connection task. + +Shard splits are interesting here because it is the only case when the same shard might have two subscriptions at the same time. +This is handled by giving readers a unique identifier. Both shards will receive the same data while respecting their requested start +position. + +Implemented in: +1. https://github.com/neondatabase/neon/pull/10190 + +## Deployment + +Each phase shall be deployed independently. Special care should be taken around protocol changes. + +## Observability Tips + +* The safekeeper logs the protocol requested by the pageserver +along with the pageserver ID, tenant, timeline and shard: `starting streaming from`. +* There's metrics for the number of wal readers: + * `safekeeper_wal_readers{kind="task", target=~"pageserver.*"}` gives the number of wal reader tasks for each SK + * `safekeeper_wal_readers{kind="future", target=~"pageserver.*"}` gives the numer of wal readers polled inline by each SK + * `safekeeper_interpreted_wal_reader_tasks` gives the number of wal reader tasks per tenant, timeline +* Interesting log lines for the fan-out reader: + * `Spawning interpreted`: first shard creates the interpreted wal reader + * `Fanning out`: a subsequent shard attaches itself to an interpreted wal reader + * `Aborting interpreted`: all senders have finished and the reader task is being aborted + +## Future Optimizations + +This sections describes some improvement areas which may be revisited in the future. + +### Buffering of Interpreted WAL + +The interpreted WAL reader may buffer interpreted WAL records in user space to help with serving +subscriptions that are lagging behind the current position of the reader. + +Counterpoints: +* Safekeepers serve many thousands of timelines and allocating a buffer for each might be wasteful, +especially given that it would go unused on the happy path. +* WAL is buffered in the kernel page cache. Usually we'd only pay the CPU cost of decoding and interpreting. + +### Tweaking the Pagserver Safekeeper Selection Algorithm + +We could make the pageserver aware of which safekeeper's already host shards for the timeline along +with their current WAL positions. The pageserver should then prefer safkeepers that are in the same +AZ _and_ already have a shard with a position close to the desired start position. + +We currently run one safekeeper per AZ, so the point is mute until that changes. + +### Pipelining first ingest phase + +The first ingest phase is a stateless transformation of a binary WAL record into a pre-processed +output per shard. To put multiple CPUs to work, we may pipeline this processing up to some defined buffer +depth. + +## Alternatives considered + +### Give safekeepers enough state to fully decode WAL + +In this RFC, we only do the first phase of ingest on the safekeeper, because this is +the phase that is stateless. Subsequent changes then happen on the pageserver, with +access to the `Timeline` state. + +We could do more work on the safekeeper if we transmitted metadata state to the safekeeper +when subscribing to the WAL: for example, by telling the safekeeper all the relation sizes, +so that it could then generate all the metadata writes for relation sizes. + +We avoid doing this for several reasons: +1. Complexity: it's a more invasive protocol change +2. Decoupling: having the safekeeper understand the `ProcessedWalIngest` already somewhat + infects it with knowledge of the pageserver, but this is mainly an abstract structure + that describes postgres writes. However, if we taught the safekeeper about the exact + way that pageserver deals with metadata keys, this would be a much tighter coupling. +3. Load: once the WAL has been processed to the point that it can be split between shards, + it is preferable to share out work on the remaining shards rather than adding extra CPU + load to the safekeeper. + +### Do pre-processing on the compute instead of the safekeeper + +Since our first stage of ingest is stateless, it could be done at any stage in the pipeline, +all the way up to the compute. + +We choose not to do this, because it is useful for the safekeeper to store the raw WAL rather +than just the preprocessed WAL: +- The safekeeper still needs to be able to serve raw WAL back to postgres for e.g. physical replication +- It simplifies our paxos implementation to have the offset in the write log be literally + the same as the LSN +- Raw WAL must have a stable protocol since we might have to re-ingest it at arbitrary points in the future. + Storing raw WAL give us more flexibility to evolve the pageserver, safekeeper protocol. + +### Do wal pre-processing on shard 0 or a separate service, send it to other shards from there + +If we wanted to keep the safekeepers as entirely pure stores of raw WAL bytes, then +we could do the initial decode and shard-splitting in some other location: +- Shard 0 could subscribe to the full WAL and then send writes to other shards +- A new intermediate service between the safekeeper and pageserver could do the splitting. + +So why not? +- Extra network hop from shard 0 to the final destination shard +- Clearly there is more infrastructure involved here compared with doing it inline on the safekeeper. +- Safekeepers already have very light CPU load: typical cloud instances shapes with appropriate + disks for the safekeepers effectively have "free" CPU resources. +- Doing extra work on shard 0 would complicate scheduling of shards on pageservers, because + shard 0 would have significantly higher CPU load under write workloads than other shards. diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index c0ec40a6c2..c11a1b6688 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true chrono.workspace = true +jsonwebtoken.workspace = true serde.workspace = true serde_json.workspace = true regex.workspace = true diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index fc3757d981..0c256cae2e 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,18 +1,20 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. use crate::{ privilege::Privilege, + responses::ComputeCtlConfig, spec::{ComputeSpec, ExtVersion, PgIdent}, }; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; /// Request of the /configure API /// /// We now pass only `spec` in the configuration request, but later we can /// extend it and something like `restart: bool` or something else. So put /// `spec` into a struct initially to be more flexible in the future. -#[derive(Deserialize, Debug)] +#[derive(Debug, Deserialize, Serialize)] pub struct ConfigurationRequest { pub spec: ComputeSpec, + pub compute_ctl_config: ComputeCtlConfig, } #[derive(Deserialize, Debug)] diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 79234be720..a6248019d9 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -1,9 +1,9 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. -use std::collections::HashSet; use std::fmt::Display; use chrono::{DateTime, Utc}; +use jsonwebtoken::jwk::JwkSet; use serde::{Deserialize, Serialize, Serializer}; use crate::{ @@ -16,6 +16,12 @@ pub struct GenericAPIError { pub error: String, } +#[derive(Debug, Clone, Serialize)] +pub struct ExtensionInstallResponse { + pub extension: PgIdent, + pub version: ExtVersion, +} + /// Response of the /status API #[derive(Serialize, Debug, Deserialize)] #[serde(rename_all = "snake_case")] @@ -29,16 +35,6 @@ pub struct ComputeStatusResponse { pub error: Option, } -#[derive(Deserialize, Serialize)] -#[serde(rename_all = "snake_case")] -pub struct ComputeState { - pub status: ComputeStatus, - /// Timestamp of the last Postgres activity - #[serde(serialize_with = "rfc3339_serialize")] - pub last_active: Option>, - pub error: Option, -} - #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)] #[serde(rename_all = "snake_case")] pub enum ComputeStatus { @@ -79,7 +75,7 @@ impl Display for ComputeStatus { } } -fn rfc3339_serialize(x: &Option>, s: S) -> Result +pub fn rfc3339_serialize(x: &Option>, s: S) -> Result where S: Serializer, { @@ -140,13 +136,27 @@ pub struct CatalogObjects { pub databases: Vec, } +#[derive(Debug, Deserialize, Serialize)] +pub struct ComputeCtlConfig { + pub jwks: JwkSet, +} + +impl Default for ComputeCtlConfig { + fn default() -> Self { + Self { + jwks: JwkSet { + keys: Vec::default(), + }, + } + } +} + /// Response of the `/computes/{compute_id}/spec` control-plane API. -/// This is not actually a compute API response, so consider moving -/// to a different place. #[derive(Deserialize, Debug)] pub struct ControlPlaneSpecResponse { pub spec: Option, pub status: ControlPlaneComputeStatus, + pub compute_ctl_config: ComputeCtlConfig, } #[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)] @@ -163,8 +173,9 @@ pub enum ControlPlaneComputeStatus { #[derive(Clone, Debug, Default, Serialize)] pub struct InstalledExtension { pub extname: String, - pub versions: HashSet, + pub version: String, pub n_databases: u32, // Number of databases using this extension + pub owned_by_superuser: String, } #[derive(Clone, Debug, Default, Serialize)] diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 6d9c353cda..8fffae92fb 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -67,6 +67,15 @@ pub struct ComputeSpec { #[serde(default)] pub disk_quota_bytes: Option, + /// Disables the vm-monitor behavior that resizes LFC on upscale/downscale, instead relying on + /// the initial size of LFC. + /// + /// This is intended for use when the LFC size is being overridden from the default but + /// autoscaling is still enabled, and we don't want the vm-monitor to interfere with the custom + /// LFC sizing. + #[serde(default)] + pub disable_lfc_resizing: Option, + /// Expected cluster state at the end of transition process. pub cluster: Cluster, pub delta_operations: Option>, @@ -129,6 +138,13 @@ pub struct ComputeSpec { /// enough spare connections for reconfiguration process to succeed. #[serde(default = "default_reconfigure_concurrency")] pub reconfigure_concurrency: usize, + + /// If set to true, the compute_ctl will drop all subscriptions before starting the + /// compute. This is needed when we start an endpoint on a branch, so that child + /// would not compete with parent branch subscriptions + /// over the same replication content from publisher. + #[serde(default)] // Default false + pub drop_subscriptions_before_start: bool, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. @@ -188,14 +204,16 @@ impl RemoteExtSpec { // Check if extension is present in public or custom. // If not, then it is not allowed to be used by this compute. - if let Some(public_extensions) = &self.public_extensions { - if !public_extensions.contains(&real_ext_name.to_string()) { - if let Some(custom_extensions) = &self.custom_extensions { - if !custom_extensions.contains(&real_ext_name.to_string()) { - return Err(anyhow::anyhow!("extension {} is not found", real_ext_name)); - } - } - } + if !self + .public_extensions + .as_ref() + .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name)) + && !self + .custom_extensions + .as_ref() + .is_some_and(|exts| exts.iter().any(|e| e == real_ext_name)) + { + return Err(anyhow::anyhow!("extension {} is not found", real_ext_name)); } match self.extension_data.get(real_ext_name) { @@ -234,7 +252,7 @@ pub enum ComputeMode { Replica, } -#[derive(Clone, Debug, Default, Deserialize, Serialize)] +#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] pub struct Cluster { pub cluster_id: Option, pub name: Option, @@ -265,7 +283,7 @@ pub struct DeltaOp { /// Rust representation of Postgres role info with only those fields /// that matter for us. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct Role { pub name: PgIdent, pub encrypted_password: Option, @@ -274,7 +292,7 @@ pub struct Role { /// Rust representation of Postgres database info with only those fields /// that matter for us. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct Database { pub name: PgIdent, pub owner: PgIdent, @@ -290,7 +308,7 @@ pub struct Database { /// Common type representing both SQL statement params with or without value, /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config /// options like `wal_level = logical`. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] pub struct GenericOption { pub name: String, pub value: Option, @@ -324,6 +342,102 @@ mod tests { use super::*; use std::fs::File; + #[test] + fn allow_installing_remote_extensions() { + let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ + "public_extensions": null, + "custom_extensions": null, + "library_index": {}, + "extension_data": {}, + })) + .unwrap(); + + rspec + .get_ext("ext", false, "latest", "v17") + .expect_err("Extension should not be found"); + + let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ + "public_extensions": [], + "custom_extensions": null, + "library_index": {}, + "extension_data": {}, + })) + .unwrap(); + + rspec + .get_ext("ext", false, "latest", "v17") + .expect_err("Extension should not be found"); + + let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ + "public_extensions": [], + "custom_extensions": [], + "library_index": { + "ext": "ext" + }, + "extension_data": { + "ext": { + "control_data": { + "ext.control": "" + }, + "archive_path": "" + } + }, + })) + .unwrap(); + + rspec + .get_ext("ext", false, "latest", "v17") + .expect_err("Extension should not be found"); + + let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ + "public_extensions": [], + "custom_extensions": ["ext"], + "library_index": { + "ext": "ext" + }, + "extension_data": { + "ext": { + "control_data": { + "ext.control": "" + }, + "archive_path": "" + } + }, + })) + .unwrap(); + + rspec + .get_ext("ext", false, "latest", "v17") + .expect("Extension should be found"); + + let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({ + "public_extensions": ["ext"], + "custom_extensions": [], + "library_index": { + "extlib": "ext", + }, + "extension_data": { + "ext": { + "control_data": { + "ext.control": "" + }, + "archive_path": "" + } + }, + })) + .unwrap(); + + rspec + .get_ext("ext", false, "latest", "v17") + .expect("Extension should be found"); + + // test library index for the case when library name + // doesn't match the extension name + rspec + .get_ext("extlib", true, "latest", "v17") + .expect("Library should be found"); + } + #[test] fn parse_spec_file() { let file = File::open("tests/cluster_spec.json").unwrap(); diff --git a/libs/desim/src/time.rs b/libs/desim/src/time.rs index 7bb71db95c..7ce605bda8 100644 --- a/libs/desim/src/time.rs +++ b/libs/desim/src/time.rs @@ -91,7 +91,7 @@ impl Timing { /// Return true if there is a ready event. fn is_event_ready(&self, queue: &mut BinaryHeap) -> bool { - queue.peek().map_or(false, |x| x.time <= self.now()) + queue.peek().is_some_and(|x| x.time <= self.now()) } /// Clear all pending events. diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml new file mode 100644 index 0000000000..d72e4bd012 --- /dev/null +++ b/libs/http-utils/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "http-utils" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +backtrace.workspace = true +bytes.workspace = true +inferno.workspace = true +fail.workspace = true +flate2.workspace = true +hyper0.workspace = true +itertools.workspace = true +jemalloc_pprof.workspace = true +once_cell.workspace = true +pprof.workspace = true +regex.workspace = true +routerify.workspace = true +serde.workspace = true +serde_json.workspace = true +serde_path_to_error.workspace = true +thiserror.workspace = true +tracing.workspace = true +tokio.workspace = true +tokio-util.workspace = true +url.workspace = true +uuid.workspace = true + +# to use tokio channels as streams, this is faster to compile than async_stream +# why is it only here? no other crate should use it, streams are rarely needed. +tokio-stream = { version = "0.1.14" } + +metrics.workspace = true +utils.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/utils/src/http/endpoint.rs b/libs/http-utils/src/endpoint.rs similarity index 85% rename from libs/utils/src/http/endpoint.rs rename to libs/http-utils/src/endpoint.rs index d975b63677..be97b341d1 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/http-utils/src/endpoint.rs @@ -1,28 +1,30 @@ -use crate::auth::{AuthError, Claims, SwappableJwtAuth}; -use crate::http::error::{api_error_handler, route_error_handler, ApiError}; -use crate::http::request::{get_query_param, parse_query_param}; +use crate::error::{api_error_handler, route_error_handler, ApiError}; +use crate::pprof; +use crate::request::{get_query_param, parse_query_param}; +use ::pprof::protos::Message as _; +use ::pprof::ProfilerGuardBuilder; use anyhow::{anyhow, Context}; +use bytes::{Bytes, BytesMut}; use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION}; use hyper::http::HeaderValue; use hyper::Method; use hyper::{header::CONTENT_TYPE, Body, Request, Response}; use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; +use regex::Regex; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; +use tokio::sync::{mpsc, Mutex, Notify}; +use tokio_stream::wrappers::ReceiverStream; use tokio_util::io::ReaderStream; use tracing::{debug, info, info_span, warn, Instrument}; +use utils::auth::{AuthError, Claims, SwappableJwtAuth}; use std::future::Future; use std::io::Write as _; use std::str::FromStr; use std::time::Duration; -use bytes::{Bytes, BytesMut}; -use pprof::protos::Message as _; -use tokio::sync::{mpsc, Mutex}; -use tokio_stream::wrappers::ReceiverStream; - static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { register_int_counter!( "libmetrics_metric_handler_requests_total", @@ -348,33 +350,53 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A }; let seconds = match parse_query_param(&req, "seconds")? { None => 5, - Some(seconds @ 1..=30) => seconds, - Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))), + Some(seconds @ 1..=60) => seconds, + Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))), }; let frequency_hz = match parse_query_param(&req, "frequency")? { None => 99, Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))), Some(frequency) => frequency, }; - - // Only allow one profiler at a time. - static PROFILE_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); - let _lock = PROFILE_LOCK - .try_lock() - .map_err(|_| ApiError::Conflict("profiler already running".into()))?; + let force: bool = parse_query_param(&req, "force")?.unwrap_or_default(); // Take the profile. - let report = tokio::task::spawn_blocking(move || { - let guard = pprof::ProfilerGuardBuilder::default() + static PROFILE_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); + static PROFILE_CANCEL: Lazy = Lazy::new(Notify::new); + + let report = { + // Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a + // Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting + // for a lock(), to avoid races where the notify isn't currently awaited. + let _lock = loop { + match PROFILE_LOCK.try_lock() { + Ok(lock) => break lock, + Err(_) if force => PROFILE_CANCEL.notify_waiters(), + Err(_) => { + return Err(ApiError::Conflict( + "profiler already running (use ?force=true to cancel it)".into(), + )) + } + } + tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait + }; + + let guard = ProfilerGuardBuilder::default() .frequency(frequency_hz) .blocklist(&["libc", "libgcc", "pthread", "vdso"]) - .build()?; - std::thread::sleep(Duration::from_secs(seconds)); - guard.report().build() - }) - .await - .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? - .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?; + .build() + .map_err(|err| ApiError::InternalServerError(err.into()))?; + + tokio::select! { + _ = tokio::time::sleep(Duration::from_secs(seconds)) => {}, + _ = PROFILE_CANCEL.notified() => {}, + }; + + guard + .report() + .build() + .map_err(|err| ApiError::InternalServerError(err.into()))? + }; // Return the report in the requested format. match format { @@ -415,6 +437,7 @@ pub async fn profile_heap_handler(req: Request) -> Result, enum Format { Jemalloc, Pprof, + Svg, } // Parameters. @@ -422,9 +445,24 @@ pub async fn profile_heap_handler(req: Request) -> Result, None => Format::Pprof, Some("jemalloc") => Format::Jemalloc, Some("pprof") => Format::Pprof, + Some("svg") => Format::Svg, Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), }; + // Functions and mappings to strip when symbolizing pprof profiles. If true, + // also remove child frames. + static STRIP_FUNCTIONS: Lazy> = Lazy::new(|| { + vec![ + (Regex::new("^__rust").unwrap(), false), + (Regex::new("^_start$").unwrap(), false), + (Regex::new("^irallocx_prof").unwrap(), true), + (Regex::new("^prof_alloc_prep").unwrap(), true), + (Regex::new("^std::rt::lang_start").unwrap(), false), + (Regex::new("^std::sys::backtrace::__rust").unwrap(), false), + ] + }); + const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"]; + // Obtain profiler handle. let mut prof_ctl = jemalloc_pprof::PROF_CTL .as_ref() @@ -457,10 +495,19 @@ pub async fn profile_heap_handler(req: Request) -> Result, } Format::Pprof => { - let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof()) - .await - .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? - .map_err(ApiError::InternalServerError)?; + let data = tokio::task::spawn_blocking(move || { + let bytes = prof_ctl.dump_pprof()?; + // Symbolize the profile. + // TODO: consider moving this upstream to jemalloc_pprof and avoiding the + // serialization roundtrip. + let profile = pprof::decode(&bytes)?; + let profile = pprof::symbolize(profile)?; + let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS); + pprof::encode(&profile) + }) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; Response::builder() .status(200) .header(CONTENT_TYPE, "application/octet-stream") @@ -468,6 +515,27 @@ pub async fn profile_heap_handler(req: Request) -> Result, .body(Body::from(data)) .map_err(|err| ApiError::InternalServerError(err.into())) } + + Format::Svg => { + let body = tokio::task::spawn_blocking(move || { + let bytes = prof_ctl.dump_pprof()?; + let profile = pprof::decode(&bytes)?; + let profile = pprof::symbolize(profile)?; + let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS); + let mut opts = inferno::flamegraph::Options::default(); + opts.title = "Heap inuse".to_string(); + opts.count_name = "bytes".to_string(); + pprof::flamegraph(profile, &mut opts) + }) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; + Response::builder() + .status(200) + .header(CONTENT_TYPE, "image/svg+xml") + .body(Body::from(body)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } } } @@ -650,9 +718,9 @@ pub fn check_permission_with( #[cfg(test)] mod tests { use super::*; - use futures::future::poll_fn; use hyper::service::Service; use routerify::RequestServiceBuilder; + use std::future::poll_fn; use std::net::{IpAddr, SocketAddr}; #[tokio::test] diff --git a/libs/utils/src/http/error.rs b/libs/http-utils/src/error.rs similarity index 93% rename from libs/utils/src/http/error.rs rename to libs/http-utils/src/error.rs index 02fc9e3b99..746305caec 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/http-utils/src/error.rs @@ -5,6 +5,8 @@ use std::error::Error as StdError; use thiserror::Error; use tracing::{error, info, warn}; +use utils::auth::AuthError; + #[derive(Debug, Error)] pub enum ApiError { #[error("Bad request: {0:#?}")] @@ -96,6 +98,15 @@ impl ApiError { } } +impl From for ApiError { + fn from(_value: AuthError) -> Self { + // Don't pass on the value of the AuthError as a precautionary measure. + // Being intentionally vague in public error communication hurts debugability + // but it is more secure. + ApiError::Forbidden("JWT authentication error".to_string()) + } +} + #[derive(Serialize, Deserialize)] pub struct HttpErrorBody { pub msg: String, diff --git a/libs/http-utils/src/failpoints.rs b/libs/http-utils/src/failpoints.rs new file mode 100644 index 0000000000..8a1e0c8cf0 --- /dev/null +++ b/libs/http-utils/src/failpoints.rs @@ -0,0 +1,50 @@ +use crate::error::ApiError; +use crate::json::{json_request, json_response}; + +use hyper::{Body, Request, Response, StatusCode}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; + +use utils::failpoint_support::apply_failpoint; + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} + +/// Configure failpoints through http. +pub async fn failpoints_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + if !fail::has_failpoints() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Cannot manage failpoints because neon was compiled without failpoints support" + ))); + } + + let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; + for fp in failpoints { + tracing::info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = apply_failpoint(&fp.name, &fp.actions); + + if let Err(err_msg) = cfg_result { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Failed to configure failpoints: {err_msg}" + ))); + } + } + + json_response(StatusCode::OK, ()) +} diff --git a/libs/utils/src/http/json.rs b/libs/http-utils/src/json.rs similarity index 100% rename from libs/utils/src/http/json.rs rename to libs/http-utils/src/json.rs diff --git a/libs/utils/src/http/mod.rs b/libs/http-utils/src/lib.rs similarity index 82% rename from libs/utils/src/http/mod.rs rename to libs/http-utils/src/lib.rs index 74ed6bb5b2..ae6a27aaa8 100644 --- a/libs/utils/src/http/mod.rs +++ b/libs/http-utils/src/lib.rs @@ -1,8 +1,12 @@ pub mod endpoint; pub mod error; +pub mod failpoints; pub mod json; +pub mod pprof; pub mod request; +extern crate hyper0 as hyper; + /// Current fast way to apply simple http routing in various Neon binaries. /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed. pub use routerify::{ext::RequestExt, RouterBuilder, RouterService}; diff --git a/libs/http-utils/src/pprof.rs b/libs/http-utils/src/pprof.rs new file mode 100644 index 0000000000..fe1cc10838 --- /dev/null +++ b/libs/http-utils/src/pprof.rs @@ -0,0 +1,238 @@ +use anyhow::bail; +use flate2::write::{GzDecoder, GzEncoder}; +use flate2::Compression; +use itertools::Itertools as _; +use pprof::protos::{Function, Line, Location, Message as _, Profile}; +use regex::Regex; + +use std::borrow::Cow; +use std::collections::{HashMap, HashSet}; +use std::ffi::c_void; +use std::io::Write as _; + +/// Decodes a gzip-compressed Protobuf-encoded pprof profile. +pub fn decode(bytes: &[u8]) -> anyhow::Result { + let mut gz = GzDecoder::new(Vec::new()); + gz.write_all(bytes)?; + Ok(Profile::parse_from_bytes(&gz.finish()?)?) +} + +/// Encodes a pprof profile as gzip-compressed Protobuf. +pub fn encode(profile: &Profile) -> anyhow::Result> { + let mut gz = GzEncoder::new(Vec::new(), Compression::default()); + profile.write_to_writer(&mut gz)?; + Ok(gz.finish()?) +} + +/// Symbolizes a pprof profile using the current binary. +pub fn symbolize(mut profile: Profile) -> anyhow::Result { + if !profile.function.is_empty() { + return Ok(profile); // already symbolized + } + + // Collect function names. + let mut functions: HashMap = HashMap::new(); + let mut strings: HashMap = profile + .string_table + .into_iter() + .enumerate() + .map(|(i, s)| (s, i as i64)) + .collect(); + + // Helper to look up or register a string. + let mut string_id = |s: &str| -> i64 { + // Don't use .entry() to avoid unnecessary allocations. + if let Some(id) = strings.get(s) { + return *id; + } + let id = strings.len() as i64; + strings.insert(s.to_string(), id); + id + }; + + for loc in &mut profile.location { + if !loc.line.is_empty() { + continue; + } + + // Resolve the line and function for each location. + backtrace::resolve(loc.address as *mut c_void, |symbol| { + let Some(symbol_name) = symbol.name() else { + return; + }; + + let function_name = format!("{symbol_name:#}"); + let functions_len = functions.len(); + let function_id = functions + .entry(function_name) + .or_insert_with_key(|function_name| { + let function_id = functions_len as u64 + 1; + let system_name = String::from_utf8_lossy(symbol_name.as_bytes()); + let filename = symbol + .filename() + .map(|path| path.to_string_lossy()) + .unwrap_or(Cow::Borrowed("")); + Function { + id: function_id, + name: string_id(function_name), + system_name: string_id(&system_name), + filename: string_id(&filename), + ..Default::default() + } + }) + .id; + loc.line.push(Line { + function_id, + line: symbol.lineno().unwrap_or(0) as i64, + ..Default::default() + }); + }); + } + + // Store the resolved functions, and mark the mapping as resolved. + profile.function = functions.into_values().sorted_by_key(|f| f.id).collect(); + profile.string_table = strings + .into_iter() + .sorted_by_key(|(_, i)| *i) + .map(|(s, _)| s) + .collect(); + + for mapping in &mut profile.mapping { + mapping.has_functions = true; + mapping.has_filenames = true; + } + + Ok(profile) +} + +/// Strips locations (stack frames) matching the given mappings (substring) or function names +/// (regex). The function bool specifies whether child frames should be stripped as well. +/// +/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all +/// string references. +pub fn strip_locations( + mut profile: Profile, + mappings: &[&str], + functions: &[(Regex, bool)], +) -> Profile { + // Strip mappings. + let mut strip_mappings: HashSet = HashSet::new(); + + profile.mapping.retain(|mapping| { + let Some(name) = profile.string_table.get(mapping.filename as usize) else { + return true; + }; + if mappings.iter().any(|substr| name.contains(substr)) { + strip_mappings.insert(mapping.id); + return false; + } + true + }); + + // Strip functions. + let mut strip_functions: HashMap = HashMap::new(); + + profile.function.retain(|function| { + let Some(name) = profile.string_table.get(function.name as usize) else { + return true; + }; + for (regex, strip_children) in functions { + if regex.is_match(name) { + strip_functions.insert(function.id, *strip_children); + return false; + } + } + true + }); + + // Strip locations. The bool specifies whether child frames should be stripped too. + let mut strip_locations: HashMap = HashMap::new(); + + profile.location.retain(|location| { + for line in &location.line { + if let Some(strip_children) = strip_functions.get(&line.function_id) { + strip_locations.insert(location.id, *strip_children); + return false; + } + } + if strip_mappings.contains(&location.mapping_id) { + strip_locations.insert(location.id, false); + return false; + } + true + }); + + // Strip sample locations. + for sample in &mut profile.sample { + // First, find the uppermost function with child removal and truncate the stack. + if let Some(truncate) = sample + .location_id + .iter() + .rposition(|id| strip_locations.get(id) == Some(&true)) + { + sample.location_id.drain(..=truncate); + } + // Next, strip any individual frames without child removal. + sample + .location_id + .retain(|id| !strip_locations.contains_key(id)); + } + + profile +} + +/// Generates an SVG flamegraph from a symbolized pprof profile. +pub fn flamegraph( + profile: Profile, + opts: &mut inferno::flamegraph::Options, +) -> anyhow::Result> { + if profile.mapping.iter().any(|m| !m.has_functions) { + bail!("profile not symbolized"); + } + + // Index locations, functions, and strings. + let locations: HashMap = + profile.location.into_iter().map(|l| (l.id, l)).collect(); + let functions: HashMap = + profile.function.into_iter().map(|f| (f.id, f)).collect(); + let strings = profile.string_table; + + // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack, + // since inferno expects it bottom-up. + let mut stacks: HashMap, i64> = HashMap::new(); + for sample in profile.sample { + let mut stack = Vec::with_capacity(sample.location_id.len()); + for location in sample.location_id.into_iter().rev() { + let Some(location) = locations.get(&location) else { + bail!("missing location {location}"); + }; + for line in location.line.iter().rev() { + let Some(function) = functions.get(&line.function_id) else { + bail!("missing function {}", line.function_id); + }; + let Some(name) = strings.get(function.name as usize) else { + bail!("missing string {}", function.name); + }; + stack.push(name.as_str()); + } + } + let Some(&value) = sample.value.first() else { + bail!("missing value"); + }; + *stacks.entry(stack).or_default() += value; + } + + // Construct stack lines for inferno. + let lines = stacks + .into_iter() + .map(|(stack, value)| (stack.into_iter().join(";"), value)) + .map(|(stack, value)| format!("{stack} {value}")) + .sorted() + .collect_vec(); + + // Construct the flamegraph. + let mut bytes = Vec::new(); + let lines = lines.iter().map(|line| line.as_str()); + inferno::flamegraph::from_lines(opts, lines, &mut bytes)?; + Ok(bytes) +} diff --git a/libs/utils/src/http/request.rs b/libs/http-utils/src/request.rs similarity index 100% rename from libs/utils/src/http/request.rs rename to libs/http-utils/src/request.rs diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 79da05da6c..87dfdfb5ec 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "pageserver_api" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 09cfbc55fd..039cc1319e 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -9,19 +9,18 @@ pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); +use std::collections::HashMap; +use std::num::{NonZeroU64, NonZeroUsize}; +use std::str::FromStr; +use std::time::Duration; + use postgres_backend::AuthType; use remote_storage::RemoteStorageConfig; use serde_with::serde_as; -use std::{ - collections::HashMap, - num::{NonZeroU64, NonZeroUsize}, - str::FromStr, - time::Duration, -}; -use utils::{logging::LogFormat, postgres_client::PostgresClientProtocol}; +use utils::logging::LogFormat; +use utils::postgres_client::PostgresClientProtocol; -use crate::models::ImageCompressionAlgorithm; -use crate::models::LsnLease; +use crate::models::{ImageCompressionAlgorithm, LsnLease}; // Certain metadata (e.g. externally-addressable name, AZ) is delivered // as a separate structure. This information is not neeed by the pageserver @@ -120,6 +119,10 @@ pub struct ConfigToml { pub no_sync: Option, pub wal_receiver_protocol: PostgresClientProtocol, pub page_service_pipelining: PageServicePipeliningConfig, + pub get_vectored_concurrent_io: GetVectoredConcurrentIo, + pub enable_read_path_debugging: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub validate_wal_contiguity: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -158,6 +161,25 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy { Tasks, } +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(tag = "mode", rename_all = "kebab-case")] +#[serde(deny_unknown_fields)] +pub enum GetVectoredConcurrentIo { + /// The read path is fully sequential: layers are visited + /// one after the other and IOs are issued and waited upon + /// from the same task that traverses the layers. + Sequential, + /// The read path still traverses layers sequentially, and + /// index blocks will be read into the PS PageCache from + /// that task, with waiting. + /// But data IOs are dispatched and waited upon from a sidecar + /// task so that the traversing task can continue to traverse + /// layers while the IOs are in flight. + /// If the PS PageCache miss rate is low, this improves + /// throughput dramatically. + SidecarTask, +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -234,9 +256,31 @@ pub struct TenantConfigToml { // Duration::ZERO means automatic compaction is disabled. #[serde(with = "humantime_serde")] pub compaction_period: Duration, - // Level0 delta layer threshold for compaction. + /// Level0 delta layer threshold for compaction. pub compaction_threshold: usize, + /// Controls the amount of L0 included in a single compaction iteration. + /// The unit is `checkpoint_distance`, i.e., a size. + /// We add L0s to the set of layers to compact until their cumulative + /// size exceeds `compaction_upper_limit * checkpoint_distance`. + pub compaction_upper_limit: usize, pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, + /// If true, compact down L0 across all tenant timelines before doing regular compaction. + pub compaction_l0_first: bool, + /// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only + /// has an effect if `compaction_l0_first` is `true`. + pub compaction_l0_semaphore: bool, + /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure, + /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer + /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification + /// blowing up. Should be >compaction_threshold. 0 to disable. Disabled by default. + pub l0_flush_delay_threshold: Option, + /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold + /// to avoid deadlock. 0 to disable. Disabled by default. + pub l0_flush_stall_threshold: Option, + /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next + /// layer. This is a temporary backpressure mechanism which should be removed once + /// l0_flush_{delay,stall}_threshold is fully enabled. + pub l0_flush_wait_upload: bool, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is #of bytes of WAL. @@ -286,6 +330,10 @@ pub struct TenantConfigToml { // Expresed in multiples of checkpoint distance. pub image_layer_creation_check_threshold: u8, + // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction. + // Set to 0 to disable preemption. + pub image_creation_preempt_threshold: usize, + /// The length for an explicit LSN lease request. /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. #[serde(with = "humantime_serde")] @@ -301,13 +349,27 @@ pub struct TenantConfigToml { pub timeline_offloading: bool, pub wal_receiver_protocol_override: Option, + + /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into + /// `index_part.json`, and it cannot be reversed. + pub rel_size_v2_enabled: bool, + + // gc-compaction related configs + /// Enable automatic gc-compaction trigger on this tenant. + pub gc_compaction_enabled: bool, + /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold, + /// gc-compaction will be triggered. + pub gc_compaction_initial_threshold_kb: u64, + /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN) + /// is above this ratio, gc-compaction will be triggered. + pub gc_compaction_ratio_percent: u64, } pub mod defaults { - use crate::models::ImageCompressionAlgorithm; - pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; + use crate::models::ImageCompressionAlgorithm; + pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; @@ -435,7 +497,7 @@ impl Default for ConfigToml { NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), image_compression: (DEFAULT_IMAGE_COMPRESSION), - timeline_offloading: false, + timeline_offloading: true, ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, virtual_file_io_mode: None, @@ -450,6 +512,17 @@ impl Default for ConfigToml { execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, }) }, + get_vectored_concurrent_io: if !cfg!(test) { + GetVectoredConcurrentIo::Sequential + } else { + GetVectoredConcurrentIo::SidecarTask + }, + enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") { + Some(true) + } else { + None + }, + validate_wal_contiguity: None, } } } @@ -472,9 +545,20 @@ pub mod tenant_conf_defaults { pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; + + // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's + // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could + // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So + // with this config, we can get a maximum peak compaction usage of 9 GB. + pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20; + pub const DEFAULT_COMPACTION_L0_FIRST: bool = false; + pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true; + pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = crate::models::CompactionAlgorithm::Legacy; + pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true; + pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. @@ -483,6 +567,10 @@ pub mod tenant_conf_defaults { // Relevant: https://github.com/neondatabase/neon/issues/3394 pub const DEFAULT_GC_PERIOD: &str = "1 hr"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; + // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image + // layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we + // want to enable this feature. + pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0; pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; @@ -494,6 +582,9 @@ pub mod tenant_conf_defaults { // By default ingest enough WAL for two new L0 layers before checking if new image // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; + pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false; + pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB + pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; } impl Default for TenantConfigToml { @@ -507,9 +598,15 @@ impl Default for TenantConfigToml { compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) .expect("cannot parse default compaction period"), compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, + compaction_upper_limit: DEFAULT_COMPACTION_UPPER_LIMIT, compaction_algorithm: crate::models::CompactionAlgorithmSettings { kind: DEFAULT_COMPACTION_ALGORITHM, }, + compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST, + compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE, + l0_flush_delay_threshold: None, + l0_flush_stall_threshold: None, + l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD, gc_horizon: DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period"), @@ -534,10 +631,15 @@ impl Default for TenantConfigToml { lazy_slru_download: false, timeline_get_throttle: crate::models::ThrottleConfig::disabled(), image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, + image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD, lsn_lease_length: LsnLease::DEFAULT_LENGTH, lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, - timeline_offloading: false, + timeline_offloading: true, wal_receiver_protocol_override: None, + rel_size_v2_enabled: false, + gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, + gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, + gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, } } } diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 6839ef69f5..2cfe1a85f9 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -9,11 +9,8 @@ use std::time::{Duration, Instant}; use serde::{Deserialize, Serialize}; use utils::id::{NodeId, TenantId}; -use crate::models::PageserverUtilization; -use crate::{ - models::{ShardParameters, TenantConfig}, - shard::{ShardStripeSize, TenantShardId}, -}; +use crate::models::{PageserverUtilization, ShardParameters, TenantConfig}; +use crate::shard::{ShardStripeSize, TenantShardId}; #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] @@ -57,6 +54,7 @@ pub struct NodeRegisterRequest { pub listen_http_addr: String, pub listen_http_port: u16, + pub listen_https_port: Option, pub availability_zone_id: AvailabilityZone, } @@ -75,7 +73,7 @@ pub struct TenantPolicyRequest { pub scheduling: Option, } -#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug)] +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)] pub struct AvailabilityZone(pub String); impl Display for AvailabilityZone { @@ -87,7 +85,7 @@ impl Display for AvailabilityZone { #[derive(Serialize, Deserialize)] pub struct ShardsPreferredAzsRequest { #[serde(flatten)] - pub preferred_az_ids: HashMap, + pub preferred_az_ids: HashMap>, } #[derive(Serialize, Deserialize)] @@ -105,6 +103,7 @@ pub struct TenantLocateResponseShard { pub listen_http_addr: String, pub listen_http_port: u16, + pub listen_https_port: Option, } #[derive(Serialize, Deserialize)] @@ -144,8 +143,11 @@ pub struct NodeDescribeResponse { pub availability: NodeAvailabilityWrapper, pub scheduling: NodeSchedulingPolicy, + pub availability_zone_id: String, + pub listen_http_addr: String, pub listen_http_port: u16, + pub listen_https_port: Option, pub listen_pg_addr: String, pub listen_pg_port: u16, @@ -179,8 +181,19 @@ pub struct TenantDescribeResponseShard { /// specifies some constraints, e.g. asking it to get off particular node(s) #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateRequest { - pub tenant_shard_id: TenantShardId, pub node_id: NodeId, + #[serde(default)] + pub migration_config: Option, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct MigrationConfig { + #[serde(default)] + #[serde(with = "humantime_serde")] + pub secondary_warmup_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub secondary_download_request_timeout: Option, } #[derive(Serialize, Clone, Debug)] @@ -320,6 +333,42 @@ impl From for String { } } +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] +pub enum SkSchedulingPolicy { + Active, + Pause, + Decomissioned, +} + +impl FromStr for SkSchedulingPolicy { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + Ok(match s { + "active" => Self::Active, + "pause" => Self::Pause, + "decomissioned" => Self::Decomissioned, + _ => { + return Err(anyhow::anyhow!( + "Unknown scheduling policy '{s}', try active,pause,decomissioned" + )); + } + }) + } +} + +impl From for String { + fn from(value: SkSchedulingPolicy) -> String { + use SkSchedulingPolicy::*; + match value { + Active => "active", + Pause => "pause", + Decomissioned => "decomissioned", + } + .to_string() + } +} + /// Controls how tenant shards are mapped to locations on pageservers, e.g. whether /// to create secondary locations. #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)] @@ -336,6 +385,16 @@ pub enum PlacementPolicy { Detached, } +impl PlacementPolicy { + pub fn want_secondaries(&self) -> usize { + match self { + PlacementPolicy::Attached(secondary_count) => *secondary_count, + PlacementPolicy::Secondary => 1, + PlacementPolicy::Detached => 0, + } + } +} + #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateResponse {} @@ -372,11 +431,33 @@ pub struct MetadataHealthListOutdatedResponse { pub health_records: Vec, } +/// Publicly exposed safekeeper description +#[derive(Serialize, Deserialize, Clone)] +pub struct SafekeeperDescribeResponse { + pub id: NodeId, + pub region_id: String, + /// 1 is special, it means just created (not currently posted to storcon). + /// Zero or negative is not really expected. + /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag. + pub version: i64, + pub host: String, + pub port: i32, + pub http_port: i32, + pub availability_zone_id: String, + pub scheduling_policy: SkSchedulingPolicy, +} + +#[derive(Serialize, Deserialize, Clone)] +pub struct SafekeeperSchedulingPolicyRequest { + pub scheduling_policy: SkSchedulingPolicy, +} + #[cfg(test)] mod test { - use super::*; use serde_json; + use super::*; + /// Check stability of PlacementPolicy's serialization #[test] fn placement_policy_encoding() -> anyhow::Result<()> { diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 37dff6fe46..8836e7ec87 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,10 +1,13 @@ -use anyhow::{bail, Result}; -use byteorder::{ByteOrder, BE}; +use std::fmt; +use std::ops::Range; + +use anyhow::{Result, bail}; +use byteorder::{BE, ByteOrder}; +use bytes::Bytes; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::Oid; -use postgres_ffi::RepOriginId; +use postgres_ffi::{Oid, RepOriginId}; use serde::{Deserialize, Serialize}; -use std::{fmt, ops::Range}; +use utils::const_assert; use crate::reltag::{BlockNumber, RelTag, SlruKind}; @@ -24,7 +27,9 @@ pub struct Key { /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as /// a struct of fields. -#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] +#[derive( + Clone, Copy, Default, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug, +)] pub struct CompactKey(i128); /// The storage key size. @@ -47,6 +52,64 @@ pub const AUX_KEY_PREFIX: u8 = 0x62; /// The key prefix of ReplOrigin keys. pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63; +/// The key prefix of db directory keys. +pub const DB_DIR_KEY_PREFIX: u8 = 0x64; + +/// The key prefix of rel directory keys. +pub const REL_DIR_KEY_PREFIX: u8 = 0x65; + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +pub enum RelDirExists { + Exists, + Removed, +} + +#[derive(Debug)] +pub struct DecodeError; + +impl fmt::Display for DecodeError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "invalid marker") + } +} + +impl std::error::Error for DecodeError {} + +impl RelDirExists { + /// The value of the rel directory keys that indicates the existence of a relation. + const REL_EXISTS_MARKER: Bytes = Bytes::from_static(b"r"); + + pub fn encode(&self) -> Bytes { + match self { + Self::Exists => Self::REL_EXISTS_MARKER.clone(), + Self::Removed => SPARSE_TOMBSTONE_MARKER.clone(), + } + } + + pub fn decode_option(data: Option>) -> Result { + match data { + Some(marker) if marker.as_ref() == Self::REL_EXISTS_MARKER => Ok(Self::Exists), + // Any other marker is invalid + Some(_) => Err(DecodeError), + None => Ok(Self::Removed), + } + } + + pub fn decode(data: impl AsRef<[u8]>) -> Result { + let data = data.as_ref(); + if data == Self::REL_EXISTS_MARKER { + Ok(Self::Exists) + } else if data == SPARSE_TOMBSTONE_MARKER { + Ok(Self::Removed) + } else { + Err(DecodeError) + } + } +} + +/// A tombstone in the sparse keyspace, which is an empty buffer. +pub const SPARSE_TOMBSTONE_MARKER: Bytes = Bytes::from_static(b""); + /// Check if the key falls in the range of metadata keys. pub const fn is_metadata_key_slice(key: &[u8]) -> bool { key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX @@ -108,6 +171,24 @@ impl Key { } } + pub fn rel_dir_sparse_key_range() -> Range { + Key { + field1: REL_DIR_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REL_DIR_KEY_PREFIX + 1, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } + } + /// This function checks more extensively what keys we can take on the write path. /// If a key beginning with 00 does not have a global/default tablespace OID, it /// will be rejected on the write path. @@ -438,6 +519,36 @@ pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { } } +#[inline(always)] +pub fn rel_tag_sparse_key(spcnode: Oid, dbnode: Oid, relnode: Oid, forknum: u8) -> Key { + Key { + field1: REL_DIR_KEY_PREFIX, + field2: spcnode, + field3: dbnode, + field4: relnode, + field5: forknum, + field6: 1, + } +} + +pub fn rel_tag_sparse_key_range(spcnode: Oid, dbnode: Oid) -> Range { + Key { + field1: REL_DIR_KEY_PREFIX, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REL_DIR_KEY_PREFIX, + field2: spcnode, + field3: dbnode, + field4: u32::MAX, + field5: u8::MAX, + field6: u32::MAX, + } // it's fine to exclude the last key b/c we only use field6 == 1 +} + #[inline(always)] pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { Key { @@ -565,6 +676,10 @@ impl Key { && self.field5 == 0 && self.field6 == u32::MAX } + + pub fn is_slru_dir_key(&self) -> bool { + slru_dir_kind(self).is_some() + } } #[inline(always)] @@ -702,7 +817,7 @@ pub fn repl_origin_key_range() -> Range { /// Non inherited range for vectored get. pub const NON_INHERITED_RANGE: Range = AUX_FILES_KEY..AUX_FILES_KEY.next(); /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range. -pub const NON_INHERITED_SPARSE_RANGE: Range = Key::metadata_key_range(); +pub const SPARSE_RANGE: Range = Key::metadata_key_range(); impl Key { // AUX_FILES currently stores only data for logical replication (slots etc), and @@ -710,7 +825,42 @@ impl Key { // switch (and generally it likely should be optional), so ignore these. #[inline(always)] pub fn is_inherited_key(self) -> bool { - !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self) + if self.is_sparse() { + self.is_inherited_sparse_key() + } else { + !NON_INHERITED_RANGE.contains(&self) + } + } + + #[inline(always)] + pub fn is_sparse(self) -> bool { + self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX + } + + /// Check if the key belongs to the inherited keyspace. + fn is_inherited_sparse_key(self) -> bool { + debug_assert!(self.is_sparse()); + self.field1 == RELATION_SIZE_PREFIX + } + + pub const fn sparse_non_inherited_keyspace() -> Range { + // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace + const_assert!(AUX_KEY_PREFIX + 1 == REPL_ORIGIN_KEY_PREFIX); + Key { + field1: AUX_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REPL_ORIGIN_KEY_PREFIX + 1, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } } #[inline(always)] @@ -805,25 +955,22 @@ impl std::str::FromStr for Key { mod tests { use std::str::FromStr; - use crate::key::is_metadata_key_slice; - use crate::key::Key; - - use rand::Rng; - use rand::SeedableRng; + use rand::{Rng, SeedableRng}; use super::AUX_KEY_PREFIX; + use crate::key::{Key, is_metadata_key_slice}; #[test] fn display_fromstr_bijection() { let mut rng = rand::rngs::StdRng::seed_from_u64(42); let key = Key { - field1: rng.gen(), - field2: rng.gen(), - field3: rng.gen(), - field4: rng.gen(), - field5: rng.gen(), - field6: rng.gen(), + field1: rng.r#gen(), + field2: rng.r#gen(), + field3: rng.r#gen(), + field4: rng.r#gen(), + field5: rng.r#gen(), + field6: rng.r#gen(), }; assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index c55b9e9484..e505f23e49 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -1,11 +1,10 @@ -use postgres_ffi::BLCKSZ; use std::ops::Range; -use crate::{ - key::Key, - shard::{ShardCount, ShardIdentity}, -}; use itertools::Itertools; +use postgres_ffi::BLCKSZ; + +use crate::key::Key; +use crate::shard::{ShardCount, ShardIdentity}; /// /// Represents a set of Keys, in a compact form. @@ -609,15 +608,13 @@ pub fn singleton_range(key: Key) -> Range { #[cfg(test)] mod tests { + use std::fmt::Write; + use rand::{RngCore, SeedableRng}; - use crate::{ - models::ShardParameters, - shard::{ShardCount, ShardNumber}, - }; - use super::*; - use std::fmt::Write; + use crate::models::ShardParameters; + use crate::shard::{ShardCount, ShardNumber}; // Helper function to create a key range. // diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 5488f7b2c2..ea565e7769 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -2,37 +2,30 @@ pub mod detach_ancestor; pub mod partitioning; pub mod utilization; -#[cfg(feature = "testing")] -use camino::Utf8PathBuf; -pub use utilization::PageserverUtilization; - -use std::{ - collections::HashMap, - fmt::Display, - io::{BufRead, Read}, - num::{NonZeroU32, NonZeroU64, NonZeroUsize}, - str::FromStr, - time::{Duration, SystemTime}, -}; +use core::ops::Range; +use std::collections::HashMap; +use std::fmt::Display; +use std::io::{BufRead, Read}; +use std::num::{NonZeroU32, NonZeroU64, NonZeroUsize}; +use std::str::FromStr; +use std::time::{Duration, SystemTime}; use byteorder::{BigEndian, ReadBytesExt}; -use postgres_ffi::BLCKSZ; -use serde::{Deserialize, Serialize}; -use serde_with::serde_as; -use utils::{ - completion, - id::{NodeId, TenantId, TimelineId}, - lsn::Lsn, - postgres_client::PostgresClientProtocol, - serde_system_time, -}; - -use crate::{ - reltag::RelTag, - shard::{ShardCount, ShardStripeSize, TenantShardId}, -}; -use anyhow::bail; use bytes::{Buf, BufMut, Bytes, BytesMut}; +#[cfg(feature = "testing")] +use camino::Utf8PathBuf; +use postgres_ffi::BLCKSZ; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use serde_with::serde_as; +pub use utilization::PageserverUtilization; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::postgres_client::PostgresClientProtocol; +use utils::{completion, serde_system_time}; + +use crate::key::{CompactKey, Key}; +use crate::reltag::RelTag; +use crate::shard::{ShardCount, ShardStripeSize, TenantShardId}; /// The state of a tenant in this pageserver. /// @@ -210,6 +203,70 @@ pub enum TimelineState { Broken { reason: String, backtrace: String }, } +#[serde_with::serde_as] +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct CompactLsnRange { + pub start: Lsn, + pub end: Lsn, +} + +#[serde_with::serde_as] +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct CompactKeyRange { + #[serde_as(as = "serde_with::DisplayFromStr")] + pub start: Key, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub end: Key, +} + +impl From> for CompactLsnRange { + fn from(range: Range) -> Self { + Self { + start: range.start, + end: range.end, + } + } +} + +impl From> for CompactKeyRange { + fn from(range: Range) -> Self { + Self { + start: range.start, + end: range.end, + } + } +} + +impl From for Range { + fn from(range: CompactLsnRange) -> Self { + range.start..range.end + } +} + +impl From for Range { + fn from(range: CompactKeyRange) -> Self { + range.start..range.end + } +} + +impl CompactLsnRange { + pub fn above(lsn: Lsn) -> Self { + Self { + start: lsn, + end: Lsn::MAX, + } + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct CompactInfoResponse { + pub compact_key_range: Option, + pub compact_lsn_range: Option, + pub sub_compaction: bool, + pub running: bool, + pub job_id: usize, +} + #[derive(Serialize, Deserialize, Clone)] pub struct TimelineCreateRequest { pub new_timeline_id: TimelineId, @@ -267,7 +324,8 @@ pub struct ImportPgdataIdempotencyKey(pub String); impl ImportPgdataIdempotencyKey { pub fn random() -> Self { - use rand::{distributions::Alphanumeric, Rng}; + use rand::Rng; + use rand::distributions::Alphanumeric; Self( rand::thread_rng() .sample_iter(&Alphanumeric) @@ -325,35 +383,374 @@ impl Default for ShardParameters { } } +#[derive(Debug, Default, Clone, Eq, PartialEq)] +pub enum FieldPatch { + Upsert(T), + Remove, + #[default] + Noop, +} + +impl FieldPatch { + fn is_noop(&self) -> bool { + matches!(self, FieldPatch::Noop) + } + + pub fn apply(self, target: &mut Option) { + match self { + Self::Upsert(v) => *target = Some(v), + Self::Remove => *target = None, + Self::Noop => {} + } + } + + pub fn map Result>(self, map: F) -> Result, E> { + match self { + Self::Upsert(v) => Ok(FieldPatch::::Upsert(map(v)?)), + Self::Remove => Ok(FieldPatch::::Remove), + Self::Noop => Ok(FieldPatch::::Noop), + } + } +} + +impl<'de, T: Deserialize<'de>> Deserialize<'de> for FieldPatch { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + Option::deserialize(deserializer).map(|opt| match opt { + None => FieldPatch::Remove, + Some(val) => FieldPatch::Upsert(val), + }) + } +} + +impl Serialize for FieldPatch { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self { + FieldPatch::Upsert(val) => serializer.serialize_some(val), + FieldPatch::Remove => serializer.serialize_none(), + FieldPatch::Noop => unreachable!(), + } + } +} + +#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)] +#[serde(default)] +pub struct TenantConfigPatch { + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub checkpoint_distance: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub checkpoint_timeout: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_target_size: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_period: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_upper_limit: FieldPatch, + // defer parsing compaction_algorithm, like eviction_policy + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_algorithm: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_l0_first: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_l0_semaphore: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub l0_flush_delay_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub l0_flush_stall_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub l0_flush_wait_upload: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_horizon: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_period: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub image_creation_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub pitr_interval: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub walreceiver_connect_timeout: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub lagging_wal_timeout: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub max_lsn_wal_lag: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub eviction_policy: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub min_resident_size_override: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub evictions_low_residence_duration_metric_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub heatmap_period: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub lazy_slru_download: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub timeline_get_throttle: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub image_layer_creation_check_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub image_creation_preempt_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub lsn_lease_length: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub lsn_lease_length_for_ts: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub timeline_offloading: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub wal_receiver_protocol_override: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub rel_size_v2_enabled: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_compaction_enabled: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_compaction_initial_threshold_kb: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_compaction_ratio_percent: FieldPatch, +} + /// An alternative representation of `pageserver::tenant::TenantConf` with /// simpler types. #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)] pub struct TenantConfig { pub checkpoint_distance: Option, - pub checkpoint_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub checkpoint_timeout: Option, pub compaction_target_size: Option, - pub compaction_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub compaction_period: Option, pub compaction_threshold: Option, + pub compaction_upper_limit: Option, // defer parsing compaction_algorithm, like eviction_policy pub compaction_algorithm: Option, + pub compaction_l0_first: Option, + pub compaction_l0_semaphore: Option, + pub l0_flush_delay_threshold: Option, + pub l0_flush_stall_threshold: Option, + pub l0_flush_wait_upload: Option, pub gc_horizon: Option, - pub gc_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub gc_period: Option, pub image_creation_threshold: Option, - pub pitr_interval: Option, - pub walreceiver_connect_timeout: Option, - pub lagging_wal_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub pitr_interval: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, pub eviction_policy: Option, pub min_resident_size_override: Option, - pub evictions_low_residence_duration_metric_threshold: Option, - pub heatmap_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub evictions_low_residence_duration_metric_threshold: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub heatmap_period: Option, pub lazy_slru_download: Option, pub timeline_get_throttle: Option, pub image_layer_creation_check_threshold: Option, - pub lsn_lease_length: Option, - pub lsn_lease_length_for_ts: Option, + pub image_creation_preempt_threshold: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lsn_lease_length: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lsn_lease_length_for_ts: Option, pub timeline_offloading: Option, pub wal_receiver_protocol_override: Option, + pub rel_size_v2_enabled: Option, + pub gc_compaction_enabled: Option, + pub gc_compaction_initial_threshold_kb: Option, + pub gc_compaction_ratio_percent: Option, +} + +impl TenantConfig { + pub fn apply_patch( + self, + patch: TenantConfigPatch, + ) -> Result { + let Self { + mut checkpoint_distance, + mut checkpoint_timeout, + mut compaction_target_size, + mut compaction_period, + mut compaction_threshold, + mut compaction_upper_limit, + mut compaction_algorithm, + mut compaction_l0_first, + mut compaction_l0_semaphore, + mut l0_flush_delay_threshold, + mut l0_flush_stall_threshold, + mut l0_flush_wait_upload, + mut gc_horizon, + mut gc_period, + mut image_creation_threshold, + mut pitr_interval, + mut walreceiver_connect_timeout, + mut lagging_wal_timeout, + mut max_lsn_wal_lag, + mut eviction_policy, + mut min_resident_size_override, + mut evictions_low_residence_duration_metric_threshold, + mut heatmap_period, + mut lazy_slru_download, + mut timeline_get_throttle, + mut image_layer_creation_check_threshold, + mut image_creation_preempt_threshold, + mut lsn_lease_length, + mut lsn_lease_length_for_ts, + mut timeline_offloading, + mut wal_receiver_protocol_override, + mut rel_size_v2_enabled, + mut gc_compaction_enabled, + mut gc_compaction_initial_threshold_kb, + mut gc_compaction_ratio_percent, + } = self; + + patch.checkpoint_distance.apply(&mut checkpoint_distance); + patch + .checkpoint_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut checkpoint_timeout); + patch + .compaction_target_size + .apply(&mut compaction_target_size); + patch + .compaction_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut compaction_period); + patch.compaction_threshold.apply(&mut compaction_threshold); + patch + .compaction_upper_limit + .apply(&mut compaction_upper_limit); + patch.compaction_algorithm.apply(&mut compaction_algorithm); + patch.compaction_l0_first.apply(&mut compaction_l0_first); + patch + .compaction_l0_semaphore + .apply(&mut compaction_l0_semaphore); + patch + .l0_flush_delay_threshold + .apply(&mut l0_flush_delay_threshold); + patch + .l0_flush_stall_threshold + .apply(&mut l0_flush_stall_threshold); + patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload); + patch.gc_horizon.apply(&mut gc_horizon); + patch + .gc_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut gc_period); + patch + .image_creation_threshold + .apply(&mut image_creation_threshold); + patch + .pitr_interval + .map(|v| humantime::parse_duration(&v))? + .apply(&mut pitr_interval); + patch + .walreceiver_connect_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut walreceiver_connect_timeout); + patch + .lagging_wal_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lagging_wal_timeout); + patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag); + patch.eviction_policy.apply(&mut eviction_policy); + patch + .min_resident_size_override + .apply(&mut min_resident_size_override); + patch + .evictions_low_residence_duration_metric_threshold + .map(|v| humantime::parse_duration(&v))? + .apply(&mut evictions_low_residence_duration_metric_threshold); + patch + .heatmap_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut heatmap_period); + patch.lazy_slru_download.apply(&mut lazy_slru_download); + patch + .timeline_get_throttle + .apply(&mut timeline_get_throttle); + patch + .image_layer_creation_check_threshold + .apply(&mut image_layer_creation_check_threshold); + patch + .image_creation_preempt_threshold + .apply(&mut image_creation_preempt_threshold); + patch + .lsn_lease_length + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lsn_lease_length); + patch + .lsn_lease_length_for_ts + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lsn_lease_length_for_ts); + patch.timeline_offloading.apply(&mut timeline_offloading); + patch + .wal_receiver_protocol_override + .apply(&mut wal_receiver_protocol_override); + patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled); + patch + .gc_compaction_enabled + .apply(&mut gc_compaction_enabled); + patch + .gc_compaction_initial_threshold_kb + .apply(&mut gc_compaction_initial_threshold_kb); + patch + .gc_compaction_ratio_percent + .apply(&mut gc_compaction_ratio_percent); + + Ok(Self { + checkpoint_distance, + checkpoint_timeout, + compaction_target_size, + compaction_period, + compaction_threshold, + compaction_upper_limit, + compaction_algorithm, + compaction_l0_first, + compaction_l0_semaphore, + l0_flush_delay_threshold, + l0_flush_stall_threshold, + l0_flush_wait_upload, + gc_horizon, + gc_period, + image_creation_threshold, + pitr_interval, + walreceiver_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + eviction_policy, + min_resident_size_override, + evictions_low_residence_duration_metric_threshold, + heatmap_period, + lazy_slru_download, + timeline_get_throttle, + image_layer_creation_check_threshold, + image_creation_preempt_threshold, + lsn_lease_length, + lsn_lease_length_for_ts, + timeline_offloading, + wal_receiver_protocol_override, + rel_size_v2_enabled, + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + }) + } } /// The policy for the aux file storage. @@ -686,6 +1083,21 @@ impl TenantConfigRequest { } } +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantConfigPatchRequest { + pub tenant_id: TenantId, + #[serde(flatten)] + pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantWaitLsnRequest { + #[serde(flatten)] + pub timelines: HashMap, + pub timeout: Duration, +} + /// See [`TenantState::attachment_status`] and the OpenAPI docs for context. #[derive(Serialize, Deserialize, Clone)] #[serde(tag = "slug", content = "data", rename_all = "snake_case")] @@ -708,8 +1120,7 @@ pub struct TenantInfo { /// Opaque explanation if gc is being blocked. /// - /// Only looked up for the individual tenant detail, not the listing. This is purely for - /// debugging, not included in openapi. + /// Only looked up for the individual tenant detail, not the listing. #[serde(skip_serializing_if = "Option::is_none")] pub gc_blocking: Option, } @@ -764,7 +1175,26 @@ pub struct TimelineInfo { pub ancestor_lsn: Option, pub last_record_lsn: Lsn, pub prev_record_lsn: Option, + + /// Legacy field for compat with control plane. Synonym of `min_readable_lsn`. + /// TODO: remove once control plane no longer reads it. pub latest_gc_cutoff_lsn: Lsn, + + /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients. + /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead, + /// as it is easier to reason about. + #[serde(default)] + pub applied_gc_cutoff_lsn: Lsn, + + /// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval. + /// This LSN represents the "end of history" for this timeline, and callers should use it to figure out the oldest + /// LSN at which it is legal to create a branch or ephemeral endpoint. + /// + /// Note that holders of valid LSN leases may be able to create branches and read pages earlier + /// than this LSN, but new leases may not be taken out earlier than this LSN. + #[serde(default)] + pub min_readable_lsn: Lsn, + pub disk_consistent_lsn: Lsn, /// The LSN that we have succesfully uploaded to remote storage @@ -1116,6 +1546,8 @@ pub enum PagestreamFeMessage { GetPage(PagestreamGetPageRequest), DbSize(PagestreamDbSizeRequest), GetSlruSegment(PagestreamGetSlruSegmentRequest), + #[cfg(feature = "testing")] + Test(PagestreamTestRequest), } // Wrapped in libpq CopyData @@ -1127,6 +1559,22 @@ pub enum PagestreamBeMessage { Error(PagestreamErrorResponse), DbSize(PagestreamDbSizeResponse), GetSlruSegment(PagestreamGetSlruSegmentResponse), + #[cfg(feature = "testing")] + Test(PagestreamTestResponse), +} + +// Keep in sync with `pagestore_client.h` +#[repr(u8)] +enum PagestreamFeMessageTag { + Exists = 0, + Nblocks = 1, + GetPage = 2, + DbSize = 3, + GetSlruSegment = 4, + /* future tags above this line */ + /// For testing purposes, not available in production. + #[cfg(feature = "testing")] + Test = 99, } // Keep in sync with `pagestore_client.h` @@ -1138,7 +1586,28 @@ enum PagestreamBeMessageTag { Error = 103, DbSize = 104, GetSlruSegment = 105, + /* future tags above this line */ + /// For testing purposes, not available in production. + #[cfg(feature = "testing")] + Test = 199, } + +impl TryFrom for PagestreamFeMessageTag { + type Error = u8; + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(PagestreamFeMessageTag::Exists), + 1 => Ok(PagestreamFeMessageTag::Nblocks), + 2 => Ok(PagestreamFeMessageTag::GetPage), + 3 => Ok(PagestreamFeMessageTag::DbSize), + 4 => Ok(PagestreamFeMessageTag::GetSlruSegment), + #[cfg(feature = "testing")] + 99 => Ok(PagestreamFeMessageTag::Test), + _ => Err(value), + } + } +} + impl TryFrom for PagestreamBeMessageTag { type Error = u8; fn try_from(value: u8) -> Result { @@ -1149,6 +1618,8 @@ impl TryFrom for PagestreamBeMessageTag { 103 => Ok(PagestreamBeMessageTag::Error), 104 => Ok(PagestreamBeMessageTag::DbSize), 105 => Ok(PagestreamBeMessageTag::GetSlruSegment), + #[cfg(feature = "testing")] + 199 => Ok(PagestreamBeMessageTag::Test), _ => Err(value), } } @@ -1178,78 +1649,108 @@ impl TryFrom for PagestreamBeMessageTag { // interface allows sending both LSNs, and let the pageserver do the right thing. There was no // difference in the responses between V1 and V2. // -#[derive(Clone, Copy)] +// V3 version of protocol adds request ID to all requests. This request ID is also included in response +// as well as other fields from requests, which allows to verify that we receive response for our request. +// We copy fields from request to response to make checking more reliable: request ID is formed from process ID +// and local counter, so in principle there can be duplicated requests IDs if process PID is reused. +// +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub enum PagestreamProtocolVersion { V2, + V3, } -#[derive(Debug, PartialEq, Eq)] +pub type RequestId = u64; + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub struct PagestreamRequest { + pub reqid: RequestId, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamExistsRequest { - pub request_lsn: Lsn, - pub not_modified_since: Lsn, + pub hdr: PagestreamRequest, pub rel: RelTag, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamNblocksRequest { - pub request_lsn: Lsn, - pub not_modified_since: Lsn, + pub hdr: PagestreamRequest, pub rel: RelTag, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamGetPageRequest { - pub request_lsn: Lsn, - pub not_modified_since: Lsn, + pub hdr: PagestreamRequest, pub rel: RelTag, pub blkno: u32, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamDbSizeRequest { - pub request_lsn: Lsn, - pub not_modified_since: Lsn, + pub hdr: PagestreamRequest, pub dbnode: u32, } -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct PagestreamGetSlruSegmentRequest { - pub request_lsn: Lsn, - pub not_modified_since: Lsn, + pub hdr: PagestreamRequest, pub kind: u8, pub segno: u32, } #[derive(Debug)] pub struct PagestreamExistsResponse { + pub req: PagestreamExistsRequest, pub exists: bool, } #[derive(Debug)] pub struct PagestreamNblocksResponse { + pub req: PagestreamNblocksRequest, pub n_blocks: u32, } #[derive(Debug)] pub struct PagestreamGetPageResponse { + pub req: PagestreamGetPageRequest, pub page: Bytes, } #[derive(Debug)] pub struct PagestreamGetSlruSegmentResponse { + pub req: PagestreamGetSlruSegmentRequest, pub segment: Bytes, } #[derive(Debug)] pub struct PagestreamErrorResponse { + pub req: PagestreamRequest, pub message: String, } #[derive(Debug)] pub struct PagestreamDbSizeResponse { + pub req: PagestreamDbSizeRequest, pub db_size: i64, } +#[cfg(feature = "testing")] +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct PagestreamTestRequest { + pub hdr: PagestreamRequest, + pub batch_key: u64, + pub message: String, +} + +#[cfg(feature = "testing")] +#[derive(Debug)] +pub struct PagestreamTestResponse { + pub req: PagestreamTestRequest, +} + // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields // that require pageserver-internal types. It is sufficient to get the total size. #[derive(Serialize, Deserialize, Debug)] @@ -1263,15 +1764,16 @@ pub struct TenantHistorySize { impl PagestreamFeMessage { /// Serialize a compute -> pageserver message. This is currently only used in testing - /// tools. Always uses protocol version 2. + /// tools. Always uses protocol version 3. pub fn serialize(&self) -> Bytes { let mut bytes = BytesMut::new(); match self { Self::Exists(req) => { - bytes.put_u8(0); - bytes.put_u64(req.request_lsn.0); - bytes.put_u64(req.not_modified_since.0); + bytes.put_u8(PagestreamFeMessageTag::Exists as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -1279,9 +1781,10 @@ impl PagestreamFeMessage { } Self::Nblocks(req) => { - bytes.put_u8(1); - bytes.put_u64(req.request_lsn.0); - bytes.put_u64(req.not_modified_since.0); + bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -1289,9 +1792,10 @@ impl PagestreamFeMessage { } Self::GetPage(req) => { - bytes.put_u8(2); - bytes.put_u64(req.request_lsn.0); - bytes.put_u64(req.not_modified_since.0); + bytes.put_u8(PagestreamFeMessageTag::GetPage as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -1300,123 +1804,278 @@ impl PagestreamFeMessage { } Self::DbSize(req) => { - bytes.put_u8(3); - bytes.put_u64(req.request_lsn.0); - bytes.put_u64(req.not_modified_since.0); + bytes.put_u8(PagestreamFeMessageTag::DbSize as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u32(req.dbnode); } Self::GetSlruSegment(req) => { - bytes.put_u8(4); - bytes.put_u64(req.request_lsn.0); - bytes.put_u64(req.not_modified_since.0); + bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); bytes.put_u8(req.kind); bytes.put_u32(req.segno); } + #[cfg(feature = "testing")] + Self::Test(req) => { + bytes.put_u8(PagestreamFeMessageTag::Test as u8); + bytes.put_u64(req.hdr.reqid); + bytes.put_u64(req.hdr.request_lsn.0); + bytes.put_u64(req.hdr.not_modified_since.0); + bytes.put_u64(req.batch_key); + let message = req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } } bytes.into() } - pub fn parse(body: &mut R) -> anyhow::Result { + pub fn parse( + body: &mut R, + protocol_version: PagestreamProtocolVersion, + ) -> anyhow::Result { // these correspond to the NeonMessageTag enum in pagestore_client.h // // TODO: consider using protobuf or serde bincode for less error prone // serialization. let msg_tag = body.read_u8()?; + let (reqid, request_lsn, not_modified_since) = match protocol_version { + PagestreamProtocolVersion::V2 => ( + 0, + Lsn::from(body.read_u64::()?), + Lsn::from(body.read_u64::()?), + ), + PagestreamProtocolVersion::V3 => ( + body.read_u64::()?, + Lsn::from(body.read_u64::()?), + Lsn::from(body.read_u64::()?), + ), + }; - // these two fields are the same for every request type - let request_lsn = Lsn::from(body.read_u64::()?); - let not_modified_since = Lsn::from(body.read_u64::()?); - - match msg_tag { - 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - request_lsn, - not_modified_since, - rel: RelTag { - spcnode: body.read_u32::()?, + match PagestreamFeMessageTag::try_from(msg_tag) + .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? + { + PagestreamFeMessageTag::Exists => { + Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + })) + } + PagestreamFeMessageTag::Nblocks => { + Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + })) + } + PagestreamFeMessageTag::GetPage => { + Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + blkno: body.read_u32::()?, + })) + } + PagestreamFeMessageTag::DbSize => { + Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - })), - 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - request_lsn, - not_modified_since, - rel: RelTag { - spcnode: body.read_u32::()?, - dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - })), - 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - request_lsn, - not_modified_since, - rel: RelTag { - spcnode: body.read_u32::()?, - dbnode: body.read_u32::()?, - relnode: body.read_u32::()?, - forknum: body.read_u8()?, - }, - blkno: body.read_u32::()?, - })), - 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - request_lsn, - not_modified_since, - dbnode: body.read_u32::()?, - })), - 4 => Ok(PagestreamFeMessage::GetSlruSegment( + })) + } + PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment( PagestreamGetSlruSegmentRequest { - request_lsn, - not_modified_since, + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, kind: body.read_u8()?, segno: body.read_u32::()?, }, )), - _ => bail!("unknown smgr message tag: {:?}", msg_tag), + #[cfg(feature = "testing")] + PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + batch_key: body.read_u64::()?, + message: { + let len = body.read_u64::()?; + let mut buf = vec![0; len as usize]; + body.read_exact(&mut buf)?; + String::from_utf8(buf)? + }, + })), } } } impl PagestreamBeMessage { - pub fn serialize(&self) -> Bytes { + pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes { let mut bytes = BytesMut::new(); use PagestreamBeMessageTag as Tag; - match self { - Self::Exists(resp) => { - bytes.put_u8(Tag::Exists as u8); - bytes.put_u8(resp.exists as u8); - } + match protocol_version { + PagestreamProtocolVersion::V2 => { + match self { + Self::Exists(resp) => { + bytes.put_u8(Tag::Exists as u8); + bytes.put_u8(resp.exists as u8); + } - Self::Nblocks(resp) => { - bytes.put_u8(Tag::Nblocks as u8); - bytes.put_u32(resp.n_blocks); - } + Self::Nblocks(resp) => { + bytes.put_u8(Tag::Nblocks as u8); + bytes.put_u32(resp.n_blocks); + } - Self::GetPage(resp) => { - bytes.put_u8(Tag::GetPage as u8); - bytes.put(&resp.page[..]); - } + Self::GetPage(resp) => { + bytes.put_u8(Tag::GetPage as u8); + bytes.put(&resp.page[..]) + } - Self::Error(resp) => { - bytes.put_u8(Tag::Error as u8); - bytes.put(resp.message.as_bytes()); - bytes.put_u8(0); // null terminator - } - Self::DbSize(resp) => { - bytes.put_u8(Tag::DbSize as u8); - bytes.put_i64(resp.db_size); - } + Self::Error(resp) => { + bytes.put_u8(Tag::Error as u8); + bytes.put(resp.message.as_bytes()); + bytes.put_u8(0); // null terminator + } + Self::DbSize(resp) => { + bytes.put_u8(Tag::DbSize as u8); + bytes.put_i64(resp.db_size); + } - Self::GetSlruSegment(resp) => { - bytes.put_u8(Tag::GetSlruSegment as u8); - bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); - bytes.put(&resp.segment[..]); + Self::GetSlruSegment(resp) => { + bytes.put_u8(Tag::GetSlruSegment as u8); + bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); + bytes.put(&resp.segment[..]); + } + + #[cfg(feature = "testing")] + Self::Test(resp) => { + bytes.put_u8(Tag::Test as u8); + bytes.put_u64(resp.req.batch_key); + let message = resp.req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } + } + } + PagestreamProtocolVersion::V3 => { + match self { + Self::Exists(resp) => { + bytes.put_u8(Tag::Exists as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u32(resp.req.rel.spcnode); + bytes.put_u32(resp.req.rel.dbnode); + bytes.put_u32(resp.req.rel.relnode); + bytes.put_u8(resp.req.rel.forknum); + bytes.put_u8(resp.exists as u8); + } + + Self::Nblocks(resp) => { + bytes.put_u8(Tag::Nblocks as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u32(resp.req.rel.spcnode); + bytes.put_u32(resp.req.rel.dbnode); + bytes.put_u32(resp.req.rel.relnode); + bytes.put_u8(resp.req.rel.forknum); + bytes.put_u32(resp.n_blocks); + } + + Self::GetPage(resp) => { + bytes.put_u8(Tag::GetPage as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u32(resp.req.rel.spcnode); + bytes.put_u32(resp.req.rel.dbnode); + bytes.put_u32(resp.req.rel.relnode); + bytes.put_u8(resp.req.rel.forknum); + bytes.put_u32(resp.req.blkno); + bytes.put(&resp.page[..]) + } + + Self::Error(resp) => { + bytes.put_u8(Tag::Error as u8); + bytes.put_u64(resp.req.reqid); + bytes.put_u64(resp.req.request_lsn.0); + bytes.put_u64(resp.req.not_modified_since.0); + bytes.put(resp.message.as_bytes()); + bytes.put_u8(0); // null terminator + } + Self::DbSize(resp) => { + bytes.put_u8(Tag::DbSize as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u32(resp.req.dbnode); + bytes.put_i64(resp.db_size); + } + + Self::GetSlruSegment(resp) => { + bytes.put_u8(Tag::GetSlruSegment as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u8(resp.req.kind); + bytes.put_u32(resp.req.segno); + bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); + bytes.put(&resp.segment[..]); + } + + #[cfg(feature = "testing")] + Self::Test(resp) => { + bytes.put_u8(Tag::Test as u8); + bytes.put_u64(resp.req.hdr.reqid); + bytes.put_u64(resp.req.hdr.request_lsn.0); + bytes.put_u64(resp.req.hdr.not_modified_since.0); + bytes.put_u64(resp.req.batch_key); + let message = resp.req.message.as_bytes(); + bytes.put_u64(message.len() as u64); + bytes.put_slice(message); + } + } } } - bytes.into() } @@ -1428,41 +2087,156 @@ impl PagestreamBeMessage { let ok = match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? { Tag::Exists => { - let exists = buf.read_u8()?; + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let rel = RelTag { + spcnode: buf.read_u32::()?, + dbnode: buf.read_u32::()?, + relnode: buf.read_u32::()?, + forknum: buf.read_u8()?, + }; + let exists = buf.read_u8()? != 0; Self::Exists(PagestreamExistsResponse { - exists: exists != 0, + req: PagestreamExistsRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel, + }, + exists, }) } Tag::Nblocks => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let rel = RelTag { + spcnode: buf.read_u32::()?, + dbnode: buf.read_u32::()?, + relnode: buf.read_u32::()?, + forknum: buf.read_u8()?, + }; let n_blocks = buf.read_u32::()?; - Self::Nblocks(PagestreamNblocksResponse { n_blocks }) + Self::Nblocks(PagestreamNblocksResponse { + req: PagestreamNblocksRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel, + }, + n_blocks, + }) } Tag::GetPage => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let rel = RelTag { + spcnode: buf.read_u32::()?, + dbnode: buf.read_u32::()?, + relnode: buf.read_u32::()?, + forknum: buf.read_u8()?, + }; + let blkno = buf.read_u32::()?; let mut page = vec![0; 8192]; // TODO: use MaybeUninit buf.read_exact(&mut page)?; - PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() }) + Self::GetPage(PagestreamGetPageResponse { + req: PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + rel, + blkno, + }, + page: page.into(), + }) } Tag::Error => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); let mut msg = Vec::new(); buf.read_until(0, &mut msg)?; let cstring = std::ffi::CString::from_vec_with_nul(msg)?; let rust_str = cstring.to_str()?; - PagestreamBeMessage::Error(PagestreamErrorResponse { + Self::Error(PagestreamErrorResponse { + req: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, message: rust_str.to_owned(), }) } Tag::DbSize => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let dbnode = buf.read_u32::()?; let db_size = buf.read_i64::()?; - Self::DbSize(PagestreamDbSizeResponse { db_size }) + Self::DbSize(PagestreamDbSizeResponse { + req: PagestreamDbSizeRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + dbnode, + }, + db_size, + }) } Tag::GetSlruSegment => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let kind = buf.read_u8()?; + let segno = buf.read_u32::()?; let n_blocks = buf.read_u32::()?; let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize]; buf.read_exact(&mut segment)?; Self::GetSlruSegment(PagestreamGetSlruSegmentResponse { + req: PagestreamGetSlruSegmentRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + kind, + segno, + }, segment: segment.into(), }) } + #[cfg(feature = "testing")] + Tag::Test => { + let reqid = buf.read_u64::()?; + let request_lsn = Lsn(buf.read_u64::()?); + let not_modified_since = Lsn(buf.read_u64::()?); + let batch_key = buf.read_u64::()?; + let len = buf.read_u64::()?; + let mut msg = vec![0; len as usize]; + buf.read_exact(&mut msg)?; + let message = String::from_utf8(msg)?; + Self::Test(PagestreamTestResponse { + req: PagestreamTestRequest { + hdr: PagestreamRequest { + reqid, + request_lsn, + not_modified_since, + }, + batch_key, + message, + }, + }) + } }; let remaining = buf.into_inner(); if !remaining.is_empty() { @@ -1482,15 +2256,35 @@ impl PagestreamBeMessage { Self::Error(_) => "Error", Self::DbSize(_) => "DbSize", Self::GetSlruSegment(_) => "GetSlruSegment", + #[cfg(feature = "testing")] + Self::Test(_) => "Test", + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct PageTraceEvent { + pub key: CompactKey, + pub effective_lsn: Lsn, + pub time: SystemTime, +} + +impl Default for PageTraceEvent { + fn default() -> Self { + Self { + key: Default::default(), + effective_lsn: Default::default(), + time: std::time::UNIX_EPOCH, } } } #[cfg(test)] mod tests { - use serde_json::json; use std::str::FromStr; + use serde_json::json; + use super::*; #[test] @@ -1498,8 +2292,11 @@ mod tests { // Test serialization/deserialization of PagestreamFeMessage let messages = vec![ PagestreamFeMessage::Exists(PagestreamExistsRequest { - request_lsn: Lsn(4), - not_modified_since: Lsn(3), + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(4), + not_modified_since: Lsn(3), + }, rel: RelTag { forknum: 1, spcnode: 2, @@ -1508,8 +2305,11 @@ mod tests { }, }), PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - request_lsn: Lsn(4), - not_modified_since: Lsn(4), + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(4), + not_modified_since: Lsn(4), + }, rel: RelTag { forknum: 1, spcnode: 2, @@ -1518,8 +2318,11 @@ mod tests { }, }), PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - request_lsn: Lsn(4), - not_modified_since: Lsn(3), + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(4), + not_modified_since: Lsn(3), + }, rel: RelTag { forknum: 1, spcnode: 2, @@ -1529,14 +2332,19 @@ mod tests { blkno: 7, }), PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - request_lsn: Lsn(4), - not_modified_since: Lsn(3), + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(4), + not_modified_since: Lsn(3), + }, dbnode: 7, }), ]; for msg in messages { let bytes = msg.serialize(); - let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap(); + let reconstructed = + PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3) + .unwrap(); assert!(msg == reconstructed); } } @@ -1699,4 +2507,45 @@ mod tests { ); } } + + #[test] + fn test_tenant_config_patch_request_serde() { + let patch_request = TenantConfigPatchRequest { + tenant_id: TenantId::from_str("17c6d121946a61e5ab0fe5a2fd4d8215").unwrap(), + config: TenantConfigPatch { + checkpoint_distance: FieldPatch::Upsert(42), + gc_horizon: FieldPatch::Remove, + compaction_threshold: FieldPatch::Noop, + ..TenantConfigPatch::default() + }, + }; + + let json = serde_json::to_string(&patch_request).unwrap(); + + let expected = r#"{"tenant_id":"17c6d121946a61e5ab0fe5a2fd4d8215","checkpoint_distance":42,"gc_horizon":null}"#; + assert_eq!(json, expected); + + let decoded: TenantConfigPatchRequest = serde_json::from_str(&json).unwrap(); + assert_eq!(decoded.tenant_id, patch_request.tenant_id); + assert_eq!(decoded.config, patch_request.config); + + // Now apply the patch to a config to demonstrate semantics + + let base = TenantConfig { + checkpoint_distance: Some(28), + gc_horizon: Some(100), + compaction_target_size: Some(1024), + ..Default::default() + }; + + let expected = TenantConfig { + checkpoint_distance: Some(42), + gc_horizon: None, + ..base.clone() + }; + + let patched = base.apply_patch(decoded.config).unwrap(); + + assert_eq!(patched, expected); + } } diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index 641aa51989..69c240ff3c 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -1,5 +1,7 @@ use std::time::SystemTime; -use utils::{serde_percent::Percent, serde_system_time}; + +use utils::serde_percent::Percent; +use utils::serde_system_time; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. @@ -131,12 +133,12 @@ impl PageserverUtilization { /// Test helper pub mod test_utilization { - use super::PageserverUtilization; use std::time::SystemTime; - use utils::{ - serde_percent::Percent, - serde_system_time::{self}, - }; + + use utils::serde_percent::Percent; + use utils::serde_system_time::{self}; + + use super::PageserverUtilization; // Parameters of the imaginary node used for test utilization instances const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024; diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs index bb62b35d36..fda504a26e 100644 --- a/libs/pageserver_api/src/record.rs +++ b/libs/pageserver_api/src/record.rs @@ -1,7 +1,7 @@ //! This module defines the WAL record format used within the pageserver. use bytes::Bytes; -use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember}; +use postgres_ffi::walrecord::{MultiXactMember, describe_postgres_wal_record}; use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use utils::bin_ser::DeserializeError; diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index 09d1fae221..473a44dbf9 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -1,10 +1,10 @@ -use serde::{Deserialize, Serialize}; use std::cmp::Ordering; use std::fmt; -use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; -use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM}; use postgres_ffi::Oid; +use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; +use postgres_ffi::relfile_utils::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name}; +use serde::{Deserialize, Serialize}; /// /// Relation data file segment id throughout the Postgres cluster. diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index cf0cd3a46b..eca04b1f3d 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -31,12 +31,15 @@ //! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), //! and their slugs are 0004, 0104, 0204, and 0304. -use crate::{key::Key, models::ShardParameters}; -use postgres_ffi::relfile_utils::INIT_FORKNUM; -use serde::{Deserialize, Serialize}; +use std::hash::{Hash, Hasher}; #[doc(inline)] pub use ::utils::shard::*; +use postgres_ffi::relfile_utils::INIT_FORKNUM; +use serde::{Deserialize, Serialize}; + +use crate::key::Key; +use crate::models::ShardParameters; /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], /// and to check whether that [`ShardNumber`] is the same as the current shard. @@ -48,6 +51,23 @@ pub struct ShardIdentity { layout: ShardLayout, } +/// Hash implementation +/// +/// The stripe size cannot change dynamically, so it can be ignored for efficiency reasons. +impl Hash for ShardIdentity { + fn hash(&self, state: &mut H) { + let ShardIdentity { + number, + count, + stripe_size: _, + layout: _, + } = self; + + number.0.hash(state); + count.0.hash(state); + } +} + /// Stripe size in number of pages #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardStripeSize(pub u32); @@ -59,7 +79,7 @@ impl Default for ShardStripeSize { } /// Layout version: for future upgrades where we might change how the key->shard mapping works -#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] +#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)] pub struct ShardLayout(u8); const LAYOUT_V1: ShardLayout = ShardLayout(1); @@ -173,7 +193,11 @@ impl ShardIdentity { /// Return true if the key should be stored on all shards, not just one. pub fn is_key_global(&self, key: &Key) -> bool { - if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() { + if key.is_slru_block_key() + || key.is_slru_segment_size_key() + || key.is_aux_file_key() + || key.is_slru_dir_key() + { // Special keys that are only stored on shard 0 false } else if key.is_rel_block_key() { @@ -314,7 +338,8 @@ pub fn describe( mod tests { use std::str::FromStr; - use utils::{id::TenantId, Hex}; + use utils::Hex; + use utils::id::TenantId; use super::*; diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 2e88836bd0..647d01c3c2 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -6,9 +6,9 @@ use serde::{Deserialize, Serialize}; use utils::id::NodeId; -use crate::{ - controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId, -}; +use crate::controller_api::NodeRegisterRequest; +use crate::models::LocationConfigMode; +use crate::shard::TenantShardId; /// Upcall message sent by the pageserver to the configured `control_plane_api` on /// startup. @@ -30,7 +30,7 @@ fn default_mode() -> LocationConfigMode { pub struct ReAttachResponseTenant { pub id: TenantShardId, /// Mandatory if LocationConfigMode is None or set to an Attached* mode - pub gen: Option, + pub r#gen: Option, /// Default value only for backward compat: this field should be set #[serde(default = "default_mode")] @@ -44,7 +44,7 @@ pub struct ReAttachResponse { #[derive(Serialize, Deserialize)] pub struct ValidateRequestTenant { pub id: TenantShardId, - pub gen: u32, + pub r#gen: u32, } #[derive(Serialize, Deserialize)] diff --git a/libs/pageserver_api/src/value.rs b/libs/pageserver_api/src/value.rs index 1f8ed30a9a..883d903ff3 100644 --- a/libs/pageserver_api/src/value.rs +++ b/libs/pageserver_api/src/value.rs @@ -7,10 +7,11 @@ //! Note that the [`Value`] type is used for the permananent storage format, so any //! changes to it must be backwards compatible. -use crate::record::NeonWalRecord; use bytes::Bytes; use serde::{Deserialize, Serialize}; +use crate::record::NeonWalRecord; + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum Value { /// An Image value contains a full copy of the value @@ -83,11 +84,11 @@ impl ValueBytes { #[cfg(test)] mod test { - use super::*; - use bytes::Bytes; use utils::bin_ser::BeSer; + use super::*; + macro_rules! roundtrip { ($orig:expr, $expected:expr) => {{ let orig: Value = $orig; diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 8c024375c1..f74b229ac4 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -9,6 +9,8 @@ use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::io::ErrorKind; use std::net::SocketAddr; +use std::os::fd::AsRawFd; +use std::os::fd::RawFd; use std::pin::Pin; use std::sync::Arc; use std::task::{ready, Poll}; @@ -268,6 +270,7 @@ impl MaybeWriteOnly { } pub struct PostgresBackend { + pub socket_fd: RawFd, framed: MaybeWriteOnly, pub state: ProtoState, @@ -293,9 +296,11 @@ impl PostgresBackend { tls_config: Option>, ) -> io::Result { let peer_addr = socket.peer_addr()?; + let socket_fd = socket.as_raw_fd(); let stream = MaybeTlsStream::Unencrypted(socket); Ok(Self { + socket_fd, framed: MaybeWriteOnly::Full(Framed::new(stream)), state: ProtoState::Initialization, auth_type, @@ -307,6 +312,7 @@ impl PostgresBackend { impl PostgresBackend { pub fn new_from_io( + socket_fd: RawFd, socket: IO, peer_addr: SocketAddr, auth_type: AuthType, @@ -315,6 +321,7 @@ impl PostgresBackend { let stream = MaybeTlsStream::Unencrypted(socket); Ok(Self { + socket_fd, framed: MaybeWriteOnly::Full(Framed::new(stream)), state: ProtoState::Initialization, auth_type, diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml index 19027d13ff..462fb4a533 100644 --- a/libs/postgres_connection/Cargo.toml +++ b/libs/postgres_connection/Cargo.toml @@ -7,7 +7,6 @@ license.workspace = true [dependencies] anyhow.workspace = true itertools.workspace = true -postgres.workspace = true tokio-postgres.workspace = true url.workspace = true diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index ddf9f7b610..e3d31c6cfc 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -171,10 +171,10 @@ impl PgConnectionConfig { tokio_postgres::Client, tokio_postgres::Connection, ), - postgres::Error, + tokio_postgres::Error, > { self.to_tokio_postgres_config() - .connect(postgres::NoTls) + .connect(tokio_postgres::NoTls) .await } } diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index e1f5443cbe..b7a376841d 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -9,9 +9,11 @@ regex.workspace = true bytes.workspace = true anyhow.workspace = true crc32c.workspace = true +criterion.workspace = true once_cell.workspace = true log.workspace = true memoffset.workspace = true +pprof.workspace = true thiserror.workspace = true serde.workspace = true utils.workspace = true @@ -24,3 +26,7 @@ postgres.workspace = true [build-dependencies] anyhow.workspace = true bindgen.workspace = true + +[[bench]] +name = "waldecoder" +harness = false diff --git a/libs/postgres_ffi/benches/README.md b/libs/postgres_ffi/benches/README.md new file mode 100644 index 0000000000..00a8980174 --- /dev/null +++ b/libs/postgres_ffi/benches/README.md @@ -0,0 +1,26 @@ +## Benchmarks + +To run benchmarks: + +```sh +# All benchmarks. +cargo bench --package postgres_ffi + +# Specific file. +cargo bench --package postgres_ffi --bench waldecoder + +# Specific benchmark. +cargo bench --package postgres_ffi --bench waldecoder complete_record/size=1024 + +# List available benchmarks. +cargo bench --package postgres_ffi --benches -- --list + +# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. +# Output in target/criterion/*/profile/flamegraph.svg. +cargo bench --package postgres_ffi --bench waldecoder complete_record/size=1024 -- --profile-time 10 +``` + +Additional charts and statistics are available in `target/criterion/report/index.html`. + +Benchmarks are automatically compared against the previous run. To compare against other runs, see +`--baseline` and `--save-baseline`. \ No newline at end of file diff --git a/libs/postgres_ffi/benches/waldecoder.rs b/libs/postgres_ffi/benches/waldecoder.rs new file mode 100644 index 0000000000..c8cf0d322a --- /dev/null +++ b/libs/postgres_ffi/benches/waldecoder.rs @@ -0,0 +1,49 @@ +use std::ffi::CStr; + +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use postgres_ffi::v17::wal_generator::LogicalMessageGenerator; +use postgres_ffi::v17::waldecoder_handler::WalStreamDecoderHandler; +use postgres_ffi::waldecoder::WalStreamDecoder; +use pprof::criterion::{Output, PProfProfiler}; +use utils::lsn::Lsn; + +const KB: usize = 1024; + +// Register benchmarks with Criterion. +criterion_group!( + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_complete_record, +); +criterion_main!(benches); + +/// Benchmarks WalStreamDecoder::complete_record() for a logical message of varying size. +fn bench_complete_record(c: &mut Criterion) { + let mut g = c.benchmark_group("complete_record"); + for size in [64, KB, 8 * KB, 128 * KB] { + // Kind of weird to change the group throughput per benchmark, but it's the only way + // to vary it per benchmark. It works. + g.throughput(criterion::Throughput::Bytes(size as u64)); + g.bench_function(format!("size={size}"), |b| run_bench(b, size).unwrap()); + } + + fn run_bench(b: &mut Bencher, size: usize) -> anyhow::Result<()> { + const PREFIX: &CStr = c""; + let value_size = LogicalMessageGenerator::make_value_size(size, PREFIX); + let value = vec![1; value_size]; + + let mut decoder = WalStreamDecoder::new(Lsn(0), 170000); + let msg = LogicalMessageGenerator::new(PREFIX, &value) + .next() + .unwrap() + .encode(Lsn(0)); + assert_eq!(msg.len(), size); + + b.iter(|| { + let msg = msg.clone(); // Bytes::clone() is cheap + decoder.complete_record(msg).unwrap(); + }); + + Ok(()) + } +} diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 0239b56d9c..301bc2f16e 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -278,7 +278,7 @@ pub fn generate_pg_control( checkpoint_bytes: &[u8], lsn: Lsn, pg_version: u32, -) -> anyhow::Result<(Bytes, u64)> { +) -> anyhow::Result<(Bytes, u64, bool)> { dispatch_pgversion!( pg_version, pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), diff --git a/libs/postgres_ffi/src/wal_generator.rs b/libs/postgres_ffi/src/wal_generator.rs index dc679eea33..a72b035e17 100644 --- a/libs/postgres_ffi/src/wal_generator.rs +++ b/libs/postgres_ffi/src/wal_generator.rs @@ -106,11 +106,11 @@ impl WalGenerator { const TIMELINE_ID: u32 = 1; /// Creates a new WAL generator with the given record generator. - pub fn new(record_generator: R) -> WalGenerator { + pub fn new(record_generator: R, start_lsn: Lsn) -> WalGenerator { Self { record_generator, - lsn: Lsn(0), - prev_lsn: Lsn(0), + lsn: start_lsn, + prev_lsn: start_lsn, } } @@ -231,6 +231,22 @@ impl LogicalMessageGenerator { }; [&header.encode(), prefix, message].concat().into() } + + /// Computes how large a value must be to get a record of the given size. Convenience method to + /// construct records of pre-determined size. Panics if the record size is too small. + pub fn make_value_size(record_size: usize, prefix: &CStr) -> usize { + let xlog_header_size = XLOG_SIZE_OF_XLOG_RECORD; + let lm_header_size = size_of::(); + let prefix_size = prefix.to_bytes_with_nul().len(); + let data_header_size = match record_size - xlog_header_size - 2 { + 0..=255 => 2, + 256..=258 => panic!("impossible record_size {record_size}"), + 259.. => 5, + }; + record_size + .checked_sub(xlog_header_size + lm_header_size + prefix_size + data_header_size) + .expect("record_size too small") + } } impl Iterator for LogicalMessageGenerator { diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs index b32106632a..fce37e2fdd 100644 --- a/libs/postgres_ffi/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError; use utils::lsn::Lsn; #[repr(C)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlMultiXactCreate { pub mid: MultiXactId, /* new MultiXact's ID */ @@ -46,7 +46,7 @@ impl XlMultiXactCreate { } #[repr(C)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlMultiXactTruncate { pub oldest_multi_db: Oid, /* to-be-truncated range of multixact offsets */ @@ -72,7 +72,7 @@ impl XlMultiXactTruncate { } #[repr(C)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlRelmapUpdate { pub dbid: Oid, /* database ID, or 0 for shared map */ pub tsid: Oid, /* database's tablespace, or pg_global */ @@ -90,7 +90,7 @@ impl XlRelmapUpdate { } #[repr(C)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlReploriginDrop { pub node_id: RepOriginId, } @@ -104,7 +104,7 @@ impl XlReploriginDrop { } #[repr(C)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlReploriginSet { pub remote_lsn: Lsn, pub node_id: RepOriginId, @@ -911,7 +911,7 @@ impl XlSmgrCreate { } #[repr(C)] -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlSmgrTruncate { pub blkno: BlockNumber, pub rnode: RelFileNode, @@ -984,7 +984,7 @@ impl XlDropDatabase { /// xl_xact_parsed_abort structs in PostgreSQL, but we use the same /// struct for commits and aborts. /// -#[derive(Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize)] pub struct XlXactParsedRecord { pub xid: TransactionId, pub info: u8, diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 852b20eace..14fb1f2a1f 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -124,23 +124,59 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { } } +/// Generate a pg_control file, for a basebackup for starting up Postgres at the given LSN +/// +/// 'pg_control_bytes' and 'checkpoint_bytes' are the contents of those keys persisted in +/// the pageserver. They use the same format as the PostgreSQL control file and the +/// checkpoint record, but see walingest.rs for how exactly they are kept up to date. +/// 'lsn' is the LSN at which we're starting up. +/// +/// Returns: +/// - pg_control file contents +/// - system_identifier, extracted from the persisted information +/// - true, if we're starting up from a "clean shutdown", i.e. if there was a shutdown +/// checkpoint at the given LSN pub fn generate_pg_control( pg_control_bytes: &[u8], checkpoint_bytes: &[u8], lsn: Lsn, -) -> anyhow::Result<(Bytes, u64)> { +) -> anyhow::Result<(Bytes, u64, bool)> { let mut pg_control = ControlFileData::decode(pg_control_bytes)?; let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?; // Generate new pg_control needed for bootstrap + // + // NB: In the checkpoint struct that we persist in the pageserver, we have a different + // convention for the 'redo' field than in PostgreSQL: On a shutdown checkpoint, + // 'redo' points the *end* of the checkpoint WAL record. On PostgreSQL, it points to + // the beginning. Furthermore, on an online checkpoint, 'redo' is set to 0. + // + // We didn't always have this convention however, and old persisted records will have + // old REDO values that point to some old LSN. + // + // The upshot is that if 'redo' is equal to the "current" LSN, there was a shutdown + // checkpoint record at that point in WAL, with no new WAL records after it. That case + // can be treated as starting from a clean shutdown. All other cases are treated as + // non-clean shutdown. In Neon, we don't do WAL replay at startup in either case, so + // that distinction doesn't matter very much. As of this writing, it only affects + // whether the persisted pg_stats information can be used or not. + // + // In the Checkpoint struct in the returned pg_control file, the redo pointer is + // always set to the LSN we're starting at, to hint that no WAL replay is required. + // (There's some neon-specific code in Postgres startup to make that work, though. + // Just setting the redo pointer is not sufficient.) + let was_shutdown = Lsn(checkpoint.redo) == lsn; checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; - //save new values in pg_control + // We use DBState_DB_SHUTDOWNED even if it was not a clean shutdown. The + // neon-specific code at postgres startup ignores the state stored in the control + // file, similar to archive recovery in standalone PostgreSQL. Similarly, the + // checkPoint pointer is ignored, so just set it to 0. pg_control.checkPoint = 0; pg_control.checkPointCopy = checkpoint; pg_control.state = DBState_DB_SHUTDOWNED; - Ok((pg_control.encode(), pg_control.system_identifier)) + Ok((pg_control.encode(), pg_control.system_identifier, was_shutdown)) } pub fn get_current_timestamp() -> TimestampTz { diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 9524a5149b..77dff4ac99 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -76,7 +76,15 @@ impl Conf { let mut cmd = Command::new(path); cmd.env_clear() .env("LD_LIBRARY_PATH", self.pg_lib_dir()?) - .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?); + .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ); Ok(cmd) } diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 9eb3f0e95a..4a33dbe25b 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -81,7 +81,7 @@ fn test_end_of_wal(test_name: &str) { continue; } let mut f = File::options().write(true).open(file.path()).unwrap(); - const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE]; + static ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE]; f.write_all( &ZEROS[0..min( WAL_SEGMENT_SIZE, diff --git a/libs/postgres_initdb/src/lib.rs b/libs/postgres_initdb/src/lib.rs index 2f072354fb..ed54696861 100644 --- a/libs/postgres_initdb/src/lib.rs +++ b/libs/postgres_initdb/src/lib.rs @@ -64,6 +64,14 @@ pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> { .env_clear() .env("LD_LIBRARY_PATH", library_search_path) .env("DYLD_LIBRARY_PATH", library_search_path) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) .stdin(std::process::Stdio::null()) // stdout invocation produces the same output every time, we don't need it .stdout(std::process::Stdio::null()) diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 94714359a3..f99128b76a 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32); impl ProtocolVersion { pub const fn new(major: u16, minor: u16) -> Self { - Self((major as u32) << 16 | minor as u32) + Self(((major as u32) << 16) | minor as u32) } pub const fn minor(self) -> u16 { self.0 as u16 @@ -182,6 +182,13 @@ pub struct CancelKeyData { pub cancel_key: i32, } +pub fn id_to_cancel_key(id: u64) -> CancelKeyData { + CancelKeyData { + backend_pid: (id >> 32) as i32, + cancel_key: (id & 0xffffffff) as i32, + } +} + impl fmt::Display for CancelKeyData { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let hi = (self.backend_pid as u64) << 32; diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml index f71c1599c7..7ebb05eec1 100644 --- a/libs/proxy/postgres-protocol2/Cargo.toml +++ b/libs/proxy/postgres-protocol2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "postgres-protocol2" version = "0.1.0" -edition = "2018" +edition = "2024" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs index f2200a40ce..27e05e24ec 100644 --- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -1,14 +1,12 @@ //! SASL-based authentication support. +use std::fmt::Write; +use std::{io, iter, mem, str}; + use hmac::{Hmac, Mac}; use rand::{self, Rng}; use sha2::digest::FixedOutput; use sha2::{Digest, Sha256}; -use std::fmt::Write; -use std::io; -use std::iter; -use std::mem; -use std::str; use tokio::task::yield_now; const NONCE_LENGTH: usize = 24; @@ -493,11 +491,9 @@ mod test { let nonce = "9IZ2O01zb9IgiIZ1WJ/zgpJB"; let client_first = "n,,n=,r=9IZ2O01zb9IgiIZ1WJ/zgpJB"; - let server_first = - "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\ + let server_first = "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\ =4096"; - let client_final = - "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\ + let client_final = "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\ 1NTlQYNs5BTeQjdHdk7lOflDo5re2an8="; let server_final = "v=U+ppxD5XUKtradnv8e2MkeupiA8FU87Sg8CXzXHDAzw="; diff --git a/libs/proxy/postgres-protocol2/src/lib.rs b/libs/proxy/postgres-protocol2/src/lib.rs index 947f2f835d..afbd1e92bd 100644 --- a/libs/proxy/postgres-protocol2/src/lib.rs +++ b/libs/proxy/postgres-protocol2/src/lib.rs @@ -9,12 +9,12 @@ //! //! This library assumes that the `client_encoding` backend parameter has been //! set to `UTF8`. It will most likely not behave properly if that is not the case. -#![doc(html_root_url = "https://docs.rs/postgres-protocol/0.6")] -#![warn(missing_docs, rust_2018_idioms, clippy::all)] +#![warn(missing_docs, clippy::all)] + +use std::io; use byteorder::{BigEndian, ByteOrder}; use bytes::{BufMut, BytesMut}; -use std::io; pub mod authentication; pub mod escape; diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs index 097964f9c1..d7eaef9509 100644 --- a/libs/proxy/postgres-protocol2/src/message/backend.rs +++ b/libs/proxy/postgres-protocol2/src/message/backend.rs @@ -1,13 +1,13 @@ #![allow(missing_docs)] +use std::io::{self, Read}; +use std::ops::Range; +use std::{cmp, str}; + use byteorder::{BigEndian, ByteOrder, ReadBytesExt}; use bytes::{Bytes, BytesMut}; use fallible_iterator::FallibleIterator; use memchr::memchr; -use std::cmp; -use std::io::{self, Read}; -use std::ops::Range; -use std::str; use crate::Oid; diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs index bc6168f337..b447290ea8 100644 --- a/libs/proxy/postgres-protocol2/src/message/frontend.rs +++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs @@ -1,14 +1,13 @@ //! Frontend message serialization. #![allow(missing_docs)] +use std::error::Error; +use std::{io, marker}; + use byteorder::{BigEndian, ByteOrder}; use bytes::{Buf, BufMut, BytesMut}; -use std::convert::TryFrom; -use std::error::Error; -use std::io; -use std::marker; -use crate::{write_nullable, FromUsize, IsNull, Oid}; +use crate::{FromUsize, IsNull, Oid, write_nullable}; #[inline] fn write_body(buf: &mut BytesMut, f: F) -> Result<(), E> diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs index 38eb31dfcf..4cd9bfb060 100644 --- a/libs/proxy/postgres-protocol2/src/password/mod.rs +++ b/libs/proxy/postgres-protocol2/src/password/mod.rs @@ -6,12 +6,13 @@ //! side. This is good because it ensures the cleartext password won't //! end up in logs pg_stat displays, etc. -use crate::authentication::sasl; use hmac::{Hmac, Mac}; use rand::RngCore; use sha2::digest::FixedOutput; use sha2::{Digest, Sha256}; +use crate::authentication::sasl; + #[cfg(test)] mod test; diff --git a/libs/proxy/postgres-protocol2/src/types/mod.rs b/libs/proxy/postgres-protocol2/src/types/mod.rs index 78131c05bf..6a9b334bcb 100644 --- a/libs/proxy/postgres-protocol2/src/types/mod.rs +++ b/libs/proxy/postgres-protocol2/src/types/mod.rs @@ -1,11 +1,12 @@ //! Conversions to and from Postgres's binary format for various types. -use byteorder::{BigEndian, ReadBytesExt}; -use bytes::{BufMut, BytesMut}; -use fallible_iterator::FallibleIterator; use std::boxed::Box as StdBox; use std::error::Error; use std::str; +use byteorder::{BigEndian, ReadBytesExt}; +use bytes::{BufMut, BytesMut}; +use fallible_iterator::FallibleIterator; + use crate::Oid; #[cfg(test)] diff --git a/libs/proxy/postgres-types2/Cargo.toml b/libs/proxy/postgres-types2/Cargo.toml index 58cfb5571f..25ad23ba35 100644 --- a/libs/proxy/postgres-types2/Cargo.toml +++ b/libs/proxy/postgres-types2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "postgres-types2" version = "0.1.0" -edition = "2018" +edition = "2024" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs index 18ba032151..0ccd8c295f 100644 --- a/libs/proxy/postgres-types2/src/lib.rs +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -2,22 +2,20 @@ //! //! This crate is used by the `tokio-postgres` and `postgres` crates. You normally don't need to depend directly on it //! unless you want to define your own `ToSql` or `FromSql` definitions. -#![doc(html_root_url = "https://docs.rs/postgres-types/0.2")] -#![warn(clippy::all, rust_2018_idioms, missing_docs)] +#![warn(clippy::all, missing_docs)] -use fallible_iterator::FallibleIterator; -use postgres_protocol2::types; use std::any::type_name; use std::error::Error; use std::fmt; use std::sync::Arc; -use crate::type_gen::{Inner, Other}; - +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; #[doc(inline)] pub use postgres_protocol2::Oid; +use postgres_protocol2::types; -use bytes::BytesMut; +use crate::type_gen::{Inner, Other}; /// Generates a simple implementation of `ToSql::accepts` which accepts the /// types passed to it. diff --git a/libs/proxy/postgres-types2/src/private.rs b/libs/proxy/postgres-types2/src/private.rs index 774f9a301c..188b982812 100644 --- a/libs/proxy/postgres-types2/src/private.rs +++ b/libs/proxy/postgres-types2/src/private.rs @@ -1,7 +1,9 @@ -use crate::{FromSql, Type}; -pub use bytes::BytesMut; use std::error::Error; +pub use bytes::BytesMut; + +use crate::{FromSql, Type}; + pub fn read_be_i32(buf: &mut &[u8]) -> Result> { if buf.len() < 4 { return Err("invalid buffer size".into()); diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml index 7130c1b726..540876742f 100644 --- a/libs/proxy/tokio-postgres2/Cargo.toml +++ b/libs/proxy/tokio-postgres2/Cargo.toml @@ -1,21 +1,19 @@ [package] name = "tokio-postgres2" version = "0.1.0" -edition = "2018" +edition = "2024" license = "MIT/Apache-2.0" [dependencies] -async-trait.workspace = true bytes.workspace = true -byteorder.workspace = true fallible-iterator.workspace = true futures-util = { workspace = true, features = ["sink"] } log = "0.4" parking_lot.workspace = true -percent-encoding = "2.0" pin-project-lite.workspace = true phf = "0.11" postgres-protocol2 = { path = "../postgres-protocol2" } postgres-types2 = { path = "../postgres-types2" } tokio = { workspace = true, features = ["io-util", "time", "net"] } tokio-util = { workspace = true, features = ["codec"] } +serde = { workspace = true, features = ["derive"] } diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs index cddbf16336..b65fb571e6 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_query.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs @@ -1,10 +1,11 @@ +use std::io; + use tokio::net::TcpStream; use crate::client::SocketConfig; use crate::config::{Host, SslMode}; use crate::tls::MakeTlsConnect; -use crate::{cancel_query_raw, connect_socket, Error}; -use std::io; +use crate::{Error, cancel_query_raw, connect_socket}; pub(crate) async fn cancel_query( config: Option, @@ -22,7 +23,7 @@ where return Err(Error::connect(io::Error::new( io::ErrorKind::InvalidInput, "unknown host", - ))) + ))); } }; diff --git a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs index 8c08296435..c720214e9b 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs @@ -1,10 +1,11 @@ -use crate::config::SslMode; -use crate::tls::TlsConnect; -use crate::{connect_tls, Error}; use bytes::BytesMut; use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use crate::config::SslMode; +use crate::tls::TlsConnect; +use crate::{Error, connect_tls}; + pub async fn cancel_query_raw( stream: S, mode: SslMode, diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs index a10e8bf5c3..f6526395ee 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_token.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs @@ -1,14 +1,15 @@ -use crate::config::SslMode; -use crate::tls::TlsConnect; - -use crate::{cancel_query, client::SocketConfig, tls::MakeTlsConnect}; -use crate::{cancel_query_raw, Error}; +use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpStream; +use crate::client::SocketConfig; +use crate::config::SslMode; +use crate::tls::{MakeTlsConnect, TlsConnect}; +use crate::{Error, cancel_query, cancel_query_raw}; + /// The capability to request cancellation of in-progress queries on a /// connection. -#[derive(Clone)] +#[derive(Clone, Serialize, Deserialize)] pub struct CancelToken { pub socket_config: Option, pub ssl_mode: SslMode, diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index a7cd53afc3..39b1db75da 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -1,30 +1,28 @@ -use crate::codec::{BackendMessages, FrontendMessage}; - -use crate::config::Host; -use crate::config::SslMode; -use crate::connection::{Request, RequestMessages}; - -use crate::query::RowStream; -use crate::simple_query::SimpleQueryStream; - -use crate::types::{Oid, ToSql, Type}; - -use crate::{ - prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row, - SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder, -}; -use bytes::BytesMut; -use fallible_iterator::FallibleIterator; -use futures_util::{future, ready, TryStreamExt}; -use parking_lot::Mutex; -use postgres_protocol2::message::{backend::Message, frontend}; use std::collections::HashMap; use std::fmt; use std::sync::Arc; use std::task::{Context, Poll}; +use std::time::Duration; + +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{TryStreamExt, future, ready}; +use parking_lot::Mutex; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use serde::{Deserialize, Serialize}; use tokio::sync::mpsc; -use std::time::Duration; +use crate::codec::{BackendMessages, FrontendMessage}; +use crate::config::{Host, SslMode}; +use crate::connection::{Request, RequestMessages}; +use crate::query::RowStream; +use crate::simple_query::SimpleQueryStream; +use crate::types::{Oid, ToSql, Type}; +use crate::{ + CancelToken, Error, ReadyForQueryStatus, Row, SimpleQueryMessage, Statement, Transaction, + TransactionBuilder, query, simple_query, slice_iter, +}; pub struct Responses { receiver: mpsc::Receiver, @@ -53,18 +51,18 @@ impl Responses { } /// A cache of type info and prepared statements for fetching type info -/// (corresponding to the queries in the [prepare] module). +/// (corresponding to the queries in the [crate::prepare] module). #[derive(Default)] struct CachedTypeInfo { /// A statement for basic information for a type from its - /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its + /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its /// fallback). typeinfo: Option, /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY). + /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY). typeinfo_composite: Option, /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or + /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or /// its fallback). typeinfo_enum: Option, @@ -137,7 +135,7 @@ impl InnerClient { } } -#[derive(Clone)] +#[derive(Clone, Serialize, Deserialize)] pub struct SocketConfig { pub host: Host, pub port: u16, @@ -189,26 +187,6 @@ impl Client { &self.inner } - /// Creates a new prepared statement. - /// - /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc), - /// which are set when executed. Prepared statements can only be used with the connection that created them. - pub async fn prepare(&self, query: &str) -> Result { - self.prepare_typed(query, &[]).await - } - - /// Like `prepare`, but allows the types of query parameters to be explicitly specified. - /// - /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be - /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`. - pub async fn prepare_typed( - &self, - query: &str, - parameter_types: &[Type], - ) -> Result { - prepare::prepare(&self.inner, query, parameter_types).await - } - /// Executes a statement, returning a vector of the resulting rows. /// /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list @@ -221,14 +199,11 @@ impl Client { /// # Panics /// /// Panics if the number of parameters provided does not match the number expected. - pub async fn query( + pub async fn query( &self, - statement: &T, + statement: Statement, params: &[&(dyn ToSql + Sync)], - ) -> Result, Error> - where - T: ?Sized + ToStatement, - { + ) -> Result, Error> { self.query_raw(statement, slice_iter(params)) .await? .try_collect() @@ -249,13 +224,15 @@ impl Client { /// Panics if the number of parameters provided does not match the number expected. /// /// [`query`]: #method.query - pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result + pub async fn query_raw<'a, I>( + &self, + statement: Statement, + params: I, + ) -> Result where - T: ?Sized + ToStatement, I: IntoIterator, I::IntoIter: ExactSizeIterator, { - let statement = statement.__convert().into_statement(self).await?; query::query(&self.inner, statement, params).await } @@ -270,55 +247,6 @@ impl Client { query::query_txt(&self.inner, statement, params).await } - /// Executes a statement, returning the number of rows modified. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - pub async fn execute( - &self, - statement: &T, - params: &[&(dyn ToSql + Sync)], - ) -> Result - where - T: ?Sized + ToStatement, - { - self.execute_raw(statement, slice_iter(params)).await - } - - /// The maximally flexible version of [`execute`]. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - /// - /// [`execute`]: #method.execute - pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result - where - T: ?Sized + ToStatement, - I: IntoIterator, - I::IntoIter: ExactSizeIterator, - { - let statement = statement.__convert().into_statement(self).await?; - query::execute(self.inner(), statement, params).await - } - /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows. /// /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs index 0ec46198ce..f1fd9b47b3 100644 --- a/libs/proxy/tokio-postgres2/src/codec.rs +++ b/libs/proxy/tokio-postgres2/src/codec.rs @@ -1,8 +1,9 @@ +use std::io; + use bytes::{Buf, Bytes, BytesMut}; use fallible_iterator::FallibleIterator; use postgres_protocol2::message::backend; use postgres_protocol2::message::frontend::CopyData; -use std::io; use tokio_util::codec::{Decoder, Encoder}; pub enum FrontendMessage { diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 11a361a81b..4c25491b67 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -1,22 +1,21 @@ //! Connection configuration. -use crate::connect::connect; -use crate::connect_raw::connect_raw; -use crate::connect_raw::RawConnection; -use crate::tls::MakeTlsConnect; -use crate::tls::TlsConnect; -use crate::{Client, Connection, Error}; -use postgres_protocol2::message::frontend::StartupMessageParams; -use std::fmt; -use std::str; use std::time::Duration; -use tokio::io::{AsyncRead, AsyncWrite}; +use std::{fmt, str}; pub use postgres_protocol2::authentication::sasl::ScramKeys; +use postgres_protocol2::message::frontend::StartupMessageParams; +use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::TcpStream; +use crate::connect::connect; +use crate::connect_raw::{RawConnection, connect_raw}; +use crate::tls::{MakeTlsConnect, TlsConnect}; +use crate::{Client, Connection, Error}; + /// TLS configuration. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] #[non_exhaustive] pub enum SslMode { /// Do not use TLS. @@ -50,7 +49,7 @@ pub enum ReplicationMode { } /// A host specification. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum Host { /// A TCP hostname. Tcp(String), diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index e0cb69748d..d2bd0dfbcd 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -1,3 +1,7 @@ +use postgres_protocol2::message::backend::Message; +use tokio::net::TcpStream; +use tokio::sync::mpsc; + use crate::client::SocketConfig; use crate::codec::BackendMessage; use crate::config::Host; @@ -5,9 +9,6 @@ use crate::connect_raw::connect_raw; use crate::connect_socket::connect_socket; use crate::tls::{MakeTlsConnect, TlsConnect}; use crate::{Client, Config, Connection, Error, RawConnection}; -use postgres_protocol2::message::backend::Message; -use tokio::net::TcpStream; -use tokio::sync::mpsc; pub async fn connect( mut tls: T, diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs index 66db85e07d..20dc538cf2 100644 --- a/libs/proxy/tokio-postgres2/src/connect_raw.rs +++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs @@ -1,22 +1,24 @@ +use std::collections::HashMap; +use std::io; +use std::pin::Pin; +use std::task::{Context, Poll}; + +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{Sink, SinkExt, Stream, TryStreamExt, ready}; +use postgres_protocol2::authentication::sasl; +use postgres_protocol2::authentication::sasl::ScramSha256; +use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody}; +use postgres_protocol2::message::frontend; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_util::codec::Framed; + +use crate::Error; use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; use crate::config::{self, AuthKeys, Config}; use crate::connect_tls::connect_tls; use crate::maybe_tls_stream::MaybeTlsStream; use crate::tls::{TlsConnect, TlsStream}; -use crate::Error; -use bytes::BytesMut; -use fallible_iterator::FallibleIterator; -use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt}; -use postgres_protocol2::authentication::sasl; -use postgres_protocol2::authentication::sasl::ScramSha256; -use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody}; -use postgres_protocol2::message::frontend; -use std::collections::HashMap; -use std::io; -use std::pin::Pin; -use std::task::{Context, Poll}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_util::codec::Framed; pub struct StartupStream { inner: Framed, PostgresCodec>, @@ -158,7 +160,7 @@ where | Some(Message::AuthenticationSspi) => { return Err(Error::authentication( "unsupported authentication method".into(), - )) + )); } Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), Some(_) => return Err(Error::unexpected_message()), diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs index 336a13317f..15411f7ef3 100644 --- a/libs/proxy/tokio-postgres2/src/connect_socket.rs +++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs @@ -1,11 +1,13 @@ -use crate::config::Host; -use crate::Error; use std::future::Future; use std::io; use std::time::Duration; + use tokio::net::{self, TcpStream}; use tokio::time; +use crate::Error; +use crate::config::Host; + pub(crate) async fn connect_socket( host: &Host, port: u16, diff --git a/libs/proxy/tokio-postgres2/src/connect_tls.rs b/libs/proxy/tokio-postgres2/src/connect_tls.rs index 64b0b68abc..4dc929a9e2 100644 --- a/libs/proxy/tokio-postgres2/src/connect_tls.rs +++ b/libs/proxy/tokio-postgres2/src/connect_tls.rs @@ -1,12 +1,13 @@ -use crate::config::SslMode; -use crate::maybe_tls_stream::MaybeTlsStream; -use crate::tls::private::ForcePrivateApi; -use crate::tls::TlsConnect; -use crate::Error; use bytes::BytesMut; use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; +use crate::Error; +use crate::config::SslMode; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::tls::TlsConnect; +use crate::tls::private::ForcePrivateApi; + pub async fn connect_tls( mut stream: S, mode: SslMode, @@ -19,7 +20,7 @@ where match mode { SslMode::Disable => return Ok(MaybeTlsStream::Raw(stream)), SslMode::Prefer if !tls.can_connect(ForcePrivateApi) => { - return Ok(MaybeTlsStream::Raw(stream)) + return Ok(MaybeTlsStream::Raw(stream)); } SslMode::Prefer | SslMode::Require => {} } diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs index 0aa5c77e22..60e39b3b44 100644 --- a/libs/proxy/tokio-postgres2/src/connection.rs +++ b/libs/proxy/tokio-postgres2/src/connection.rs @@ -1,22 +1,24 @@ -use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; -use crate::error::DbError; -use crate::maybe_tls_stream::MaybeTlsStream; -use crate::{AsyncMessage, Error, Notification}; -use bytes::BytesMut; -use fallible_iterator::FallibleIterator; -use futures_util::{ready, Sink, Stream}; -use log::{info, trace}; -use postgres_protocol2::message::backend::Message; -use postgres_protocol2::message::frontend; use std::collections::{HashMap, VecDeque}; use std::future::Future; use std::pin::Pin; use std::task::{Context, Poll}; + +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{Sink, Stream, ready}; +use log::{info, trace}; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc; use tokio_util::codec::Framed; use tokio_util::sync::PollSender; +use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; +use crate::error::DbError; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::{AsyncMessage, Error, Notification}; + pub enum RequestMessages { Single(FrontendMessage), } @@ -33,10 +35,14 @@ pub struct Response { #[derive(PartialEq, Debug)] enum State { Active, - Terminating, Closing, } +enum WriteReady { + Terminating, + WaitingOnRead, +} + /// A connection to a PostgreSQL database. /// /// This is one half of what is returned when a new connection is established. It performs the actual IO with the @@ -51,7 +57,6 @@ pub struct Connection { /// HACK: we need this in the Neon Proxy to forward params. pub parameters: HashMap, receiver: mpsc::UnboundedReceiver, - pending_request: Option, pending_responses: VecDeque, responses: VecDeque, state: State, @@ -72,7 +77,6 @@ where stream, parameters, receiver, - pending_request: None, pending_responses, responses: VecDeque::new(), state: State::Active, @@ -93,26 +97,23 @@ where .map(|o| o.map(|r| r.map_err(Error::io))) } - fn poll_read(&mut self, cx: &mut Context<'_>) -> Result, Error> { - if self.state != State::Active { - trace!("poll_read: done"); - return Ok(None); - } - + /// Read and process messages from the connection to postgres. + /// client <- postgres + fn poll_read(&mut self, cx: &mut Context<'_>) -> Poll> { loop { let message = match self.poll_response(cx)? { Poll::Ready(Some(message)) => message, - Poll::Ready(None) => return Err(Error::closed()), + Poll::Ready(None) => return Poll::Ready(Err(Error::closed())), Poll::Pending => { trace!("poll_read: waiting on response"); - return Ok(None); + return Poll::Pending; } }; let (mut messages, request_complete) = match message { BackendMessage::Async(Message::NoticeResponse(body)) => { let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?; - return Ok(Some(AsyncMessage::Notice(error))); + return Poll::Ready(Ok(AsyncMessage::Notice(error))); } BackendMessage::Async(Message::NotificationResponse(body)) => { let notification = Notification { @@ -120,7 +121,7 @@ where channel: body.channel().map_err(Error::parse)?.to_string(), payload: body.message().map_err(Error::parse)?.to_string(), }; - return Ok(Some(AsyncMessage::Notification(notification))); + return Poll::Ready(Ok(AsyncMessage::Notification(notification))); } BackendMessage::Async(Message::ParameterStatus(body)) => { self.parameters.insert( @@ -139,8 +140,10 @@ where let mut response = match self.responses.pop_front() { Some(response) => response, None => match messages.next().map_err(Error::parse)? { - Some(Message::ErrorResponse(error)) => return Err(Error::db(error)), - _ => return Err(Error::unexpected_message()), + Some(Message::ErrorResponse(error)) => { + return Poll::Ready(Err(Error::db(error))); + } + _ => return Poll::Ready(Err(Error::unexpected_message())), }, }; @@ -164,18 +167,14 @@ where request_complete, }); trace!("poll_read: waiting on sender"); - return Ok(None); + return Poll::Pending; } } } } + /// Fetch the next client request and enqueue the response sender. fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll> { - if let Some(messages) = self.pending_request.take() { - trace!("retrying pending request"); - return Poll::Ready(Some(messages)); - } - if self.receiver.is_closed() { return Poll::Ready(None); } @@ -193,74 +192,80 @@ where } } - fn poll_write(&mut self, cx: &mut Context<'_>) -> Result { + /// Process client requests and write them to the postgres connection, flushing if necessary. + /// client -> postgres + fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll> { loop { - if self.state == State::Closing { - trace!("poll_write: done"); - return Ok(false); - } - if Pin::new(&mut self.stream) .poll_ready(cx) .map_err(Error::io)? .is_pending() { trace!("poll_write: waiting on socket"); - return Ok(false); + + // poll_ready is self-flushing. + return Poll::Pending; } - let request = match self.poll_request(cx) { - Poll::Ready(Some(request)) => request, - Poll::Ready(None) if self.responses.is_empty() && self.state == State::Active => { + match self.poll_request(cx) { + // send the message to postgres + Poll::Ready(Some(RequestMessages::Single(request))) => { + Pin::new(&mut self.stream) + .start_send(request) + .map_err(Error::io)?; + } + // No more messages from the client, and no more responses to wait for. + // Send a terminate message to postgres + Poll::Ready(None) if self.responses.is_empty() => { trace!("poll_write: at eof, terminating"); - self.state = State::Terminating; let mut request = BytesMut::new(); frontend::terminate(&mut request); - RequestMessages::Single(FrontendMessage::Raw(request.freeze())) + let request = FrontendMessage::Raw(request.freeze()); + + Pin::new(&mut self.stream) + .start_send(request) + .map_err(Error::io)?; + + trace!("poll_write: sent eof, closing"); + trace!("poll_write: done"); + return Poll::Ready(Ok(WriteReady::Terminating)); } + // No more messages from the client, but there are still some responses to wait for. Poll::Ready(None) => { trace!( "poll_write: at eof, pending responses {}", self.responses.len() ); - return Ok(true); + ready!(self.poll_flush(cx))?; + return Poll::Ready(Ok(WriteReady::WaitingOnRead)); } + // Still waiting for a message from the client. Poll::Pending => { trace!("poll_write: waiting on request"); - return Ok(true); - } - }; - - match request { - RequestMessages::Single(request) => { - Pin::new(&mut self.stream) - .start_send(request) - .map_err(Error::io)?; - if self.state == State::Terminating { - trace!("poll_write: sent eof, closing"); - self.state = State::Closing; - } + ready!(self.poll_flush(cx))?; + return Poll::Pending; } } } } - fn poll_flush(&mut self, cx: &mut Context<'_>) -> Result<(), Error> { + fn poll_flush(&mut self, cx: &mut Context<'_>) -> Poll> { match Pin::new(&mut self.stream) .poll_flush(cx) .map_err(Error::io)? { - Poll::Ready(()) => trace!("poll_flush: flushed"), - Poll::Pending => trace!("poll_flush: waiting on socket"), + Poll::Ready(()) => { + trace!("poll_flush: flushed"); + Poll::Ready(Ok(())) + } + Poll::Pending => { + trace!("poll_flush: waiting on socket"); + Poll::Pending + } } - Ok(()) } fn poll_shutdown(&mut self, cx: &mut Context<'_>) -> Poll> { - if self.state != State::Closing { - return Poll::Pending; - } - match Pin::new(&mut self.stream) .poll_close(cx) .map_err(Error::io)? @@ -289,18 +294,30 @@ where &mut self, cx: &mut Context<'_>, ) -> Poll>> { - let message = self.poll_read(cx)?; - let want_flush = self.poll_write(cx)?; - if want_flush { - self.poll_flush(cx)?; + if self.state != State::Closing { + // if the state is still active, try read from and write to postgres. + let message = self.poll_read(cx)?; + let closing = self.poll_write(cx)?; + if let Poll::Ready(WriteReady::Terminating) = closing { + self.state = State::Closing; + } + + if let Poll::Ready(message) = message { + return Poll::Ready(Some(Ok(message))); + } + + // poll_read returned Pending. + // poll_write returned Pending or Ready(WriteReady::WaitingOnRead). + // if poll_write returned Ready(WriteReady::WaitingOnRead), then we are waiting to read more data from postgres. + if self.state != State::Closing { + return Poll::Pending; + } } - match message { - Some(message) => Poll::Ready(Some(Ok(message))), - None => match self.poll_shutdown(cx) { - Poll::Ready(Ok(())) => Poll::Ready(None), - Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))), - Poll::Pending => Poll::Pending, - }, + + match self.poll_shutdown(cx) { + Poll::Ready(Ok(())) => Poll::Ready(None), + Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))), + Poll::Pending => Poll::Pending, } } } diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs index 922c348525..b12e76e5bf 100644 --- a/libs/proxy/tokio-postgres2/src/error/mod.rs +++ b/libs/proxy/tokio-postgres2/src/error/mod.rs @@ -1,10 +1,10 @@ //! Errors. +use std::error::{self, Error as _Error}; +use std::{fmt, io}; + use fallible_iterator::FallibleIterator; use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody}; -use std::error::{self, Error as _Error}; -use std::fmt; -use std::io; pub use self::sqlstate::*; diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs index 768213f8ed..31c3d8fa3e 100644 --- a/libs/proxy/tokio-postgres2/src/generic_client.rs +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -1,8 +1,10 @@ +#![allow(async_fn_in_trait)] + +use postgres_protocol2::Oid; + use crate::query::RowStream; use crate::types::Type; use crate::{Client, Error, Transaction}; -use async_trait::async_trait; -use postgres_protocol2::Oid; mod private { pub trait Sealed {} @@ -11,7 +13,6 @@ mod private { /// A trait allowing abstraction over connections and transactions. /// /// This trait is "sealed", and cannot be implemented outside of this crate. -#[async_trait] pub trait GenericClient: private::Sealed { /// Like `Client::query_raw_txt`. async fn query_raw_txt(&self, statement: &str, params: I) -> Result @@ -26,7 +27,6 @@ pub trait GenericClient: private::Sealed { impl private::Sealed for Client {} -#[async_trait] impl GenericClient for Client { async fn query_raw_txt(&self, statement: &str, params: I) -> Result where @@ -39,14 +39,12 @@ impl GenericClient for Client { /// Query for type information async fn get_type(&self, oid: Oid) -> Result { - self.get_type(oid).await + crate::prepare::get_type(self.inner(), oid).await } } impl private::Sealed for Transaction<'_> {} -#[async_trait] -#[allow(clippy::needless_lifetimes)] impl GenericClient for Transaction<'_> { async fn query_raw_txt(&self, statement: &str, params: I) -> Result where diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 901ed0c96c..c8ebba5487 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -1,5 +1,7 @@ //! An asynchronous, pipelined, PostgreSQL client. -#![warn(rust_2018_idioms, clippy::all)] +#![warn(clippy::all)] + +use postgres_protocol2::message::backend::ReadyForQueryBody; pub use crate::cancel_token::CancelToken; pub use crate::client::{Client, SocketConfig}; @@ -14,11 +16,9 @@ pub use crate::row::{Row, SimpleQueryRow}; pub use crate::simple_query::SimpleQueryStream; pub use crate::statement::{Column, Statement}; pub use crate::tls::NoTls; -pub use crate::to_statement::ToStatement; pub use crate::transaction::Transaction; pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder}; use crate::types::ToSql; -use postgres_protocol2::message::backend::ReadyForQueryBody; /// After executing a query, the connection will be in one of these states #[derive(Clone, Copy, Debug, PartialEq)] @@ -65,7 +65,6 @@ pub mod row; mod simple_query; mod statement; pub mod tls; -mod to_statement; mod transaction; mod transaction_builder; pub mod types; diff --git a/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs index 9a7e248997..4aa838613e 100644 --- a/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs +++ b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs @@ -1,12 +1,14 @@ //! MaybeTlsStream. //! //! Represents a stream that may or may not be encrypted with TLS. -use crate::tls::{ChannelBinding, TlsStream}; use std::io; use std::pin::Pin; use std::task::{Context, Poll}; + use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use crate::tls::{ChannelBinding, TlsStream}; + /// A stream that may or may not be encrypted with TLS. pub enum MaybeTlsStream { /// An unencrypted stream. diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs index da0c755c5b..b36d2e5f74 100644 --- a/libs/proxy/tokio-postgres2/src/prepare.rs +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -1,20 +1,19 @@ -use crate::client::InnerClient; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; -use crate::error::SqlState; -use crate::types::{Field, Kind, Oid, Type}; -use crate::{query, slice_iter}; -use crate::{Column, Error, Statement}; +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; + use bytes::Bytes; use fallible_iterator::FallibleIterator; -use futures_util::{pin_mut, TryStreamExt}; +use futures_util::{TryStreamExt, pin_mut}; use log::debug; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; -use std::future::Future; -use std::pin::Pin; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; + +use crate::client::InnerClient; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::types::{Field, Kind, Oid, Type}; +use crate::{Column, Error, Statement, query, slice_iter}; pub(crate) const TYPEINFO_QUERY: &str = "\ SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid @@ -24,14 +23,6 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid WHERE t.oid = $1 "; -// Range types weren't added until Postgres 9.2, so pg_range may not exist -const TYPEINFO_FALLBACK_QUERY: &str = "\ -SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid -FROM pg_catalog.pg_type t -INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid -WHERE t.oid = $1 -"; - const TYPEINFO_ENUM_QUERY: &str = "\ SELECT enumlabel FROM pg_catalog.pg_enum @@ -39,14 +30,6 @@ WHERE enumtypid = $1 ORDER BY enumsortorder "; -// Postgres 9.0 didn't have enumsortorder -const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\ -SELECT enumlabel -FROM pg_catalog.pg_enum -WHERE enumtypid = $1 -ORDER BY oid -"; - pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\ SELECT attname, atttypid FROM pg_catalog.pg_attribute @@ -56,15 +39,13 @@ AND attnum > 0 ORDER BY attnum "; -static NEXT_ID: AtomicUsize = AtomicUsize::new(0); - pub async fn prepare( client: &Arc, + name: &'static str, query: &str, types: &[Type], ) -> Result { - let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst)); - let buf = encode(client, &name, query, types)?; + let buf = encode(client, name, query, types)?; let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; match responses.next().await? { @@ -105,10 +86,11 @@ pub async fn prepare( fn prepare_rec<'a>( client: &'a Arc, + name: &'static str, query: &'a str, types: &'a [Type], ) -> Pin> + 'a + Send>> { - Box::pin(prepare(client, query, types)) + Box::pin(prepare(client, name, query, types)) } fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result { @@ -192,13 +174,8 @@ async fn typeinfo_statement(client: &Arc) -> Result stmt, - Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => { - prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await? - } - Err(e) => return Err(e), - }; + let typeinfo = "neon_proxy_typeinfo"; + let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?; client.set_typeinfo(&stmt); Ok(stmt) @@ -219,13 +196,8 @@ async fn typeinfo_enum_statement(client: &Arc) -> Result stmt, - Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => { - prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await? - } - Err(e) => return Err(e), - }; + let typeinfo = "neon_proxy_typeinfo_enum"; + let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?; client.set_typeinfo_enum(&stmt); Ok(stmt) @@ -255,7 +227,8 @@ async fn typeinfo_composite_statement(client: &Arc) -> Result(&'a [&'a (dyn ToSql + Sync)]); impl fmt::Debug for BorrowToSqlParamsDebug<'_> { @@ -157,49 +159,6 @@ where }) } -pub async fn execute<'a, I>( - client: &InnerClient, - statement: Statement, - params: I, -) -> Result -where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, -{ - let buf = if log_enabled!(Level::Debug) { - let params = params.into_iter().collect::>(); - debug!( - "executing statement {} with parameters: {:?}", - statement.name(), - BorrowToSqlParamsDebug(params.as_slice()), - ); - encode(client, &statement, params)? - } else { - encode(client, &statement, params)? - }; - let mut responses = start(client, buf).await?; - - let mut rows = 0; - loop { - match responses.next().await? { - Message::DataRow(_) => {} - Message::CommandComplete(body) => { - rows = body - .tag() - .map_err(Error::parse)? - .rsplit(' ') - .next() - .unwrap() - .parse() - .unwrap_or(0); - } - Message::EmptyQueryResponse => rows = 0, - Message::ReadyForQuery(_) => return Ok(rows), - _ => return Err(Error::unexpected_message()), - } - } -} - async fn start(client: &InnerClient, buf: Bytes) -> Result { let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; @@ -300,7 +259,7 @@ impl Stream for RowStream { this.statement.clone(), body, *this.output_format, - )?))) + )?))); } Message::EmptyQueryResponse | Message::PortalSuspended => {} Message::CommandComplete(body) => { diff --git a/libs/proxy/tokio-postgres2/src/row.rs b/libs/proxy/tokio-postgres2/src/row.rs index 10e130707d..5fc955eef4 100644 --- a/libs/proxy/tokio-postgres2/src/row.rs +++ b/libs/proxy/tokio-postgres2/src/row.rs @@ -1,17 +1,18 @@ //! Rows. +use std::ops::Range; +use std::sync::Arc; +use std::{fmt, str}; + +use fallible_iterator::FallibleIterator; +use postgres_protocol2::message::backend::DataRowBody; +use postgres_types2::{Format, WrongFormat}; + use crate::row::sealed::{AsName, Sealed}; use crate::simple_query::SimpleColumn; use crate::statement::Column; use crate::types::{FromSql, Type, WrongType}; use crate::{Error, Statement}; -use fallible_iterator::FallibleIterator; -use postgres_protocol2::message::backend::DataRowBody; -use postgres_types2::{Format, WrongFormat}; -use std::fmt; -use std::ops::Range; -use std::str; -use std::sync::Arc; mod sealed { pub trait Sealed {} diff --git a/libs/proxy/tokio-postgres2/src/simple_query.rs b/libs/proxy/tokio-postgres2/src/simple_query.rs index fb2550377b..f13d63983f 100644 --- a/libs/proxy/tokio-postgres2/src/simple_query.rs +++ b/libs/proxy/tokio-postgres2/src/simple_query.rs @@ -1,19 +1,21 @@ -use crate::client::{InnerClient, Responses}; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; -use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow}; -use bytes::Bytes; -use fallible_iterator::FallibleIterator; -use futures_util::{ready, Stream}; -use log::debug; -use pin_project_lite::pin_project; -use postgres_protocol2::message::backend::Message; -use postgres_protocol2::message::frontend; use std::marker::PhantomPinned; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; +use bytes::Bytes; +use fallible_iterator::FallibleIterator; +use futures_util::{Stream, ready}; +use log::debug; +use pin_project_lite::pin_project; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; + +use crate::client::{InnerClient, Responses}; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow}; + /// Information about a column of a single query row. #[derive(Debug)] pub struct SimpleColumn { diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs index 22e160fc05..e4828db712 100644 --- a/libs/proxy/tokio-postgres2/src/statement.rs +++ b/libs/proxy/tokio-postgres2/src/statement.rs @@ -1,19 +1,18 @@ +use std::fmt; +use std::sync::{Arc, Weak}; + +use postgres_protocol2::Oid; +use postgres_protocol2::message::backend::Field; +use postgres_protocol2::message::frontend; + use crate::client::InnerClient; use crate::codec::FrontendMessage; use crate::connection::RequestMessages; use crate::types::Type; -use postgres_protocol2::{ - message::{backend::Field, frontend}, - Oid, -}; -use std::{ - fmt, - sync::{Arc, Weak}, -}; struct StatementInner { client: Weak, - name: String, + name: &'static str, params: Vec, columns: Vec, } @@ -22,7 +21,7 @@ impl Drop for StatementInner { fn drop(&mut self) { if let Some(client) = self.client.upgrade() { let buf = client.with_buf(|buf| { - frontend::close(b'S', &self.name, buf).unwrap(); + frontend::close(b'S', self.name, buf).unwrap(); frontend::sync(buf); buf.split().freeze() }); @@ -40,7 +39,7 @@ pub struct Statement(Arc); impl Statement { pub(crate) fn new( inner: &Arc, - name: String, + name: &'static str, params: Vec, columns: Vec, ) -> Statement { @@ -55,14 +54,14 @@ impl Statement { pub(crate) fn new_anonymous(params: Vec, columns: Vec) -> Statement { Statement(Arc::new(StatementInner { client: Weak::new(), - name: String::new(), + name: "", params, columns, })) } pub(crate) fn name(&self) -> &str { - &self.0.name + self.0.name } /// Returns the expected types of the statement's parameters. diff --git a/libs/proxy/tokio-postgres2/src/tls.rs b/libs/proxy/tokio-postgres2/src/tls.rs index dc8140719f..41b51368ff 100644 --- a/libs/proxy/tokio-postgres2/src/tls.rs +++ b/libs/proxy/tokio-postgres2/src/tls.rs @@ -5,6 +5,7 @@ use std::future::Future; use std::pin::Pin; use std::task::{Context, Poll}; use std::{fmt, io}; + use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; pub(crate) mod private { diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs deleted file mode 100644 index 427f77dd79..0000000000 --- a/libs/proxy/tokio-postgres2/src/to_statement.rs +++ /dev/null @@ -1,57 +0,0 @@ -use crate::to_statement::private::{Sealed, ToStatementType}; -use crate::Statement; - -mod private { - use crate::{Client, Error, Statement}; - - pub trait Sealed {} - - pub enum ToStatementType<'a> { - Statement(&'a Statement), - Query(&'a str), - } - - impl<'a> ToStatementType<'a> { - pub async fn into_statement(self, client: &Client) -> Result { - match self { - ToStatementType::Statement(s) => Ok(s.clone()), - ToStatementType::Query(s) => client.prepare(s).await, - } - } - } -} - -/// A trait abstracting over prepared and unprepared statements. -/// -/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which -/// was prepared previously. -/// -/// This trait is "sealed" and cannot be implemented by anything outside this crate. -pub trait ToStatement: Sealed { - #[doc(hidden)] - fn __convert(&self) -> ToStatementType<'_>; -} - -impl ToStatement for Statement { - fn __convert(&self) -> ToStatementType<'_> { - ToStatementType::Statement(self) - } -} - -impl Sealed for Statement {} - -impl ToStatement for str { - fn __convert(&self) -> ToStatementType<'_> { - ToStatementType::Query(self) - } -} - -impl Sealed for str {} - -impl ToStatement for String { - fn __convert(&self) -> ToStatementType<'_> { - ToStatementType::Query(self) - } -} - -impl Sealed for String {} diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs index 03a57e4947..eecbfc5873 100644 --- a/libs/proxy/tokio-postgres2/src/transaction.rs +++ b/libs/proxy/tokio-postgres2/src/transaction.rs @@ -1,8 +1,9 @@ +use postgres_protocol2::message::frontend; + use crate::codec::FrontendMessage; use crate::connection::RequestMessages; use crate::query::RowStream; use crate::{CancelToken, Client, Error, ReadyForQueryStatus}; -use postgres_protocol2::message::frontend; /// A representation of a PostgreSQL database transaction. /// diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 1816825bda..7bdf340f74 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "remote_storage" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] @@ -18,6 +18,7 @@ camino = { workspace = true, features = ["serde1"] } humantime-serde.workspace = true hyper = { workspace = true, features = ["client"] } futures.workspace = true +reqwest.workspace = true serde.workspace = true serde_json.workspace = true tokio = { workspace = true, features = ["sync", "fs", "io-util"] } diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 32c51bc2ad..dee61a410d 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -2,30 +2,26 @@ use std::borrow::Cow; use std::collections::HashMap; -use std::env; use std::fmt::Display; -use std::io; use std::num::NonZeroU32; use std::pin::Pin; use std::str::FromStr; -use std::time::Duration; -use std::time::SystemTime; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; +use std::{env, io}; -use super::REMOTE_STORAGE_PREFIX_SEPARATOR; -use anyhow::Context; -use anyhow::Result; +use anyhow::{Context, Result}; use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range}; -use azure_core::{Continuable, RetryOptions}; +use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions}; use azure_storage::StorageCredentials; use azure_storage_blobs::blob::CopyStatus; -use azure_storage_blobs::prelude::ClientBuilder; -use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; +use azure_storage_blobs::blob::operations::GetBlobBuilder; +use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; use bytes::Bytes; +use futures::FutureExt; use futures::future::Either; use futures::stream::Stream; -use futures::FutureExt; -use futures_util::StreamExt; -use futures_util::TryStreamExt; +use futures_util::{StreamExt, TryStreamExt}; use http_types::{StatusCode, Url}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -33,12 +29,13 @@ use tracing::debug; use utils::backoff; use utils::backoff::exponential_backoff_duration_seconds; -use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; -use crate::DownloadKind; +use super::REMOTE_STORAGE_PREFIX_SEPARATOR; +use crate::config::AzureConfig; +use crate::error::Cancelled; +use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests}; use crate::{ - config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, - DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata, - TimeTravelError, TimeoutOrCancel, + ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode, + ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, }; pub struct AzureBlobStorage { @@ -80,8 +77,13 @@ impl AzureBlobStorage { StorageCredentials::token_credential(token_credential) }; - // we have an outer retry - let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none()); + let builder = ClientBuilder::new(account, credentials) + // we have an outer retry + .retry(RetryOptions::none()) + // Customize transport to configure conneciton pooling + .transport(TransportOptions::new(Self::reqwest_client( + azure_config.conn_pool_size, + ))); let client = builder.container_client(azure_config.container_name.to_owned()); @@ -106,6 +108,14 @@ impl AzureBlobStorage { }) } + fn reqwest_client(conn_pool_size: usize) -> Arc { + let client = reqwest::ClientBuilder::new() + .pool_max_idle_per_host(conn_pool_size) + .build() + .expect("failed to build `reqwest` client"); + Arc::new(client) + } + pub fn relative_path_to_name(&self, path: &RemotePath) -> String { assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); let path_string = path.get_path().as_str(); @@ -361,7 +371,8 @@ impl RemoteStorage for AzureBlobStorage { let next_item = next_item?; - if timeout_try_cnt >= 2 { + // Log a warning if we saw two timeouts in a row before a successful request + if timeout_try_cnt > 2 { tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt); } timeout_try_cnt = 1; @@ -544,9 +555,9 @@ impl RemoteStorage for AzureBlobStorage { .await } - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Delete; diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index f6ef31077c..52978be5b4 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -1,13 +1,15 @@ -use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration}; +use std::fmt::Debug; +use std::num::NonZeroUsize; +use std::str::FromStr; +use std::time::Duration; use aws_sdk_s3::types::StorageClass; use camino::Utf8PathBuf; - use serde::{Deserialize, Serialize}; use crate::{ DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT, - DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; /// External backup storage configuration, enough for creating a client for that storage. @@ -43,6 +45,17 @@ impl RemoteStorageKind { } } +impl RemoteStorageConfig { + /// Helper to fetch the configured concurrency limit. + pub fn concurrency_limit(&self) -> usize { + match &self.storage { + RemoteStorageKind::LocalFs { .. } => DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, + RemoteStorageKind::AwsS3(c) => c.concurrency_limit.into(), + RemoteStorageKind::AzureContainer(c) => c.concurrency_limit.into(), + } + } +} + fn default_timeout() -> Duration { RemoteStorageConfig::DEFAULT_TIMEOUT } @@ -114,6 +127,18 @@ fn default_max_keys_per_list_response() -> Option { DEFAULT_MAX_KEYS_PER_LIST_RESPONSE } +fn default_azure_conn_pool_size() -> usize { + // By default, the Azure SDK does no connection pooling, due to historic reports of hard-to-reproduce issues + // (https://github.com/hyperium/hyper/issues/2312) + // + // However, using connection pooling is important to avoid exhausting client ports when + // doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971) + // + // We therefore enable a modest pool size by default: this may be configured to zero if + // issues like the alleged upstream hyper issue appear. + 8 +} + impl Debug for S3Config { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("S3Config") @@ -146,6 +171,8 @@ pub struct AzureConfig { pub concurrency_limit: NonZeroUsize, #[serde(default = "default_max_keys_per_list_response")] pub max_keys_per_list_response: Option, + #[serde(default = "default_azure_conn_pool_size")] + pub conn_pool_size: usize, } fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize { @@ -302,6 +329,7 @@ timeout = '5s'"; container_region = 'westeurope' upload_storage_class = 'INTELLIGENT_TIERING' timeout = '7s' + conn_pool_size = 8 "; let config = parse(toml).unwrap(); @@ -316,6 +344,7 @@ timeout = '5s'"; prefix_in_container: None, concurrency_limit: default_remote_storage_azure_concurrency_limit(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + conn_pool_size: 8, }), timeout: Duration::from_secs(7), small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 2a3468f986..6eb5570d9b 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -18,40 +18,35 @@ mod s3_bucket; mod simulate_failures; mod support; -use std::{ - collections::HashMap, - fmt::Debug, - num::NonZeroU32, - ops::Bound, - pin::{pin, Pin}, - sync::Arc, - time::SystemTime, -}; +use std::collections::HashMap; +use std::fmt::Debug; +use std::num::NonZeroU32; +use std::ops::Bound; +use std::pin::{Pin, pin}; +use std::sync::Arc; +use std::time::SystemTime; use anyhow::Context; -use camino::{Utf8Path, Utf8PathBuf}; - +/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. +pub use azure_core::Etag; use bytes::Bytes; -use futures::{stream::Stream, StreamExt}; +use camino::{Utf8Path, Utf8PathBuf}; +pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; +use futures::StreamExt; +use futures::stream::Stream; use itertools::Itertools as _; +use s3_bucket::RequestKind; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::info; -pub use self::{ - azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket, - simulate_failures::UnreliableWrapper, -}; -use s3_bucket::RequestKind; - +pub use self::azure_blob::AzureBlobStorage; +pub use self::local_fs::LocalFs; +pub use self::s3_bucket::S3Bucket; +pub use self::simulate_failures::UnreliableWrapper; pub use crate::config::{AzureConfig, RemoteStorageConfig, RemoteStorageKind, S3Config}; -/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. -pub use azure_core::Etag; - -pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; - /// Default concurrency limit for S3 operations /// /// Currently, sync happens with AWS S3, that has two limits on requests per second: @@ -65,6 +60,12 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; /// Here, a limit of max 20k concurrent connections was noted. /// pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100; +/// Set this limit analogously to the S3 limit. +/// +/// The local filesystem backend doesn't enforce a concurrency limit itself, but this also bounds +/// the upload queue concurrency. Some tests create thousands of uploads, which slows down the +/// quadratic scheduling of the upload queue, and there is no point spawning so many Tokio tasks. +pub const DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT: usize = 100; /// No limits on the client side, which currenltly means 1000 for AWS S3. /// pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; @@ -341,9 +342,9 @@ pub trait RemoteStorage: Send + Sync + 'static { /// If the operation fails because of timeout or cancellation, the root cause of the error will be /// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went /// through. - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()>; @@ -634,8 +635,13 @@ impl GenericRemoteStorage { let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "".into()); let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "".into()); - info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", - s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); + info!( + "Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", + s3_config.bucket_name, + s3_config.bucket_region, + s3_config.prefix_in_bucket, + s3_config.endpoint + ); Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?)) } RemoteStorageKind::AzureContainer(azure_config) => { @@ -643,8 +649,12 @@ impl GenericRemoteStorage { .storage_account .as_deref() .unwrap_or(""); - info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", - azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container); + info!( + "Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", + azure_config.container_name, + azure_config.container_region, + azure_config.prefix_in_container + ); Self::AzureBlob(Arc::new(AzureBlobStorage::new( azure_config, timeout, diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 1a2d421c66..f03d6ac8ee 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -4,31 +4,26 @@ //! This storage used in tests, but can also be used in cases when a certain persistent //! volume is mounted to the local FS. -use std::{ - collections::HashSet, - io::ErrorKind, - num::NonZeroU32, - time::{Duration, SystemTime, UNIX_EPOCH}, -}; +use std::collections::HashSet; +use std::io::ErrorKind; +use std::num::NonZeroU32; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use anyhow::{bail, ensure, Context}; +use anyhow::{Context, bail, ensure}; use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use futures::stream::Stream; -use tokio::{ - fs, - io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, -}; -use tokio_util::{io::ReaderStream, sync::CancellationToken}; +use tokio::fs; +use tokio::io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; +use tokio_util::io::ReaderStream; +use tokio_util::sync::CancellationToken; use utils::crashsafe::path_with_suffix_extension; -use crate::{ - Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, - TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, -}; - use super::{RemoteStorage, StorageMetadata}; -use crate::Etag; +use crate::{ + Download, DownloadError, DownloadOpts, Etag, Listing, ListingMode, ListingObject, + REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, TimeTravelError, TimeoutOrCancel, +}; const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; @@ -91,7 +86,8 @@ impl LocalFs { #[cfg(test)] async fn list_all(&self) -> anyhow::Result> { - use std::{future::Future, pin::Pin}; + use std::future::Future; + use std::pin::Pin; fn get_all_files<'a, P>( directory_path: P, ) -> Pin>> + Send + Sync + 'a>> @@ -284,7 +280,9 @@ impl LocalFs { })?; if bytes_read < from_size_bytes { - bail!("Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes"); + bail!( + "Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes" + ); } // Check if there is any extra data after the given size. let mut from = buffer_to_read.into_inner(); @@ -562,9 +560,9 @@ impl RemoteStorage for LocalFs { } } - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { for path in paths { @@ -642,10 +640,13 @@ fn mock_etag(meta: &std::fs::Metadata) -> Etag { #[cfg(test)] mod fs_tests { - use super::*; + use std::collections::HashMap; + use std::io::Write; + use std::ops::Bound; use camino_tempfile::tempdir; - use std::{collections::HashMap, io::Write, ops::Bound}; + + use super::*; async fn read_and_check_metadata( storage: &LocalFs, @@ -736,9 +737,14 @@ mod fs_tests { ); let non_existing_path = RemotePath::new(Utf8Path::new("somewhere/else"))?; - match storage.download(&non_existing_path, &DownloadOpts::default(), &cancel).await { + match storage + .download(&non_existing_path, &DownloadOpts::default(), &cancel) + .await + { Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys - other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), + other => panic!( + "Should get a NotFound error when downloading non-existing storage files, but got: {other:?}" + ), } Ok(()) } diff --git a/libs/remote_storage/src/metrics.rs b/libs/remote_storage/src/metrics.rs index 48c121fbc8..81e68e9a29 100644 --- a/libs/remote_storage/src/metrics.rs +++ b/libs/remote_storage/src/metrics.rs @@ -1,5 +1,5 @@ use metrics::{ - register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter, + Histogram, IntCounter, register_histogram_vec, register_int_counter, register_int_counter_vec, }; use once_cell::sync::Lazy; @@ -16,8 +16,8 @@ pub(crate) enum RequestKind { Head = 6, } -use scopeguard::ScopeGuard; use RequestKind::*; +use scopeguard::ScopeGuard; impl RequestKind { const fn as_str(&self) -> &'static str { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 2891f92d07..ba7ce9e1e7 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -4,56 +4,50 @@ //! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. -use std::{ - borrow::Cow, - collections::HashMap, - num::NonZeroU32, - pin::Pin, - sync::Arc, - task::{Context, Poll}, - time::{Duration, SystemTime}, -}; +use std::borrow::Cow; +use std::collections::HashMap; +use std::num::NonZeroU32; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use std::time::{Duration, SystemTime}; -use anyhow::{anyhow, Context as _}; -use aws_config::{ - default_provider::credentials::DefaultCredentialsChain, - retry::{RetryConfigBuilder, RetryMode}, - BehaviorVersion, -}; -use aws_sdk_s3::{ - config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, - error::SdkError, - operation::{get_object::GetObjectError, head_object::HeadObjectError}, - types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, - Client, -}; +use anyhow::{Context as _, anyhow}; +use aws_config::BehaviorVersion; +use aws_config::default_provider::credentials::DefaultCredentialsChain; +use aws_config::retry::{RetryConfigBuilder, RetryMode}; +use aws_sdk_s3::Client; +use aws_sdk_s3::config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}; +use aws_sdk_s3::error::SdkError; +use aws_sdk_s3::operation::get_object::GetObjectError; +use aws_sdk_s3::operation::head_object::HeadObjectError; +use aws_sdk_s3::types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}; use aws_smithy_async::rt::sleep::TokioSleep; -use http_body_util::StreamBody; -use http_types::StatusCode; - -use aws_smithy_types::{body::SdkBody, DateTime}; -use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError}; +use aws_smithy_types::DateTime; +use aws_smithy_types::body::SdkBody; +use aws_smithy_types::byte_stream::ByteStream; +use aws_smithy_types::date_time::ConversionError; use bytes::Bytes; use futures::stream::Stream; use futures_util::StreamExt; +use http_body_util::StreamBody; +use http_types::StatusCode; use hyper::body::Frame; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use utils::backoff; use super::StorageMetadata; -use crate::{ - config::S3Config, - error::Cancelled, - metrics::{start_counting_cancelled_wait, start_measuring_requests}, - support::PermitCarrying, - ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, - RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE_S3, - REMOTE_STORAGE_PREFIX_SEPARATOR, -}; - -use crate::metrics::AttemptOutcome; +use crate::config::S3Config; +use crate::error::Cancelled; pub(super) use crate::metrics::RequestKind; +use crate::metrics::{AttemptOutcome, start_counting_cancelled_wait, start_measuring_requests}; +use crate::support::PermitCarrying; +use crate::{ + ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, + MAX_KEYS_PER_DELETE_S3, REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, RemoteStorage, + TimeTravelError, TimeoutOrCancel, +}; /// AWS S3 storage. pub struct S3Bucket { @@ -813,9 +807,9 @@ impl RemoteStorage for S3Bucket { .await } - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Delete; @@ -958,8 +952,10 @@ impl RemoteStorage for S3Bucket { version_id, key, .. } = &vd; if version_id == "null" { - return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \ - indicating either disabled versioning, or legacy objects with null version id values"))); + return Err(TimeTravelError::Other(anyhow!( + "Received ListVersions response for key={key} with version_id='null', \ + indicating either disabled versioning, or legacy objects with null version id values" + ))); } tracing::trace!( "Parsing version key={key} version_id={version_id} kind={:?}", @@ -1126,9 +1122,10 @@ impl VerOrDelete { #[cfg(test)] mod tests { - use camino::Utf8Path; use std::num::NonZeroUsize; + use camino::Utf8Path; + use crate::{RemotePath, S3Bucket, S3Config}; #[tokio::test] diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 51833c1fe6..f56be873c4 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -1,14 +1,15 @@ //! This module provides a wrapper around a real RemoteStorage implementation that //! causes the first N attempts at each upload or download operatio to fail. For //! testing purposes. -use bytes::Bytes; -use futures::stream::Stream; -use futures::StreamExt; use std::collections::HashMap; +use std::collections::hash_map::Entry; use std::num::NonZeroU32; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use std::time::SystemTime; -use std::{collections::hash_map::Entry, sync::Arc}; + +use bytes::Bytes; +use futures::StreamExt; +use futures::stream::Stream; use tokio_util::sync::CancellationToken; use crate::{ @@ -181,9 +182,9 @@ impl RemoteStorage for UnreliableWrapper { self.delete_inner(path, true, cancel).await } - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?; diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs index 1ed9ed9305..07da38cf77 100644 --- a/libs/remote_storage/src/support.rs +++ b/libs/remote_storage/src/support.rs @@ -1,9 +1,7 @@ -use std::{ - future::Future, - pin::Pin, - task::{Context, Poll}, - time::Duration, -}; +use std::future::Future; +use std::pin::Pin; +use std::task::{Context, Poll}; +use std::time::Duration; use bytes::Bytes; use futures_util::Stream; @@ -114,9 +112,10 @@ pub(crate) fn cancel_or_timeout( #[cfg(test)] mod tests { + use futures::stream::StreamExt; + use super::*; use crate::DownloadError; - use futures::stream::StreamExt; #[tokio::test(start_paused = true)] async fn cancelled_download_stream() { diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index d5da1d48e9..6a78ddc01e 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -1,19 +1,20 @@ +use std::collections::HashSet; +use std::num::NonZeroU32; +use std::ops::Bound; +use std::sync::Arc; + use anyhow::Context; use camino::Utf8Path; use futures::StreamExt; use remote_storage::{DownloadError, DownloadOpts, ListingMode, ListingObject, RemotePath}; -use std::ops::Bound; -use std::sync::Arc; -use std::{collections::HashSet, num::NonZeroU32}; use test_context::test_context; use tokio_util::sync::CancellationToken; use tracing::debug; -use crate::common::{download_to_vec, upload_stream, wrap_stream}; - use super::{ MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs, }; +use crate::common::{download_to_vec, upload_stream, wrap_stream}; /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. @@ -62,7 +63,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a .into_iter() .collect::>(); assert_eq!( - root_remote_prefixes, HashSet::from([base_prefix.clone()]), + root_remote_prefixes, + HashSet::from([base_prefix.clone()]), "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" ); @@ -84,7 +86,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a .difference(&nested_remote_prefixes) .collect::>(); assert_eq!( - remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), + 0, "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); @@ -119,7 +122,8 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a .difference(&nested_remote_prefixes_combined) .collect::>(); assert_eq!( - remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), + 0, "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", ); diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 92d579fec8..31c9ca3200 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -1,9 +1,9 @@ +use std::collections::HashSet; use std::env; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; -use std::time::UNIX_EPOCH; -use std::{collections::HashSet, time::Duration}; +use std::time::{Duration, UNIX_EPOCH}; use anyhow::Context; use remote_storage::{ @@ -208,7 +208,7 @@ async fn create_azure_client( .as_millis(); // because nanos can be the same for two threads so can millis, add randomness - let random = rand::thread_rng().gen::(); + let random = rand::thread_rng().r#gen::(); let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AzureContainer(AzureConfig { @@ -218,6 +218,7 @@ async fn create_azure_client( prefix_in_container: Some(format!("test_{millis}_{random:08x}/")), concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, + conn_pool_size: 8, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index e60ec18c93..6996bb27ae 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -1,13 +1,12 @@ +use std::collections::HashSet; use std::env; use std::fmt::{Debug, Display}; use std::future::Future; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; -use std::time::{Duration, UNIX_EPOCH}; -use std::{collections::HashSet, time::SystemTime}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; -use crate::common::{download_to_vec, upload_stream}; use anyhow::Context; use camino::Utf8Path; use futures_util::StreamExt; @@ -15,12 +14,13 @@ use remote_storage::{ DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, }; -use test_context::test_context; -use test_context::AsyncTestContext; +use test_context::{AsyncTestContext, test_context}; use tokio::io::AsyncBufReadExt; use tokio_util::sync::CancellationToken; use tracing::info; +use crate::common::{download_to_vec, upload_stream}; + mod common; #[path = "common/tests.rs"] @@ -128,8 +128,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: let t0_hwt = t0 + half_wt; let t1_hwt = t1 - half_wt; if !(t0_hwt..=t1_hwt).contains(&last_modified) { - panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \ - This likely means a large lock discrepancy between S3 and the local clock."); + panic!( + "last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \ + This likely means a large lock discrepancy between S3 and the local clock." + ); } } @@ -383,7 +385,7 @@ async fn create_s3_client( .as_millis(); // because nanos can be the same for two threads so can millis, add randomness - let random = rand::thread_rng().gen::(); + let random = rand::thread_rng().r#gen::(); let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AwsS3(S3Config { diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 14811232d3..d9d080e8fe 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -1,10 +1,16 @@ [package] name = "safekeeper_api" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] -serde.workspace = true +anyhow.workspace = true const_format.workspace = true +serde.workspace = true +serde_json.workspace = true +postgres_ffi.workspace = true +pq_proto.workspace = true +tokio.workspace = true utils.workspace = true +pageserver_api.workspace = true diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs index 63c2c51188..fa86523ad7 100644 --- a/libs/safekeeper_api/src/lib.rs +++ b/libs/safekeeper_api/src/lib.rs @@ -1,10 +1,30 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] use const_format::formatcp; +use pq_proto::SystemId; +use serde::{Deserialize, Serialize}; +pub mod membership; /// Public API types pub mod models; +/// Consensus logical timestamp. Note: it is a part of sk control file. +pub type Term = u64; +/// With this term timeline is created initially. It +/// is a normal term except wp is never elected with it. +pub const INITIAL_TERM: Term = 0; + +/// Information about Postgres. Safekeeper gets it once and then verifies all +/// further connections from computes match. Note: it is a part of sk control +/// file. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ServerInfo { + /// Postgres server version + pub pg_version: u32, + pub system_id: SystemId, + pub wal_seg_size: u32, +} + pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs new file mode 100644 index 0000000000..4ccdd491b0 --- /dev/null +++ b/libs/safekeeper_api/src/membership.rs @@ -0,0 +1,192 @@ +//! Types defining safekeeper membership, see +//! rfcs/035-safekeeper-dynamic-membership-change.md +//! for details. + +use std::collections::HashSet; +use std::fmt::Display; + +use anyhow; +use anyhow::bail; +use serde::{Deserialize, Serialize}; +use utils::id::NodeId; + +/// 1 is the first valid generation, 0 is used as +/// a placeholder before we fully migrate to generations. +pub const INVALID_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(0); +pub const INITIAL_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(1); + +/// Number uniquely identifying safekeeper configuration. +/// Note: it is a part of sk control file. +/// +/// Like tenant generations, but for safekeepers. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +pub struct SafekeeperGeneration(u32); + +impl SafekeeperGeneration { + pub const fn new(v: u32) -> Self { + Self(v) + } + + #[track_caller] + pub fn previous(&self) -> Option { + Some(Self(self.0.checked_sub(1)?)) + } + + #[track_caller] + pub fn next(&self) -> Self { + Self(self.0 + 1) + } + + pub fn into_inner(self) -> u32 { + self.0 + } +} + +impl Display for SafekeeperGeneration { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +/// Membership is defined by ids so e.g. walproposer uses them to figure out +/// quorums, but we also carry host and port to give wp idea where to connect. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SafekeeperId { + pub id: NodeId, + pub host: String, + /// We include here only port for computes -- that is, pg protocol tenant + /// only port, or wide pg protocol port if the former is not configured. + pub pg_port: u16, +} + +impl Display for SafekeeperId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[id={}, ep={}:{}]", self.id, self.host, self.pg_port) + } +} + +/// Set of safekeepers. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(transparent)] +pub struct MemberSet { + pub m: Vec, +} + +impl MemberSet { + pub fn empty() -> Self { + MemberSet { m: Vec::new() } + } + + pub fn new(members: Vec) -> anyhow::Result { + let hs: HashSet = HashSet::from_iter(members.iter().map(|sk| sk.id)); + if hs.len() != members.len() { + bail!("duplicate safekeeper id in the set {:?}", members); + } + Ok(MemberSet { m: members }) + } + + pub fn contains(&self, sk: &SafekeeperId) -> bool { + self.m.iter().any(|m| m.id == sk.id) + } + + pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> { + if self.contains(&sk) { + bail!(format!( + "sk {} is already member of the set {}", + sk.id, self + )); + } + self.m.push(sk); + Ok(()) + } +} + +impl Display for MemberSet { + /// Display as a comma separated list of members. + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::>(); + write!(f, "({})", sks_str.join(", ")) + } +} + +/// Safekeeper membership configuration. +/// Note: it is a part of both control file and http API. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct Configuration { + /// Unique id. + pub generation: SafekeeperGeneration, + /// Current members of the configuration. + pub members: MemberSet, + /// Some means it is a joint conf. + pub new_members: Option, +} + +impl Configuration { + /// Used for pre-generations timelines, will be removed eventually. + pub fn empty() -> Self { + Configuration { + generation: INVALID_GENERATION, + members: MemberSet::empty(), + new_members: None, + } + } +} + +impl Display for Configuration { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "gen={}, members={}, new_members={}", + self.generation, + self.members, + self.new_members + .as_ref() + .map(ToString::to_string) + .unwrap_or(String::from("none")) + ) + } +} + +#[cfg(test)] +mod tests { + use utils::id::NodeId; + + use super::{MemberSet, SafekeeperId}; + + #[test] + fn test_member_set() { + let mut members = MemberSet::empty(); + members + .add(SafekeeperId { + id: NodeId(42), + host: String::from("lala.org"), + pg_port: 5432, + }) + .unwrap(); + + members + .add(SafekeeperId { + id: NodeId(42), + host: String::from("lala.org"), + pg_port: 5432, + }) + .expect_err("duplicate must not be allowed"); + + members + .add(SafekeeperId { + id: NodeId(43), + host: String::from("bubu.org"), + pg_port: 5432, + }) + .unwrap(); + + println!("members: {}", members); + + let j = serde_json::to_string(&members).expect("failed to serialize"); + println!("members json: {}", j); + assert_eq!( + j, + r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"# + ); + } +} diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 28666d197a..2f2aeaa429 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -1,21 +1,224 @@ -use serde::{Deserialize, Serialize}; +//! Types used in safekeeper http API. Many of them are also reused internally. -use utils::{ - id::{NodeId, TenantId, TimelineId}, - lsn::Lsn, -}; +use std::net::SocketAddr; + +use pageserver_api::shard::ShardIdentity; +use postgres_ffi::TimestampTz; +use serde::{Deserialize, Serialize}; +use tokio::time::Instant; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; + +use crate::membership::Configuration; +use crate::{ServerInfo, Term}; + +#[derive(Debug, Serialize)] +pub struct SafekeeperStatus { + pub id: NodeId, +} #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, - pub peer_ids: Option>, + pub mconf: Configuration, pub pg_version: u32, pub system_id: Option, + // By default WAL_SEGMENT_SIZE pub wal_seg_size: Option, + pub start_lsn: Lsn, + // Normal creation should omit this field (start_lsn initializes all LSNs). + // However, we allow specifying custom value higher than start_lsn for + // manual recovery case, see test_s3_wal_replay. + pub commit_lsn: Option, +} + +/// Same as TermLsn, but serializes LSN using display serializer +/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct TermSwitchApiEntry { + pub term: Term, + pub lsn: Lsn, +} + +/// Augment AcceptorState with last_log_term for convenience +#[derive(Debug, Serialize, Deserialize)] +pub struct AcceptorStateStatus { + pub term: Term, + pub epoch: Term, // aka last_log_term, old `epoch` name is left for compatibility + pub term_history: Vec, +} + +/// Things safekeeper should know about timeline state on peers. +/// Used as both model and internally. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PeerInfo { + pub sk_id: NodeId, + pub term: Term, + /// Term of the last entry. + pub last_log_term: Term, + /// LSN of the last record. + pub flush_lsn: Lsn, pub commit_lsn: Lsn, - // If not passed, it is assigned to the beginning of commit_lsn segment. - pub local_start_lsn: Option, + /// Since which LSN safekeeper has WAL. + pub local_start_lsn: Lsn, + /// When info was received. Serde annotations are not very useful but make + /// the code compile -- we don't rely on this field externally. + #[serde(skip)] + #[serde(default = "Instant::now")] + pub ts: Instant, + pub pg_connstr: String, + pub http_connstr: String, +} + +pub type FullTransactionId = u64; + +/// Hot standby feedback received from replica +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct HotStandbyFeedback { + pub ts: TimestampTz, + pub xmin: FullTransactionId, + pub catalog_xmin: FullTransactionId, +} + +pub const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0; + +impl HotStandbyFeedback { + pub fn empty() -> HotStandbyFeedback { + HotStandbyFeedback { + ts: 0, + xmin: 0, + catalog_xmin: 0, + } + } +} + +/// Standby status update +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct StandbyReply { + pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby. + pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby. + pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby. + pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01. + pub reply_requested: bool, +} + +impl StandbyReply { + pub fn empty() -> Self { + StandbyReply { + write_lsn: Lsn::INVALID, + flush_lsn: Lsn::INVALID, + apply_lsn: Lsn::INVALID, + reply_ts: 0, + reply_requested: false, + } + } +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct StandbyFeedback { + pub reply: StandbyReply, + pub hs_feedback: HotStandbyFeedback, +} + +impl StandbyFeedback { + pub fn empty() -> Self { + StandbyFeedback { + reply: StandbyReply::empty(), + hs_feedback: HotStandbyFeedback::empty(), + } + } +} + +/// Receiver is either pageserver or regular standby, which have different +/// feedbacks. +/// Used as both model and internally. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub enum ReplicationFeedback { + Pageserver(PageserverFeedback), + Standby(StandbyFeedback), +} + +/// Uniquely identifies a WAL service connection. Logged in spans for +/// observability. +pub type ConnectionId = u32; + +/// Serialize is used only for json'ing in API response. Also used internally. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum WalSenderState { + Vanilla(VanillaWalSenderState), + Interpreted(InterpretedWalSenderState), +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VanillaWalSenderState { + pub ttid: TenantTimelineId, + pub addr: SocketAddr, + pub conn_id: ConnectionId, + // postgres application_name + pub appname: Option, + pub feedback: ReplicationFeedback, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct InterpretedWalSenderState { + pub ttid: TenantTimelineId, + pub shard: ShardIdentity, + pub addr: SocketAddr, + pub conn_id: ConnectionId, + // postgres application_name + pub appname: Option, + pub feedback: ReplicationFeedback, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalReceiverState { + /// None means it is recovery initiated by us (this safekeeper). + pub conn_id: Option, + pub status: WalReceiverStatus, +} + +/// Walreceiver status. Currently only whether it passed voting stage and +/// started receiving the stream, but it is easy to add more if needed. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum WalReceiverStatus { + Voting, + Streaming, +} + +/// Info about timeline on safekeeper ready for reporting. +#[derive(Debug, Serialize, Deserialize)] +pub struct TimelineStatus { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub mconf: Configuration, + pub acceptor_state: AcceptorStateStatus, + pub pg_info: ServerInfo, + pub flush_lsn: Lsn, + pub timeline_start_lsn: Lsn, + pub local_start_lsn: Lsn, + pub commit_lsn: Lsn, + pub backup_lsn: Lsn, + pub peer_horizon_lsn: Lsn, + pub remote_consistent_lsn: Lsn, + pub peers: Vec, + pub walsenders: Vec, + pub walreceivers: Vec, +} + +/// Request to switch membership configuration. +#[derive(Serialize, Deserialize)] +#[serde(transparent)] +pub struct TimelineMembershipSwitchRequest { + pub mconf: Configuration, +} + +/// In response both previous and current configuration are sent. +#[derive(Serialize, Deserialize)] +pub struct TimelineMembershipSwitchResponse { + pub previous_conf: Configuration, + pub current_conf: Configuration, } fn lsn_invalid() -> Lsn { @@ -73,3 +276,23 @@ pub struct TimelineTermBumpResponse { pub previous_term: u64, pub current_term: u64, } + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct SafekeeperUtilization { + pub timeline_count: u64, +} + +/// pull_timeline request body. +#[derive(Debug, Deserialize, Serialize)] +pub struct PullTimelineRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub http_hosts: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct PullTimelineResponse { + // Donor safekeeper host + pub safekeeper_host: String, + // TODO: add more fields? +} diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index c4aad53cdb..818d759eac 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -38,7 +38,6 @@ pub mod http; use opentelemetry::trace::TracerProvider; use opentelemetry::KeyValue; -use opentelemetry_sdk::Resource; use tracing::Subscriber; use tracing_subscriber::registry::LookupSpan; use tracing_subscriber::Layer; @@ -121,7 +120,10 @@ where S: Subscriber + for<'span> LookupSpan<'span>, { // Sets up exporter from the OTEL_EXPORTER_* environment variables. - let exporter = opentelemetry_otlp::new_exporter().http(); + let exporter = opentelemetry_otlp::SpanExporter::builder() + .with_http() + .build() + .expect("could not initialize opentelemetry exporter"); // TODO: opentelemetry::global::set_error_handler() with custom handler that // bypasses default tracing layers, but logs regular looking log @@ -132,17 +134,13 @@ where opentelemetry_sdk::propagation::TraceContextPropagator::new(), ); - let tracer = opentelemetry_otlp::new_pipeline() - .tracing() - .with_exporter(exporter) - .with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource( - Resource::new(vec![KeyValue::new( - opentelemetry_semantic_conventions::resource::SERVICE_NAME, - service_name, - )]), - )) - .install_batch(opentelemetry_sdk::runtime::Tokio) - .expect("could not initialize opentelemetry exporter") + let tracer = opentelemetry_sdk::trace::TracerProvider::builder() + .with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio) + .with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new( + opentelemetry_semantic_conventions::resource::SERVICE_NAME, + service_name, + )])) + .build() .tracer("global"); tracing_opentelemetry::layer().with_tracer(tracer) diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 66500fb141..5020d82adf 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -15,6 +15,7 @@ arc-swap.workspace = true sentry.workspace = true async-compression.workspace = true anyhow.workspace = true +backtrace.workspace = true bincode.workspace = true bytes.workspace = true camino.workspace = true @@ -23,17 +24,13 @@ diatomic-waker.workspace = true git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true -hyper0 = { workspace = true, features = ["full"] } fail.workspace = true -futures = { workspace = true} -jemalloc_pprof.workspace = true +futures = { workspace = true } jsonwebtoken.workspace = true -nix.workspace = true +nix = { workspace = true, features = ["ioctl"] } once_cell.workspace = true pin-project-lite.workspace = true -pprof.workspace = true regex.workspace = true -routerify.workspace = true serde.workspace = true serde_with.workspace = true serde_json.workspace = true @@ -50,8 +47,6 @@ rand.workspace = true scopeguard.workspace = true strum.workspace = true strum_macros.workspace = true -url.workspace = true -uuid.workspace = true walkdir.workspace = true pq_proto.workspace = true @@ -60,18 +55,13 @@ metrics.workspace = true const_format.workspace = true -# to use tokio channels as streams, this is faster to compile than async_stream -# why is it only here? no other crate should use it, streams are rarely needed. -tokio-stream = { version = "0.1.14" } - -serde_path_to_error.workspace = true - [dev-dependencies] byteorder.workspace = true bytes.workspace = true criterion.workspace = true hex-literal.workspace = true camino-tempfile.workspace = true +pprof.workspace = true serde_assert.workspace = true tokio = { workspace = true, features = ["test-util"] } diff --git a/libs/utils/benches/README.md b/libs/utils/benches/README.md new file mode 100644 index 0000000000..5afbe3cf2b --- /dev/null +++ b/libs/utils/benches/README.md @@ -0,0 +1,26 @@ +## Utils Benchmarks + +To run benchmarks: + +```sh +# All benchmarks. +cargo bench --package utils + +# Specific file. +cargo bench --package utils --bench benchmarks + +# Specific benchmark. +cargo bench --package utils --bench benchmarks log_slow/enabled=true + +# List available benchmarks. +cargo bench --package utils --benches -- --list + +# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. +# Output in target/criterion/*/profile/flamegraph.svg. +cargo bench --package utils --bench benchmarks log_slow/enabled=true --profile-time 10 +``` + +Additional charts and statistics are available in `target/criterion/report/index.html`. + +Benchmarks are automatically compared against the previous run. To compare against other runs, see +`--baseline` and `--save-baseline`. \ No newline at end of file diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 44eb36387c..348e27ac47 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,5 +1,18 @@ -use criterion::{criterion_group, criterion_main, Criterion}; +use std::time::Duration; + +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use pprof::criterion::{Output, PProfProfiler}; use utils::id; +use utils::logging::log_slow; + +// Register benchmarks with Criterion. +criterion_group!( + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_id_stringify, + bench_log_slow, +); +criterion_main!(benches); pub fn bench_id_stringify(c: &mut Criterion) { // Can only use public methods. @@ -16,5 +29,31 @@ pub fn bench_id_stringify(c: &mut Criterion) { }); } -criterion_group!(benches, bench_id_stringify); -criterion_main!(benches); +pub fn bench_log_slow(c: &mut Criterion) { + for enabled in [false, true] { + c.bench_function(&format!("log_slow/enabled={enabled}"), |b| { + run_bench(b, enabled).unwrap() + }); + } + + // The actual benchmark. + fn run_bench(b: &mut Bencher, enabled: bool) -> anyhow::Result<()> { + const THRESHOLD: Duration = Duration::from_secs(1); + + // Use a multi-threaded runtime to avoid thread parking overhead when yielding. + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + + // Test both with and without log_slow, since we're essentially measuring Tokio scheduling + // performance too. Use a simple noop future that yields once, to avoid any scheduler fast + // paths for a ready future. + if enabled { + b.iter(|| runtime.block_on(log_slow("ready", THRESHOLD, tokio::task::yield_now()))); + } else { + b.iter(|| runtime.block_on(tokio::task::yield_now())); + } + + Ok(()) + } +} diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh index a8615c2337..f394d4c58d 100755 --- a/libs/utils/scripts/restore_from_wal.sh +++ b/libs/utils/scripts/restore_from_wal.sh @@ -39,7 +39,7 @@ function initdb_with_args { ;; esac - eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}" + eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib ASAN_OPTIONS="${ASAN_OPTIONS-}" UBSAN_OPTIONS="${UBSAN_OPTIONS-}" "${cmd[*]}" } rm -fr "$DATA_DIR" diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index f7acc61ac1..4bfd0ab055 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -10,7 +10,7 @@ use jsonwebtoken::{ }; use serde::{Deserialize, Serialize}; -use crate::{http::error::ApiError, id::TenantId}; +use crate::id::TenantId; /// Algorithm to use. We require EdDSA. const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA; @@ -90,15 +90,6 @@ impl Display for AuthError { } } -impl From for ApiError { - fn from(_value: AuthError) -> Self { - // Don't pass on the value of the AuthError as a precautionary measure. - // Being intentionally vague in public error communication hurts debugability - // but it is more secure. - ApiError::Forbidden("JWT authentication error".to_string()) - } -} - pub struct JwtAuth { decoding_keys: Vec, validation: Validation, diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs index 096c7e5854..e6503fe377 100644 --- a/libs/utils/src/backoff.rs +++ b/libs/utils/src/backoff.rs @@ -1,4 +1,5 @@ use std::fmt::{Debug, Display}; +use std::time::Duration; use futures::Future; use tokio_util::sync::CancellationToken; @@ -29,6 +30,11 @@ pub async fn exponential_backoff( } } +pub fn exponential_backoff_duration(n: u32, base_increment: f64, max_seconds: f64) -> Duration { + let seconds = exponential_backoff_duration_seconds(n, base_increment, max_seconds); + Duration::from_secs_f64(seconds) +} + pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { if n == 0 { 0.0 diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 42b45eeea0..4d173d0726 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -286,6 +286,11 @@ mod tests { const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7]; const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff]; + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] + struct NewTypeStruct(u32); + const NT1: NewTypeStruct = NewTypeStruct(414243); + const NT1_INNER: u32 = 414243; + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct LongMsg { pub tag: u8, @@ -408,4 +413,42 @@ mod tests { let msg2 = LongMsg::des(&encoded).unwrap(); assert_eq!(msg, msg2); } + + #[test] + /// Ensure that newtype wrappers around u32 don't change the serialization format + fn be_nt() { + use super::BeSer; + + assert_eq!(NT1.serialized_size().unwrap(), 4); + + let msg = NT1; + + let encoded = msg.ser().unwrap(); + let expected = hex_literal::hex!("0006 5223"); + assert_eq!(encoded, expected); + + assert_eq!(encoded, NT1_INNER.ser().unwrap()); + + let msg2 = NewTypeStruct::des(&encoded).unwrap(); + assert_eq!(msg, msg2); + } + + #[test] + /// Ensure that newtype wrappers around u32 don't change the serialization format + fn le_nt() { + use super::LeSer; + + assert_eq!(NT1.serialized_size().unwrap(), 4); + + let msg = NT1; + + let encoded = msg.ser().unwrap(); + let expected = hex_literal::hex!("2352 0600"); + assert_eq!(encoded, expected); + + assert_eq!(encoded, NT1_INNER.ser().unwrap()); + + let msg2 = NewTypeStruct::des(&encoded).unwrap(); + assert_eq!(msg, msg2); + } } diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs index b3e326bfd0..a1bcec9229 100644 --- a/libs/utils/src/env.rs +++ b/libs/utils/src/env.rs @@ -2,6 +2,7 @@ use std::{fmt::Display, str::FromStr}; +/// For types `V` that implement [`FromStr`]. pub fn var(varname: &str) -> Option where V: FromStr, @@ -10,7 +11,9 @@ where match std::env::var(varname) { Ok(s) => Some( s.parse() - .map_err(|e| format!("failed to parse env var {varname}: {e:#}")) + .map_err(|e| { + format!("failed to parse env var {varname} using FromStr::parse: {e:#}") + }) .unwrap(), ), Err(std::env::VarError::NotPresent) => None, @@ -19,3 +22,24 @@ where } } } + +/// For types `V` that implement [`serde::de::DeserializeOwned`]. +pub fn var_serde_json_string(varname: &str) -> Option +where + V: serde::de::DeserializeOwned, +{ + match std::env::var(varname) { + Ok(s) => Some({ + let value = serde_json::Value::String(s); + serde_json::from_value(value) + .map_err(|e| { + format!("failed to parse env var {varname} as a serde_json json string: {e:#}") + }) + .unwrap() + }), + Err(std::env::VarError::NotPresent) => None, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {varname} is not unicode") + } + } +} diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs index 870684b399..fc998ad9a9 100644 --- a/libs/utils/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -1,41 +1,58 @@ //! Failpoint support code shared between pageserver and safekeepers. -use crate::http::{ - error::ApiError, - json::{json_request, json_response}, -}; -use hyper::{Body, Request, Response, StatusCode}; -use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; -use tracing::*; -/// Declare a failpoint that can use the `pause` failpoint action. +/// Declare a failpoint that can use to `pause` failpoint action. /// We don't want to block the executor thread, hence, spawn_blocking + await. +/// +/// Optionally pass a cancellation token, and this failpoint will drop out of +/// its pause when the cancellation token fires. This is useful for testing +/// cases where we would like to block something, but test its clean shutdown behavior. +/// The macro evaluates to a Result in that case, where Ok(()) is the case +/// where the failpoint was not paused, and Err() is the case where cancellation +/// token fired while evaluating the failpoint. +/// +/// Remember to unpause the failpoint in the test; until that happens, one of the +/// limited number of spawn_blocking thread pool threads is leaked. #[macro_export] macro_rules! pausable_failpoint { - ($name:literal) => { + ($name:literal) => {{ if cfg!(feature = "testing") { - tokio::task::spawn_blocking({ - let current = tracing::Span::current(); + let cancel = ::tokio_util::sync::CancellationToken::new(); + let _ = $crate::pausable_failpoint!($name, &cancel); + } + }}; + ($name:literal, $cancel:expr) => {{ + if cfg!(feature = "testing") { + let failpoint_fut = ::tokio::task::spawn_blocking({ + let current = ::tracing::Span::current(); move || { let _entered = current.entered(); - tracing::info!("at failpoint {}", $name); - fail::fail_point!($name); + ::tracing::info!("at failpoint {}", $name); + ::fail::fail_point!($name); + } + }); + let cancel_fut = async move { + $cancel.cancelled().await; + }; + ::tokio::select! { + res = failpoint_fut => { + res.expect("spawn_blocking"); + // continue with execution + Ok(()) + }, + _ = cancel_fut => { + Err(()) } - }) - .await - .expect("spawn_blocking"); - } - }; - ($name:literal, $cond:expr) => { - if cfg!(feature = "testing") { - if $cond { - pausable_failpoint!($name) } + } else { + Ok(()) } - }; + }}; } +pub use pausable_failpoint; + /// use with fail::cfg("$name", "return(2000)") /// /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the @@ -160,45 +177,3 @@ fn exit_failpoint() { tracing::info!("Exit requested by failpoint"); std::process::exit(1); } - -pub type ConfigureFailpointsRequest = Vec; - -/// Information for configuring a single fail point -#[derive(Debug, Serialize, Deserialize)] -pub struct FailpointConfig { - /// Name of the fail point - pub name: String, - /// List of actions to take, using the format described in `fail::cfg` - /// - /// We also support `actions = "exit"` to cause the fail point to immediately exit. - pub actions: String, -} - -/// Configure failpoints through http. -pub async fn failpoints_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - if !fail::has_failpoints() { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Cannot manage failpoints because storage was compiled without failpoints support" - ))); - } - - let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; - for fp in failpoints { - info!("cfg failpoint: {} {}", fp.name, fp.actions); - - // We recognize one extra "action" that's not natively recognized - // by the failpoints crate: exit, to immediately kill the process - let cfg_result = apply_failpoint(&fp.name, &fp.actions); - - if let Err(err_msg) = cfg_result { - return Err(ApiError::BadRequest(anyhow::anyhow!( - "Failed to configure failpoints: {err_msg}" - ))); - } - } - - json_response(StatusCode::OK, ()) -} diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 5970836033..44565ee6a2 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -112,9 +112,9 @@ impl Serialize for Generation { // We should never be asked to serialize a None. Structures // that include an optional generation should convert None to an // Option::None - Err(serde::ser::Error::custom( - "Tried to serialize invalid generation ({self})", - )) + Err(serde::ser::Error::custom(format!( + "Tried to serialize invalid generation ({self:?})" + ))) } } } diff --git a/libs/utils/src/guard_arc_swap.rs b/libs/utils/src/guard_arc_swap.rs new file mode 100644 index 0000000000..cec5202460 --- /dev/null +++ b/libs/utils/src/guard_arc_swap.rs @@ -0,0 +1,54 @@ +//! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes +//! don't block reads. + +use arc_swap::ArcSwap; +use std::sync::Arc; +use tokio::sync::TryLockError; + +pub struct GuardArcSwap { + inner: ArcSwap, + guard: tokio::sync::Mutex<()>, +} + +pub struct Guard<'a, T> { + _guard: tokio::sync::MutexGuard<'a, ()>, + inner: &'a ArcSwap, +} + +impl GuardArcSwap { + pub fn new(inner: T) -> Self { + Self { + inner: ArcSwap::new(Arc::new(inner)), + guard: tokio::sync::Mutex::new(()), + } + } + + pub fn read(&self) -> Arc { + self.inner.load_full() + } + + pub async fn write_guard(&self) -> Guard<'_, T> { + Guard { + _guard: self.guard.lock().await, + inner: &self.inner, + } + } + + pub fn try_write_guard(&self) -> Result, TryLockError> { + let guard = self.guard.try_lock()?; + Ok(Guard { + _guard: guard, + inner: &self.inner, + }) + } +} + +impl Guard<'_, T> { + pub fn read(&self) -> Arc { + self.inner.load_full() + } + + pub fn write(&mut self, value: T) { + self.inner.store(Arc::new(value)); + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index d9b82b20da..9389a27bf3 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -2,8 +2,6 @@ //! between other crates in this repository. #![deny(clippy::undocumented_unsafe_blocks)] -extern crate hyper0 as hyper; - pub mod backoff; /// `Lsn` type implements common tasks on Log Sequence Numbers @@ -33,9 +31,6 @@ pub mod shard; mod hex; pub use hex::Hex; -// http endpoint utils -pub mod http; - // definition of the Generation type for pageserver attachment APIs pub mod generation; @@ -94,6 +89,13 @@ pub mod toml_edit_ext; pub mod circuit_breaker; +pub mod try_rcu; + +pub mod guard_arc_swap; + +#[cfg(target_os = "linux")] +pub mod linux_socket_ioctl; + // Re-export used in macro. Avoids adding git-version as dep in target crates. #[doc(hidden)] pub use git_version; diff --git a/libs/utils/src/linux_socket_ioctl.rs b/libs/utils/src/linux_socket_ioctl.rs new file mode 100644 index 0000000000..5ae0e86af8 --- /dev/null +++ b/libs/utils/src/linux_socket_ioctl.rs @@ -0,0 +1,35 @@ +//! Linux-specific socket ioctls. +//! +//! + +use std::{ + io, + mem::MaybeUninit, + os::{fd::RawFd, raw::c_int}, +}; + +use nix::libc::{FIONREAD, TIOCOUTQ}; + +unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result { + let mut inq: MaybeUninit = MaybeUninit::uninit(); + let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr()); + if err == 0 { + Ok(inq.assume_init()) + } else { + Err(io::Error::last_os_error()) + } +} + +/// # Safety +/// +/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. +pub unsafe fn inq(socket_fd: RawFd) -> io::Result { + do_ioctl(socket_fd, FIONREAD) +} + +/// # Safety +/// +/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor. +pub unsafe fn outq(socket_fd: RawFd) -> io::Result { + do_ioctl(socket_fd, TIOCOUTQ) +} diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index e205d60d74..2c36942f43 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -1,9 +1,34 @@ +use std::future::Future; use std::str::FromStr; +use std::time::Duration; use anyhow::Context; use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, VariantNames}; +use tokio::time::Instant; +use tracing::info; + +/// Logs a critical error, similarly to `tracing::error!`. This will: +/// +/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace. +/// * Trigger a pageable alert (via the metric below). +/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error". +/// * In debug builds, panic the process. +/// +/// When including errors in the message, please use {err:?} to include the error cause and original +/// backtrace. +#[macro_export] +macro_rules! critical { + ($($arg:tt)*) => {{ + if cfg!(debug_assertions) { + panic!($($arg)*); + } + $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical(); + let backtrace = std::backtrace::Backtrace::capture(); + tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*)); + }}; +} #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)] #[strum(serialize_all = "snake_case")] @@ -25,7 +50,10 @@ impl LogFormat { } } -struct TracingEventCountMetric { +pub struct TracingEventCountMetric { + /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro, + /// and also emit it as a regular error. These are thus double-counted, but that seems fine. + critical: IntCounter, error: IntCounter, warn: IntCounter, info: IntCounter, @@ -33,7 +61,7 @@ struct TracingEventCountMetric { trace: IntCounter, } -static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| { +pub static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| { let vec = metrics::register_int_counter_vec!( "libmetrics_tracing_event_count", "Number of tracing events, by level", @@ -46,6 +74,7 @@ static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| impl TracingEventCountMetric { fn new(vec: IntCounterVec) -> Self { Self { + critical: vec.with_label_values(&["critical"]), error: vec.with_label_values(&["error"]), warn: vec.with_label_values(&["warn"]), info: vec.with_label_values(&["info"]), @@ -54,6 +83,11 @@ impl TracingEventCountMetric { } } + // Allow public access from `critical!` macro. + pub fn inc_critical(&self) { + self.critical.inc(); + } + fn inc_for_level(&self, level: tracing::Level) { let counter = match level { tracing::Level::ERROR => &self.error, @@ -288,6 +322,43 @@ impl std::fmt::Debug for SecretString { } } +/// Logs a periodic message if a future is slow to complete. +/// +/// This is performance-sensitive as it's used on the GetPage read path. +/// +/// TODO: consider upgrading this to a warning, but currently it fires too often. +#[inline] +pub async fn log_slow(name: &str, threshold: Duration, f: impl Future) -> O { + // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and + // won't fit on the stack. + let mut f = Box::pin(f); + + let started = Instant::now(); + let mut attempt = 1; + + loop { + // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common + // case where the timeout doesn't fire. + let deadline = started + attempt * threshold; + if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await { + // NB: we check if we exceeded the threshold even if the timeout never fired, because + // scheduling or execution delays may cause the future to succeed even if it exceeds the + // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid + // false negatives. + let elapsed = started.elapsed(); + if elapsed >= threshold { + info!("slow {name} completed after {:.3}s", elapsed.as_secs_f64()); + } + return output; + } + + let elapsed = started.elapsed().as_secs_f64(); + info!("slow {name} still running after {elapsed:.3}s",); + + attempt += 1; + } +} + #[cfg(test)] mod tests { use metrics::{core::Opts, IntCounterVec}; diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index f188165600..c874fa30ff 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -260,7 +260,7 @@ impl FromStr for Lsn { { let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?; let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?; - Ok(Lsn((left_num as u64) << 32 | right_num as u64)) + Ok(Lsn(((left_num as u64) << 32) | right_num as u64)) } else { Err(LsnParseError) } diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index 6352ea9f92..d98284f969 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -117,6 +117,10 @@ impl TenantShardId { ) } + pub fn range(&self) -> RangeInclusive { + RangeInclusive::new(*self, *self) + } + pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { ShardSlug(self) } diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 16ec563fa7..0a1ed81621 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -64,6 +64,12 @@ pub struct GateGuard { gate: Arc, } +impl GateGuard { + pub fn try_clone(&self) -> Result { + Gate::enter_impl(self.gate.clone()) + } +} + impl Drop for GateGuard { fn drop(&mut self) { if self.gate.closing.load(Ordering::Relaxed) { @@ -107,11 +113,11 @@ impl Gate { /// to avoid blocking close() indefinitely: typically types that contain a Gate will /// also contain a CancellationToken. pub fn enter(&self) -> Result { - let permit = self - .inner - .sem - .try_acquire() - .map_err(|_| GateError::GateClosed)?; + Self::enter_impl(self.inner.clone()) + } + + fn enter_impl(gate: Arc) -> Result { + let permit = gate.sem.try_acquire().map_err(|_| GateError::GateClosed)?; // we now have the permit, let's disable the normal raii functionality and leave // "returning" the permit to our GateGuard::drop. @@ -122,7 +128,7 @@ impl Gate { Ok(GateGuard { span_at_enter: tracing::Span::current(), - gate: self.inner.clone(), + gate, }) } @@ -252,4 +258,39 @@ mod tests { // Attempting to enter() is still forbidden gate.enter().expect_err("enter should fail finishing close"); } + + #[tokio::test(start_paused = true)] + async fn clone_gate_guard() { + let gate = Gate::default(); + let forever = Duration::from_secs(24 * 7 * 365); + + let guard1 = gate.enter().expect("gate isn't closed"); + + let guard2 = guard1.try_clone().expect("gate isn't clsoed"); + + let mut close_fut = std::pin::pin!(gate.close()); + + tokio::time::timeout(forever, &mut close_fut) + .await + .unwrap_err(); + + // we polled close_fut once, that should prevent all later enters and clones + gate.enter().unwrap_err(); + guard1.try_clone().unwrap_err(); + guard2.try_clone().unwrap_err(); + + // guard2 keeps gate open even if guard1 is closed + drop(guard1); + tokio::time::timeout(forever, &mut close_fut) + .await + .unwrap_err(); + + drop(guard2); + + // now that the last guard is dropped, closing should complete + close_fut.await; + + // entering is still forbidden + gate.enter().expect_err("enter should stilll fail"); + } } diff --git a/libs/utils/src/sync/spsc_fold.rs b/libs/utils/src/sync/spsc_fold.rs index b44f766ef0..0cab291d51 100644 --- a/libs/utils/src/sync/spsc_fold.rs +++ b/libs/utils/src/sync/spsc_fold.rs @@ -96,7 +96,11 @@ impl Sender { } } State::SenderWaitsForReceiverToConsume(_data) => { - // Really, we shouldn't be polled until receiver has consumed and wakes us. + // SAFETY: send is single threaded due to `&mut self` requirement, + // therefore register is not concurrent. + unsafe { + self.state.wake_sender.register(cx.waker()); + } Poll::Pending } State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)), @@ -449,4 +453,38 @@ mod tests { let err = recv_task.await.unwrap().expect_err("should error"); assert!(matches!(err, RecvError::SenderGone)); } + + #[tokio::test(start_paused = true)] + async fn test_receiver_drop_while_waiting_for_receiver_to_consume_unblocks_sender() { + let (mut sender, receiver) = channel(); + + let state = receiver.state.clone(); + + sender.send((), |_, _| unreachable!()).await.unwrap(); + + assert!(matches!(&*state.value.lock().unwrap(), &State::HasData(_))); + + let unmergeable = sender.send((), |_, _| Err(())); + let mut unmergeable = std::pin::pin!(unmergeable); + tokio::select! { + _ = tokio::time::sleep(FOREVER) => {}, + _ = &mut unmergeable => { + panic!("unmergeable should not complete"); + }, + } + + assert!(matches!( + &*state.value.lock().unwrap(), + &State::SenderWaitsForReceiverToConsume(_) + )); + + drop(receiver); + + assert!(matches!( + &*state.value.lock().unwrap(), + &State::ReceiverGone + )); + + unmergeable.await.unwrap_err(); + } } diff --git a/libs/utils/src/try_rcu.rs b/libs/utils/src/try_rcu.rs new file mode 100644 index 0000000000..6b53ab1316 --- /dev/null +++ b/libs/utils/src/try_rcu.rs @@ -0,0 +1,77 @@ +//! Try RCU extension lifted from + +pub trait ArcSwapExt { + /// [`ArcSwap::rcu`](arc_swap::ArcSwap::rcu), but with Result that short-circuits on error. + fn try_rcu(&self, f: F) -> Result + where + F: FnMut(&T) -> Result, + R: Into; +} + +impl ArcSwapExt for arc_swap::ArcSwapAny +where + T: arc_swap::RefCnt, + S: arc_swap::strategy::CaS, +{ + fn try_rcu(&self, mut f: F) -> Result + where + F: FnMut(&T) -> Result, + R: Into, + { + fn ptr_eq(a: A, b: B) -> bool + where + A: arc_swap::AsRaw, + B: arc_swap::AsRaw, + { + let a = a.as_raw(); + let b = b.as_raw(); + std::ptr::eq(a, b) + } + + let mut cur = self.load(); + loop { + let new = f(&cur)?.into(); + let prev = self.compare_and_swap(&*cur, new); + let swapped = ptr_eq(&*cur, &*prev); + if swapped { + return Ok(arc_swap::Guard::into_inner(prev)); + } else { + cur = prev; + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arc_swap::ArcSwap; + use std::sync::Arc; + + #[test] + fn test_try_rcu_success() { + let swap = ArcSwap::from(Arc::new(42)); + + let result = swap.try_rcu(|value| -> Result<_, String> { Ok(**value + 1) }); + + assert!(result.is_ok()); + assert_eq!(**swap.load(), 43); + } + + #[test] + fn test_try_rcu_error() { + let swap = ArcSwap::from(Arc::new(42)); + + let result = swap.try_rcu(|value| -> Result { + if **value == 42 { + Err("err") + } else { + Ok(**value + 1) + } + }); + + assert!(result.is_err()); + assert_eq!(result.unwrap_err(), "err"); + assert_eq!(**swap.load(), 42); + } +} diff --git a/libs/vm_monitor/Cargo.toml b/libs/vm_monitor/Cargo.toml index ba73902d38..a70465921c 100644 --- a/libs/vm_monitor/Cargo.toml +++ b/libs/vm_monitor/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "vm_monitor" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [[bin]] diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs index 1d70cedcf9..dda9b23818 100644 --- a/libs/vm_monitor/src/cgroup.rs +++ b/libs/vm_monitor/src/cgroup.rs @@ -1,12 +1,10 @@ use std::fmt::{self, Debug, Formatter}; use std::time::{Duration, Instant}; -use anyhow::{anyhow, Context}; -use cgroups_rs::{ - hierarchies::{self, is_cgroup2_unified_mode}, - memory::MemController, - Subsystem, -}; +use anyhow::{Context, anyhow}; +use cgroups_rs::Subsystem; +use cgroups_rs::hierarchies::{self, is_cgroup2_unified_mode}; +use cgroups_rs::memory::MemController; use tokio::sync::watch; use tracing::{info, warn}; diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs index 6a965ace9b..7b7201ab77 100644 --- a/libs/vm_monitor/src/dispatcher.rs +++ b/libs/vm_monitor/src/dispatcher.rs @@ -6,17 +6,15 @@ //! the cgroup (requesting upscale), and the signals that go to the cgroup //! (notifying it of upscale). -use anyhow::{bail, Context}; -use axum::extract::ws::{Message, WebSocket}; -use futures::{ - stream::{SplitSink, SplitStream}, - SinkExt, StreamExt, -}; +use anyhow::{Context, bail}; +use axum::extract::ws::{Message, Utf8Bytes, WebSocket}; +use futures::stream::{SplitSink, SplitStream}; +use futures::{SinkExt, StreamExt}; use tracing::{debug, info}; use crate::protocol::{ - OutboundMsg, OutboundMsgKind, ProtocolRange, ProtocolResponse, ProtocolVersion, - PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION, + OutboundMsg, OutboundMsgKind, PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION, ProtocolRange, + ProtocolResponse, ProtocolVersion, }; /// The central handler for all communications in the monitor. @@ -82,21 +80,21 @@ impl Dispatcher { let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) { Ok(version) => { - sink.send(Message::Text( + sink.send(Message::Text(Utf8Bytes::from( serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(), - )) + ))) .await .context("failed to notify agent of negotiated protocol version")?; version } Err(e) => { - sink.send(Message::Text( + sink.send(Message::Text(Utf8Bytes::from( serde_json::to_string(&ProtocolResponse::Error(format!( "Received protocol version range {} which does not overlap with {}", agent_range, monitor_range ))) .unwrap(), - )) + ))) .await .context("failed to notify agent of no overlap between protocol version ranges")?; Err(e).context("error determining suitable protocol version range")? @@ -126,7 +124,7 @@ impl Dispatcher { let json = serde_json::to_string(&message).context("failed to serialize message")?; self.sink - .send(Message::Text(json)) + .send(Message::Text(Utf8Bytes::from(json))) .await .context("stream error sending message") } diff --git a/libs/vm_monitor/src/filecache.rs b/libs/vm_monitor/src/filecache.rs index fe71e11197..bc42347e5a 100644 --- a/libs/vm_monitor/src/filecache.rs +++ b/libs/vm_monitor/src/filecache.rs @@ -2,12 +2,14 @@ use std::num::NonZeroU64; -use crate::MiB; -use anyhow::{anyhow, Context}; -use tokio_postgres::{types::ToSql, Client, NoTls, Row}; +use anyhow::{Context, anyhow}; +use tokio_postgres::types::ToSql; +use tokio_postgres::{Client, NoTls, Row}; use tokio_util::sync::CancellationToken; use tracing::{error, info}; +use crate::MiB; + /// Manages Postgres' file cache by keeping a connection open. #[derive(Debug)] pub struct FileCacheState { @@ -177,8 +179,8 @@ impl FileCacheState { crate::spawn_with_cancel( token, |res| { - if let Err(error) = res { - error!(%error, "postgres error") + if let Err(e) = res { + error!(error = format_args!("{e:#}"), "postgres error"); } }, conn, @@ -205,7 +207,7 @@ impl FileCacheState { { Ok(rows) => Ok(rows), Err(e) => { - error!(error = ?e, "postgres error: {e} -> retrying"); + error!(error = format_args!("{e:#}"), "postgres error -> retrying"); let client = FileCacheState::connect(&self.conn_str, self.token.clone()) .await diff --git a/libs/vm_monitor/src/lib.rs b/libs/vm_monitor/src/lib.rs index 1b13c8e0b2..7c77aca35d 100644 --- a/libs/vm_monitor/src/lib.rs +++ b/libs/vm_monitor/src/lib.rs @@ -2,24 +2,26 @@ #![deny(clippy::undocumented_unsafe_blocks)] #![cfg(target_os = "linux")] +use std::fmt::Debug; +use std::net::SocketAddr; +use std::time::Duration; + use anyhow::Context; -use axum::{ - extract::{ws::WebSocket, State, WebSocketUpgrade}, - response::Response, -}; -use axum::{routing::get, Router}; +use axum::Router; +use axum::extract::ws::WebSocket; +use axum::extract::{State, WebSocketUpgrade}; +use axum::response::Response; +use axum::routing::get; use clap::Parser; use futures::Future; -use std::net::SocketAddr; -use std::{fmt::Debug, time::Duration}; +use runner::Runner; use sysinfo::{RefreshKind, System, SystemExt}; use tokio::net::TcpListener; -use tokio::{sync::broadcast, task::JoinHandle}; +use tokio::sync::broadcast; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::{error, info}; -use runner::Runner; - // Code that interfaces with agent pub mod dispatcher; pub mod protocol; @@ -191,15 +193,12 @@ async fn start_monitor( .await; let mut monitor = match monitor { Ok(Ok(monitor)) => monitor, - Ok(Err(error)) => { - error!(?error, "failed to create monitor"); + Ok(Err(e)) => { + error!(error = format_args!("{e:#}"), "failed to create monitor"); return; } Err(_) => { - error!( - ?timeout, - "creating monitor timed out (probably waiting to receive protocol range)" - ); + error!(?timeout, "creating monitor timed out"); return; } }; @@ -207,6 +206,9 @@ async fn start_monitor( match monitor.run().await { Ok(()) => info!("monitor was killed due to new connection"), - Err(e) => error!(error = ?e, "monitor terminated unexpectedly"), + Err(e) => error!( + error = format_args!("{e:#}"), + "monitor terminated unexpectedly" + ), } } diff --git a/libs/vm_monitor/src/protocol.rs b/libs/vm_monitor/src/protocol.rs index 5f07435503..4fce3cdefc 100644 --- a/libs/vm_monitor/src/protocol.rs +++ b/libs/vm_monitor/src/protocol.rs @@ -35,7 +35,8 @@ use core::fmt; use std::cmp; -use serde::{de::Error, Deserialize, Serialize}; +use serde::de::Error; +use serde::{Deserialize, Serialize}; /// A Message we send to the agent. #[derive(Serialize, Deserialize, Debug, Clone)] diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs index 8605314ba9..6f75ff0abd 100644 --- a/libs/vm_monitor/src/runner.rs +++ b/libs/vm_monitor/src/runner.rs @@ -7,7 +7,7 @@ use std::fmt::Debug; use std::time::{Duration, Instant}; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use axum::extract::ws::{Message, WebSocket}; use futures::StreamExt; use tokio::sync::{broadcast, watch}; @@ -18,7 +18,7 @@ use crate::cgroup::{self, CgroupWatcher}; use crate::dispatcher::Dispatcher; use crate::filecache::{FileCacheConfig, FileCacheState}; use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources}; -use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB}; +use crate::{Args, MiB, bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel}; /// Central struct that interacts with agent, dispatcher, and cgroup to handle /// signals from the agent. @@ -233,7 +233,9 @@ impl Runner { // // TODO: make the duration here configurable. if last_time.elapsed() > Duration::from_secs(5) { - bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information"); + bail!( + "haven't gotten cgroup memory stats recently enough to determine downscaling information" + ); } else if last_history.samples_count <= 1 { let status = "haven't received enough cgroup memory stats yet"; info!(status, "discontinuing downscale"); @@ -370,12 +372,16 @@ impl Runner { }), InboundMsgKind::InvalidMessage { error } => { warn!( - %error, id, "received notification of an invalid message we sent" + error = format_args!("{error:#}"), + id, "received notification of an invalid message we sent" ); Ok(None) } InboundMsgKind::InternalError { error } => { - warn!(error, id, "agent experienced an internal error"); + warn!( + error = format_args!("{error:#}"), + id, "agent experienced an internal error" + ); Ok(None) } InboundMsgKind::HealthCheck {} => { @@ -476,7 +482,7 @@ impl Runner { // gives the outermost cause, and the debug impl // pretty-prints the error, whereas {:#} contains all the // causes, but is compact (no newlines). - warn!(error = format!("{e:#}"), "error handling message"); + warn!(error = format_args!("{e:#}"), "error handling message"); OutboundMsg::new( OutboundMsgKind::InternalError { error: e.to_string(), @@ -492,7 +498,7 @@ impl Runner { .context("failed to send message")?; } Err(e) => warn!( - error = format!("{e}"), + error = format_args!("{e:#}"), msg = ?msg, "received error message" ), diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml index 8fac4e38ca..cb0ef4b00d 100644 --- a/libs/wal_decoder/Cargo.toml +++ b/libs/wal_decoder/Cargo.toml @@ -17,10 +17,24 @@ postgres_ffi.workspace = true serde.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["io-util"] } -tonic.workspace = true tracing.workspace = true utils.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } [build-dependencies] tonic-build.workspace = true + +[dev-dependencies] +criterion.workspace = true +camino.workspace = true +camino-tempfile.workspace = true +remote_storage.workspace = true +tokio-util.workspace = true +serde_json.workspace = true +futures.workspace = true +tikv-jemallocator.workspace = true +pprof.workspace = true + +[[bench]] +name = "bench_interpret_wal" +harness = false diff --git a/libs/wal_decoder/benches/README.md b/libs/wal_decoder/benches/README.md new file mode 100644 index 0000000000..14885afecf --- /dev/null +++ b/libs/wal_decoder/benches/README.md @@ -0,0 +1,34 @@ +## WAL Decoding and Interpretation Benchmarks + +Note that these benchmarks pull WAL from a public bucket in S3 +as a preparation step. Hence, you need a way to auth with AWS. +You can achieve this by copying the `~/.aws/config` file from +the AWS SSO notion page and exporting `AWS_PROFILE=dev` when invoking +the benchmarks. + +To run benchmarks: + +```sh +aws sso login --profile dev + +# All benchmarks. +AWS_PROFILE=dev cargo bench --package wal_decoder + +# Specific file. +AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal + +# Specific benchmark. +AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded + +# List available benchmarks. +cargo bench --package wal_decoder --benches -- --list + +# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. +# Output in target/criterion/*/profile/flamegraph.svg. +AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded -- --profile-time 10 +``` + +Additional charts and statistics are available in `target/criterion/report/index.html`. + +Benchmarks are automatically compared against the previous run. To compare against other runs, see +`--baseline` and `--save-baseline`. diff --git a/libs/wal_decoder/benches/bench_interpret_wal.rs b/libs/wal_decoder/benches/bench_interpret_wal.rs new file mode 100644 index 0000000000..846904cf87 --- /dev/null +++ b/libs/wal_decoder/benches/bench_interpret_wal.rs @@ -0,0 +1,250 @@ +use anyhow::Context; +use criterion::{criterion_group, criterion_main, Criterion}; +use futures::{stream::FuturesUnordered, StreamExt}; +use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; +use postgres_ffi::{waldecoder::WalStreamDecoder, MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use pprof::criterion::{Output, PProfProfiler}; +use serde::Deserialize; +use std::{env, num::NonZeroUsize, sync::Arc}; + +use camino::{Utf8Path, Utf8PathBuf}; +use camino_tempfile::Utf8TempDir; +use remote_storage::{ + DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind, + S3Config, +}; +use tokio_util::sync::CancellationToken; +use utils::{ + lsn::Lsn, + shard::{ShardCount, ShardNumber}, +}; +use wal_decoder::models::InterpretedWalRecord; + +const S3_BUCKET: &str = "neon-github-public-dev"; +const S3_REGION: &str = "eu-central-1"; +const BUCKET_PREFIX: &str = "wal-snapshots/bulk-insert/"; +const METADATA_FILENAME: &str = "metadata.json"; + +/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB. +/// This mirrors the configuration in bin/safekeeper.rs. +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; + +async fn create_s3_client() -> anyhow::Result> { + let remote_storage_config = RemoteStorageConfig { + storage: RemoteStorageKind::AwsS3(S3Config { + bucket_name: S3_BUCKET.to_string(), + bucket_region: S3_REGION.to_string(), + prefix_in_bucket: Some(BUCKET_PREFIX.to_string()), + endpoint: None, + concurrency_limit: NonZeroUsize::new(100).unwrap(), + max_keys_per_list_response: None, + upload_storage_class: None, + }), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, + }; + Ok(Arc::new( + GenericRemoteStorage::from_config(&remote_storage_config) + .await + .context("remote storage init")?, + )) +} + +async fn download_bench_data( + client: Arc, + cancel: &CancellationToken, +) -> anyhow::Result { + let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into()?; + let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent)?; + + eprintln!("Downloading benchmark data to {:?}", temp_dir); + + let listing = client + .list(None, ListingMode::NoDelimiter, None, cancel) + .await?; + + let mut downloads = listing + .keys + .into_iter() + .map(|obj| { + let client = client.clone(); + let temp_dir_path = temp_dir.path().to_owned(); + + async move { + let remote_path = obj.key; + let download = client + .download(&remote_path, &DownloadOpts::default(), cancel) + .await?; + let mut body = tokio_util::io::StreamReader::new(download.download_stream); + + let file_name = remote_path.object_name().unwrap(); + let file_path = temp_dir_path.join(file_name); + let file = tokio::fs::OpenOptions::new() + .create(true) + .truncate(true) + .write(true) + .open(&file_path) + .await?; + + let mut writer = tokio::io::BufWriter::new(file); + tokio::io::copy_buf(&mut body, &mut writer).await?; + + Ok::<(), anyhow::Error>(()) + } + }) + .collect::>(); + + while let Some(download) = downloads.next().await { + download?; + } + + Ok(temp_dir) +} + +struct BenchmarkData { + wal: Vec, + meta: BenchmarkMetadata, +} + +#[derive(Deserialize)] +struct BenchmarkMetadata { + pg_version: u32, + start_lsn: Lsn, +} + +async fn load_bench_data(path: &Utf8Path, input_size: usize) -> anyhow::Result { + eprintln!("Loading benchmark data from {:?}", path); + + let mut entries = tokio::fs::read_dir(path).await?; + let mut ordered_segment_paths = Vec::new(); + let mut metadata = None; + + while let Some(entry) = entries.next_entry().await? { + if entry.file_name() == METADATA_FILENAME { + let bytes = tokio::fs::read(entry.path()).await?; + metadata = Some( + serde_json::from_slice::(&bytes) + .context("failed to deserialize metadata.json")?, + ); + } else { + ordered_segment_paths.push(entry.path()); + } + } + + ordered_segment_paths.sort(); + + let mut buffer = Vec::new(); + for path in ordered_segment_paths { + if buffer.len() >= input_size { + break; + } + + use async_compression::tokio::bufread::ZstdDecoder; + let file = tokio::fs::File::open(path).await?; + let reader = tokio::io::BufReader::new(file); + let decoder = ZstdDecoder::new(reader); + let mut reader = tokio::io::BufReader::new(decoder); + tokio::io::copy_buf(&mut reader, &mut buffer).await?; + } + + buffer.truncate(input_size); + + Ok(BenchmarkData { + wal: buffer, + meta: metadata.unwrap(), + }) +} + +fn criterion_benchmark(c: &mut Criterion) { + const INPUT_SIZE: usize = 128 * 1024 * 1024; + + let setup_runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + let (_temp_dir, bench_data) = setup_runtime.block_on(async move { + let cancel = CancellationToken::new(); + let client = create_s3_client().await.unwrap(); + let temp_dir = download_bench_data(client, &cancel).await.unwrap(); + let bench_data = load_bench_data(temp_dir.path(), INPUT_SIZE).await.unwrap(); + + (temp_dir, bench_data) + }); + + eprintln!( + "Benchmarking against {} MiB of WAL", + INPUT_SIZE / 1024 / 1024 + ); + + let mut group = c.benchmark_group("decode-interpret-wal"); + group.throughput(criterion::Throughput::Bytes(bench_data.wal.len() as u64)); + group.sample_size(10); + + group.bench_function("unsharded", |b| { + b.iter(|| decode_interpret_main(&bench_data, &[ShardIdentity::unsharded()])) + }); + + let eight_shards = (0..8) + .map(|i| ShardIdentity::new(ShardNumber(i), ShardCount(8), ShardStripeSize(8)).unwrap()) + .collect::>(); + + group.bench_function("8/8-shards", |b| { + b.iter(|| decode_interpret_main(&bench_data, &eight_shards)) + }); + + let four_shards = eight_shards + .into_iter() + .filter(|s| s.number.0 % 2 == 0) + .collect::>(); + group.bench_function("4/8-shards", |b| { + b.iter(|| decode_interpret_main(&bench_data, &four_shards)) + }); + + let two_shards = four_shards + .into_iter() + .filter(|s| s.number.0 % 4 == 0) + .collect::>(); + group.bench_function("2/8-shards", |b| { + b.iter(|| decode_interpret_main(&bench_data, &two_shards)) + }); +} + +fn decode_interpret_main(bench: &BenchmarkData, shards: &[ShardIdentity]) { + let r = decode_interpret(bench, shards); + if let Err(e) = r { + panic!("{e:?}"); + } +} + +fn decode_interpret(bench: &BenchmarkData, shard: &[ShardIdentity]) -> anyhow::Result<()> { + let mut decoder = WalStreamDecoder::new(bench.meta.start_lsn, bench.meta.pg_version); + let xlogoff: usize = bench.meta.start_lsn.segment_offset(WAL_SEGMENT_SIZE); + + for chunk in bench.wal[xlogoff..].chunks(MAX_SEND_SIZE) { + decoder.feed_bytes(chunk); + while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() { + assert!(lsn.is_aligned()); + let _ = InterpretedWalRecord::from_bytes_filtered( + recdata, + shard, + lsn, + bench.meta.pg_version, + ) + .unwrap(); + } + } + + Ok(()) +} +criterion_group!( + name=benches; + config=Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets=criterion_benchmark +); +criterion_main!(benches); diff --git a/libs/wal_decoder/proto/interpreted_wal.proto b/libs/wal_decoder/proto/interpreted_wal.proto index 0393392c1a..7b40201a75 100644 --- a/libs/wal_decoder/proto/interpreted_wal.proto +++ b/libs/wal_decoder/proto/interpreted_wal.proto @@ -5,6 +5,7 @@ package interpreted_wal; message InterpretedWalRecords { repeated InterpretedWalRecord records = 1; optional uint64 next_record_lsn = 2; + optional uint64 raw_wal_start_lsn = 3; } message InterpretedWalRecord { @@ -37,7 +38,7 @@ message ValueMeta { } message CompactKey { - int64 high = 1; - int64 low = 2; + uint64 high = 1; + uint64 low = 2; } diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs index aa50c62911..ebb38ceb52 100644 --- a/libs/wal_decoder/src/decoder.rs +++ b/libs/wal_decoder/src/decoder.rs @@ -1,6 +1,8 @@ //! This module contains logic for decoding and interpreting //! raw bytes which represent a raw Postgres WAL record. +use std::collections::HashMap; + use crate::models::*; use crate::serialized_batch::SerializedValueBatch; use bytes::{Buf, Bytes}; @@ -14,15 +16,15 @@ use utils::lsn::Lsn; impl InterpretedWalRecord { /// Decode and interpreted raw bytes which represent one Postgres WAL record. - /// Data blocks which do not match the provided shard identity are filtered out. + /// Data blocks which do not match any of the provided shard identities are filtered out. /// Shard 0 is a special case since it tracks all relation sizes. We only give it /// the keys that are being written as that is enough for updating relation sizes. pub fn from_bytes_filtered( buf: Bytes, - shard: &ShardIdentity, + shards: &[ShardIdentity], next_record_lsn: Lsn, pg_version: u32, - ) -> anyhow::Result { + ) -> anyhow::Result> { let mut decoded = DecodedWALRecord::default(); decode_wal_record(buf, &mut decoded, pg_version)?; let xid = decoded.xl_xid; @@ -33,43 +35,57 @@ impl InterpretedWalRecord { FlushUncommittedRecords::No }; - let metadata_record = - MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?; - let batch = SerializedValueBatch::from_decoded_filtered( + let mut shard_records: HashMap = + HashMap::with_capacity(shards.len()); + for shard in shards { + shard_records.insert( + *shard, + InterpretedWalRecord { + metadata_record: None, + batch: SerializedValueBatch::default(), + next_record_lsn, + flush_uncommitted, + xid, + }, + ); + } + + MetadataRecord::from_decoded_filtered( + &decoded, + &mut shard_records, + next_record_lsn, + pg_version, + )?; + SerializedValueBatch::from_decoded_filtered( decoded, - shard, + &mut shard_records, next_record_lsn, pg_version, )?; - Ok(InterpretedWalRecord { - metadata_record, - batch, - next_record_lsn, - flush_uncommitted, - xid, - }) + Ok(shard_records) } } impl MetadataRecord { - /// Builds a metadata record for this WAL record, if any. + /// Populates the given `shard_records` with metadata records from this WAL record, if any, + /// discarding those belonging to other shards. /// - /// Only metadata records relevant for the given shard are emitted. Currently, most metadata + /// Only metadata records relevant for the given shards is emitted. Currently, most metadata /// records are broadcast to all shards for simplicity, but this should be improved. fn from_decoded_filtered( decoded: &DecodedWALRecord, - shard: &ShardIdentity, + shard_records: &mut HashMap, next_record_lsn: Lsn, pg_version: u32, - ) -> anyhow::Result> { + ) -> anyhow::Result<()> { // Note: this doesn't actually copy the bytes since // the [`Bytes`] type implements it via a level of indirection. let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); // First, generate metadata records from the decoded WAL record. - let mut metadata_record = match decoded.xl_rmid { + let metadata_record = match decoded.xl_rmid { pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => { Self::decode_heapam_record(&mut buf, decoded, pg_version)? } @@ -112,41 +128,65 @@ impl MetadataRecord { }; // Next, filter the metadata record by shard. - match metadata_record { - Some( - MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits)) - | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)), - ) => { - // Route VM page updates to the shards that own them. VM pages are stored in the VM fork - // of the main relation. These are sharded and managed just like regular relation pages. - // See: https://github.com/neondatabase/neon/issues/9855 - let is_local_vm_page = |heap_blk| { - let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk); - shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk)) - }; - // Send the old and new VM page updates to their respective shards. - clear_vm_bits.old_heap_blkno = clear_vm_bits - .old_heap_blkno - .filter(|&blkno| is_local_vm_page(blkno)); - clear_vm_bits.new_heap_blkno = clear_vm_bits - .new_heap_blkno - .filter(|&blkno| is_local_vm_page(blkno)); - // If neither VM page belongs to this shard, discard the record. - if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() - { - metadata_record = None + for (shard, record) in shard_records.iter_mut() { + match metadata_record { + Some( + MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref clear_vm_bits)) + | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref clear_vm_bits)), + ) => { + // Route VM page updates to the shards that own them. VM pages are stored in the VM fork + // of the main relation. These are sharded and managed just like regular relation pages. + // See: https://github.com/neondatabase/neon/issues/9855 + let is_local_vm_page = |heap_blk| { + let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk); + shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk)) + }; + // Send the old and new VM page updates to their respective shards. + let updated_old_heap_blkno = clear_vm_bits + .old_heap_blkno + .filter(|&blkno| is_local_vm_page(blkno)); + let updated_new_heap_blkno = clear_vm_bits + .new_heap_blkno + .filter(|&blkno| is_local_vm_page(blkno)); + // If neither VM page belongs to this shard, discard the record. + if updated_old_heap_blkno.is_some() || updated_new_heap_blkno.is_some() { + // Clone the record and update it for the current shard. + let mut for_shard = metadata_record.clone(); + match for_shard { + Some( + MetadataRecord::Heapam(HeapamRecord::ClearVmBits( + ref mut clear_vm_bits, + )) + | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits( + ref mut clear_vm_bits, + )), + ) => { + clear_vm_bits.old_heap_blkno = updated_old_heap_blkno; + clear_vm_bits.new_heap_blkno = updated_new_heap_blkno; + record.metadata_record = for_shard; + } + _ => { + unreachable!("for_shard is a clone of what we checked above") + } + } + } + } + Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => { + // Filter LogicalMessage records (AUX files) to only be stored on shard zero + if shard.is_shard_zero() { + record.metadata_record = metadata_record; + // No other shards should receive this record, so we stop traversing shards early. + break; + } + } + _ => { + // All other metadata records are sent to all shards. + record.metadata_record = metadata_record.clone(); } } - Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => { - // Filter LogicalMessage records (AUX files) to only be stored on shard zero - if !shard.is_shard_zero() { - metadata_record = None; - } - } - _ => {} } - Ok(metadata_record) + Ok(()) } fn decode_heapam_record( diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs index af22de5d95..7e1934c6c3 100644 --- a/libs/wal_decoder/src/models.rs +++ b/libs/wal_decoder/src/models.rs @@ -45,10 +45,10 @@ pub mod proto { #![allow(clippy::derive_partial_eq_without_eq)] // The generated ValueMeta has a `len` method generate for its `len` field. #![allow(clippy::len_without_is_empty)] - tonic::include_proto!("interpreted_wal"); + include!(concat!(env!("OUT_DIR"), concat!("/interpreted_wal.rs"))); } -#[derive(Serialize, Deserialize)] +#[derive(Copy, Clone, Serialize, Deserialize)] pub enum FlushUncommittedRecords { Yes, No, @@ -60,11 +60,15 @@ pub struct InterpretedWalRecords { pub records: Vec, // Start LSN of the next record after the batch. // Note that said record may not belong to the current shard. - pub next_record_lsn: Option, + pub next_record_lsn: Lsn, + // Inclusive start LSN of the PG WAL from which the interpreted + // WAL records were extracted. Note that this is not necessarily the + // start LSN of the first interpreted record in the batch. + pub raw_wal_start_lsn: Option, } /// An interpreted Postgres WAL record, ready to be handled by the pageserver -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct InterpretedWalRecord { /// Optional metadata record - may cause writes to metadata keys /// in the storage engine @@ -95,11 +99,19 @@ impl InterpretedWalRecord { && self.metadata_record.is_none() && matches!(self.flush_uncommitted, FlushUncommittedRecords::No) } + + /// Checks if the WAL record is observed (i.e. contains only metadata + /// for observed values) + pub fn is_observed(&self) -> bool { + self.batch.is_observed() + && self.metadata_record.is_none() + && matches!(self.flush_uncommitted, FlushUncommittedRecords::No) + } } /// The interpreted part of the Postgres WAL record which requires metadata /// writes to the underlying storage engine. -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum MetadataRecord { Heapam(HeapamRecord), Neonrmgr(NeonrmgrRecord), @@ -115,12 +127,12 @@ pub enum MetadataRecord { Replorigin(ReploriginRecord), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum HeapamRecord { ClearVmBits(ClearVmBits), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct ClearVmBits { pub new_heap_blkno: Option, pub old_heap_blkno: Option, @@ -128,29 +140,29 @@ pub struct ClearVmBits { pub flags: u8, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum NeonrmgrRecord { ClearVmBits(ClearVmBits), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum SmgrRecord { Create(SmgrCreate), Truncate(XlSmgrTruncate), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct SmgrCreate { pub rel: RelTag, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum DbaseRecord { Create(DbaseCreate), Drop(DbaseDrop), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct DbaseCreate { pub db_id: Oid, pub tablespace_id: Oid, @@ -158,32 +170,32 @@ pub struct DbaseCreate { pub src_tablespace_id: Oid, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct DbaseDrop { pub db_id: Oid, pub tablespace_ids: Vec, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum ClogRecord { ZeroPage(ClogZeroPage), Truncate(ClogTruncate), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct ClogZeroPage { pub segno: u32, pub rpageno: u32, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct ClogTruncate { pub pageno: u32, pub oldest_xid: TransactionId, pub oldest_xid_db: Oid, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum XactRecord { Commit(XactCommon), Abort(XactCommon), @@ -192,7 +204,7 @@ pub enum XactRecord { Prepare(XactPrepare), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct XactCommon { pub parsed: XlXactParsedRecord, pub origin_id: u16, @@ -201,73 +213,73 @@ pub struct XactCommon { pub lsn: Lsn, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct XactPrepare { pub xl_xid: TransactionId, pub data: Bytes, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum MultiXactRecord { ZeroPage(MultiXactZeroPage), Create(XlMultiXactCreate), Truncate(XlMultiXactTruncate), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct MultiXactZeroPage { pub slru_kind: SlruKind, pub segno: u32, pub rpageno: u32, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum RelmapRecord { Update(RelmapUpdate), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct RelmapUpdate { pub update: XlRelmapUpdate, pub buf: Bytes, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum XlogRecord { Raw(RawXlogRecord), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct RawXlogRecord { pub info: u8, pub lsn: Lsn, pub buf: Bytes, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum LogicalMessageRecord { Put(PutLogicalMessage), #[cfg(feature = "testing")] Failpoint, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct PutLogicalMessage { pub path: String, pub buf: Bytes, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum StandbyRecord { RunningXacts(StandbyRunningXacts), } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct StandbyRunningXacts { pub oldest_running_xid: TransactionId, } -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub enum ReploriginRecord { Set(XlReploriginSet), Drop(XlReploriginDrop), diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs index 41294da7a0..d76f75f51f 100644 --- a/libs/wal_decoder/src/serialized_batch.rs +++ b/libs/wal_decoder/src/serialized_batch.rs @@ -5,7 +5,7 @@ //! Such batches are created from decoded PG wal records and ingested //! by the pageserver by writing directly to the ephemeral file. -use std::collections::BTreeSet; +use std::collections::{BTreeSet, HashMap}; use bytes::{Bytes, BytesMut}; use pageserver_api::key::rel_block_to_key; @@ -22,6 +22,8 @@ use utils::lsn::Lsn; use pageserver_api::key::Key; +use crate::models::InterpretedWalRecord; + static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); /// Accompanying metadata for the batch @@ -30,7 +32,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); /// relation sizes. In the case of "observed" values, we only need to know /// the key and LSN, so two types of metadata are supported to save on network /// bandwidth. -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub enum ValueMeta { Serialized(SerializedValueMeta), Observed(ObservedValueMeta), @@ -77,7 +79,7 @@ impl PartialEq for OrderedValueMeta { impl Eq for OrderedValueMeta {} /// Metadata for a [`Value`] serialized into the batch. -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct SerializedValueMeta { pub key: CompactKey, pub lsn: Lsn, @@ -89,14 +91,14 @@ pub struct SerializedValueMeta { } /// Metadata for a [`Value`] observed by the batch -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct ObservedValueMeta { pub key: CompactKey, pub lsn: Lsn, } /// Batch of serialized [`Value`]s. -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct SerializedValueBatch { /// [`Value`]s serialized in EphemeralFile's native format, /// ready for disk write by the pageserver @@ -128,7 +130,8 @@ impl Default for SerializedValueBatch { } impl SerializedValueBatch { - /// Build a batch of serialized values from a decoded PG WAL record + /// Populates the given `shard_records` with value batches from this WAL record, if any, + /// discarding those belonging to other shards. /// /// The batch will only contain values for keys targeting the specifiec /// shard. Shard 0 is a special case, where any keys that don't belong to @@ -136,21 +139,20 @@ impl SerializedValueBatch { /// but absent from the raw buffer [`SerializedValueBatch::raw`]). pub(crate) fn from_decoded_filtered( decoded: DecodedWALRecord, - shard: &ShardIdentity, + shard_records: &mut HashMap, next_record_lsn: Lsn, pg_version: u32, - ) -> anyhow::Result { - // First determine how big the buffer needs to be and allocate it up-front. + ) -> anyhow::Result<()> { + // First determine how big the buffers need to be and allocate it up-front. // This duplicates some of the work below, but it's empirically much faster. - let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version); - let mut buf = Vec::::with_capacity(estimated_buffer_size); + for (shard, record) in shard_records.iter_mut() { + assert!(record.batch.is_empty()); + + let estimate = Self::estimate_buffer_size(&decoded, shard, pg_version); + record.batch.raw = Vec::with_capacity(estimate); + } - let mut metadata: Vec = Vec::with_capacity(decoded.blocks.len()); - let mut max_lsn: Lsn = Lsn(0); - let mut len: usize = 0; for blk in decoded.blocks.iter() { - let relative_off = buf.len() as u64; - let rel = RelTag { spcnode: blk.rnode_spcnode, dbnode: blk.rnode_dbnode, @@ -168,99 +170,98 @@ impl SerializedValueBatch { ); } - let key_is_local = shard.is_key_local(&key); + for (shard, record) in shard_records.iter_mut() { + let key_is_local = shard.is_key_local(&key); - tracing::debug!( - lsn=%next_record_lsn, - key=%key, - "ingest: shard decision {}", - if !key_is_local { "drop" } else { "keep" }, - ); + tracing::debug!( + lsn=%next_record_lsn, + key=%key, + "ingest: shard decision {}", + if !key_is_local { "drop" } else { "keep" }, + ); - if !key_is_local { - if shard.is_shard_zero() { - // Shard 0 tracks relation sizes. Although we will not store this block, we will observe - // its blkno in case it implicitly extends a relation. - metadata.push(ValueMeta::Observed(ObservedValueMeta { + if !key_is_local { + if shard.is_shard_zero() { + // Shard 0 tracks relation sizes. Although we will not store this block, we will observe + // its blkno in case it implicitly extends a relation. + record + .batch + .metadata + .push(ValueMeta::Observed(ObservedValueMeta { + key: key.to_compact(), + lsn: next_record_lsn, + })) + } + + continue; + } + + // Instead of storing full-page-image WAL record, + // it is better to store extracted image: we can skip wal-redo + // in this case. Also some FPI records may contain multiple (up to 32) pages, + // so them have to be copied multiple times. + // + let val = if Self::block_is_image(&decoded, blk, pg_version) { + // Extract page image from FPI record + let img_len = blk.bimg_len as usize; + let img_offs = blk.bimg_offset as usize; + let mut image = BytesMut::with_capacity(BLCKSZ as usize); + // TODO(vlad): skip the copy + image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]); + + if blk.hole_length != 0 { + let tail = image.split_off(blk.hole_offset as usize); + image.resize(image.len() + blk.hole_length as usize, 0u8); + image.unsplit(tail); + } + // + // Match the logic of XLogReadBufferForRedoExtended: + // The page may be uninitialized. If so, we can't set the LSN because + // that would corrupt the page. + // + if !page_is_new(&image) { + page_set_lsn(&mut image, next_record_lsn) + } + assert_eq!(image.len(), BLCKSZ as usize); + + Value::Image(image.freeze()) + } else { + Value::WalRecord(NeonWalRecord::Postgres { + will_init: blk.will_init || blk.apply_image, + rec: decoded.record.clone(), + }) + }; + + let relative_off = record.batch.raw.len() as u64; + + val.ser_into(&mut record.batch.raw) + .expect("Writing into in-memory buffer is infallible"); + + let val_ser_size = record.batch.raw.len() - relative_off as usize; + + record + .batch + .metadata + .push(ValueMeta::Serialized(SerializedValueMeta { key: key.to_compact(), lsn: next_record_lsn, - })) - } - - continue; + batch_offset: relative_off, + len: val_ser_size, + will_init: val.will_init(), + })); + record.batch.max_lsn = std::cmp::max(record.batch.max_lsn, next_record_lsn); + record.batch.len += 1; } - - // Instead of storing full-page-image WAL record, - // it is better to store extracted image: we can skip wal-redo - // in this case. Also some FPI records may contain multiple (up to 32) pages, - // so them have to be copied multiple times. - // - let val = if Self::block_is_image(&decoded, blk, pg_version) { - // Extract page image from FPI record - let img_len = blk.bimg_len as usize; - let img_offs = blk.bimg_offset as usize; - let mut image = BytesMut::with_capacity(BLCKSZ as usize); - // TODO(vlad): skip the copy - image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]); - - if blk.hole_length != 0 { - let tail = image.split_off(blk.hole_offset as usize); - image.resize(image.len() + blk.hole_length as usize, 0u8); - image.unsplit(tail); - } - // - // Match the logic of XLogReadBufferForRedoExtended: - // The page may be uninitialized. If so, we can't set the LSN because - // that would corrupt the page. - // - if !page_is_new(&image) { - page_set_lsn(&mut image, next_record_lsn) - } - assert_eq!(image.len(), BLCKSZ as usize); - - Value::Image(image.freeze()) - } else { - Value::WalRecord(NeonWalRecord::Postgres { - will_init: blk.will_init || blk.apply_image, - rec: decoded.record.clone(), - }) - }; - - val.ser_into(&mut buf) - .expect("Writing into in-memory buffer is infallible"); - - let val_ser_size = buf.len() - relative_off as usize; - - metadata.push(ValueMeta::Serialized(SerializedValueMeta { - key: key.to_compact(), - lsn: next_record_lsn, - batch_offset: relative_off, - len: val_ser_size, - will_init: val.will_init(), - })); - max_lsn = std::cmp::max(max_lsn, next_record_lsn); - len += 1; } if cfg!(any(debug_assertions, test)) { - let batch = Self { - raw: buf, - metadata, - max_lsn, - len, - }; - - batch.validate_lsn_order(); - - return Ok(batch); + // Validate that the batches are correct + for record in shard_records.values() { + record.batch.validate_lsn_order(); + } } - Ok(Self { - raw: buf, - metadata, - max_lsn, - len, - }) + Ok(()) } /// Look into the decoded PG WAL record and determine @@ -501,6 +502,11 @@ impl SerializedValueBatch { !self.has_data() && self.metadata.is_empty() } + /// Checks if the batch contains only observed values + pub fn is_observed(&self) -> bool { + !self.has_data() && !self.metadata.is_empty() + } + /// Checks if the batch contains data /// /// Note that if this returns false, it may still contain observed values or diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs index 5a343054c3..52ed5c70b5 100644 --- a/libs/wal_decoder/src/wire_format.rs +++ b/libs/wal_decoder/src/wire_format.rs @@ -167,7 +167,8 @@ impl TryFrom for proto::InterpretedWalRecords { .collect::, _>>()?; Ok(proto::InterpretedWalRecords { records, - next_record_lsn: value.next_record_lsn.map(|l| l.0), + next_record_lsn: Some(value.next_record_lsn.0), + raw_wal_start_lsn: value.raw_wal_start_lsn.map(|l| l.0), }) } } @@ -236,8 +237,8 @@ impl From for proto::ValueMeta { impl From for proto::CompactKey { fn from(value: CompactKey) -> Self { proto::CompactKey { - high: (value.raw() >> 64) as i64, - low: value.raw() as i64, + high: (value.raw() >> 64) as u64, + low: value.raw() as u64, } } } @@ -254,7 +255,11 @@ impl TryFrom for InterpretedWalRecords { Ok(InterpretedWalRecords { records, - next_record_lsn: value.next_record_lsn.map(Lsn::from), + next_record_lsn: value + .next_record_lsn + .map(Lsn::from) + .expect("Always provided"), + raw_wal_start_lsn: value.raw_wal_start_lsn.map(Lsn::from), }) } } @@ -354,3 +359,64 @@ impl From for CompactKey { (((value.high as i128) << 64) | (value.low as i128)).into() } } + +#[test] +fn test_compact_key_with_large_relnode() { + use pageserver_api::key::Key; + + let inputs = vec![ + Key { + field1: 0, + field2: 0x100, + field3: 0x200, + field4: 0, + field5: 0x10, + field6: 0x5, + }, + Key { + field1: 0, + field2: 0x100, + field3: 0x200, + field4: 0x007FFFFF, + field5: 0x10, + field6: 0x5, + }, + Key { + field1: 0, + field2: 0x100, + field3: 0x200, + field4: 0x00800000, + field5: 0x10, + field6: 0x5, + }, + Key { + field1: 0, + field2: 0x100, + field3: 0x200, + field4: 0x00800001, + field5: 0x10, + field6: 0x5, + }, + Key { + field1: 0, + field2: 0xFFFFFFFF, + field3: 0xFFFFFFFF, + field4: 0xFFFFFFFF, + field5: 0x0, + field6: 0x0, + }, + ]; + + for input in inputs { + assert!(input.is_valid_key_on_write_path()); + let compact = input.to_compact(); + let proto: proto::CompactKey = compact.into(); + let from_proto: CompactKey = proto.into(); + + assert_eq!( + compact, from_proto, + "Round trip failed for key with relnode={:#x}", + input.field4 + ); + } +} diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index 3f549889b8..8d5b1ade35 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -30,9 +30,9 @@ fn main() -> anyhow::Result<()> { let pgxn_neon = std::fs::canonicalize(pgxn_neon)?; let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?; + println!("cargo:rustc-link-lib=static=walproposer"); println!("cargo:rustc-link-lib=static=pgport"); println!("cargo:rustc-link-lib=static=pgcommon"); - println!("cargo:rustc-link-lib=static=walproposer"); println!("cargo:rustc-link-search={walproposer_lib_search_str}"); // Rebuild crate when libwalproposer.a changes diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index ba75171db2..60b606c64a 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -215,6 +215,7 @@ impl Wrapper { syncSafekeepers: config.sync_safekeepers, systemId: 0, pgTimeline: 1, + proto_version: 3, callback_data, }; let c_config = Box::into_raw(Box::new(c_config)); @@ -276,6 +277,7 @@ mod tests { use core::panic; use std::{ cell::Cell, + ffi::CString, sync::{atomic::AtomicUsize, mpsc::sync_channel}, }; @@ -496,57 +498,64 @@ mod tests { // Messages definitions are at walproposer.h // xxx: it would be better to extract them from safekeeper crate and // use serialization/deserialization here. - let greeting_tag = (b'g' as u64).to_ne_bytes(); - let proto_version = 2_u32.to_ne_bytes(); - let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes(); - let proposer_id = [0; 16]; - let system_id = 0_u64.to_ne_bytes(); - let tenant_id = ttid.tenant_id.as_arr(); - let timeline_id = ttid.timeline_id.as_arr(); - let pg_tli = 1_u32.to_ne_bytes(); - let wal_seg_size = 16777216_u32.to_ne_bytes(); + let greeting_tag = (b'g').to_be_bytes(); + let tenant_id = CString::new(ttid.tenant_id.to_string()) + .unwrap() + .into_bytes_with_nul(); + let timeline_id = CString::new(ttid.timeline_id.to_string()) + .unwrap() + .into_bytes_with_nul(); + let mconf_gen = 0_u32.to_be_bytes(); + let mconf_members_len = 0_u32.to_be_bytes(); + let mconf_members_new_len = 0_u32.to_be_bytes(); + let pg_version: [u8; 4] = PG_VERSION_NUM.to_be_bytes(); + let system_id = 0_u64.to_be_bytes(); + let wal_seg_size = 16777216_u32.to_be_bytes(); + let proposer_greeting = [ greeting_tag.as_slice(), - proto_version.as_slice(), - pg_version.as_slice(), - proposer_id.as_slice(), - system_id.as_slice(), tenant_id.as_slice(), timeline_id.as_slice(), - pg_tli.as_slice(), + mconf_gen.as_slice(), + mconf_members_len.as_slice(), + mconf_members_new_len.as_slice(), + pg_version.as_slice(), + system_id.as_slice(), wal_seg_size.as_slice(), ] .concat(); - let voting_tag = (b'v' as u64).to_ne_bytes(); - let vote_request_term = 3_u64.to_ne_bytes(); - let proposer_id = [0; 16]; + let voting_tag = (b'v').to_be_bytes(); + let vote_request_term = 3_u64.to_be_bytes(); let vote_request = [ voting_tag.as_slice(), + mconf_gen.as_slice(), vote_request_term.as_slice(), - proposer_id.as_slice(), ] .concat(); - let acceptor_greeting_term = 2_u64.to_ne_bytes(); - let acceptor_greeting_node_id = 1_u64.to_ne_bytes(); + let acceptor_greeting_term = 2_u64.to_be_bytes(); + let acceptor_greeting_node_id = 1_u64.to_be_bytes(); let acceptor_greeting = [ greeting_tag.as_slice(), - acceptor_greeting_term.as_slice(), acceptor_greeting_node_id.as_slice(), + mconf_gen.as_slice(), + mconf_members_len.as_slice(), + mconf_members_new_len.as_slice(), + acceptor_greeting_term.as_slice(), ] .concat(); - let vote_response_term = 3_u64.to_ne_bytes(); - let vote_given = 1_u64.to_ne_bytes(); - let flush_lsn = 0x539_u64.to_ne_bytes(); - let truncate_lsn = 0x539_u64.to_ne_bytes(); - let th_len = 1_u32.to_ne_bytes(); - let th_term = 2_u64.to_ne_bytes(); - let th_lsn = 0x539_u64.to_ne_bytes(); - let timeline_start_lsn = 0x539_u64.to_ne_bytes(); + let vote_response_term = 3_u64.to_be_bytes(); + let vote_given = 1_u8.to_be_bytes(); + let flush_lsn = 0x539_u64.to_be_bytes(); + let truncate_lsn = 0x539_u64.to_be_bytes(); + let th_len = 1_u32.to_be_bytes(); + let th_term = 2_u64.to_be_bytes(); + let th_lsn = 0x539_u64.to_be_bytes(); let vote_response = [ voting_tag.as_slice(), + mconf_gen.as_slice(), vote_response_term.as_slice(), vote_given.as_slice(), flush_lsn.as_slice(), @@ -554,7 +563,6 @@ mod tests { th_len.as_slice(), th_term.as_slice(), th_lsn.as_slice(), - timeline_start_lsn.as_slice(), ] .concat(); diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 140b287ccc..7330856be4 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -1,14 +1,14 @@ [package] name = "pageserver" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions -testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"] +testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"] [dependencies] anyhow.workspace = true @@ -16,6 +16,7 @@ arc-swap.workspace = true async-compression.workspace = true async-stream.workspace = true bit_field.workspace = true +bincode.workspace = true byteorder.workspace = true bytes.workspace = true camino.workspace = true @@ -35,15 +36,15 @@ itertools.workspace = true md5.workspace = true nix.workspace = true # hack to get the number of worker threads tokio uses -num_cpus = { version = "1.15" } +num_cpus.workspace = true num-traits.workspace = true once_cell.workspace = true pin-project-lite.workspace = true -postgres.workspace = true postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true postgres_initdb.workspace = true +pprof.workspace = true rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true @@ -77,6 +78,7 @@ pq_proto.workspace = true remote_storage.workspace = true storage_broker.workspace = true tenant_size_model.workspace = true +http-utils.workspace = true utils.workspace = true workspace_hack.workspace = true reqwest.workspace = true @@ -108,3 +110,11 @@ harness = false [[bench]] name = "bench_ingest" harness = false + +[[bench]] +name = "upload_queue" +harness = false + +[[bin]] +name = "test_helper_slow_client_reads" +required-features = [ "testing" ] diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index c163603842..ad8c618b95 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -1,22 +1,20 @@ -use std::{env, num::NonZeroUsize}; +use std::env; +use std::num::NonZeroUsize; use bytes::Bytes; use camino::Utf8PathBuf; -use criterion::{criterion_group, criterion_main, Criterion}; -use pageserver::{ - config::PageServerConf, - context::{DownloadBehavior, RequestContext}, - l0_flush::{L0FlushConfig, L0FlushGlobalState}, - page_cache, - task_mgr::TaskKind, - tenant::storage_layer::InMemoryLayer, - virtual_file, -}; -use pageserver_api::{key::Key, shard::TenantShardId, value::Value}; -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, -}; +use criterion::{Criterion, criterion_group, criterion_main}; +use pageserver::config::PageServerConf; +use pageserver::context::{DownloadBehavior, RequestContext}; +use pageserver::l0_flush::{L0FlushConfig, L0FlushGlobalState}; +use pageserver::task_mgr::TaskKind; +use pageserver::tenant::storage_layer::InMemoryLayer; +use pageserver::{page_cache, virtual_file}; +use pageserver_api::key::Key; +use pageserver_api::shard::TenantShardId; +use pageserver_api::value::Value; +use utils::bin_ser::BeSer; +use utils::id::{TenantId, TimelineId}; use wal_decoder::serialized_batch::SerializedValueBatch; // A very cheap hash for generating non-sequential keys. diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5c5b52db44..e11af49449 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,23 +1,21 @@ -use criterion::measurement::WallTime; -use pageserver::keyspace::{KeyPartitioning, KeySpace}; -use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::storage_layer::PersistentLayerDesc; -use pageserver_api::key::Key; -use pageserver_api::shard::TenantShardId; -use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use std::cmp::{max, min}; use std::fs::File; use std::io::{BufRead, BufReader}; use std::path::PathBuf; use std::str::FromStr; use std::time::Instant; + +use criterion::measurement::WallTime; +use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main}; +use pageserver::keyspace::{KeyPartitioning, KeySpace}; +use pageserver::tenant::layer_map::LayerMap; +use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc}; +use pageserver_api::key::Key; +use pageserver_api::shard::TenantShardId; +use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use utils::id::{TenantId, TimelineId}; - use utils::lsn::Lsn; -use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion}; - fn fixture_path(relative: &str) -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative) } diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index d3551b56e1..77b3f90b3e 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -56,20 +56,23 @@ //! medium/128 time: [10.412 ms 10.574 ms 10.718 ms] //! ``` +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use anyhow::Context; use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; use once_cell::sync::Lazy; -use pageserver::{config::PageServerConf, walredo::PostgresRedoManager}; +use pageserver::config::PageServerConf; +use pageserver::walredo::PostgresRedoManager; +use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; -use pageserver_api::{key::Key, shard::TenantShardId}; -use std::{ - future::Future, - sync::Arc, - time::{Duration, Instant}, -}; -use tokio::{sync::Barrier, task::JoinSet}; -use utils::{id::TenantId, lsn::Lsn}; +use pageserver_api::shard::TenantShardId; +use tokio::sync::Barrier; +use tokio::task::JoinSet; +use utils::id::TenantId; +use utils::lsn::Lsn; fn bench(c: &mut Criterion) { macro_rules! bench_group { diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs new file mode 100644 index 0000000000..8de06a6c25 --- /dev/null +++ b/pageserver/benches/upload_queue.rs @@ -0,0 +1,87 @@ +//! Upload queue benchmarks. + +use std::str::FromStr as _; +use std::sync::Arc; +use std::sync::atomic::AtomicU32; + +use criterion::{Bencher, Criterion, criterion_group, criterion_main}; +use pageserver::tenant::IndexPart; +use pageserver::tenant::metadata::TimelineMetadata; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver::tenant::storage_layer::LayerName; +use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask}; +use pprof::criterion::{Output, PProfProfiler}; +use utils::generation::Generation; +use utils::shard::{ShardCount, ShardIndex, ShardNumber}; + +// Register benchmarks with Criterion. +criterion_group!( + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_upload_queue_next_ready, +); +criterion_main!(benches); + +/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks +/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload +/// queue as a whole is thus quadratic. +/// +/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test +/// Delete and UploadMetadata instead. This is incidentally the most expensive case. +fn bench_upload_queue_next_ready(c: &mut Criterion) { + let mut g = c.benchmark_group("upload_queue_next_ready"); + for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] { + g.bench_function(format!("inprogress={inprogress}"), |b| { + run_bench(b, inprogress).unwrap() + }); + } + + fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> { + // Construct two layers. layer0 is in the indexes, layer1 will be deleted. + let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name"); + let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name"); + + let metadata = LayerFileMetadata { + shard: ShardIndex::new(ShardNumber(1), ShardCount(2)), + generation: Generation::Valid(1), + file_size: 0, + }; + + // Construct the (initial and uploaded) index with layer0. + let mut index = IndexPart::empty(TimelineMetadata::example()); + index.layer_metadata.insert(layer0, metadata.clone()); + + // Construct the queue. + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&index, 0)?; + + // Populate inprogress_tasks with a bunch of layer1 deletions. + let delete = UploadOp::Delete(Delete { + layers: vec![(layer1, metadata)], + }); + + for task_id in 0..(inprogress as u64) { + queue.inprogress_tasks.insert( + task_id, + Arc::new(UploadTask { + task_id, + retries: AtomicU32::new(0), + op: delete.clone(), + coalesced_ops: Vec::new(), + }), + ); + } + + // Benchmark index upload scheduling. + let index_upload = UploadOp::UploadMetadata { + uploaded: Box::new(index), + }; + + b.iter(|| { + queue.queued_operations.push_front(index_upload.clone()); + assert!(queue.next_ready().is_some()); + }); + + Ok(()) + } +} diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index d9b36bf3d4..970a437a42 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -4,10 +4,14 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +testing = [ "pageserver_api/testing" ] + [dependencies] pageserver_api.workspace = true thiserror.workspace = true reqwest = { workspace = true, features = [ "stream" ] } +http-utils.workspace = true utils.workspace = true serde.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } @@ -17,5 +21,4 @@ tokio.workspace = true futures.workspace = true tokio-util.workspace = true anyhow.workspace = true -postgres.workspace = true bytes.workspace = true diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index c3a1ef8140..bb0f64ca32 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,11 +1,12 @@ use std::{collections::HashMap, error::Error as _}; use bytes::Bytes; -use detach_ancestor::AncestorDetached; -use pageserver_api::{models::*, shard::TenantShardId}; use reqwest::{IntoUrl, Method, StatusCode}; + +use detach_ancestor::AncestorDetached; +use http_utils::error::HttpErrorBody; +use pageserver_api::{models::*, shard::TenantShardId}; use utils::{ - http::error::HttpErrorBody, id::{TenantId, TimelineId}, lsn::Lsn, }; @@ -270,12 +271,18 @@ impl Client { Ok(body) } - pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> { + pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> Result<()> { let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint); self.request(Method::PUT, &uri, req).await?; Ok(()) } + pub async fn patch_tenant_config(&self, req: &TenantConfigPatchRequest) -> Result<()> { + let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint); + self.request(Method::PATCH, &uri, req).await?; + Ok(()) + } + pub async fn tenant_secondary_download( &self, tenant_id: TenantShardId, @@ -470,6 +477,26 @@ impl Client { self.request(Method::POST, &uri, ()).await.map(|_| ()) } + pub async fn timeline_download_heatmap_layers( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + concurrency: Option, + ) -> Result<()> { + let mut path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id + )) + .expect("Cannot build URL"); + + if let Some(concurrency) = concurrency { + path.query_pairs_mut() + .append_pair("concurrency", &format!("{}", concurrency)); + } + + self.request(Method::POST, path, ()).await.map(|_| ()) + } + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { let uri = format!( "{}/v1/tenant/{}/reset", @@ -757,4 +784,19 @@ impl Client { .await .map_err(Error::ReceiveBody) } + + pub async fn wait_lsn( + &self, + tenant_shard_id: TenantShardId, + request: TenantWaitLsnRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/wait_lsn", + self.mgmt_api_endpoint, + ); + + self.request_noerror(Method::POST, uri, request) + .await + .map(|resp| resp.status()) + } } diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index f9507fc47a..47da83b0eb 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -1,6 +1,9 @@ -use std::pin::Pin; +use std::sync::{Arc, Mutex}; -use futures::SinkExt; +use futures::{ + stream::{SplitSink, SplitStream}, + SinkExt, StreamExt, +}; use pageserver_api::{ models::{ PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, @@ -10,7 +13,6 @@ use pageserver_api::{ }; use tokio::task::JoinHandle; use tokio_postgres::CopyOutStream; -use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; use utils::{ id::{TenantId, TimelineId}, @@ -32,7 +34,8 @@ pub struct BasebackupRequest { impl Client { pub async fn new(connstring: String) -> anyhow::Result { - let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?; + let (client, connection) = + tokio_postgres::connect(&connstring, tokio_postgres::NoTls).await?; let conn_task_cancel = CancellationToken::new(); let conn_task = tokio::spawn({ @@ -60,17 +63,30 @@ impl Client { ) -> anyhow::Result { let copy_both: tokio_postgres::CopyBothDuplex = self .client - .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}")) + .copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}")) .await?; + let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away. let Client { cancel_on_client_drop, conn_task, client: _, } = self; + let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning( + ConnTaskRunning { + cancel_on_client_drop, + conn_task, + }, + ))); Ok(PagestreamClient { - copy_both: Box::pin(copy_both), - conn_task, - cancel_on_client_drop, + sink: PagestreamSender { + shared: shared.clone(), + sink, + }, + stream: PagestreamReceiver { + shared: shared.clone(), + stream, + }, + shared, }) } @@ -97,7 +113,28 @@ impl Client { /// Create using [`Client::pagestream`]. pub struct PagestreamClient { - copy_both: Pin>>, + shared: Arc>, + sink: PagestreamSender, + stream: PagestreamReceiver, +} + +pub struct PagestreamSender { + #[allow(dead_code)] + shared: Arc>, + sink: SplitSink, bytes::Bytes>, +} + +pub struct PagestreamReceiver { + #[allow(dead_code)] + shared: Arc>, + stream: SplitStream>, +} + +enum PagestreamShared { + ConnTaskRunning(ConnTaskRunning), + ConnTaskCancelledJoinHandleReturnedOrDropped, +} +struct ConnTaskRunning { cancel_on_client_drop: Option, conn_task: JoinHandle<()>, } @@ -110,11 +147,11 @@ pub struct RelTagBlockNo { impl PagestreamClient { pub async fn shutdown(self) { let Self { - copy_both, - cancel_on_client_drop: cancel_conn_task, - conn_task, - } = self; - // The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`. + shared, + sink, + stream, + } = { self }; + // The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`. // When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection. // (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56). // @@ -131,27 +168,77 @@ impl PagestreamClient { // // NB: page_service doesn't have a use case to exit the `pagestream` mode currently. // => https://github.com/neondatabase/neon/issues/6390 - let _ = cancel_conn_task.unwrap(); + let ConnTaskRunning { + cancel_on_client_drop, + conn_task, + } = { + let mut guard = shared.lock().unwrap(); + match std::mem::replace( + &mut *guard, + PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped, + ) { + PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running, + PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(), + } + }; + let _ = cancel_on_client_drop.unwrap(); conn_task.await.unwrap(); - drop(copy_both); + + // Now drop the split copy_both. + drop(sink); + drop(stream); + } + + pub fn split(self) -> (PagestreamSender, PagestreamReceiver) { + let Self { + shared: _, + sink, + stream, + } = self; + (sink, stream) } pub async fn getpage( &mut self, req: PagestreamGetPageRequest, ) -> anyhow::Result { - let req = PagestreamFeMessage::GetPage(req); - let req: bytes::Bytes = req.serialize(); - // let mut req = tokio_util::io::ReaderStream::new(&req); - let mut req = tokio_stream::once(Ok(req)); + self.getpage_send(req).await?; + self.getpage_recv().await + } - self.copy_both.send_all(&mut req).await?; + pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { + self.sink.getpage_send(req).await + } - let next: Option> = self.copy_both.next().await; + pub async fn getpage_recv(&mut self) -> anyhow::Result { + self.stream.getpage_recv().await + } +} + +impl PagestreamSender { + // TODO: maybe make this impl Sink instead for better composability? + pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> { + let msg = msg.serialize(); + self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?; + Ok(()) + } + + pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> { + self.send(PagestreamFeMessage::GetPage(req)).await + } +} + +impl PagestreamReceiver { + // TODO: maybe make this impl Stream instead for better composability? + pub async fn recv(&mut self) -> anyhow::Result { + let next: Option> = self.stream.next().await; let next: bytes::Bytes = next.unwrap()?; + PagestreamBeMessage::deserialize(next) + } - let msg = PagestreamBeMessage::deserialize(next)?; - match msg { + pub async fn getpage_recv(&mut self) -> anyhow::Result { + let next: PagestreamBeMessage = self.recv().await?; + match next { PagestreamBeMessage::GetPage(p) => Ok(p), PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e), PagestreamBeMessage::Exists(_) @@ -160,7 +247,14 @@ impl PagestreamClient { | PagestreamBeMessage::GetSlruSegment(_) => { anyhow::bail!( "unexpected be message kind in response to getpage request: {}", - msg.kind() + next.kind() + ) + } + #[cfg(feature = "testing")] + PagestreamBeMessage::Test(_) => { + anyhow::bail!( + "unexpected be message kind in response to getpage request: {}", + next.kind() ) } } diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 20f88868f9..7779ffaf8b 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -272,7 +272,7 @@ struct CompactionJob { completed: bool, } -impl<'a, E> LevelCompactionState<'a, E> +impl LevelCompactionState<'_, E> where E: CompactionJobExecutor, { diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 6b739d85a7..7e4e3042b3 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -221,12 +221,12 @@ where // performed implicitly when `top` is dropped). if let Some(mut top) = this.heap.peek_mut() { match top.deref_mut() { - LazyLoadLayer::Unloaded(ref mut l) => { + LazyLoadLayer::Unloaded(l) => { let fut = l.load_keys(this.ctx); this.load_future.set(Some(Box::pin(fut))); continue; } - LazyLoadLayer::Loaded(ref mut entries) => { + LazyLoadLayer::Loaded(entries) => { let result = entries.pop_front().unwrap(); if entries.is_empty() { std::collections::binary_heap::PeekMut::pop(top); diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs index 1853afffdd..e04bd15396 100644 --- a/pageserver/compaction/src/identify_levels.rs +++ b/pageserver/compaction/src/identify_levels.rs @@ -224,9 +224,8 @@ impl Level { } // recalculate depth if this was the last event at this point - let more_events_at_this_key = events_iter - .peek() - .map_or(false, |next_e| next_e.key == e.key); + let more_events_at_this_key = + events_iter.peek().is_some_and(|next_e| next_e.key == e.key); if !more_events_at_this_key { let mut active_depth = 0; for (_end_lsn, is_image, _idx) in active_set.iter().rev() { diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 5bc9b5ca1d..8ed393a645 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -148,7 +148,7 @@ pub trait CompactionDeltaLayer: CompactionLay Self: 'a; /// Return all keys in this delta layer. - fn load_keys<'a>( + fn load_keys( &self, ctx: &E::RequestContext, ) -> impl Future>>> + Send; diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index 776c537d03..673b80c313 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -143,7 +143,7 @@ impl interface::CompactionLayer for Arc { impl interface::CompactionDeltaLayer for Arc { type DeltaEntry<'a> = MockRecord; - async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result> { + async fn load_keys(&self, _ctx: &MockRequestContext) -> anyhow::Result> { Ok(self.records.clone()) } } diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs index 997925067f..4559db09f1 100644 --- a/pageserver/compaction/src/simulator/draw.rs +++ b/pageserver/compaction/src/simulator/draw.rs @@ -160,9 +160,12 @@ pub fn draw_history(history: &[LayerTraceEvent], mut output: // Fill in and thicken rectangle if it's an // image layer so that we can see it. - let mut style = Style::default(); - style.fill = Fill::Color(rgb(0x80, 0x80, 0x80)); - style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5); + let mut style = Style { + fill: Fill::Color(rgb(0x80, 0x80, 0x80)), + stroke: Stroke::Color(rgb(0, 0, 0), 0.5), + opacity: 1.0, + stroke_opacity: 1.0, + }; let y_start = lsn_max - lsn_start; let y_end = lsn_max - lsn_end; @@ -214,10 +217,6 @@ pub fn draw_history(history: &[LayerTraceEvent], mut output: files_seen.insert(f); } - let mut record_style = Style::default(); - record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80)); - record_style.stroke = Stroke::None; - writeln!(svg, "{}", EndSvg)?; let mut layer_events_str = String::new(); diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index 39ca47568c..7b70f0dc87 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -8,9 +8,11 @@ license.workspace = true [dependencies] anyhow.workspace = true +bincode.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } humantime.workspace = true +itertools.workspace = true pageserver = { path = ".." } pageserver_api.workspace = true remote_storage = { path = "../../libs/remote_storage" } diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs index af4b5a21ab..c7f0719c41 100644 --- a/pageserver/ctl/src/key.rs +++ b/pageserver/ctl/src/key.rs @@ -345,6 +345,7 @@ impl AuxFileV2 { AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash) } (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash), + (3, 1) => AuxFileV2::Recognized("pg_stat/pgstat.stat", hash), (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash), (0xff, 0xff) => AuxFileV2::Other(hash), _ => return None, diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index a0aac89dc8..353b4bd2f9 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -9,7 +9,9 @@ mod index_part; mod key; mod layer_map_analyzer; mod layers; +mod page_trace; +use page_trace::PageTraceCmd; use std::{ str::FromStr, time::{Duration, SystemTime}, @@ -64,6 +66,7 @@ enum Commands { Layer(LayerCmd), /// Debug print a hex key found from logs Key(key::DescribeKeyCommand), + PageTrace(PageTraceCmd), } /// Read and update pageserver metadata file @@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> { .await?; } Commands::Key(dkc) => dkc.execute(), + Commands::PageTrace(cmd) => page_trace::main(&cmd)?, }; Ok(()) } diff --git a/pageserver/ctl/src/page_trace.rs b/pageserver/ctl/src/page_trace.rs new file mode 100644 index 0000000000..da0de72fd9 --- /dev/null +++ b/pageserver/ctl/src/page_trace.rs @@ -0,0 +1,73 @@ +use std::collections::HashMap; +use std::io::BufReader; + +use camino::Utf8PathBuf; +use clap::Parser; +use itertools::Itertools as _; +use pageserver_api::key::{CompactKey, Key}; +use pageserver_api::models::PageTraceEvent; +use pageserver_api::reltag::RelTag; + +/// Parses a page trace (as emitted by the `page_trace` timeline API), and outputs stats. +#[derive(Parser)] +pub(crate) struct PageTraceCmd { + /// Trace input file. + path: Utf8PathBuf, +} + +pub(crate) fn main(cmd: &PageTraceCmd) -> anyhow::Result<()> { + let mut file = BufReader::new(std::fs::OpenOptions::new().read(true).open(&cmd.path)?); + let mut events: Vec = Vec::new(); + loop { + match bincode::deserialize_from(&mut file) { + Ok(event) => events.push(event), + Err(err) => { + if let bincode::ErrorKind::Io(ref err) = *err { + if err.kind() == std::io::ErrorKind::UnexpectedEof { + break; + } + } + return Err(err.into()); + } + } + } + + let mut reads_by_relation: HashMap = HashMap::new(); + let mut reads_by_key: HashMap = HashMap::new(); + + for event in events { + let key = Key::from_compact(event.key); + let reltag = RelTag { + spcnode: key.field2, + dbnode: key.field3, + relnode: key.field4, + forknum: key.field5, + }; + + *reads_by_relation.entry(reltag).or_default() += 1; + *reads_by_key.entry(event.key).or_default() += 1; + } + + let multi_read_keys = reads_by_key + .into_iter() + .filter(|(_, count)| *count > 1) + .sorted_by_key(|(key, count)| (-*count, *key)) + .collect_vec(); + + println!("Multi-read keys: {}", multi_read_keys.len()); + for (key, count) in multi_read_keys { + println!(" {key}: {count}"); + } + + let reads_by_relation = reads_by_relation + .into_iter() + .sorted_by_key(|(rel, count)| (-*count, *rel)) + .collect_vec(); + + println!("Reads by relation:"); + for (reltag, count) in reads_by_relation { + println!(" {reltag}: {count}"); + } + + Ok(()) +} diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs index 923a7f1f18..b869a0c6c7 100644 --- a/pageserver/pagebench/src/cmd/aux_files.rs +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -64,7 +64,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { println!("operating on timeline {}", timeline); mgmt_api_client - .tenant_config(&TenantConfigRequest { + .set_tenant_config(&TenantConfigRequest { tenant_id: timeline.tenant_id, config: TenantConfig::default(), }) diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index b2df01714d..a60efc7567 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -2,7 +2,7 @@ use anyhow::Context; use camino::Utf8PathBuf; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpaceAccum; -use pageserver_api::models::PagestreamGetPageRequest; +use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest}; use pageserver_api::shard::TenantShardId; use tokio_util::sync::CancellationToken; @@ -13,7 +13,7 @@ use rand::prelude::*; use tokio::task::JoinSet; use tracing::info; -use std::collections::HashSet; +use std::collections::{HashSet, VecDeque}; use std::future::Future; use std::num::NonZeroUsize; use std::pin::Pin; @@ -63,6 +63,10 @@ pub(crate) struct Args { #[clap(long)] set_io_mode: Option, + /// Queue depth generated in each client. + #[clap(long, default_value = "1")] + queue_depth: NonZeroUsize, + targets: Option>, } @@ -298,6 +302,7 @@ async fn main_impl( start_work_barrier.wait().await; let client_start = Instant::now(); let mut ticks_processed = 0; + let mut inflight = VecDeque::new(); while !cancel.is_cancelled() { // Detect if a request took longer than the RPS rate if let Some(period) = &rps_period { @@ -311,28 +316,37 @@ async fn main_impl( ticks_processed = periods_passed_until_now; } - let start = Instant::now(); - let req = { - let mut rng = rand::thread_rng(); - let r = &ranges[weights.sample(&mut rng)]; - let key: i128 = rng.gen_range(r.start..r.end); - let key = Key::from_i128(key); - assert!(key.is_rel_block_key()); - let (rel_tag, block_no) = key - .to_rel_block() - .expect("we filter non-rel-block keys out above"); - PagestreamGetPageRequest { - request_lsn: if rng.gen_bool(args.req_latest_probability) { - Lsn::MAX - } else { - r.timeline_lsn - }, - not_modified_since: r.timeline_lsn, - rel: rel_tag, - blkno: block_no, - } - }; - client.getpage(req).await.unwrap(); + while inflight.len() < args.queue_depth.get() { + let start = Instant::now(); + let req = { + let mut rng = rand::thread_rng(); + let r = &ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = Key::from_i128(key); + assert!(key.is_rel_block_key()); + let (rel_tag, block_no) = key + .to_rel_block() + .expect("we filter non-rel-block keys out above"); + PagestreamGetPageRequest { + hdr: PagestreamRequest { + reqid: 0, + request_lsn: if rng.gen_bool(args.req_latest_probability) { + Lsn::MAX + } else { + r.timeline_lsn + }, + not_modified_since: r.timeline_lsn, + }, + rel: rel_tag, + blkno: block_no, + } + }; + client.getpage_send(req).await.unwrap(); + inflight.push_back(start); + } + + let start = inflight.pop_front().unwrap(); + client.getpage_recv().await.unwrap(); let end = Instant::now(); live_stats.request_done(); ticks_processed += 1; diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs index 4aa6950782..ebe7bc031d 100644 --- a/pageserver/pagebench/src/util/request_stats.rs +++ b/pageserver/pagebench/src/util/request_stats.rs @@ -40,9 +40,7 @@ impl Stats { } } pub(crate) fn add(&mut self, other: &Self) { - let Self { - ref mut latency_histo, - } = self; + let Self { latency_histo } = self; latency_histo.add(&other.latency_histo).unwrap(); } } diff --git a/pageserver/src/assert_u64_eq_usize.rs b/pageserver/src/assert_u64_eq_usize.rs index 66ca7fd057..c4b8d9acba 100644 --- a/pageserver/src/assert_u64_eq_usize.rs +++ b/pageserver/src/assert_u64_eq_usize.rs @@ -2,7 +2,9 @@ pub(crate) const _ASSERT_U64_EQ_USIZE: () = { if std::mem::size_of::() != std::mem::size_of::() { - panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information"); + panic!( + "the traits defined in this module assume that usize and u64 can be converted to each other without loss of information" + ); } }; diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs index 5e527b7d61..b76c0e045f 100644 --- a/pageserver/src/aux_file.rs +++ b/pageserver/src/aux_file.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use ::metrics::IntGauge; use bytes::{Buf, BufMut, Bytes}; -use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; +use pageserver_api::key::{AUX_KEY_PREFIX, Key, METADATA_KEY_SIZE}; use tracing::warn; // BEGIN Copyright (c) 2017 Servo Contributors @@ -39,6 +39,7 @@ fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key const AUX_DIR_PG_LOGICAL: u8 = 0x01; const AUX_DIR_PG_REPLSLOT: u8 = 0x02; +const AUX_DIR_PG_STAT: u8 = 0x03; const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; /// Encode the aux file into a fixed-size key. @@ -53,6 +54,7 @@ const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; /// * pg_logical/replorigin_checkpoint -> 0x0103 /// * pg_logical/others -> 0x01FF /// * pg_replslot/ -> 0x0201 +/// * pg_stat/pgstat.stat -> 0x0301 /// * others -> 0xFFFF /// /// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`. @@ -75,6 +77,8 @@ pub fn encode_aux_file_key(path: &str) -> Key { aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes()) } else if let Some(fname) = path.strip_prefix("pg_replslot/") { aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_stat/") { + aux_hash_to_metadata_key(AUX_DIR_PG_STAT, 0x01, fname.as_bytes()) } else { if cfg!(debug_assertions) { warn!( diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index cae0ffb980..ce54bd9c1c 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,39 +10,38 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, Context}; -use bytes::{BufMut, Bytes, BytesMut}; -use fail::fail_point; -use pageserver_api::key::Key; -use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; use std::time::{Instant, SystemTime}; + +use anyhow::{Context, anyhow}; +use bytes::{BufMut, Bytes, BytesMut}; +use fail::fail_point; +use pageserver_api::key::{Key, rel_block_to_key}; +use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::pg_constants::{ + DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID, PG_HBA, PGDATA_SPECIAL_FILES, +}; +use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; +use postgres_ffi::{ + BLCKSZ, PG_TLI, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants, +}; use tokio::io; use tokio::io::AsyncWrite; -use tracing::*; - use tokio_tar::{Builder, EntryType, Header}; +use tracing::*; +use utils::lsn::Lsn; use crate::context::RequestContext; use crate::pgdatadir_mapping::Version; use crate::tenant::Timeline; -use pageserver_api::reltag::{RelTag, SlruKind}; - -use postgres_ffi::dispatch_pgversion; -use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; -use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA}; -use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; -use postgres_ffi::XLogFileName; -use postgres_ffi::PG_TLI; -use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; -use utils::lsn::Lsn; +use crate::tenant::storage_layer::IoConcurrency; #[derive(Debug, thiserror::Error)] pub enum BasebackupError { #[error("basebackup pageserver error {0:#}")] Server(#[from] anyhow::Error), - #[error("basebackup client error {0:#}")] - Client(#[source] io::Error), + #[error("basebackup client error {0:#} when {1}")] + Client(#[source] io::Error, &'static str), } /// Create basebackup with non-rel data in it. @@ -123,6 +122,13 @@ where full_backup, replica, ctx, + io_concurrency: IoConcurrency::spawn_from_conf( + timeline.conf, + timeline + .gate + .enter() + .map_err(|e| BasebackupError::Server(e.into()))?, + ), }; basebackup .send_tarball() @@ -144,6 +150,7 @@ where full_backup: bool, replica: bool, ctx: &'a RequestContext, + io_concurrency: IoConcurrency, } /// A sink that accepts SLRU blocks ordered by key and forwards @@ -225,7 +232,7 @@ where self.ar .append(&header, self.buf.as_slice()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "flush"))?; self.total_blocks += nblocks; debug!("Added to basebackup slru {} relsize {}", segname, nblocks); @@ -248,13 +255,38 @@ where } } -impl<'a, W> Basebackup<'a, W> +impl Basebackup<'_, W> where W: AsyncWrite + Send + Sync + Unpin, { async fn send_tarball(mut self) -> Result<(), BasebackupError> { // TODO include checksum + // Construct the pg_control file from the persisted checkpoint and pg_control + // information. But we only add this to the tarball at the end, so that if the + // writing is interrupted half-way through, the resulting incomplete tarball will + // be missing the pg_control file, which prevents PostgreSQL from starting up on + // it. With proper error handling, you should never try to start up from an + // incomplete basebackup in the first place, of course, but this is a nice little + // extra safety measure. + let checkpoint_bytes = self + .timeline + .get_checkpoint(self.lsn, self.ctx) + .await + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = self + .timeline + .get_control_file(self.lsn, self.ctx) + .await + .context("failed to get control bytes")?; + let (pg_control_bytes, system_identifier, was_shutdown) = + postgres_ffi::generate_pg_control( + &pg_control_bytes, + &checkpoint_bytes, + self.lsn, + self.timeline.pg_version, + )?; + let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; let pgversion = self.timeline.pg_version; @@ -264,9 +296,9 @@ where for dir in subdirs.iter() { let header = new_tar_header_dir(dir)?; self.ar - .append(&header, &mut io::empty()) + .append(&header, io::empty()) .await - .context("could not add directory to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball"))?; } // Send config files. @@ -277,13 +309,13 @@ where self.ar .append(&header, data) .await - .context("could not add config file to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball,pg_hba.conf"))?; } else { let header = new_tar_header(filepath, 0)?; self.ar - .append(&header, &mut io::empty()) + .append(&header, io::empty()) .await - .context("could not add config file to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball,add_config_file"))?; } } if !lazy_slru_download { @@ -303,7 +335,7 @@ where for part in slru_partitions.parts { let blocks = self .timeline - .get_vectored(part, self.lsn, self.ctx) + .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx) .await .map_err(|e| BasebackupError::Server(e.into()))?; @@ -358,7 +390,7 @@ where let start_time = Instant::now(); let aux_files = self .timeline - .list_aux_files(self.lsn, self.ctx) + .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone()) .await .map_err(|e| BasebackupError::Server(e.into()))?; let aux_scan_time = start_time.elapsed(); @@ -392,12 +424,16 @@ where // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, // but now we should handle (skip) it for backward compatibility. continue; + } else if path == "pg_stat/pgstat.stat" && !was_shutdown { + // Drop statistic in case of abnormal termination, i.e. if we're not starting from the exact LSN + // of a shutdown checkpoint. + continue; } let header = new_tar_header(&path, content.len() as u64)?; self.ar .append(&header, &*content) .await - .context("could not add aux file to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball,add_aux_file"))?; } if min_restart_lsn != Lsn::MAX { @@ -410,7 +446,7 @@ where self.ar .append(&header, &data[..]) .await - .context("could not add restart.lsn file to basebackup tarball")?; + .map_err(|e| BasebackupError::Client(e, "send_tarball,restart.lsn"))?; } for xid in self .timeline @@ -422,7 +458,7 @@ where } let repl_origins = self .timeline - .get_replorigins(self.lsn, self.ctx) + .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone()) .await .map_err(|e| BasebackupError::Server(e.into()))?; let n_origins = repl_origins.len(); @@ -442,9 +478,9 @@ where let crc32 = crc32c::crc32c(&content); content.extend_from_slice(&crc32.to_le_bytes()); let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?; - self.ar.append(&header, &*content).await.context( - "could not add pg_logical/replorigin_checkpoint file to basebackup tarball", - )?; + self.ar.append(&header, &*content).await.map_err(|e| { + BasebackupError::Client(e, "send_tarball,pg_logical/replorigin_checkpoint") + })?; } fail_point!("basebackup-before-control-file", |_| { @@ -453,9 +489,13 @@ where ))) }); - // Generate pg_control and bootstrap WAL segment. - self.add_pgcontrol_file().await?; - self.ar.finish().await.map_err(BasebackupError::Client)?; + // Last, add the pg_control file and bootstrap WAL segment. + self.add_pgcontrol_file(pg_control_bytes, system_identifier) + .await?; + self.ar + .finish() + .await + .map_err(|e| BasebackupError::Client(e, "send_tarball,finish"))?; debug!("all tarred up!"); Ok(()) } @@ -473,9 +513,9 @@ where let file_name = dst.to_segfile_name(0); let header = new_tar_header(&file_name, 0)?; self.ar - .append(&header, &mut io::empty()) + .append(&header, io::empty()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_rel,empty"))?; return Ok(()); } @@ -489,7 +529,9 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx) + // TODO: investigate using get_vectored for the entire startblk..endblk range. + // But this code path is not on the critical path for most basebackups (?). + .get(rel_block_to_key(src, blknum), self.lsn, self.ctx) .await .map_err(|e| BasebackupError::Server(e.into()))?; segment_data.extend_from_slice(&img[..]); @@ -500,7 +542,7 @@ where self.ar .append(&header, segment_data.as_slice()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_rel,segment"))?; seg += 1; startblk = endblk; @@ -551,7 +593,7 @@ where self.ar .append(&header, pg_version_str.as_bytes()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,PG_VERSION"))?; info!("timeline.pg_version {}", self.timeline.pg_version); @@ -561,7 +603,7 @@ where self.ar .append(&header, &img[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,global/pg_filenode.map"))?; } else { warn!("global/pg_filenode.map is missing"); } @@ -597,9 +639,9 @@ where let path = format!("base/{}", dbnode); let header = new_tar_header_dir(&path)?; self.ar - .append(&header, &mut io::empty()) + .append(&header, io::empty()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?; if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); @@ -612,14 +654,14 @@ where self.ar .append(&header, pg_version_str.as_bytes()) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; self.ar .append(&header, &img[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/pg_filenode.map"))?; } }; Ok(()) @@ -648,7 +690,7 @@ where self.ar .append(&header, &buf[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_twophase_file"))?; Ok(()) } @@ -657,7 +699,11 @@ where // Add generated pg_control file and bootstrap WAL segment. // Also send zenith.signal file with extra bootstrap data. // - async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> { + async fn add_pgcontrol_file( + &mut self, + pg_control_bytes: Bytes, + system_identifier: u64, + ) -> Result<(), BasebackupError> { // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { @@ -678,32 +724,14 @@ where zenith_signal.as_bytes(), ) .await - .map_err(BasebackupError::Client)?; - - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn, self.ctx) - .await - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn, self.ctx) - .await - .context("failed get control bytes")?; - - let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( - &pg_control_bytes, - &checkpoint_bytes, - self.lsn, - self.timeline.pg_version, - )?; + .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?; //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; self.ar .append(&header, &pg_control_bytes[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,pg_control"))?; //send wal segment let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); @@ -727,7 +755,7 @@ where self.ar .append(&header, &wal_seg[..]) .await - .map_err(BasebackupError::Client)?; + .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,wal_segment"))?; Ok(()) } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 567a69da3b..ab8d37df2e 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -3,49 +3,41 @@ //! Main entry point for the Page Server executable. use std::env; -use std::env::{var, VarError}; +use std::env::{VarError, var}; use std::io::Read; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use camino::Utf8Path; use clap::{Arg, ArgAction, Command}; - -use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; -use pageserver::config::PageserverIdentity; +use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric}; +use metrics::set_build_info_metric; +use pageserver::config::{PageServerConf, PageserverIdentity}; use pageserver::controller_upcall_client::ControllerUpcallClient; +use pageserver::deletion_queue::DeletionQueue; use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; -use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME}; -use pageserver::tenant::{secondary, TenantSharedResources}; -use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener}; +use pageserver::task_mgr::{ + BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, +}; +use pageserver::tenant::{TenantSharedResources, mgr, secondary}; +use pageserver::{ + CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, http, page_cache, page_service, + task_mgr, virtual_file, +}; +use postgres_backend::AuthType; use remote_storage::GenericRemoteStorage; use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; - -use metrics::set_build_info_metric; -use pageserver::{ - config::PageServerConf, - deletion_queue::DeletionQueue, - http, page_cache, page_service, task_mgr, - task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME}, - tenant::mgr, - virtual_file, -}; -use postgres_backend::AuthType; +use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::crashsafe::syncfs; -use utils::failpoint_support; use utils::logging::TracingErrorLayerEnablement; -use utils::{ - auth::{JwtAuth, SwappableJwtAuth}, - logging, project_build_tag, project_git_version, - sentry_init::init_sentry, - tcp_listener, -}; +use utils::sentry_init::init_sentry; +use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); @@ -53,10 +45,12 @@ project_build_tag!(BUILD_TAG); #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21). +/// This adds roughly 3% overhead for allocations on average, which is acceptable considering +/// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] -pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +#[unsafe(export_name = "malloc_conf")] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "pageserver.pid"; @@ -83,6 +77,9 @@ fn main() -> anyhow::Result<()> { return Ok(()); } + // Initialize up failpoints support + let scenario = failpoint_support::init(); + let workdir = arg_matches .get_one::("workdir") .map(Utf8Path::new) @@ -132,7 +129,9 @@ fn main() -> anyhow::Result<()> { info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); + info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation"); info!(?conf.page_service_pipelining, "starting with page service pipelining config"); + info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config"); // The tenants directory contains all the pageserver local disk state. // Create if not exists and make sure all the contents are durable before proceeding. @@ -174,9 +173,6 @@ fn main() -> anyhow::Result<()> { } } - // Initialize up failpoints support - let scenario = failpoint_support::init(); - // Basic initialization of things that don't change after startup tracing::info!("Initializing virtual_file..."); virtual_file::init( @@ -213,7 +209,9 @@ fn initialize_config( Ok(mut f) => { let md = f.metadata().context("stat config file")?; if !md.is_file() { - anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..."); + anyhow::bail!( + "Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ..." + ); } let mut s = String::new(); @@ -221,7 +219,9 @@ fn initialize_config( toml_edit::de::from_str::(&s)? } Err(e) => { - anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..."); + anyhow::bail!( + "Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ..." + ); } }; @@ -397,11 +397,9 @@ fn start_pageserver( Err(VarError::NotPresent) => { info!("No JWT token for authentication with Safekeeper detected"); } - Err(e) => { - return Err(e).with_context(|| { - "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable" - }) - } + Err(e) => return Err(e).with_context( + || "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable", + ), }; // Top-level cancellation token for the process @@ -589,7 +587,7 @@ fn start_pageserver( let router = http::make_router(router_state, launch_ts, http_auth.clone())? .build() .map_err(|err| anyhow!(err))?; - let service = utils::http::RouterService::new(router).unwrap(); + let service = http_utils::RouterService::new(router).unwrap(); let server = hyper0::Server::from_tcp(http_listener)? .serve(service) .with_graceful_shutdown({ @@ -707,7 +705,9 @@ async fn create_remote_storage_client( // wrapper that simulates failures. if conf.test_remote_failures > 0 { if !cfg!(feature = "testing") { - anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"); + anyhow::bail!( + "test_remote_failures option is not available because pageserver was compiled without the 'testing' feature" + ); } info!( "Simulating remote failures for first {} attempts of each op", diff --git a/pageserver/src/bin/test_helper_slow_client_reads.rs b/pageserver/src/bin/test_helper_slow_client_reads.rs new file mode 100644 index 0000000000..0215dd06fb --- /dev/null +++ b/pageserver/src/bin/test_helper_slow_client_reads.rs @@ -0,0 +1,61 @@ +use std::io::{Read, Write, stdin, stdout}; +use std::time::Duration; + +use clap::Parser; +use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +#[derive(clap::Parser)] +struct Args { + connstr: String, + tenant_id: TenantId, + timeline_id: TimelineId, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let Args { + connstr, + tenant_id, + timeline_id, + } = Args::parse(); + let client = pageserver_client::page_service::Client::new(connstr).await?; + let client = client.pagestream(tenant_id, timeline_id).await?; + let (mut sender, _receiver) = client.split(); + + eprintln!("filling the pipe"); + let mut msg = 0; + loop { + msg += 1; + let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test( + PagestreamTestRequest { + hdr: PagestreamRequest { + reqid: 0, + request_lsn: Lsn(23), + not_modified_since: Lsn(23), + }, + batch_key: 42, + message: format!("message {}", msg), + }, + )); + let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else { + eprintln!("pipe seems full"); + break; + }; + let _: () = res?; + } + + let n = stdout().write(b"R")?; + assert_eq!(n, 1); + stdout().flush()?; + + eprintln!("waiting for signal to tell us to exit"); + + let mut buf = [0u8; 1]; + stdin().read_exact(&mut buf)?; + + eprintln!("termination signal received, exiting"); + + anyhow::Ok(()) +} diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 1651db8500..64d00882b9 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -4,36 +4,29 @@ //! file, or on the command line. //! See also `settings.md` for better description on every parameter. -use anyhow::{bail, ensure, Context}; -use pageserver_api::models::ImageCompressionAlgorithm; -use pageserver_api::{ - config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}, - shard::TenantShardId, -}; -use remote_storage::{RemotePath, RemoteStorageConfig}; use std::env; -use storage_broker::Uri; -use utils::logging::SecretString; -use utils::postgres_client::PostgresClientProtocol; - -use once_cell::sync::OnceCell; -use reqwest::Url; use std::num::NonZeroUsize; use std::sync::Arc; use std::time::Duration; +use anyhow::{Context, bail, ensure}; use camino::{Utf8Path, Utf8PathBuf}; +use once_cell::sync::OnceCell; +use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}; +use pageserver_api::models::ImageCompressionAlgorithm; +use pageserver_api::shard::TenantShardId; use postgres_backend::AuthType; -use utils::{ - id::{NodeId, TimelineId}, - logging::LogFormat, -}; +use remote_storage::{RemotePath, RemoteStorageConfig}; +use reqwest::Url; +use storage_broker::Uri; +use utils::id::{NodeId, TimelineId}; +use utils::logging::{LogFormat, SecretString}; +use utils::postgres_client::PostgresClientProtocol; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; -use crate::virtual_file; use crate::virtual_file::io_engine; -use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME}; +use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, virtual_file}; /// Global state of pageserver. /// @@ -191,6 +184,16 @@ pub struct PageServerConf { pub wal_receiver_protocol: PostgresClientProtocol, pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig, + + pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo, + + /// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer + /// files read. + pub enable_read_path_debugging: bool, + + /// Interpreted protocol feature: if enabled, validate that the logical WAL received from + /// safekeepers does not have gaps. + pub validate_wal_contiguity: bool, } /// Token for authentication to safekeepers @@ -352,6 +355,9 @@ impl PageServerConf { no_sync, wal_receiver_protocol, page_service_pipelining, + get_vectored_concurrent_io, + enable_read_path_debugging, + validate_wal_contiguity, } = config_toml; let mut conf = PageServerConf { @@ -396,6 +402,7 @@ impl PageServerConf { import_pgdata_aws_endpoint_url, wal_receiver_protocol, page_service_pipelining, + get_vectored_concurrent_io, // ------------------------------------------------------------ // fields that require additional validation or custom handling @@ -426,7 +433,9 @@ impl PageServerConf { io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise io_engine::FeatureTestResult::Worse { engine, remark } => { // TODO: bubble this up to the caller so we can tracing::warn! it. - eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); + eprintln!( + "auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}" + ); engine } }, @@ -436,6 +445,8 @@ impl PageServerConf { .unwrap_or_default(), virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()), no_sync: no_sync.unwrap_or(false), + enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false), + validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false), }; // ------------------------------------------------------------ diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 7e8c00c293..0231190e69 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -1,13 +1,9 @@ //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. -use crate::config::PageServerConf; -use crate::consumption_metrics::metrics::MetricsKey; -use crate::consumption_metrics::upload::KeyGen as _; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; -use crate::tenant::size::CalculateSyntheticSizeError; -use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant}; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; + use camino::Utf8PathBuf; use consumption_metrics::EventType; use itertools::Itertools as _; @@ -15,14 +11,21 @@ use pageserver_api::models::TenantState; use remote_storage::{GenericRemoteStorage, RemoteStorageConfig}; use reqwest::Url; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::sync::Arc; -use std::time::{Duration, SystemTime}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::NodeId; +use crate::config::PageServerConf; +use crate::consumption_metrics::metrics::MetricsKey; +use crate::consumption_metrics::upload::KeyGen as _; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; +use crate::tenant::mgr::TenantManager; +use crate::tenant::size::CalculateSyntheticSizeError; +use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::{LogicalSizeCalculationCause, Tenant}; + mod disk_cache; mod metrics; mod upload; diff --git a/pageserver/src/consumption_metrics/disk_cache.rs b/pageserver/src/consumption_metrics/disk_cache.rs index 54a505a134..f1dad8793d 100644 --- a/pageserver/src/consumption_metrics/disk_cache.rs +++ b/pageserver/src/consumption_metrics/disk_cache.rs @@ -1,10 +1,10 @@ -use anyhow::Context; -use camino::{Utf8Path, Utf8PathBuf}; use std::sync::Arc; -use crate::consumption_metrics::NewMetricsRefRoot; +use anyhow::Context; +use camino::{Utf8Path, Utf8PathBuf}; use super::{NewMetricsRoot, NewRawMetric, RawMetric}; +use crate::consumption_metrics::NewMetricsRefRoot; pub(super) fn read_metrics_from_serde_value( json_value: serde_json::Value, diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 07fac09f6f..71910011ea 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -1,15 +1,16 @@ -use crate::tenant::mgr::TenantManager; -use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize}; +use std::sync::Arc; +use std::time::SystemTime; + use chrono::{DateTime, Utc}; use consumption_metrics::EventType; use futures::stream::StreamExt; -use std::{sync::Arc, time::SystemTime}; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; use super::{Cache, NewRawMetric}; +use crate::context::RequestContext; +use crate::tenant::mgr::TenantManager; +use crate::tenant::timeline::logical_size::CurrentLogicalSize; /// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events` /// instead of static str. diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 3ed7b44123..52b4fb8680 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -1,7 +1,7 @@ -use crate::consumption_metrics::RawMetric; +use std::collections::HashMap; use super::*; -use std::collections::HashMap; +use crate::consumption_metrics::RawMetric; #[test] fn startup_collected_timeline_metrics_before_advancing() { diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 448bf47525..59e0145a5b 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -2,15 +2,16 @@ use std::error::Error as _; use std::time::SystemTime; use chrono::{DateTime, Utc}; -use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE}; +use consumption_metrics::{CHUNK_SIZE, Event, EventChunk, IdempotencyKey}; use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::Instrument; - -use super::{metrics::Name, Cache, MetricsKey, NewRawMetric, RawMetric}; use utils::id::{TenantId, TimelineId}; +use super::metrics::Name; +use super::{Cache, MetricsKey, NewRawMetric, RawMetric}; + /// How the metrics from pageserver are identified. #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)] struct Ids { @@ -438,14 +439,13 @@ async fn upload( #[cfg(test)] mod tests { - use crate::consumption_metrics::{ - disk_cache::read_metrics_from_serde_value, NewMetricsRefRoot, - }; - - use super::*; use chrono::{DateTime, Utc}; use once_cell::sync::Lazy; + use super::*; + use crate::consumption_metrics::NewMetricsRefRoot; + use crate::consumption_metrics::disk_cache::read_metrics_from_serde_value; + #[test] fn chunked_serialization() { let examples = metric_samples(); diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 8f2177fe5b..da9c095a15 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -98,6 +98,7 @@ pub struct RequestContext { download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, + read_path_debug: bool, } /// The kind of access to the page cache. @@ -155,6 +156,7 @@ impl RequestContextBuilder { download_behavior: DownloadBehavior::Download, access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, + read_path_debug: false, }, } } @@ -168,6 +170,7 @@ impl RequestContextBuilder { download_behavior: original.download_behavior, access_stats_behavior: original.access_stats_behavior, page_content_kind: original.page_content_kind, + read_path_debug: original.read_path_debug, }, } } @@ -191,6 +194,11 @@ impl RequestContextBuilder { self } + pub(crate) fn read_path_debug(mut self, b: bool) -> Self { + self.inner.read_path_debug = b; + self + } + pub fn build(self) -> RequestContext { self.inner } @@ -291,4 +299,8 @@ impl RequestContext { pub(crate) fn page_content_kind(&self) -> PageContentKind { self.page_content_kind } + + pub(crate) fn read_path_debug(&self) -> bool { + self.read_path_debug + } } diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index d41bfd9021..8462594607 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -1,21 +1,23 @@ use std::collections::HashMap; use futures::Future; -use pageserver_api::{ - controller_api::{AvailabilityZone, NodeRegisterRequest}, - shard::TenantShardId, - upcall_api::{ - ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, - ValidateRequestTenant, ValidateResponse, - }, +use pageserver_api::config::NodeMetadata; +use pageserver_api::controller_api::{AvailabilityZone, NodeRegisterRequest}; +use pageserver_api::shard::TenantShardId; +use pageserver_api::upcall_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, + ValidateRequestTenant, ValidateResponse, }; -use serde::{de::DeserializeOwned, Serialize}; +use serde::Serialize; +use serde::de::DeserializeOwned; use tokio_util::sync::CancellationToken; use url::Url; -use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; +use utils::generation::Generation; +use utils::id::NodeId; +use utils::{backoff, failpoint_support}; -use crate::{config::PageServerConf, virtual_file::on_fatal_io_error}; -use pageserver_api::config::NodeMetadata; +use crate::config::PageServerConf; +use crate::virtual_file::on_fatal_io_error; /// The Pageserver's client for using the storage controller upcall API: this is a small API /// for dealing with generations (see docs/rfcs/025-generation-numbers.md). @@ -157,14 +159,18 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { match az_id_from_metadata { Some(az_id) => Some(AvailabilityZone(az_id)), None => { - tracing::warn!("metadata.json does not contain an 'availability_zone_id' field"); + tracing::warn!( + "metadata.json does not contain an 'availability_zone_id' field" + ); conf.availability_zone.clone().map(AvailabilityZone) } } }; if az_id.is_none() { - panic!("Availablity zone id could not be inferred from metadata.json or pageserver config"); + panic!( + "Availablity zone id could not be inferred from metadata.json or pageserver config" + ); } Some(NodeRegisterRequest { @@ -173,6 +179,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { listen_pg_port: m.postgres_port, listen_http_addr: m.http_host, listen_http_port: m.http_port, + listen_https_port: None, // TODO: Support https. availability_zone_id: az_id.expect("Checked above"), }) } @@ -235,7 +242,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { .iter() .map(|(id, generation)| ValidateRequestTenant { id: *id, - gen: (*generation).into().expect( + r#gen: (*generation).into().expect( "Generation should always be valid for a Tenant doing deletions", ), }) diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 1d508f5fe9..8118f66252 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -6,39 +6,31 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; -use crate::controller_upcall_client::ControlPlaneGenerationsApi; -use crate::metrics; -use crate::tenant::remote_timeline_client::remote_layer_path; -use crate::tenant::remote_timeline_client::remote_timeline_path; -use crate::tenant::remote_timeline_client::LayerFileMetadata; -use crate::virtual_file::MaybeFatalIo; -use crate::virtual_file::VirtualFile; use anyhow::Context; use camino::Utf8PathBuf; +use deleter::DeleterMessage; +use list_writer::ListWriterQueueMessage; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, RemotePath}; -use serde::Deserialize; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio_util::sync::CancellationToken; -use tracing::Instrument; -use tracing::{debug, error}; +use tracing::{Instrument, debug, error}; use utils::crashsafe::path_with_suffix_extension; use utils::generation::Generation; use utils::id::TimelineId; -use utils::lsn::AtomicLsn; -use utils::lsn::Lsn; - -use self::deleter::Deleter; -use self::list_writer::DeletionOp; -use self::list_writer::ListWriter; -use self::list_writer::RecoverOp; -use self::validator::Validator; -use deleter::DeleterMessage; -use list_writer::ListWriterQueueMessage; +use utils::lsn::{AtomicLsn, Lsn}; use validator::ValidatorQueueMessage; -use crate::{config::PageServerConf, tenant::storage_layer::LayerName}; +use self::deleter::Deleter; +use self::list_writer::{DeletionOp, ListWriter, RecoverOp}; +use self::validator::Validator; +use crate::config::PageServerConf; +use crate::controller_upcall_client::ControlPlaneGenerationsApi; +use crate::metrics; +use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path}; +use crate::tenant::storage_layer::LayerName; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; // TODO: configurable for how long to wait before executing deletions @@ -463,45 +455,18 @@ impl DeletionQueueClient { /// /// The `current_generation` is the generation of this pageserver's current attachment. The /// generations in `layers` are the generations in which those layers were written. - pub(crate) async fn push_layers( + pub(crate) fn push_layers( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { - if current_generation.is_none() { - debug!("Enqueuing deletions in legacy mode, skipping queue"); + // None generations are not valid for attached tenants: they must always be attached in + // a known generation. None generations are still permitted for layers in the index because + // they may be historical. + assert!(!current_generation.is_none()); - let mut layer_paths = Vec::new(); - for (layer, meta) in layers { - layer_paths.push(remote_layer_path( - &tenant_shard_id.tenant_id, - &timeline_id, - meta.shard, - &layer, - meta.generation, - )); - } - self.push_immediate(layer_paths).await?; - return self.flush_immediate().await; - } - - self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers) - } - - /// When a Tenant has a generation, push_layers is always synchronous because - /// the ListValidator channel is an unbounded channel. - /// - /// This can be merged into push_layers when we remove the Generation-less mode - /// support (``) - pub(crate) fn push_layers_sync( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - current_generation: Generation, - layers: Vec<(LayerName, LayerFileMetadata)>, - ) -> Result<(), DeletionQueueError> { metrics::DELETION_QUEUE .keys_submitted .inc_by(layers.len() as u64); @@ -692,21 +657,22 @@ impl DeletionQueue { #[cfg(test)] mod test { + use std::io::ErrorKind; + use std::time::Duration; + use camino::Utf8Path; use hex_literal::hex; - use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant}; - use std::{io::ErrorKind, time::Duration}; - use tracing::info; - + use pageserver_api::key::Key; + use pageserver_api::shard::ShardIndex; + use pageserver_api::upcall_api::ReAttachResponseTenant; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; use tokio::task::JoinHandle; - - use crate::{ - controller_upcall_client::RetryForeverError, - tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, - }; + use tracing::info; use super::*; + use crate::controller_upcall_client::RetryForeverError; + use crate::tenant::harness::TenantHarness; + use crate::tenant::storage_layer::DeltaLayerName; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -752,26 +718,26 @@ mod test { .expect("Failed to join workers for previous deletion queue"); } - fn set_latest_generation(&self, gen: Generation) { + fn set_latest_generation(&self, gen_: Generation) { let tenant_shard_id = self.harness.tenant_shard_id; self.mock_control_plane .latest_generation .lock() .unwrap() - .insert(tenant_shard_id, gen); + .insert(tenant_shard_id, gen_); } /// Returns remote layer file name, suitable for use in assert_remote_files fn write_remote_layer( &self, file_name: LayerName, - gen: Generation, + gen_: Generation, ) -> anyhow::Result { let tenant_shard_id = self.harness.tenant_shard_id; let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID); let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path()); std::fs::create_dir_all(&remote_timeline_path)?; - let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix()); + let remote_layer_file_name = format!("{}{}", file_name, gen_.get_suffix()); let content: Vec = format!("placeholder contents of {file_name}").into(); @@ -957,14 +923,12 @@ mod test { // File should still be there after we push it to the queue (we haven't pushed enough to flush anything) info!("Pushing"); - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - now_generation, - [(layer_file_name_1.clone(), layer_metadata)].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + now_generation, + [(layer_file_name_1.clone(), layer_metadata)].to_vec(), + )?; assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path); assert_local_files(&[], &deletion_prefix); @@ -1017,14 +981,12 @@ mod test { assert_remote_files(&[&remote_layer_name], &remote_timeline_path); tracing::debug!("Pushing..."); - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - stale_generation, - [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + stale_generation, + [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), + )?; // We enqueued the operation in a stale generation: it should have failed validation tracing::debug!("Flushing..."); @@ -1032,14 +994,12 @@ mod test { assert_remote_files(&[&remote_layer_name], &remote_timeline_path); tracing::debug!("Pushing..."); - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - latest_generation, - [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + latest_generation, + [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), + )?; // We enqueued the operation in a fresh generation: it should have passed validation tracing::debug!("Flushing..."); @@ -1074,28 +1034,24 @@ mod test { // generation gets that treatment) let remote_layer_file_name_historical = ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?; - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - now_generation.previous(), - [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + now_generation.previous(), + [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(), + )?; // Inject a deletion in the generation before generation_now: after restart, // this deletion should get executed, because we execute deletions in the // immediately previous generation on the same node. let remote_layer_file_name_previous = ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?; - client - .push_layers( - tenant_shard_id, - TIMELINE_ID, - now_generation, - [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(), - ) - .await?; + client.push_layers( + tenant_shard_id, + TIMELINE_ID, + now_generation, + [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(), + )?; client.flush().await?; assert_remote_files( @@ -1136,10 +1092,12 @@ mod test { /// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it. #[cfg(test)] pub(crate) mod mock { + use std::sync::atomic::{AtomicUsize, Ordering}; + use tracing::info; use super::*; - use std::sync::atomic::{AtomicUsize, Ordering}; + use crate::tenant::remote_timeline_client::remote_layer_path; pub struct ConsumerState { rx: tokio::sync::mpsc::UnboundedReceiver, diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index ef1dfbac19..691ba75cc7 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -6,21 +6,16 @@ //! number of full-sized DeleteObjects requests, rather than a larger number of //! smaller requests. -use remote_storage::GenericRemoteStorage; -use remote_storage::RemotePath; -use remote_storage::TimeoutOrCancel; use std::time::Duration; + +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use tokio_util::sync::CancellationToken; -use tracing::info; -use tracing::warn; -use utils::backoff; -use utils::pausable_failpoint; +use tracing::{info, warn}; +use utils::{backoff, pausable_failpoint}; +use super::{DeletionQueueError, FlushOp}; use crate::metrics; -use super::DeletionQueueError; -use super::FlushOp; - const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10); pub(super) enum DeleterMessage { diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index ae3b2c9180..a385e35a02 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -10,11 +10,6 @@ //! //! DeletionLists are passed onwards to the Validator. -use super::DeletionHeader; -use super::DeletionList; -use super::FlushOp; -use super::ValidatorQueueMessage; - use std::collections::HashMap; use std::fs::create_dir_all; use std::time::Duration; @@ -23,20 +18,17 @@ use pageserver_api::shard::TenantShardId; use regex::Regex; use remote_storage::RemotePath; use tokio_util::sync::CancellationToken; -use tracing::debug; -use tracing::info; -use tracing::warn; +use tracing::{debug, info, warn}; use utils::generation::Generation; use utils::id::TimelineId; +use super::{DeletionHeader, DeletionList, FlushOp, ValidatorQueueMessage}; use crate::config::PageServerConf; use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; -use crate::tenant::remote_timeline_client::remote_layer_path; -use crate::tenant::remote_timeline_client::LayerFileMetadata; +use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_layer_path}; use crate::tenant::storage_layer::LayerName; -use crate::virtual_file::on_fatal_io_error; -use crate::virtual_file::MaybeFatalIo; +use crate::virtual_file::{MaybeFatalIo, on_fatal_io_error}; // The number of keys in a DeletionList before we will proactively persist it // (without reaching a flush deadline). This aims to deliver objects of the order diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index 1d55581ebd..b0ce2b80b4 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -20,22 +20,14 @@ use std::time::Duration; use camino::Utf8PathBuf; use tokio_util::sync::CancellationToken; -use tracing::debug; -use tracing::info; -use tracing::warn; - -use crate::config::PageServerConf; -use crate::controller_upcall_client::ControlPlaneGenerationsApi; -use crate::controller_upcall_client::RetryForeverError; -use crate::metrics; -use crate::virtual_file::MaybeFatalIo; +use tracing::{debug, info, warn}; use super::deleter::DeleterMessage; -use super::DeletionHeader; -use super::DeletionList; -use super::DeletionQueueError; -use super::FlushOp; -use super::VisibleLsnUpdates; +use super::{DeletionHeader, DeletionList, DeletionQueueError, FlushOp, VisibleLsnUpdates}; +use crate::config::PageServerConf; +use crate::controller_upcall_client::{ControlPlaneGenerationsApi, RetryForeverError}; +use crate::metrics; +use crate::virtual_file::MaybeFatalIo; // After this length of time, do any validation work that is pending, // even if we haven't accumulated many keys to delete. @@ -190,7 +182,10 @@ where } } else { // If we failed validation, then do not apply any of the projected updates - info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); + info!( + "Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", + tenant_lsn_state.generation + ); metrics::DELETION_QUEUE.dropped_lsn_updates.inc(); } } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index ca44fbe6ae..13252037e5 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -41,29 +41,31 @@ // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl // reading these fields. We use the Debug impl for semi-structured logging, though. -use std::{sync::Arc, time::SystemTime}; +use std::sync::Arc; +use std::time::SystemTime; use anyhow::Context; -use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId}; +use pageserver_api::config::DiskUsageEvictionTaskConfig; +use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; use serde::Serialize; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, instrument, warn, Instrument}; -use utils::{completion, id::TimelineId}; +use tracing::{Instrument, debug, error, info, instrument, warn}; +use utils::completion; +use utils::id::TimelineId; -use crate::{ - config::PageServerConf, - metrics::disk_usage_based_eviction::METRICS, - task_mgr::{self, BACKGROUND_RUNTIME}, - tenant::{ - mgr::TenantManager, - remote_timeline_client::LayerFileMetadata, - secondary::SecondaryTenant, - storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint}, - }, - CancellableTask, DiskUsageEvictionTask, +use crate::config::PageServerConf; +use crate::metrics::disk_usage_based_eviction::METRICS; +use crate::task_mgr::{self, BACKGROUND_RUNTIME}; +use crate::tenant::mgr::TenantManager; +use crate::tenant::remote_timeline_client::LayerFileMetadata; +use crate::tenant::secondary::SecondaryTenant; +use crate::tenant::storage_layer::{ + AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint, }; +use crate::tenant::tasks::sleep_random; +use crate::{CancellableTask, DiskUsageEvictionTask}; /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. @@ -210,14 +212,8 @@ async fn disk_usage_eviction_task( info!("disk usage based eviction task finishing"); }; - use crate::tenant::tasks::random_init_delay; - { - if random_init_delay(task_config.period, &cancel) - .await - .is_err() - { - return; - } + if sleep_random(task_config.period, &cancel).await.is_err() { + return; } let mut iteration_no = 0; @@ -1012,10 +1008,14 @@ async fn collect_eviction_candidates( } } - debug_assert!(EvictionPartition::Above < EvictionPartition::Below, - "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); - debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above, - "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); + debug_assert!( + EvictionPartition::Above < EvictionPartition::Below, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first" + ); + debug_assert!( + EvictionPartition::EvictNow < EvictionPartition::Above, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first" + ); eviction_order.sort(&mut candidates); @@ -1162,9 +1162,8 @@ mod filesystem_level_usage { use anyhow::Context; use camino::Utf8Path; - use crate::statvfs::Statvfs; - use super::DiskUsageEvictionTaskConfig; + use crate::statvfs::Statvfs; #[derive(Debug, Clone, Copy)] pub struct Usage<'a> { @@ -1229,10 +1228,12 @@ mod filesystem_level_usage { #[test] fn max_usage_pct_pressure() { - use super::Usage as _; use std::time::Duration; + use utils::serde_percent::Percent; + use super::Usage as _; + let mut usage = Usage { config: &DiskUsageEvictionTaskConfig { max_usage_pct: Percent::new(85).unwrap(), diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 7fb9247feb..12252739fd 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -767,7 +767,27 @@ paths: /v1/tenant/config: put: description: | - Update tenant's config. + Update tenant's config by setting it to the provided value + + Invalid fields in the tenant config will cause the request to be rejected with status 400. + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/TenantConfigRequest" + responses: + "200": + description: OK + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/TenantInfo" + patch: + description: | + Update tenant's config additively by patching the updated fields provided. + Null values unset the field and non-null values upsert it. Invalid fields in the tenant config will cause the request to be rejected with status 400. requestBody: @@ -804,6 +824,38 @@ paths: schema: $ref: "#/components/schemas/TenantConfigResponse" + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + - name: concurrency + description: Maximum number of concurrent downloads (capped at remote storage concurrency) + in: query + required: false + schema: + type: integer + post: + description: | + Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter + may be used to target all shards of a tenant when the unsharded form is used, or a specific + tenant shard with the sharded form. + responses: + "200": + description: Success + delete: + description: Stop any on-going background downloads of heatmap layers for the specified timeline. + responses: + "200": + description: Success + /v1/utilization: get: description: | @@ -862,6 +914,8 @@ components: properties: reason: type: string + gc_blocking: + type: string TenantCreateRequest: allOf: @@ -964,6 +1018,8 @@ components: type: string compaction_threshold: type: string + compaction_upper_limit: + type: string image_creation_threshold: type: integer walreceiver_connect_timeout: @@ -1058,9 +1114,15 @@ components: type: integer state: type: string + min_readable_lsn: + type: string + format: hex latest_gc_cutoff_lsn: type: string format: hex + applied_gc_cutoff_lsn: + type: string + format: hex SyntheticSizeResponse: type: object diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 75d25d0a6a..dd5a24a41f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2,116 +2,83 @@ //! Management HTTP API //! use std::cmp::Reverse; -use std::collections::BinaryHeap; -use std::collections::HashMap; +use std::collections::{BinaryHeap, HashMap}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{anyhow, Context, Result}; +use anyhow::{Context, Result, anyhow}; use enumset::EnumSet; -use futures::StreamExt; -use futures::TryFutureExt; +use futures::future::join_all; +use futures::{StreamExt, TryFutureExt}; +use http_utils::endpoint::{ + self, attach_openapi_ui, auth_middleware, check_permission_with, profile_cpu_handler, + profile_heap_handler, prometheus_metrics_handler, request_span, +}; +use http_utils::error::{ApiError, HttpErrorBody}; +use http_utils::failpoints::failpoints_handler; +use http_utils::json::{json_request, json_request_maybe, json_response}; +use http_utils::request::{ + get_request_param, must_get_query_param, must_parse_query_param, parse_query_param, + parse_request_param, +}; +use http_utils::{RequestExt, RouterBuilder}; use humantime::format_rfc3339; -use hyper::header; -use hyper::StatusCode; -use hyper::{Body, Request, Response, Uri}; +use hyper::{Body, Request, Response, StatusCode, Uri, header}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::virtual_file::IoMode; -use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; -use pageserver_api::models::IngestAuxFilesRequest; -use pageserver_api::models::ListAuxFilesRequest; -use pageserver_api::models::LocationConfig; -use pageserver_api::models::LocationConfigListResponse; -use pageserver_api::models::LocationConfigMode; -use pageserver_api::models::LsnLease; -use pageserver_api::models::LsnLeaseRequest; -use pageserver_api::models::OffloadedTimelineInfo; -use pageserver_api::models::ShardParameters; -use pageserver_api::models::TenantDetails; -use pageserver_api::models::TenantLocationConfigRequest; -use pageserver_api::models::TenantLocationConfigResponse; -use pageserver_api::models::TenantScanRemoteStorageResponse; -use pageserver_api::models::TenantScanRemoteStorageShard; -use pageserver_api::models::TenantShardLocation; -use pageserver_api::models::TenantShardSplitRequest; -use pageserver_api::models::TenantShardSplitResponse; -use pageserver_api::models::TenantSorting; -use pageserver_api::models::TenantState; -use pageserver_api::models::TimelineArchivalConfigRequest; -use pageserver_api::models::TimelineCreateRequestMode; -use pageserver_api::models::TimelineCreateRequestModeImportPgdata; -use pageserver_api::models::TimelinesInfoAndOffloaded; -use pageserver_api::models::TopTenantShardItem; -use pageserver_api::models::TopTenantShardsRequest; -use pageserver_api::models::TopTenantShardsResponse; -use pageserver_api::shard::ShardCount; -use pageserver_api::shard::TenantShardId; -use remote_storage::DownloadError; -use remote_storage::GenericRemoteStorage; -use remote_storage::TimeTravelError; -use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel}; +use pageserver_api::models::{ + DownloadRemoteLayersTaskSpawnRequest, IngestAuxFilesRequest, ListAuxFilesRequest, + LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, LsnLeaseRequest, + OffloadedTimelineInfo, PageTraceEvent, ShardParameters, StatusResponse, + TenantConfigPatchRequest, TenantConfigRequest, TenantDetails, TenantInfo, + TenantLocationConfigRequest, TenantLocationConfigResponse, TenantScanRemoteStorageResponse, + TenantScanRemoteStorageShard, TenantShardLocation, TenantShardSplitRequest, + TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest, + TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode, + TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo, + TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse, +}; +use pageserver_api::shard::{ShardCount, TenantShardId}; +use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; +use scopeguard::defer; +use tenant_size_model::svg::SvgBranchKind; +use tenant_size_model::{SizeResult, StorageModel}; +use tokio::time::Instant; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::auth::JwtAuth; -use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::{ - profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, -}; -use utils::http::request::must_parse_query_param; -use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; +use utils::auth::SwappableJwtAuth; +use utils::generation::Generation; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext}; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::deletion_queue::DeletionQueueClient; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; -use crate::tenant::mgr::GetActiveTenantError; use crate::tenant::mgr::{ - GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError, - TenantSlotUpsertError, TenantStateError, + GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, + TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError, +}; +use crate::tenant::remote_timeline_client::{ + download_index_part, list_remote_tenant_shards, list_remote_timelines, }; -use crate::tenant::mgr::{TenantSlot, UpsertLocationError}; -use crate::tenant::remote_timeline_client; -use crate::tenant::remote_timeline_client::download_index_part; -use crate::tenant::remote_timeline_client::list_remote_tenant_shards; -use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; -use crate::tenant::storage_layer::LayerAccessStatsReset; -use crate::tenant::storage_layer::LayerName; -use crate::tenant::timeline::import_pgdata; -use crate::tenant::timeline::offload::offload_timeline; -use crate::tenant::timeline::offload::OffloadError; -use crate::tenant::timeline::CompactFlags; -use crate::tenant::timeline::CompactOptions; -use crate::tenant::timeline::CompactRequest; -use crate::tenant::timeline::CompactionError; -use crate::tenant::timeline::Timeline; -use crate::tenant::GetTimelineError; -use crate::tenant::OffloadedTimeline; -use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; -use crate::DEFAULT_PG_VERSION; -use crate::{disk_usage_eviction_task, tenant}; -use pageserver_api::models::{ - StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, - TimelineInfo, +use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName}; +use crate::tenant::timeline::offload::{OffloadError, offload_timeline}; +use crate::tenant::timeline::{ + CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout, + WaitLsnWaiter, import_pgdata, }; -use utils::{ - auth::SwappableJwtAuth, - generation::Generation, - http::{ - endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, - error::{ApiError, HttpErrorBody}, - json::{json_request, json_request_maybe, json_response}, - request::parse_request_param, - RequestExt, RouterBuilder, - }, - id::{TenantId, TimelineId}, - lsn::Lsn, +use crate::tenant::{ + GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError, + remote_timeline_client, }; +use crate::{DEFAULT_PG_VERSION, disk_usage_eviction_task, tenant}; // For APIs that require an Active tenant, how long should we block waiting for that state? // This is not functionally necessary (clients will retry), but avoids generating a lot of @@ -474,6 +441,11 @@ async fn build_timeline_info_common( let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats(); + let min_readable_lsn = std::cmp::max( + timeline.get_gc_cutoff_lsn(), + *timeline.get_applied_gc_cutoff_lsn(), + ); + let info = TimelineInfo { tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, @@ -485,7 +457,12 @@ async fn build_timeline_info_common( initdb_lsn, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), - latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), + // Externally, expose the lowest LSN that can be used to create a branch as the "GC cutoff", although internally + // we distinguish between the "planned" GC cutoff (PITR point) and the "latest" GC cutoff (where we + // actually trimmed data to), which can pass each other when PITR is changed. + latest_gc_cutoff_lsn: min_readable_lsn, + min_readable_lsn, + applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(), current_logical_size: current_logical_size.size_dont_care_about_accuracy(), current_logical_size_is_accurate: match current_logical_size.accuracy() { tenant::timeline::logical_size::Accuracy::Approximate => false, @@ -552,7 +529,7 @@ async fn reload_auth_validation_keys_handler( let key_path = config.auth_validation_public_key_path.as_ref().unwrap(); info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}"); - match JwtAuth::from_key_path(key_path) { + match utils::auth::JwtAuth::from_key_path(key_path) { Ok(new_auth) => { shared_auth.swap(new_auth); json_response(StatusCode::OK, ()) @@ -1109,12 +1086,12 @@ async fn tenant_list_handler( ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into()) })? .iter() - .map(|(id, state, gen)| TenantInfo { + .map(|(id, state, gen_)| TenantInfo { id: *id, state: state.clone(), current_physical_size: None, attachment_status: state.attachment_status(), - generation: (*gen) + generation: (*gen_) .into() .expect("Tenants are always attached with a generation"), gc_blocking: None, @@ -1445,6 +1422,59 @@ async fn timeline_layer_scan_disposable_keys( ) } +async fn timeline_download_heatmap_layers_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + // Only used in the case where remote storage is not configured. + const DEFAULT_MAX_CONCURRENCY: usize = 100; + // A conservative default. + const DEFAULT_CONCURRENCY: usize = 16; + + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let desired_concurrency = + parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY); + + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let max_concurrency = get_config(&request) + .remote_storage_config + .as_ref() + .map(|c| c.concurrency_limit()) + .unwrap_or(DEFAULT_MAX_CONCURRENCY); + let concurrency = std::cmp::min(max_concurrency, desired_concurrency); + + timeline.start_heatmap_layers_download(concurrency).await?; + + json_response(StatusCode::ACCEPTED, ()) +} + +async fn timeline_shutdown_download_heatmap_layers_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + timeline.stop_and_drain_heatmap_layers_download().await; + + json_response(StatusCode::OK, ()) +} + async fn layer_download_handler( request: Request, _cancel: CancellationToken, @@ -1463,7 +1493,13 @@ async fn layer_download_handler( let downloaded = timeline .download_layer(&layer_name) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| match e { + tenant::storage_layer::layer::DownloadError::TimelineShutdown + | tenant::storage_layer::layer::DownloadError::DownloadCancelled => { + ApiError::ShuttingDown + } + other => ApiError::InternalServerError(other.into()), + })?; match downloaded { Some(true) => json_response(StatusCode::OK, ()), @@ -1520,6 +1556,71 @@ async fn timeline_gc_unblocking_handler( block_or_unblock_gc(request, false).await } +/// Traces GetPage@LSN requests for a timeline, and emits metadata in an efficient binary encoding. +/// Use the `pagectl page-trace` command to decode and analyze the output. +async fn timeline_page_trace_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let state = get_state(&request); + check_permission(&request, None)?; + + let size_limit: usize = parse_query_param(&request, "size_limit_bytes")?.unwrap_or(1024 * 1024); + let time_limit_secs: u64 = parse_query_param(&request, "time_limit_secs")?.unwrap_or(5); + + // Convert size limit to event limit based on the serialized size of an event. The event size is + // fixed, as the default bincode serializer uses fixed-width integer encoding. + let event_size = bincode::serialize(&PageTraceEvent::default()) + .map_err(|err| ApiError::InternalServerError(err.into()))? + .len(); + let event_limit = size_limit / event_size; + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + // Install a page trace, unless one is already in progress. We just use a buffered channel, + // which may 2x the memory usage in the worst case, but it's still bounded. + let (trace_tx, mut trace_rx) = tokio::sync::mpsc::channel(event_limit); + let cur = timeline.page_trace.load(); + let installed = cur.is_none() + && timeline + .page_trace + .compare_and_swap(cur, Some(Arc::new(trace_tx))) + .is_none(); + if !installed { + return Err(ApiError::Conflict("page trace already active".to_string())); + } + defer!(timeline.page_trace.store(None)); // uninstall on return + + // Collect the trace and return it to the client. We could stream the response, but this is + // simple and fine. + let mut body = Vec::with_capacity(size_limit); + let deadline = Instant::now() + Duration::from_secs(time_limit_secs); + + while body.len() < size_limit { + tokio::select! { + event = trace_rx.recv() => { + let Some(event) = event else { + break; // shouldn't happen (sender doesn't close, unless timeline dropped) + }; + bincode::serialize_into(&mut body, &event) + .map_err(|err| ApiError::InternalServerError(err.into()))?; + } + _ = tokio::time::sleep_until(deadline) => break, // time limit reached + _ = cancel.cancelled() => return Err(ApiError::Cancelled), + } + } + + Ok(Response::builder() + .status(StatusCode::OK) + .header(header::CONTENT_TYPE, "application/octet-stream") + .body(hyper::Body::from(body)) + .unwrap()) +} + /// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`. /// /// Both are technically unsafe because they might fire off index uploads, thus they are POST. @@ -1527,9 +1628,8 @@ async fn block_or_unblock_gc( request: Request, block: bool, ) -> Result, ApiError> { - use crate::tenant::{ - remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized, - }; + use crate::tenant::remote_timeline_client::WaitCompletionError; + use crate::tenant::upload_queue::NotInitialized; let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -1695,7 +1795,47 @@ async fn update_tenant_config_handler( crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; - tenant.set_new_tenant_config(new_tenant_conf); + + let _ = tenant + .update_tenant_config(|_crnt| Ok(new_tenant_conf.clone())) + .expect("Closure returns Ok()"); + + json_response(StatusCode::OK, ()) +} + +async fn patch_tenant_config_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let request_data: TenantConfigPatchRequest = json_request(&mut request).await?; + let tenant_id = request_data.tenant_id; + check_permission(&request, Some(tenant_id))?; + + let state = get_state(&request); + + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let updated = tenant + .update_tenant_config(|crnt| crnt.apply_patch(request_data.config.clone())) + .map_err(ApiError::BadRequest)?; + + // This is a legacy API that only operates on attached tenants: the preferred + // API to use is the location_config/ endpoint, which lets the caller provide + // the full LocationConf. + let location_conf = LocationConf::attached_single( + updated, + tenant.get_generation(), + &ShardParameters::default(), + ); + + crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) + .await + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; json_response(StatusCode::OK, ()) } @@ -1875,7 +2015,9 @@ async fn tenant_time_travel_remote_storage_handler( ))); } - tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}"); + tracing::info!( + "Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}" + ); remote_timeline_client::upload::time_travel_recover_tenant( &state.remote_storage, @@ -1998,6 +2140,26 @@ async fn timeline_cancel_compact_handler( .await } +// Get compact info of a timeline +async fn timeline_compact_info_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + let resp = tenant.get_scheduled_compaction_tasks(timeline_id); + json_response(StatusCode::OK, resp) + } + .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) + .await +} + // Run compaction immediately on given timeline. async fn timeline_compact_handler( mut request: Request, @@ -2012,6 +2174,7 @@ async fn timeline_compact_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); + flags |= CompactFlags::NoYield; // run compaction to completion if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { flags |= CompactFlags::ForceL0Compaction; @@ -2040,13 +2203,20 @@ async fn timeline_compact_handler( .as_ref() .map(|r| r.sub_compaction) .unwrap_or(false); + let sub_compaction_max_job_size_mb = compact_request + .as_ref() + .and_then(|r| r.sub_compaction_max_job_size_mb); + let options = CompactOptions { - compact_range: compact_request + compact_key_range: compact_request .as_ref() - .and_then(|r| r.compact_range.clone()), - compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn), + .and_then(|r| r.compact_key_range.clone()), + compact_lsn_range: compact_request + .as_ref() + .and_then(|r| r.compact_lsn_range.clone()), flags, sub_compaction, + sub_compaction_max_job_size_mb, }; let scheduled = compact_request @@ -2184,7 +2354,9 @@ async fn timeline_checkpoint_handler( match e { CompactionError::ShuttingDown => ApiError::ShuttingDown, CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), - CompactionError::Other(e) => ApiError::InternalServerError(e) + CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), + CompactionError::Other(e) => ApiError::InternalServerError(e), + CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)), } )?; } @@ -2246,9 +2418,10 @@ async fn timeline_detach_ancestor_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - use crate::tenant::timeline::detach_ancestor; use pageserver_api::models::detach_ancestor::AncestorDetached; + use crate::tenant::timeline::detach_ancestor; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -2361,14 +2534,30 @@ async fn deletion_queue_flush( } } -/// Try if `GetPage@Lsn` is successful, useful for manual debugging. async fn getpage_at_lsn_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + getpage_at_lsn_handler_inner(false, request, cancel).await +} + +async fn touchpage_at_lsn_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + getpage_at_lsn_handler_inner(true, request, cancel).await +} + +/// Try if `GetPage@Lsn` is successful, useful for manual debugging. +async fn getpage_at_lsn_handler_inner( + touch: bool, request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_shard_id.tenant_id))?; + // Require pageserver admin permission for this API instead of only tenant-level token. + check_permission(&request, None)?; let state = get_state(&request); struct Key(pageserver_api::key::Key); @@ -2383,22 +2572,29 @@ async fn getpage_at_lsn_handler( let key: Key = parse_query_param(&request, "key")? .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?; - let lsn: Lsn = parse_query_param(&request, "lsn")? - .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?; + let lsn: Option = parse_query_param(&request, "lsn")?; async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + // Enable read path debugging + let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build(); let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; + // Use last_record_lsn if no lsn is provided + let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); let page = timeline.get(key.0, lsn, &ctx).await?; - Result::<_, ApiError>::Ok( - Response::builder() - .status(StatusCode::OK) - .header(header::CONTENT_TYPE, "application/octet-stream") - .body(hyper::Body::from(page)) - .unwrap(), - ) + if touch { + json_response(StatusCode::OK, ()) + } else { + Result::<_, ApiError>::Ok( + Response::builder() + .status(StatusCode::OK) + .header(header::CONTENT_TYPE, "application/octet-stream") + .body(hyper::Body::from(page)) + .unwrap(), + ) + } } .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await @@ -2570,14 +2766,19 @@ async fn tenant_scan_remote_handler( .await { Ok((index_part, index_generation, _index_mtime)) => { - tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", - index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn()); + tracing::info!( + "Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", + index_part.layer_metadata.len(), + index_part.metadata.disk_consistent_lsn() + ); generation = std::cmp::max(generation, index_generation); } Err(DownloadError::NotFound) => { // This is normal for tenants that were created with multiple shards: they have an unsharded path // containing the timeline's initdb tarball but no index. Otherwise it is a bit strange. - tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping"); + tracing::info!( + "Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping" + ); continue; } Err(e) => { @@ -2653,6 +2854,63 @@ async fn secondary_download_handler( json_response(status, progress) } +async fn wait_lsn_handler( + mut request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let wait_lsn_request: TenantWaitLsnRequest = json_request(&mut request).await?; + + let state = get_state(&request); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + let mut wait_futures = Vec::default(); + for timeline in tenant.list_timelines() { + let Some(lsn) = wait_lsn_request.timelines.get(&timeline.timeline_id) else { + continue; + }; + + let fut = { + let timeline = timeline.clone(); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error); + async move { + timeline + .wait_lsn( + *lsn, + WaitLsnWaiter::HttpEndpoint, + WaitLsnTimeout::Custom(wait_lsn_request.timeout), + &ctx, + ) + .await + } + }; + wait_futures.push(fut); + } + + if wait_futures.is_empty() { + return json_response(StatusCode::NOT_FOUND, ()); + } + + let all_done = tokio::select! { + results = join_all(wait_futures) => { + results.iter().all(|res| res.is_ok()) + }, + _ = cancel.cancelled() => { + return Err(ApiError::Cancelled); + } + }; + + let status = if all_done { + StatusCode::OK + } else { + StatusCode::ACCEPTED + }; + + json_response(status, ()) +} + async fn secondary_status_handler( request: Request, _cancel: CancellationToken, @@ -2802,8 +3060,15 @@ async fn list_aux_files( active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) .await?; + let io_concurrency = IoConcurrency::spawn_from_conf( + state.conf, + timeline.gate.enter().map_err(|_| ApiError::Cancelled)?, + ); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let files = timeline.list_aux_files(body.lsn, &ctx).await?; + let files = timeline + .list_aux_files(body.lsn, &ctx, io_concurrency) + .await?; json_response(StatusCode::OK, files) } @@ -2964,12 +3229,16 @@ async fn put_tenant_timeline_import_basebackup( let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + let span = info_span!("import_basebackup", + tenant_id=%tenant_id, timeline_id=%timeline_id, shard_id=%tenant_shard_id.shard_slug(), + base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version); async move { let state = get_state(&request); let tenant = state .tenant_manager - .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?; + .get_attached_tenant_shard(tenant_shard_id)?; let broker_client = state.broker_client.clone(); @@ -3128,7 +3397,9 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow anyhow::bail!("unexpected non-zero bytes after the tar archive"); } if trailing_bytes % 512 != 0 { - anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive"); + anyhow::bail!( + "unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive" + ); } Ok(()) } @@ -3178,7 +3449,17 @@ where let status = response.status(); info!(%status, "Cancelled request finished successfully") } - Err(e) => error!("Cancelled request finished with an error: {e:?}"), + Err(e) => match e { + ApiError::ShuttingDown | ApiError::ResourceUnavailable(_) => { + // Don't log this at error severity: they are normal during lifecycle of tenants/process + info!("Cancelled request aborted for shutdown") + } + _ => { + // Log these in a highly visible way, because we have no client to send the response to, but + // would like to know that something went wrong. + error!("Cancelled request finished with an error: {e:?}") + } + }, } } // only logging for cancelled panicked request handlers is the tracing_panic_hook, @@ -3288,6 +3569,9 @@ pub fn make_router( .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| { api_handler(r, tenant_size_handler) }) + .patch("/v1/tenant/config", |r| { + api_handler(r, patch_tenant_config_handler) + }) .put("/v1/tenant/config", |r| { api_handler(r, update_tenant_config_handler) }) @@ -3349,6 +3633,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc", |r| api_handler(r, timeline_gc_handler), ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", + |r| api_handler(r, timeline_compact_info_handler), + ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", |r| api_handler(r, timeline_compact_handler), @@ -3384,6 +3672,14 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer", |r| api_handler(r, layer_map_info_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", + |r| api_handler(r, timeline_download_heatmap_layers_handler), + ) + .delete( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", + |r| api_handler(r, timeline_shutdown_download_heatmap_layers_handler), + ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, layer_download_handler), @@ -3404,6 +3700,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc", |r| api_handler(r, timeline_gc_unblocking_handler), ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/page_trace", + |r| api_handler(r, timeline_page_trace_handler), + ) .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { api_handler(r, secondary_upload_handler) }) @@ -3422,6 +3722,9 @@ pub fn make_router( .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| { api_handler(r, secondary_download_handler) }) + .post("/v1/tenant/:tenant_shard_id/wait_lsn", |r| { + api_handler(r, wait_lsn_handler) + }) .put("/v1/tenant/:tenant_shard_id/break", |r| { testing_api_handler("set tenant state to broken", r, handle_tenant_break) }) @@ -3433,6 +3736,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage", |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler), ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage", + |r| api_handler(r, touchpage_at_lsn_handler), + ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace", |r| api_handler(r, timeline_collect_keyspace), diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index c061714010..6dd005de50 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -4,14 +4,22 @@ //! use std::path::{Path, PathBuf}; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{Context, Result, bail, ensure}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; use pageserver_api::key::rel_block_to_key; +use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::relfile_utils::*; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{ + BLCKSZ, ControlFileData, DBState_DB_SHUTDOWNED, Oid, WAL_SEGMENT_SIZE, XLogFileName, + pg_constants, +}; use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_tar::Archive; use tracing::*; +use utils::lsn::Lsn; use wal_decoder::models::InterpretedWalRecord; use walkdir::WalkDir; @@ -20,16 +28,6 @@ use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; use crate::walingest::WalIngest; -use pageserver_api::reltag::{RelTag, SlruKind}; -use postgres_ffi::pg_constants; -use postgres_ffi::relfile_utils::*; -use postgres_ffi::waldecoder::WalStreamDecoder; -use postgres_ffi::ControlFileData; -use postgres_ffi::DBState_DB_SHUTDOWNED; -use postgres_ffi::Oid; -use postgres_ffi::XLogFileName; -use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE}; -use utils::lsn::Lsn; // Returns checkpoint LSN from controlfile pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result { @@ -278,6 +276,8 @@ async fn import_wal( let mut walingest = WalIngest::new(tline, startpoint, ctx).await?; + let shard = vec![*tline.get_shard_identity()]; + while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE); @@ -314,10 +314,12 @@ async fn import_wal( if let Some((lsn, recdata)) = waldecoder.poll_decode()? { let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, - tline.get_shard_identity(), + &shard, lsn, tline.pg_version, - )?; + )? + .remove(tline.get_shard_identity()) + .unwrap(); walingest .ingest_record(interpreted, &mut modification, ctx) @@ -411,6 +413,7 @@ pub async fn import_wal_from_tar( let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?; + let shard = vec![*tline.get_shard_identity()]; // Ingest wal until end_lsn info!("importing wal until {}", end_lsn); @@ -459,10 +462,12 @@ pub async fn import_wal_from_tar( if let Some((lsn, recdata)) = waldecoder.poll_decode()? { let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, - tline.get_shard_identity(), + &shard, lsn, tline.pg_version, - )?; + )? + .remove(tline.get_shard_identity()) + .unwrap(); walingest .ingest_record(interpreted, &mut modification, ctx) diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index 491c9fb96c..6cfecef0cf 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -1,4 +1,5 @@ -use std::{num::NonZeroUsize, sync::Arc}; +use std::num::NonZeroUsize; +use std::sync::Arc; #[derive(Debug, PartialEq, Eq, Clone)] pub enum L0FlushConfig { diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index ff6af3566c..02767055fb 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -15,7 +15,8 @@ pub mod l0_flush; extern crate hyper0 as hyper; -use futures::{stream::FuturesUnordered, StreamExt}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; pub use pageserver_api::keyspace; use tokio_util::sync::CancellationToken; mod assert_u64_eq_usize; @@ -35,10 +36,8 @@ pub mod walredo; use camino::Utf8Path; use deletion_queue::DeletionQueue; -use tenant::{ - mgr::{BackgroundPurges, TenantManager}, - secondary, -}; +use tenant::mgr::{BackgroundPurges, TenantManager}; +use tenant::secondary; use tracing::{info, info_span}; /// Current storage format version @@ -263,14 +262,6 @@ pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json"; /// data directory at pageserver startup can be automatically removed. pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp"; -/// A marker file to mark that a timeline directory was not fully initialized. -/// If a timeline directory with this marker is encountered at pageserver startup, -/// the timeline directory and the marker file are both removed. -/// Full path: `tenants//timelines/___uninit`. -pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; - -pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete"; - pub fn is_temporary(path: &Utf8Path) -> bool { match path.file_name() { Some(name) => name.ends_with(TEMP_FILE_SUFFIX), @@ -278,25 +269,6 @@ pub fn is_temporary(path: &Utf8Path) -> bool { } } -fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool { - match path.file_name() { - Some(name) => name.ends_with(suffix), - None => false, - } -} - -// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid -// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once -// from the name. - -pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool { - ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX) -} - -pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool { - ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX) -} - /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by /// blocking. /// @@ -377,9 +349,10 @@ async fn timed_after_cancellation( #[cfg(test)] mod timed_tests { - use super::timed; use std::time::Duration; + use super::timed; + #[tokio::test] async fn timed_completes_when_inner_future_completes() { // A future that completes on time should have its result returned diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 96ee157856..eb8a9b8e24 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,24 +1,46 @@ -use enum_map::EnumMap; +use std::collections::HashMap; +use std::num::NonZeroUsize; +use std::os::fd::RawFd; +use std::pin::Pin; +use std::sync::atomic::AtomicU64; +use std::sync::{Arc, Mutex}; +use std::task::{Context, Poll}; +use std::time::{Duration, Instant}; + +use enum_map::{Enum as _, EnumMap}; +use futures::Future; use metrics::{ + Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, + IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, - Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, - IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedExecutionStrategy, }; +use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use postgres_backend::{is_expected_io_error, QueryError}; +use pin_project_lite::pin_project; +use postgres_backend::{QueryError, is_expected_io_error}; use pq_proto::framed::ConnectionError; -use strum::{EnumCount, VariantNames}; +use strum::{EnumCount, IntoEnumIterator as _, VariantNames}; use strum_macros::{IntoStaticStr, VariantNames}; -use tracing::warn; use utils::id::TimelineId; +use crate::config::PageServerConf; +use crate::context::{PageContentKind, RequestContext}; +use crate::pgdatadir_mapping::DatadirModificationStats; +use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; +use crate::tenant::layer_map::LayerMap; +use crate::tenant::mgr::TenantSlot; +use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc}; +use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::throttle::ThrottleResult; + /// Prometheus histogram buckets (in seconds) for operations in the critical /// path. In other words, operations that directly affect that latency of user /// queries. @@ -39,6 +61,9 @@ pub(crate) enum StorageTimeOperation { #[strum(serialize = "layer flush")] LayerFlush, + #[strum(serialize = "layer flush delay")] + LayerFlushDelay, + #[strum(serialize = "compact")] Compact, @@ -79,7 +104,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::n .expect("failed to define a metric") }); -// Buckets for background operations like compaction, GC, size calculation +// Buckets for background operation duration in seconds, like compaction, GC, size calculation. const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0]; pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { @@ -92,89 +117,66 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { - register_histogram!( - "pageserver_layers_visited_per_read_global", - "Number of layers visited to reconstruct one key", - vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], +/// Measures layers visited per read (i.e. read amplification). +/// +/// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits +/// are amortized across the batch, and some layers may not intersect with a given key, each visited +/// layer contributes directly to the observed latency for every read in the batch, which is what we +/// care about. +pub(crate) static LAYERS_PER_READ: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_layers_per_read", + "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.", + &["tenant_id", "shard_id", "timeline_id"], + // Low resolution to reduce cardinality. + vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0], ) .expect("failed to define a metric") }); -pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { +pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { register_histogram!( - "pageserver_layers_visited_per_vectored_read_global", - "Average number of layers visited to reconstruct one key", - vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + "pageserver_layers_per_read_global", + "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.", + vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], ) .expect("failed to define a metric") }); -// Metrics collected on operations on the storage repository. -#[derive( - Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr, -)] -pub(crate) enum GetKind { - Singular, - Vectored, -} - -pub(crate) struct ReconstructTimeMetrics { - singular: Histogram, - vectored: Histogram, -} - -pub(crate) static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { - let inner = register_histogram_vec!( - "pageserver_getpage_reconstruct_seconds", - "Time spent in reconstruct_value (reconstruct a page from deltas)", - &["get_kind"], - CRITICAL_OP_BUCKETS.into(), +pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { + // We expect this to be low because of Postgres checkpoints. Let's see if that holds. + register_histogram!( + "pageserver_deltas_per_read_global", + "Number of delta pages applied to image page per read", + vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0], ) - .expect("failed to define a metric"); - - ReconstructTimeMetrics { - singular: inner.with_label_values(&[GetKind::Singular.into()]), - vectored: inner.with_label_values(&[GetKind::Vectored.into()]), - } + .expect("failed to define a metric") }); -impl ReconstructTimeMetrics { - pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { - match get_kind { - GetKind::Singular => &self.singular, - GetKind::Vectored => &self.vectored, - } - } -} - -pub(crate) struct ReconstructDataTimeMetrics { - singular: Histogram, - vectored: Histogram, -} - -impl ReconstructDataTimeMetrics { - pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { - match get_kind { - GetKind::Singular => &self.singular, - GetKind::Vectored => &self.vectored, - } - } -} - -pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { - let inner = register_histogram_vec!( - "pageserver_getpage_get_reconstruct_data_seconds", - "Time spent in get_reconstruct_value_data", - &["get_kind"], - CRITICAL_OP_BUCKETS.into(), +pub(crate) static CONCURRENT_INITDBS: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_concurrent_initdb", + "Number of initdb processes running" ) - .expect("failed to define a metric"); + .expect("failed to define a metric") +}); - ReconstructDataTimeMetrics { - singular: inner.with_label_values(&[GetKind::Singular.into()]), - vectored: inner.with_label_values(&[GetKind::Vectored.into()]), - } +pub(crate) static INITDB_SEMAPHORE_ACQUISITION_TIME: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_initdb_semaphore_seconds_global", + "Time spent getting a permit from the global initdb semaphore", + STORAGE_OP_BUCKETS.into() + ) + .expect("failed to define metric") +}); + +pub(crate) static INITDB_RUN_TIME: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_initdb_seconds_global", + "Time spent performing initdb", + STORAGE_OP_BUCKETS.into() + ) + .expect("failed to define metric") }); pub(crate) struct GetVectoredLatency { @@ -234,7 +236,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| GetVectoredLatency { map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { - let task_kind = ::from_usize(task_kind_idx); + let task_kind = TaskKind::from_usize(task_kind_idx); if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) { let task_kind = task_kind.into(); @@ -257,7 +259,7 @@ pub(crate) static SCAN_LATENCY: Lazy = Lazy::new(|| { ScanLatency { map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { - let task_kind = ::from_usize(task_kind_idx); + let task_kind = TaskKind::from_usize(task_kind_idx); if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) { let task_kind = task_kind.into(); @@ -298,10 +300,10 @@ static PAGE_CACHE_READ_ACCESSES: Lazy = Lazy::new(|| { pub(crate) static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMetrics { map: EnumMap::from_array(std::array::from_fn(|task_kind| { - let task_kind = ::from_usize(task_kind); + let task_kind = TaskKind::from_usize(task_kind); let task_kind: &'static str = task_kind.into(); EnumMap::from_array(std::array::from_fn(|content_kind| { - let content_kind = ::from_usize(content_kind); + let content_kind = PageContentKind::from_usize(content_kind); let content_kind: &'static str = content_kind.into(); PageCacheMetricsForTaskKind { read_accesses_immutable: { @@ -360,7 +362,7 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy = pub(crate) mod page_cache_eviction_metrics { use std::num::NonZeroUsize; - use metrics::{register_int_counter_vec, IntCounter, IntCounterVec}; + use metrics::{IntCounter, IntCounterVec, register_int_counter_vec}; use once_cell::sync::Lazy; #[derive(Clone, Copy)] @@ -491,18 +493,38 @@ static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)] +#[derive( + strum_macros::EnumIter, + strum_macros::EnumString, + strum_macros::Display, + strum_macros::IntoStaticStr, +)] #[strum(serialize_all = "kebab_case")] -pub(crate) enum MetricLayerKind { +pub(crate) enum LayerKind { Delta, Image, } +#[derive( + strum_macros::EnumIter, + strum_macros::EnumString, + strum_macros::Display, + strum_macros::IntoStaticStr, +)] +#[strum(serialize_all = "kebab_case")] +pub(crate) enum LayerLevel { + // We don't track the currently open ephemeral layer, since there's always exactly 1 and its + // size changes. See `TIMELINE_EPHEMERAL_BYTES`. + Frozen, + L0, + L1, +} + static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_layer_bytes", - "Sum of layer physical sizes in bytes", - &["tenant_id", "shard_id", "timeline_id", "kind"] + "Sum of frozen, L0, and L1 layer physical sizes in bytes (excluding the open ephemeral layer)", + &["tenant_id", "shard_id", "timeline_id", "level", "kind"] ) .expect("failed to define a metric") }); @@ -510,8 +532,8 @@ static TIMELINE_LAYER_SIZE: Lazy = Lazy::new(|| { static TIMELINE_LAYER_COUNT: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_layer_count", - "Number of layers that exist", - &["tenant_id", "shard_id", "timeline_id", "kind"] + "Number of frozen, L0, and L1 layers (excluding the open ephemeral layer)", + &["tenant_id", "shard_id", "timeline_id", "level", "kind"] ) .expect("failed to define a metric") }); @@ -699,7 +721,7 @@ pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy = Lazy::new(|| { }); pub(crate) mod initial_logical_size { - use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; + use metrics::{IntCounter, IntCounterVec, register_int_counter, register_int_counter_vec}; use once_cell::sync::Lazy; pub(crate) struct StartCalculation(IntCounterVec); @@ -1082,12 +1104,17 @@ impl EvictionsWithLowResidenceDuration { // - future "drop panick => abort" // // so just nag: (the error has the labels) - tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}"); + tracing::warn!( + "failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}" + ); } Ok(()) => { // to help identify cases where we double-remove the same values, let's log all // deletions? - tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source); + tracing::info!( + "removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", + self.data_source + ); } } } @@ -1225,115 +1252,258 @@ pub(crate) mod virtual_file_io_engine { pub(crate) struct SmgrOpTimer(Option); pub(crate) struct SmgrOpTimerInner { - global_latency_histo: Histogram, + global_execution_latency_histo: Histogram, + per_timeline_execution_latency_histo: Option, - // Optional because not all op types are tracked per-timeline - per_timeline_latency_histo: Option, + global_batch_wait_time: Histogram, + per_timeline_batch_wait_time: Histogram, global_flush_in_progress_micros: IntCounter, per_timeline_flush_in_progress_micros: IntCounter, - start: Instant, - throttled: Duration, - op: SmgrQueryType, + throttling: Arc, + + timings: SmgrOpTimerState, } -pub(crate) struct SmgrOpFlushInProgress { - base: Instant, - global_micros: IntCounter, - per_timeline_micros: IntCounter, +/// The stages of request processing are represented by the enum variants. +/// Used as part of [`SmgrOpTimerInner::timings`]. +/// +/// Request processing calls into the `SmgrOpTimer::observe_*` methods at the +/// transition points. +/// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`] +/// to the next state. +/// +/// Each request goes through every stage, in all configurations. +/// +#[derive(Debug)] +enum SmgrOpTimerState { + Received { + // In the future, we may want to track the full time the request spent + // inside pageserver process (time spent in kernel buffers can't be tracked). + // `received_at` would be used for that. + #[allow(dead_code)] + received_at: Instant, + }, + Throttling { + throttle_started_at: Instant, + }, + Batching { + throttle_done_at: Instant, + }, + Executing { + execution_started_at: Instant, + }, + Flushing, + // NB: when adding observation points, remember to update the Drop impl. } +// NB: when adding observation points, remember to update the Drop impl. impl SmgrOpTimer { - pub(crate) fn deduct_throttle(&mut self, throttle: &Option) { - let Some(throttle) = throttle else { + /// See [`SmgrOpTimerState`] for more context. + pub(crate) fn observe_throttle_start(&mut self, at: Instant) { + let Some(inner) = self.0.as_mut() else { return; }; - let inner = self.0.as_mut().expect("other public methods consume self"); - inner.throttled += *throttle; + let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else { + return; + }; + inner.throttling.count_accounted_start.inc(); + inner.timings = SmgrOpTimerState::Throttling { + throttle_started_at: at, + }; } - pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress { - let (flush_start, inner) = self - .smgr_op_end() - .expect("this method consume self, and the only other caller is drop handler"); + /// See [`SmgrOpTimerState`] for more context. + pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) { + let Some(inner) = self.0.as_mut() else { + return; + }; + let SmgrOpTimerState::Throttling { + throttle_started_at, + } = &inner.timings + else { + return; + }; + inner.throttling.count_accounted_finish.inc(); + match throttle { + ThrottleResult::NotThrottled { end } => { + inner.timings = SmgrOpTimerState::Batching { + throttle_done_at: end, + }; + } + ThrottleResult::Throttled { end } => { + // update metrics + inner.throttling.count_throttled.inc(); + inner + .throttling + .wait_time + .inc_by((end - *throttle_started_at).as_micros().try_into().unwrap()); + // state transition + inner.timings = SmgrOpTimerState::Batching { + throttle_done_at: end, + }; + } + } + } + + /// See [`SmgrOpTimerState`] for more context. + pub(crate) fn observe_execution_start(&mut self, at: Instant) { + let Some(inner) = self.0.as_mut() else { + return; + }; + let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else { + return; + }; + // update metrics + let batch = at - *throttle_done_at; + inner.global_batch_wait_time.observe(batch.as_secs_f64()); + inner + .per_timeline_batch_wait_time + .observe(batch.as_secs_f64()); + // state transition + inner.timings = SmgrOpTimerState::Executing { + execution_started_at: at, + } + } + + /// For all but the first caller, this is a no-op. + /// The first callers receives Some, subsequent ones None. + /// + /// See [`SmgrOpTimerState`] for more context. + pub(crate) fn observe_execution_end(&mut self, at: Instant) -> Option { + // NB: unlike the other observe_* methods, this one take()s. + #[allow(clippy::question_mark)] // maintain similar code pattern. + let Some(mut inner) = self.0.take() else { + return None; + }; + let SmgrOpTimerState::Executing { + execution_started_at, + } = &inner.timings + else { + return None; + }; + // update metrics + let execution = at - *execution_started_at; + inner + .global_execution_latency_histo + .observe(execution.as_secs_f64()); + if let Some(per_timeline_execution_latency_histo) = + &inner.per_timeline_execution_latency_histo + { + per_timeline_execution_latency_histo.observe(execution.as_secs_f64()); + } + + // state transition + inner.timings = SmgrOpTimerState::Flushing; + + // return the flush in progress object which + // will do the remaining metrics updates let SmgrOpTimerInner { global_flush_in_progress_micros, per_timeline_flush_in_progress_micros, .. } = inner; - SmgrOpFlushInProgress { - base: flush_start, + Some(SmgrOpFlushInProgress { global_micros: global_flush_in_progress_micros, per_timeline_micros: per_timeline_flush_in_progress_micros, - } + }) } +} - /// Returns `None`` if this method has already been called, `Some` otherwise. - fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> { - let inner = self.0.take()?; - - let now = Instant::now(); - let elapsed = now - inner.start; - - let elapsed = match elapsed.checked_sub(inner.throttled) { - Some(elapsed) => elapsed, - None => { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy>> = - Lazy::new(|| { - Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { - RateLimit::new(Duration::from_secs(10)) - }))) - }); - let mut guard = LOGGED.lock().unwrap(); - let rate_limit = &mut guard[inner.op]; - rate_limit.call(|| { - warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time"); - }); - elapsed // un-throttled time, more info than just saturating to 0 - } - }; - - let elapsed = elapsed.as_secs_f64(); - - inner.global_latency_histo.observe(elapsed); - if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo { - per_timeline_getpage_histo.observe(elapsed); - } - - Some((now, inner)) - } +/// The last stage of request processing is serializing and flushing the request +/// into the TCP connection. We want to make slow flushes observable +/// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`] +/// to periodically bump the metric. +/// +/// If in the future we decide that we're not interested in live updates, we can +/// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there, +/// and remove this struct from the code base. +pub(crate) struct SmgrOpFlushInProgress { + global_micros: IntCounter, + per_timeline_micros: IntCounter, } impl Drop for SmgrOpTimer { fn drop(&mut self) { - self.smgr_op_end(); + // In case of early drop, update any of the remaining metrics with + // observations so that (started,finished) counter pairs balance out + // and all counters on the latency path have the the same number of + // observations. + // It's technically lying and it would be better if each metric had + // a separate label or similar for cancelled requests. + // But we don't have that right now and counter pairs balancing + // out is useful when using the metrics in panels and whatnot. + let now = Instant::now(); + self.observe_throttle_start(now); + self.observe_throttle_done(ThrottleResult::NotThrottled { end: now }); + self.observe_execution_start(now); + let maybe_flush_timer = self.observe_execution_end(now); + drop(maybe_flush_timer); } } impl SmgrOpFlushInProgress { - pub(crate) async fn measure(mut self, mut fut: Fut) -> O + /// The caller must guarantee that `socket_fd`` outlives this function. + pub(crate) async fn measure( + self, + started_at: Instant, + mut fut: Fut, + socket_fd: RawFd, + ) -> O where Fut: std::future::Future, { let mut fut = std::pin::pin!(fut); - let now = Instant::now(); - // Whenever observe_guard gets called, or dropped, - // it adds the time elapsed since its last call to metrics. - // Last call is tracked in `now`. + let mut logged = false; + let mut last_counter_increment_at = started_at; let mut observe_guard = scopeguard::guard( - || { - let elapsed = now - self.base; - self.global_micros - .inc_by(u64::try_from(elapsed.as_micros()).unwrap()); - self.per_timeline_micros - .inc_by(u64::try_from(elapsed.as_micros()).unwrap()); - self.base = now; + |is_timeout| { + let now = Instant::now(); + + // Increment counter + { + let elapsed_since_last_observe = now - last_counter_increment_at; + self.global_micros + .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap()); + self.per_timeline_micros + .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap()); + last_counter_increment_at = now; + } + + // Log something on every timeout, and on completion but only if we hit a timeout. + if is_timeout || logged { + logged = true; + let elapsed_total = now - started_at; + let msg = if is_timeout { + "slow flush ongoing" + } else { + "slow flush completed or cancelled" + }; + + let (inq, outq) = { + // SAFETY: caller guarantees that `socket_fd` outlives this function. + #[cfg(target_os = "linux")] + unsafe { + ( + utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2), + utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2), + ) + } + #[cfg(not(target_os = "linux"))] + { + _ = socket_fd; // appease unused lint on macOS + (-1, -1) + } + }; + + let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64()); + tracing::info!(elapsed_total_secs, inq, outq, msg); + } }, |mut observe| { - observe(); + observe(false); }, ); @@ -1341,7 +1511,7 @@ impl SmgrOpFlushInProgress { match tokio::time::timeout(Duration::from_secs(10), &mut fut).await { Ok(v) => return v, Err(_timeout) => { - (*observe_guard)(); + (*observe_guard)(true); } } } @@ -1365,9 +1535,10 @@ pub enum SmgrQueryType { GetPageAtLsn, GetDbSize, GetSlruSegment, + #[cfg(feature = "testing")] + Test, } -#[derive(Debug)] pub(crate) struct SmgrQueryTimePerTimeline { global_started: [IntCounter; SmgrQueryType::COUNT], global_latency: [Histogram; SmgrQueryType::COUNT], @@ -1377,6 +1548,9 @@ pub(crate) struct SmgrQueryTimePerTimeline { per_timeline_batch_size: Histogram, global_flush_in_progress_micros: IntCounter, per_timeline_flush_in_progress_micros: IntCounter, + global_batch_wait_time: Histogram, + per_timeline_batch_wait_time: Histogram, + throttling: Arc, } static SMGR_QUERY_STARTED_GLOBAL: Lazy = Lazy::new(|| { @@ -1399,12 +1573,15 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy = Lazy::new(| .expect("failed to define a metric") }); +// Alias so all histograms recording per-timeline smgr timings use the same buckets. +static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS; + static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_smgr_query_seconds", - "Time spent on smgr query handling, aggegated by query type and tenant/timeline.", + "Time spent _executing_ smgr query handling, excluding batch and throttle delays.", &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"], - CRITICAL_OP_BUCKETS.into(), + SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(), ) .expect("failed to define a metric") }); @@ -1462,7 +1639,7 @@ static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy> = Lazy::new(|| { static SMGR_QUERY_TIME_GLOBAL: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_smgr_query_seconds_global", - "Time spent on smgr query handling, aggregated by query type.", + "Like pageserver_smgr_query_seconds, but aggregated to instance level.", &["smgr_query_type"], SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(), ) @@ -1559,8 +1736,31 @@ static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy = Lazy .expect("failed to define a metric") }); +static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_page_service_pagestream_batch_wait_time_seconds", + "Time a request spent waiting in its batch until the batch moved to throttle&execution.", + &["tenant_id", "shard_id", "timeline_id"], + SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(), + ) + .expect("failed to define a metric") +}); + +static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_page_service_pagestream_batch_wait_time_seconds_global", + "Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.", + SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(), + ) + .expect("failed to define a metric") +}); + impl SmgrQueryTimePerTimeline { - pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self { + pub(crate) fn new( + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + pagestream_throttle_metrics: Arc, + ) -> Self { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_slug = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id.to_string(); @@ -1599,6 +1799,11 @@ impl SmgrQueryTimePerTimeline { .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id]) .unwrap(); + let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone(); + let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME + .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id]) + .unwrap(); + let global_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone(); let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS @@ -1614,9 +1819,12 @@ impl SmgrQueryTimePerTimeline { per_timeline_batch_size, global_flush_in_progress_micros, per_timeline_flush_in_progress_micros, + global_batch_wait_time, + per_timeline_batch_wait_time, + throttling: pagestream_throttle_metrics, } } - pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer { + pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer { self.global_started[op as usize].inc(); let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) { @@ -1627,90 +1835,26 @@ impl SmgrQueryTimePerTimeline { }; SmgrOpTimer(Some(SmgrOpTimerInner { - global_latency_histo: self.global_latency[op as usize].clone(), - per_timeline_latency_histo, - start: started_at, - op, - throttled: Duration::ZERO, + global_execution_latency_histo: self.global_latency[op as usize].clone(), + per_timeline_execution_latency_histo: per_timeline_latency_histo, global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(), per_timeline_flush_in_progress_micros: self .per_timeline_flush_in_progress_micros .clone(), + global_batch_wait_time: self.global_batch_wait_time.clone(), + per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(), + throttling: self.throttling.clone(), + timings: SmgrOpTimerState::Received { received_at }, })) } + /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) { self.global_batch_size.observe(batch_size as f64); self.per_timeline_batch_size.observe(batch_size as f64); } } -#[cfg(test)] -mod smgr_query_time_tests { - use std::time::Instant; - - use pageserver_api::shard::TenantShardId; - use strum::IntoEnumIterator; - use utils::id::{TenantId, TimelineId}; - - // Regression test, we used hard-coded string constants before using an enum. - #[test] - fn op_label_name() { - use super::SmgrQueryType::*; - let expect: [(super::SmgrQueryType, &'static str); 5] = [ - (GetRelExists, "get_rel_exists"), - (GetRelSize, "get_rel_size"), - (GetPageAtLsn, "get_page_at_lsn"), - (GetDbSize, "get_db_size"), - (GetSlruSegment, "get_slru_segment"), - ]; - for (op, expect) in expect { - let actual: &'static str = op.into(); - assert_eq!(actual, expect); - } - } - - #[test] - fn basic() { - let ops: Vec<_> = super::SmgrQueryType::iter().collect(); - - for op in &ops { - let tenant_id = TenantId::generate(); - let timeline_id = TimelineId::generate(); - let metrics = super::SmgrQueryTimePerTimeline::new( - &TenantShardId::unsharded(tenant_id), - &timeline_id, - ); - - let get_counts = || { - let global: u64 = ops - .iter() - .map(|op| metrics.global_latency[*op as usize].get_sample_count()) - .sum(); - ( - global, - metrics.per_timeline_getpage_latency.get_sample_count(), - ) - }; - - let (pre_global, pre_per_tenant_timeline) = get_counts(); - assert_eq!(pre_per_tenant_timeline, 0); - - let timer = metrics.start_smgr_op(*op, Instant::now()); - drop(timer); - - let (post_global, post_per_tenant_timeline) = get_counts(); - if matches!(op, super::SmgrQueryType::GetPageAtLsn) { - // getpage ops are tracked per-timeline, others aren't - assert_eq!(post_per_tenant_timeline, 1); - } else { - assert_eq!(post_per_tenant_timeline, 0); - } - assert!(post_global > pre_global); - } - } -} - // keep in sync with control plane Go code so that we can validate // compute's basebackup_ms metric with our perspective in the context of SLI/SLO. static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| { @@ -1788,6 +1932,7 @@ pub(crate) static LIVE_CONNECTIONS: Lazy = Lazy::new(|| { #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)] pub(crate) enum ComputeCommandKind { + PageStreamV3, PageStreamV2, Basebackup, Fullbackup, @@ -1808,7 +1953,7 @@ pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy = Lazy ComputeCommandCounters { map: EnumMap::from_array(std::array::from_fn(|i| { - let command = ::from_usize(i); + let command = ComputeCommandKind::from_usize(i); let command_str: &'static str = command.into(); inner.with_label_values(&[command_str]) })), @@ -2108,11 +2253,13 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { pub struct BackgroundLoopSemaphoreMetrics { counters: EnumMap, - durations: EnumMap, + durations: EnumMap, + waiting_tasks: EnumMap, + running_tasks: EnumMap, } -pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy = Lazy::new( - || { +pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy = + Lazy::new(|| { let counters = register_int_counter_pair_vec!( "pageserver_background_loop_semaphore_wait_start_count", "Counter for background loop concurrency-limiting semaphore acquire calls started", @@ -2122,45 +2269,101 @@ pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy::from_usize(i); + counters: EnumMap::from_array(std::array::from_fn(|i| { + let kind = BackgroundLoopKind::from_usize(i); counters.with_label_values(&[kind.into()]) })), - durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| { - let kind = ::from_usize(i); + durations: EnumMap::from_array(std::array::from_fn(|i| { + let kind = BackgroundLoopKind::from_usize(i); durations.with_label_values(&[kind.into()]) })), + waiting_tasks: EnumMap::from_array(std::array::from_fn(|i| { + let kind = BackgroundLoopKind::from_usize(i); + waiting_tasks.with_label_values(&[kind.into()]) + })), + running_tasks: EnumMap::from_array(std::array::from_fn(|i| { + let kind = BackgroundLoopKind::from_usize(i); + running_tasks.with_label_values(&[kind.into()]) + })), } - }, -); + }); impl BackgroundLoopSemaphoreMetrics { - pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ { - struct Record<'a> { - metrics: &'a BackgroundLoopSemaphoreMetrics, - task: BackgroundLoopKind, - _counter_guard: metrics::IntCounterPairGuard, - start: Instant, - } - impl Drop for Record<'_> { - fn drop(&mut self) { - let elapsed = self.start.elapsed().as_secs_f64(); - self.metrics.durations[self.task].inc_by(elapsed); - } - } - Record { - metrics: self, + /// Starts recording semaphore metrics. Call `acquired()` on the returned recorder when the + /// semaphore is acquired, and drop it when the task completes or is cancelled. + pub(crate) fn record( + &self, + task: BackgroundLoopKind, + ) -> BackgroundLoopSemaphoreMetricsRecorder { + BackgroundLoopSemaphoreMetricsRecorder::start(self, task) + } +} + +/// Records metrics for a background task. +pub struct BackgroundLoopSemaphoreMetricsRecorder<'a> { + metrics: &'a BackgroundLoopSemaphoreMetrics, + task: BackgroundLoopKind, + start: Instant, + wait_counter_guard: Option, +} + +impl<'a> BackgroundLoopSemaphoreMetricsRecorder<'a> { + /// Starts recording semaphore metrics, by recording wait time and incrementing + /// `wait_start_count` and `waiting_tasks`. + fn start(metrics: &'a BackgroundLoopSemaphoreMetrics, task: BackgroundLoopKind) -> Self { + metrics.waiting_tasks[task].inc(); + Self { + metrics, task, - _counter_guard: self.counters[task].guard(), start: Instant::now(), + wait_counter_guard: Some(metrics.counters[task].guard()), + } + } + + /// Signals that the semaphore has been acquired, and updates relevant metrics. + pub fn acquired(&mut self) -> Duration { + let waited = self.start.elapsed(); + self.wait_counter_guard.take().expect("already acquired"); + self.metrics.durations[self.task].observe(waited.as_secs_f64()); + self.metrics.waiting_tasks[self.task].dec(); + self.metrics.running_tasks[self.task].inc(); + waited + } +} + +impl Drop for BackgroundLoopSemaphoreMetricsRecorder<'_> { + /// The task either completed or was cancelled. + fn drop(&mut self) { + if self.wait_counter_guard.take().is_some() { + // Waiting. + self.metrics.durations[self.task].observe(self.start.elapsed().as_secs_f64()); + self.metrics.waiting_tasks[self.task].dec(); + } else { + // Running. + self.metrics.running_tasks[self.task].dec(); } } } @@ -2271,13 +2474,44 @@ macro_rules! redo_bytes_histogram_count_buckets { pub(crate) struct WalIngestMetrics { pub(crate) bytes_received: IntCounter, pub(crate) records_received: IntCounter, + pub(crate) records_observed: IntCounter, pub(crate) records_committed: IntCounter, pub(crate) records_filtered: IntCounter, + pub(crate) values_committed_metadata_images: IntCounter, + pub(crate) values_committed_metadata_deltas: IntCounter, + pub(crate) values_committed_data_images: IntCounter, + pub(crate) values_committed_data_deltas: IntCounter, pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter, - pub(crate) clear_vm_bits_unknown: IntCounterVec, } -pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { +impl WalIngestMetrics { + pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) { + if stats.metadata_images > 0 { + self.values_committed_metadata_images + .inc_by(stats.metadata_images); + } + if stats.metadata_deltas > 0 { + self.values_committed_metadata_deltas + .inc_by(stats.metadata_deltas); + } + if stats.data_images > 0 { + self.values_committed_data_images.inc_by(stats.data_images); + } + if stats.data_deltas > 0 { + self.values_committed_data_deltas.inc_by(stats.data_deltas); + } + } +} + +pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| { + let values_committed = register_int_counter_vec!( + "pageserver_wal_ingest_values_committed", + "Number of values committed to pageserver storage from WAL records", + &["class", "kind"], + ) + .expect("failed to define a metric"); + + WalIngestMetrics { bytes_received: register_int_counter!( "pageserver_wal_ingest_bytes_received", "Bytes of WAL ingested from safekeepers", @@ -2288,6 +2522,11 @@ pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMet "Number of WAL records received from safekeepers" ) .expect("failed to define a metric"), + records_observed: register_int_counter!( + "pageserver_wal_ingest_records_observed", + "Number of WAL records observed from safekeepers. These are metadata only records for shard 0." + ) + .expect("failed to define a metric"), records_committed: register_int_counter!( "pageserver_wal_ingest_records_committed", "Number of WAL records which resulted in writes to pageserver storage" @@ -2298,17 +2537,16 @@ pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMet "Number of WAL records filtered out due to sharding" ) .expect("failed to define a metric"), + values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]), + values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]), + values_committed_data_images: values_committed.with_label_values(&["data", "image"]), + values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]), gap_blocks_zeroed_on_rel_extend: register_int_counter!( "pageserver_gap_blocks_zeroed_on_rel_extend", "Total number of zero gap blocks written on relation extends" ) .expect("failed to define a metric"), - clear_vm_bits_unknown: register_int_counter_vec!( - "pageserver_wal_ingest_clear_vm_bits_unknown", - "Number of ignored ClearVmBits operations due to unknown pages/relations", - &["entity"], - ) - .expect("failed to define a metric"), +} }); pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy = Lazy::new(|| { @@ -2374,7 +2612,7 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy = pub(crate) struct WalRedoProcessCounters { pub(crate) started: IntCounter, - pub(crate) killed_by_cause: enum_map::EnumMap, + pub(crate) killed_by_cause: EnumMap, pub(crate) active_stderr_logger_tasks_started: IntCounter, pub(crate) active_stderr_logger_tasks_finished: IntCounter, } @@ -2416,7 +2654,7 @@ impl Default for WalRedoProcessCounters { Self { started, killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| { - let cause = ::from_usize(i); + let cause = WalRedoKillCause::from_usize(i); let cause_str: &'static str = cause.into(); killed.with_label_values(&[cause_str]) })), @@ -2443,12 +2681,19 @@ impl StorageTimeMetricsTimer { } } - /// Record the time from creation to now. - pub fn stop_and_record(self) { - let duration = self.start.elapsed().as_secs_f64(); - self.metrics.timeline_sum.inc_by(duration); + /// Returns the elapsed duration of the timer. + pub fn elapsed(&self) -> Duration { + self.start.elapsed() + } + + /// Record the time from creation to now and return it. + pub fn stop_and_record(self) -> Duration { + let duration = self.elapsed(); + let seconds = duration.as_secs_f64(); + self.metrics.timeline_sum.inc_by(seconds); self.metrics.timeline_count.inc(); - self.metrics.global_histogram.observe(duration); + self.metrics.global_histogram.observe(seconds); + duration } /// Turns this timer into a timer, which will always record -- usually this means recording @@ -2468,6 +2713,13 @@ impl Drop for AlwaysRecordingStorageTimeMetricsTimer { } } +impl AlwaysRecordingStorageTimeMetricsTimer { + /// Returns the elapsed duration of the timer. + pub fn elapsed(&self) -> Duration { + self.0.as_ref().expect("not dropped yet").elapsed() + } +} + /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and /// timeline total sum and count. #[derive(Clone, Debug)] @@ -2520,6 +2772,7 @@ pub(crate) struct TimelineMetrics { shard_id: String, timeline_id: String, pub flush_time_histo: StorageTimeMetrics, + pub flush_delay_histo: StorageTimeMetrics, pub flush_wait_upload_time_gauge: Gauge, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, @@ -2532,10 +2785,7 @@ pub(crate) struct TimelineMetrics { pub disk_consistent_lsn_gauge: IntGauge, pub pitr_history_size: UIntGauge, pub archival_size: UIntGauge, - pub(crate) layer_size_image: UIntGauge, - pub(crate) layer_count_image: UIntGauge, - pub(crate) layer_size_delta: UIntGauge, - pub(crate) layer_count_delta: UIntGauge, + pub layers_per_read: Histogram, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, pub visible_physical_size_gauge: UIntGauge, @@ -2566,6 +2816,12 @@ impl TimelineMetrics { &shard_id, &timeline_id, ); + let flush_delay_histo = StorageTimeMetrics::new( + StorageTimeOperation::LayerFlushDelay, + &tenant_id, + &shard_id, + &timeline_id, + ); let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2627,40 +2883,8 @@ impl TimelineMetrics { .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); - let layer_size_image = TIMELINE_LAYER_SIZE - .get_metric_with_label_values(&[ - &tenant_id, - &shard_id, - &timeline_id, - MetricLayerKind::Image.into(), - ]) - .unwrap(); - - let layer_count_image = TIMELINE_LAYER_COUNT - .get_metric_with_label_values(&[ - &tenant_id, - &shard_id, - &timeline_id, - MetricLayerKind::Image.into(), - ]) - .unwrap(); - - let layer_size_delta = TIMELINE_LAYER_SIZE - .get_metric_with_label_values(&[ - &tenant_id, - &shard_id, - &timeline_id, - MetricLayerKind::Delta.into(), - ]) - .unwrap(); - - let layer_count_delta = TIMELINE_LAYER_COUNT - .get_metric_with_label_values(&[ - &tenant_id, - &shard_id, - &timeline_id, - MetricLayerKind::Delta.into(), - ]) + let layers_per_read = LAYERS_PER_READ + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let standby_horizon_gauge = STANDBY_HORIZON @@ -2714,6 +2938,7 @@ impl TimelineMetrics { shard_id, timeline_id, flush_time_histo, + flush_delay_histo, flush_wait_upload_time_gauge, compact_time_histo, create_images_time_histo, @@ -2726,10 +2951,7 @@ impl TimelineMetrics { disk_consistent_lsn_gauge, pitr_history_size, archival_size, - layer_size_image, - layer_count_image, - layer_size_delta, - layer_count_delta, + layers_per_read, standby_horizon_gauge, resident_physical_size_gauge, visible_physical_size_gauge, @@ -2772,6 +2994,92 @@ impl TimelineMetrics { .add(duration); } + /// Generates TIMELINE_LAYER labels for a persistent layer. + fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] { + let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) { + true => LayerLevel::L0, + false => LayerLevel::L1, + }; + let kind = match layer_desc.is_delta() { + true => LayerKind::Delta, + false => LayerKind::Image, + }; + [ + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + level.into(), + kind.into(), + ] + } + + /// Generates TIMELINE_LAYER labels for a frozen ephemeral layer. + fn make_frozen_layer_labels(&self, _layer: &InMemoryLayer) -> [&str; 5] { + [ + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + LayerLevel::Frozen.into(), + LayerKind::Delta.into(), // by definition + ] + } + + /// Removes a frozen ephemeral layer to TIMELINE_LAYER metrics. + pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) { + assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. })); + let labels = self.make_frozen_layer_labels(layer); + let size = layer.try_len().expect("frozen layer should have no writer"); + TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&labels) + .unwrap() + .dec(); + TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&labels) + .unwrap() + .sub(size); + } + + /// Adds a frozen ephemeral layer to TIMELINE_LAYER metrics. + pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) { + assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. })); + let labels = self.make_frozen_layer_labels(layer); + let size = layer.try_len().expect("frozen layer should have no writer"); + TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&labels) + .unwrap() + .inc(); + TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&labels) + .unwrap() + .add(size); + } + + /// Removes a persistent layer from TIMELINE_LAYER metrics. + pub fn dec_layer(&self, layer_desc: &PersistentLayerDesc) { + let labels = self.make_layer_labels(layer_desc); + TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&labels) + .unwrap() + .dec(); + TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&labels) + .unwrap() + .sub(layer_desc.file_size); + } + + /// Adds a persistent layer to TIMELINE_LAYER metrics. + pub fn inc_layer(&self, layer_desc: &PersistentLayerDesc) { + let labels = self.make_layer_labels(layer_desc); + TIMELINE_LAYER_COUNT + .get_metric_with_label_values(&labels) + .unwrap() + .inc(); + TIMELINE_LAYER_SIZE + .get_metric_with_label_values(&labels) + .unwrap() + .add(layer_desc.file_size); + } + pub(crate) fn shutdown(&self) { let was_shutdown = self .shutdown @@ -2804,30 +3112,16 @@ impl TimelineMetrics { let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); - let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - MetricLayerKind::Image.into(), - ]); - let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - MetricLayerKind::Image.into(), - ]); - let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - MetricLayerKind::Delta.into(), - ]); - let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[ - tenant_id, - shard_id, - timeline_id, - MetricLayerKind::Delta.into(), - ]); + for ref level in LayerLevel::iter() { + for ref kind in LayerKind::iter() { + let labels: [&str; 5] = + [tenant_id, shard_id, timeline_id, level.into(), kind.into()]; + let _ = TIMELINE_LAYER_SIZE.remove_label_values(&labels); + let _ = TIMELINE_LAYER_COUNT.remove_label_values(&labels); + } + } + + let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); @@ -2889,6 +3183,11 @@ impl TimelineMetrics { shard_id, timeline_id, ]); + let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + ]); } } @@ -2904,23 +3203,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { // we leave the BROKEN_TENANTS_SET entry if any } -use futures::Future; -use pin_project_lite::pin_project; -use std::collections::HashMap; -use std::num::NonZeroUsize; -use std::pin::Pin; -use std::sync::atomic::AtomicU64; -use std::sync::{Arc, Mutex}; -use std::task::{Context, Poll}; -use std::time::{Duration, Instant}; - -use crate::config::PageServerConf; -use crate::context::{PageContentKind, RequestContext}; -use crate::task_mgr::TaskKind; -use crate::tenant::mgr::TenantSlot; -use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::Timeline; - /// Maintain a per timeline gauge in addition to the global gauge. pub(crate) struct PerTimelineRemotePhysicalSizeGauge { last_set: AtomicU64, @@ -3296,12 +3578,10 @@ impl>, O, E> Future for MeasuredRemoteOp { } pub mod tokio_epoll_uring { - use std::{ - collections::HashMap, - sync::{Arc, Mutex}, - }; + use std::collections::HashMap; + use std::sync::{Arc, Mutex}; - use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge}; + use metrics::{Histogram, LocalHistogram, UIntGauge, register_histogram, register_int_counter}; use once_cell::sync::Lazy; /// Shared storage for tokio-epoll-uring thread local metrics. @@ -3310,7 +3590,9 @@ pub mod tokio_epoll_uring { let slots_submission_queue_depth = register_histogram!( "pageserver_tokio_epoll_uring_slots_submission_queue_depth", "The slots waiters queue depth of each tokio_epoll_uring system", - vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + vec![ + 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0 + ], ) .expect("failed to define a metric"); ThreadLocalMetricsStorage { @@ -3487,13 +3769,11 @@ pub mod tokio_epoll_uring { } pub(crate) mod tenant_throttling { - use metrics::{register_int_counter_vec, IntCounter}; + use metrics::{IntCounter, register_int_counter_vec}; use once_cell::sync::Lazy; use utils::shard::TenantShardId; - use crate::tenant::{self}; - - struct GlobalAndPerTenantIntCounter { + pub(crate) struct GlobalAndPerTenantIntCounter { global: IntCounter, per_tenant: IntCounter, } @@ -3511,10 +3791,10 @@ pub(crate) mod tenant_throttling { } pub(crate) struct Metrics { - count_accounted_start: GlobalAndPerTenantIntCounter, - count_accounted_finish: GlobalAndPerTenantIntCounter, - wait_time: GlobalAndPerTenantIntCounter, - count_throttled: GlobalAndPerTenantIntCounter, + pub(super) count_accounted_start: GlobalAndPerTenantIntCounter, + pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter, + pub(super) wait_time: GlobalAndPerTenantIntCounter, + pub(super) count_throttled: GlobalAndPerTenantIntCounter, } static COUNT_ACCOUNTED_START: Lazy = Lazy::new(|| { @@ -3649,26 +3929,6 @@ pub(crate) mod tenant_throttling { } } } - - impl tenant::throttle::Metric for Metrics { - #[inline(always)] - fn accounting_start(&self) { - self.count_accounted_start.inc(); - } - #[inline(always)] - fn accounting_finish(&self) { - self.count_accounted_finish.inc(); - } - #[inline(always)] - fn observe_throttling( - &self, - tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation, - ) { - let val = u64::try_from(wait_time.as_micros()).unwrap(); - self.wait_time.inc_by(val); - self.count_throttled.inc(); - } - } } pub(crate) mod disk_usage_based_eviction { @@ -3773,6 +4033,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { &REMOTE_ONDEMAND_DOWNLOADED_BYTES, &CIRCUIT_BREAKERS_BROKEN, &CIRCUIT_BREAKERS_UNBROKEN, + &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL, ] .into_iter() .for_each(|c| { @@ -3812,14 +4073,15 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { // histograms [ - &READ_NUM_LAYERS_VISITED, - &VEC_READ_NUM_LAYERS_VISITED, + &LAYERS_PER_READ_GLOBAL, + &DELTAS_PER_READ_GLOBAL, &WAIT_LSN_TIME, &WAL_REDO_TIME, &WAL_REDO_RECORDS_HISTOGRAM, &WAL_REDO_BYTES_HISTOGRAM, &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, &PAGE_SERVICE_BATCH_SIZE_GLOBAL, + &PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL, ] .into_iter() .for_each(|h| { @@ -3827,7 +4089,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { }); // Custom - Lazy::force(&RECONSTRUCT_TIME); Lazy::force(&BASEBACKUP_QUERY_TIME); Lazy::force(&COMPUTE_COMMANDS_COUNTERS); Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE); diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 45bf02362a..984dd125a9 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -67,23 +67,18 @@ //! mapping is automatically removed and the slot is marked free. //! -use std::{ - collections::{hash_map::Entry, HashMap}, - sync::{ - atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering}, - Arc, Weak, - }, - time::Duration, -}; +use std::collections::HashMap; +use std::collections::hash_map::Entry; +use std::sync::atomic::{AtomicU8, AtomicU64, AtomicUsize, Ordering}; +use std::sync::{Arc, Weak}; +use std::time::Duration; use anyhow::Context; use once_cell::sync::OnceCell; -use crate::{ - context::RequestContext, - metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics}, - virtual_file::{IoBufferMut, IoPageSlice}, -}; +use crate::context::RequestContext; +use crate::metrics::{PageCacheSizeMetrics, page_cache_eviction_metrics}; +use crate::virtual_file::{IoBufferMut, IoPageSlice}; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 50; @@ -168,11 +163,7 @@ impl Slot { let count_res = self.usage_count .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| { - if val == 0 { - None - } else { - Some(val - 1) - } + if val == 0 { None } else { Some(val - 1) } }); match count_res { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 97d94bbe7f..8972515163 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1,7 +1,15 @@ //! The Page Service listens for client connections and serves their GetPage@LSN //! requests. -use anyhow::{bail, Context}; +use std::borrow::Cow; +use std::num::NonZeroUsize; +use std::os::fd::AsRawFd; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant, SystemTime}; +use std::{io, str}; + +use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; use futures::FutureExt; @@ -11,65 +19,57 @@ use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedExecutionStrategy, }; -use pageserver_api::models::{self, TenantState}; +use pageserver_api::key::rel_block_to_key; use pageserver_api::models::{ - PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, + self, PageTraceEvent, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse, - PagestreamProtocolVersion, + PagestreamProtocolVersion, PagestreamRequest, TenantState, }; +use pageserver_api::reltag::SlruKind; use pageserver_api::shard::TenantShardId; use postgres_backend::{ - is_expected_io_error, AuthType, PostgresBackend, PostgresBackendReader, QueryError, + AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error, }; +use postgres_ffi::BLCKSZ; +use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use pq_proto::framed::ConnectionError; -use pq_proto::FeStartupPacket; -use pq_proto::{BeMessage, FeMessage, RowDescriptor}; -use std::borrow::Cow; -use std::io; -use std::num::NonZeroUsize; -use std::str; -use std::str::FromStr; -use std::sync::Arc; -use std::time::SystemTime; -use std::time::{Duration, Instant}; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::io::{AsyncWriteExt, BufWriter}; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor}; +use strum_macros::IntoStaticStr; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::auth::{Claims, Scope, SwappableJwtAuth}; +use utils::failpoint_support; +use utils::id::{TenantId, TimelineId}; +use utils::logging::log_slow; +use utils::lsn::Lsn; +use utils::simple_rcu::RcuReadGuard; +use utils::sync::gate::{Gate, GateGuard}; use utils::sync::spsc_fold; -use utils::{ - auth::{Claims, Scope, SwappableJwtAuth}, - id::{TenantId, TimelineId}, - lsn::Lsn, - simple_rcu::RcuReadGuard, -}; use crate::auth::check_permission; use crate::basebackup::BasebackupError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::{self, SmgrOpTimer}; -use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS}; +use crate::metrics::{ + self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer, +}; use crate::pgdatadir_mapping::Version; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; -use crate::task_mgr::TaskKind; -use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME}; -use crate::tenant::mgr::ShardSelector; -use crate::tenant::mgr::TenantManager; -use crate::tenant::mgr::{GetActiveTenantError, GetTenantError, ShardResolveResult}; +use crate::span::{ + debug_assert_current_span_has_tenant_and_timeline_id, + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, +}; +use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind}; +use crate::tenant::mgr::{ + GetActiveTenantError, GetTenantError, ShardResolveResult, ShardSelector, TenantManager, +}; +use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::{self, WaitLsnError}; -use crate::tenant::GetTimelineError; -use crate::tenant::PageReconstructError; -use crate::tenant::Timeline; +use crate::tenant::{GetTimelineError, PageReconstructError, Timeline}; use crate::{basebackup, timed_after_cancellation}; -use pageserver_api::key::rel_block_to_key; -use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; -use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; -use postgres_ffi::BLCKSZ; /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which /// is not yet in state [`TenantState::Active`]. @@ -77,6 +77,9 @@ use postgres_ffi::BLCKSZ; /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); +/// Threshold at which to log slow GetPage requests. +const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); + /////////////////////////////////////////////////////////////////////////////// pub struct Listener { @@ -89,6 +92,7 @@ pub struct Listener { pub struct Connections { cancel: CancellationToken, tasks: tokio::task::JoinSet, + gate: Gate, } pub fn spawn( @@ -109,6 +113,7 @@ pub fn spawn( let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error( "libpq listener", libpq_listener_main( + conf, tenant_manager, pg_auth, tcp_listener, @@ -133,11 +138,16 @@ impl Listener { } impl Connections { pub(crate) async fn shutdown(self) { - let Self { cancel, mut tasks } = self; + let Self { + cancel, + mut tasks, + gate, + } = self; cancel.cancel(); while let Some(res) = tasks.join_next().await { Self::handle_connection_completion(res); } + gate.close().await; } fn handle_connection_completion(res: Result, tokio::task::JoinError>) { @@ -157,7 +167,9 @@ impl Connections { /// Returns Ok(()) upon cancellation via `cancel`, returning the set of /// open connections. /// +#[allow(clippy::too_many_arguments)] pub async fn libpq_listener_main( + conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, listener: tokio::net::TcpListener, @@ -167,9 +179,15 @@ pub async fn libpq_listener_main( listener_cancel: CancellationToken, ) -> Connections { let connections_cancel = CancellationToken::new(); + let connections_gate = Gate::default(); let mut connection_handler_tasks = tokio::task::JoinSet::default(); loop { + let gate_guard = match connections_gate.enter() { + Ok(guard) => guard, + Err(_) => break, + }; + let accepted = tokio::select! { biased; _ = listener_cancel.cancelled() => break, @@ -189,6 +207,7 @@ pub async fn libpq_listener_main( let connection_ctx = listener_ctx .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download); connection_handler_tasks.spawn(page_service_conn_main( + conf, tenant_manager.clone(), local_auth, socket, @@ -196,6 +215,7 @@ pub async fn libpq_listener_main( pipelining_config.clone(), connection_ctx, connections_cancel.child_token(), + gate_guard, )); } Err(err) => { @@ -210,13 +230,16 @@ pub async fn libpq_listener_main( Connections { cancel: connections_cancel, tasks: connection_handler_tasks, + gate: connections_gate, } } type ConnectionHandlerResult = anyhow::Result<()>; -#[instrument(skip_all, fields(peer_addr))] +#[instrument(skip_all, fields(peer_addr, application_name))] +#[allow(clippy::too_many_arguments)] async fn page_service_conn_main( + conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, socket: tokio::net::TcpStream, @@ -224,6 +247,7 @@ async fn page_service_conn_main( pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, + gate_guard: GateGuard, ) -> ConnectionHandlerResult { let _guard = LIVE_CONNECTIONS .with_label_values(&["page_service"]) @@ -233,6 +257,8 @@ async fn page_service_conn_main( .set_nodelay(true) .context("could not set TCP_NODELAY")?; + let socket_fd = socket.as_raw_fd(); + let peer_addr = socket.peer_addr().context("get peer address")?; tracing::Span::current().record("peer_addr", field::display(peer_addr)); @@ -273,13 +299,15 @@ async fn page_service_conn_main( // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. let mut conn_handler = PageServerHandler::new( + conf, tenant_manager, auth, pipelining_config, connection_ctx, cancel.clone(), + gate_guard, ); - let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; + let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?; match pgbackend.run(&mut conn_handler, &cancel).await { Ok(()) => { @@ -309,6 +337,7 @@ async fn page_service_conn_main( } struct PageServerHandler { + conf: &'static PageServerConf, auth: Option>, claims: Option, @@ -324,6 +353,8 @@ struct PageServerHandler { timeline_handles: Option, pipelining_config: PageServicePipeliningConfig, + + gate_guard: GateGuard, } struct TimelineHandles { @@ -460,7 +491,6 @@ impl timeline::handle::TenantManager for TenantManagerWrappe let timeline = tenant_shard .get_timeline(timeline_id, true) .map_err(GetActiveTimelineError::Timeline)?; - set_tracing_field_shard_id(&timeline); Ok(timeline) } } @@ -537,93 +567,124 @@ impl From for QueryError { } } +#[derive(thiserror::Error, Debug)] +struct BatchedPageStreamError { + req: PagestreamRequest, + err: PageStreamError, +} + +impl std::fmt::Display for BatchedPageStreamError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.err.fmt(f) + } +} + +struct BatchedGetPageRequest { + req: PagestreamGetPageRequest, + timer: SmgrOpTimer, +} + +#[cfg(feature = "testing")] +struct BatchedTestRequest { + req: models::PagestreamTestRequest, + timer: SmgrOpTimer, +} + +/// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum, +/// so that we don't keep the [`Timeline::gate`] open while the batch +/// is being built up inside the [`spsc_fold`] (pagestream pipelining). +#[derive(IntoStaticStr)] enum BatchedFeMessage { Exists { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamExistsRequest, }, Nblocks { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamNblocksRequest, }, GetPage { span: Span, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, effective_request_lsn: Lsn, - pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>, + pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, }, DbSize { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamDbSizeRequest, }, GetSlruSegment { span: Span, timer: SmgrOpTimer, - shard: timeline::handle::Handle, + shard: timeline::handle::WeakHandle, req: models::PagestreamGetSlruSegmentRequest, }, + #[cfg(feature = "testing")] + Test { + span: Span, + shard: timeline::handle::WeakHandle, + requests: Vec, + }, RespondError { span: Span, - error: PageStreamError, + error: BatchedPageStreamError, }, } impl BatchedFeMessage { - async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> { - let (shard, tokens, timers) = match self { - BatchedFeMessage::Exists { shard, timer, .. } - | BatchedFeMessage::Nblocks { shard, timer, .. } - | BatchedFeMessage::DbSize { shard, timer, .. } - | BatchedFeMessage::GetSlruSegment { shard, timer, .. } => { - ( - shard, - // 1 token is probably under-estimating because these - // request handlers typically do several Timeline::get calls. - 1, - itertools::Either::Left(std::iter::once(timer)), - ) + fn as_static_str(&self) -> &'static str { + self.into() + } + + fn observe_execution_start(&mut self, at: Instant) { + match self { + BatchedFeMessage::Exists { timer, .. } + | BatchedFeMessage::Nblocks { timer, .. } + | BatchedFeMessage::DbSize { timer, .. } + | BatchedFeMessage::GetSlruSegment { timer, .. } => { + timer.observe_execution_start(at); } - BatchedFeMessage::GetPage { shard, pages, .. } => ( - shard, - pages.len(), - itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)), - ), - BatchedFeMessage::RespondError { .. } => return Ok(()), - }; - let throttled = tokio::select! { - throttled = shard.pagestream_throttle.throttle(tokens) => { throttled } - _ = cancel.cancelled() => { - return Err(QueryError::Shutdown); + BatchedFeMessage::GetPage { pages, .. } => { + for page in pages { + page.timer.observe_execution_start(at); + } } - }; - for timer in timers { - timer.deduct_throttle(&throttled); + #[cfg(feature = "testing")] + BatchedFeMessage::Test { requests, .. } => { + for req in requests { + req.timer.observe_execution_start(at); + } + } + BatchedFeMessage::RespondError { .. } => {} } - Ok(()) } } impl PageServerHandler { pub fn new( + conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, + gate_guard: GateGuard, ) -> Self { PageServerHandler { + conf, auth, claims: None, connection_ctx, timeline_handles: Some(TimelineHandles::new(tenant_manager)), cancel, pipelining_config, + gate_guard, } } @@ -651,6 +712,7 @@ impl PageServerHandler { ) } + #[allow(clippy::too_many_arguments)] async fn pagestream_read_message( pgb: &mut PostgresBackendReader, tenant_id: TenantId, @@ -658,6 +720,7 @@ impl PageServerHandler { timeline_handles: &mut TimelineHandles, cancel: &CancellationToken, ctx: &RequestContext, + protocol_version: PagestreamProtocolVersion, parent_span: Span, ) -> Result, QueryError> where @@ -692,128 +755,173 @@ impl PageServerHandler { fail::fail_point!("ps::handle-pagerequest-message"); // parse request - let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; + let neon_fe_msg = + PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?; + + // TODO: turn in to async closure once available to avoid repeating received_at + async fn record_op_start_and_throttle( + shard: &timeline::handle::Handle, + op: metrics::SmgrQueryType, + received_at: Instant, + ) -> Result { + // It's important to start the smgr op metric recorder as early as possible + // so that the _started counters are incremented before we do + // any serious waiting, e.g., for throttle, batching, or actual request handling. + let mut timer = shard.query_metrics.start_smgr_op(op, received_at); + let now = Instant::now(); + timer.observe_throttle_start(now); + let throttled = tokio::select! { + res = shard.pagestream_throttle.throttle(1, now) => res, + _ = shard.cancel.cancelled() => return Err(QueryError::Shutdown), + }; + timer.observe_throttle_done(throttled); + Ok(timer) + } let batched_msg = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; - let timer = shard - .query_metrics - .start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at); + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); + let timer = record_op_start_and_throttle( + &shard, + metrics::SmgrQueryType::GetRelExists, + received_at, + ) + .await?; BatchedFeMessage::Exists { span, timer, - shard, + shard: shard.downgrade(), req, } } PagestreamFeMessage::Nblocks(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; - let timer = shard - .query_metrics - .start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at); + let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); + let timer = record_op_start_and_throttle( + &shard, + metrics::SmgrQueryType::GetRelSize, + received_at, + ) + .await?; BatchedFeMessage::Nblocks { span, timer, - shard, + shard: shard.downgrade(), req, } } PagestreamFeMessage::DbSize(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; - let timer = shard - .query_metrics - .start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at); + let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); + let timer = record_op_start_and_throttle( + &shard, + metrics::SmgrQueryType::GetDbSize, + received_at, + ) + .await?; BatchedFeMessage::DbSize { span, timer, - shard, + shard: shard.downgrade(), req, } } PagestreamFeMessage::GetSlruSegment(req) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn); let shard = timeline_handles .get(tenant_id, timeline_id, ShardSelector::Zero) - .instrument(span.clone()) // sets `shard_id` field .await?; - let timer = shard - .query_metrics - .start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at); + let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug()); + let timer = record_op_start_and_throttle( + &shard, + metrics::SmgrQueryType::GetSlruSegment, + received_at, + ) + .await?; BatchedFeMessage::GetSlruSegment { span, timer, - shard, + shard: shard.downgrade(), req, } } - PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - request_lsn, - not_modified_since, - rel, - blkno, - }) => { - let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %request_lsn); + PagestreamFeMessage::GetPage(req) => { + // avoid a somewhat costly Span::record() by constructing the entire span in one go. + macro_rules! mkspan { + (before shard routing) => {{ + tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn) + }}; + ($shard_id:expr) => {{ + tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id) + }}; + } macro_rules! respond_error { - ($error:expr) => {{ + ($span:expr, $error:expr) => {{ let error = BatchedFeMessage::RespondError { - span, - error: $error, + span: $span, + error: BatchedPageStreamError { + req: req.hdr, + err: $error, + }, }; Ok(Some(error)) }}; } - let key = rel_block_to_key(rel, blkno); + let key = rel_block_to_key(req.rel, req.blkno); let shard = match timeline_handles .get(tenant_id, timeline_id, ShardSelector::Page(key)) - .instrument(span.clone()) // sets `shard_id` field .await { Ok(tl) => tl, - Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { - // We already know this tenant exists in general, because we resolved it at - // start of connection. Getting a NotFound here indicates that the shard containing - // the requested page is not present on this node: the client's knowledge of shard->pageserver - // mapping is out of date. - // - // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via - // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration - // and talk to a different pageserver. - return respond_error!(PageStreamError::Reconnect( - "getpage@lsn request routed to wrong shard".into() - )); - } Err(e) => { - return respond_error!(e.into()); + let span = mkspan!(before shard routing); + match e { + GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_)) => { + // We already know this tenant exists in general, because we resolved it at + // start of connection. Getting a NotFound here indicates that the shard containing + // the requested page is not present on this node: the client's knowledge of shard->pageserver + // mapping is out of date. + // + // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via + // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration + // and talk to a different pageserver. + return respond_error!( + span, + PageStreamError::Reconnect( + "getpage@lsn request routed to wrong shard".into() + ) + ); + } + e => { + return respond_error!(span, e.into()); + } + } } }; + let span = mkspan!(shard.tenant_shard_id.shard_slug()); - // It's important to start the timer before waiting for the LSN - // so that the _started counters are incremented before we do - // any serious waiting, e.g., for LSNs. - let timer = shard - .query_metrics - .start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at); + let timer = record_op_start_and_throttle( + &shard, + metrics::SmgrQueryType::GetPageAtLsn, + received_at, + ) + .await?; + // We're holding the Handle let effective_request_lsn = match Self::wait_or_get_last_lsn( &shard, - request_lsn, - not_modified_since, - &shard.get_latest_gc_cutoff_lsn(), + req.hdr.request_lsn, + req.hdr.not_modified_since, + &shard.get_applied_gc_cutoff_lsn(), ctx, ) // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait @@ -821,14 +929,29 @@ impl PageServerHandler { { Ok(lsn) => lsn, Err(e) => { - return respond_error!(e); + return respond_error!(span, e); } }; BatchedFeMessage::GetPage { span, - shard, + shard: shard.downgrade(), effective_request_lsn, - pages: smallvec::smallvec![(rel, blkno, timer)], + pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }], + } + } + #[cfg(feature = "testing")] + PagestreamFeMessage::Test(req) => { + let shard = timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .await?; + let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug()); + let timer = + record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at) + .await?; + BatchedFeMessage::Test { + span, + shard: shard.downgrade(), + requests: vec![BatchedTestRequest { req, timer }], } } }; @@ -856,7 +979,7 @@ impl PageServerHandler { Ok(BatchedFeMessage::GetPage { span: _, shard: accum_shard, - pages: ref mut accum_pages, + pages: accum_pages, effective_request_lsn: accum_lsn, }), BatchedFeMessage::GetPage { @@ -872,9 +995,7 @@ impl PageServerHandler { assert_eq!(accum_pages.len(), max_batch_size.get()); return false; } - if (accum_shard.tenant_shard_id, accum_shard.timeline_id) - != (this_shard.tenant_shard_id, this_shard.timeline_id) - { + if !accum_shard.is_same_handle_as(&this_shard) { trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch"); // TODO: we _could_ batch & execute each shard seperately (and in parallel). // But the current logic for keeping responses in order does not support that. @@ -893,6 +1014,44 @@ impl PageServerHandler { accum_pages.extend(this_pages); Ok(()) } + #[cfg(feature = "testing")] + ( + Ok(BatchedFeMessage::Test { + shard: accum_shard, + requests: accum_requests, + .. + }), + BatchedFeMessage::Test { + shard: this_shard, + requests: this_requests, + .. + }, + ) if (|| { + assert!(this_requests.len() == 1); + if accum_requests.len() >= max_batch_size.get() { + trace!(%max_batch_size, "stopping batching because of batch size"); + assert_eq!(accum_requests.len(), max_batch_size.get()); + return false; + } + if !accum_shard.is_same_handle_as(&this_shard) { + trace!("stopping batching because timeline object mismatch"); + // TODO: we _could_ batch & execute each shard seperately (and in parallel). + // But the current logic for keeping responses in order does not support that. + return false; + } + let this_batch_key = this_requests[0].req.batch_key; + let accum_batch_key = accum_requests[0].req.batch_key; + if this_requests[0].req.batch_key != accum_requests[0].req.batch_key { + trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed"); + return false; + } + true + })() => + { + // ok to batch + accum_requests.extend(this_requests); + Ok(()) + } // something batched already but this message is unbatchable (_, this_msg) => { // by default, don't continue batching @@ -906,120 +1065,73 @@ impl PageServerHandler { &mut self, pgb_writer: &mut PostgresBackend, batch: BatchedFeMessage, + io_concurrency: IoConcurrency, cancel: &CancellationToken, + protocol_version: PagestreamProtocolVersion, ctx: &RequestContext, ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - // invoke handler function - let (handler_results, span): ( - Vec>, - _, - ) = match batch { - BatchedFeMessage::Exists { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::exists"); - ( - vec![self - .handle_get_rel_exists_request(&shard, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer))], - span, - ) - } - BatchedFeMessage::Nblocks { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::nblocks"); - ( - vec![self - .handle_get_nblocks_request(&shard, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer))], - span, - ) - } - BatchedFeMessage::GetPage { - span, - shard, - effective_request_lsn, - pages, - } => { - fail::fail_point!("ps::handle-pagerequest-message::getpage"); - ( - { - let npages = pages.len(); - trace!(npages, "handling getpage request"); - let res = self - .handle_get_page_at_lsn_request_batched( - &shard, - effective_request_lsn, - pages, - ctx, - ) - .instrument(span.clone()) - .await; - assert_eq!(res.len(), npages); - res - }, - span, - ) - } - BatchedFeMessage::DbSize { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::dbsize"); - ( - vec![self - .handle_db_size_request(&shard, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer))], - span, - ) - } - BatchedFeMessage::GetSlruSegment { - span, - timer, - shard, - req, - } => { - fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); - ( - vec![self - .handle_get_slru_segment_request(&shard, &req, ctx) - .instrument(span.clone()) - .await - .map(|msg| (msg, timer))], - span, - ) - } - BatchedFeMessage::RespondError { span, error } => { - // We've already decided to respond with an error, so we don't need to - // call the handler. - (vec![Err(error)], span) + let started_at = Instant::now(); + let batch = { + let mut batch = batch; + batch.observe_execution_start(started_at); + batch + }; + + // Dispatch the batch to the appropriate request handler. + let (mut handler_results, span) = log_slow( + batch.as_static_str(), + LOG_SLOW_GETPAGE_THRESHOLD, + self.pagestream_dispatch_batched_message(batch, io_concurrency, ctx), + ) + .await?; + + // We purposefully don't count flush time into the smgr operation timer. + // + // The reason is that current compute client will not perform protocol processing + // if the postgres backend process is doing things other than `->smgr_read()`. + // This is especially the case for prefetch. + // + // If the compute doesn't read from the connection, eventually TCP will backpressure + // all the way into our flush call below. + // + // The timer's underlying metric is used for a storage-internal latency SLO and + // we don't want to include latency in it that we can't control. + // And as pointed out above, in this case, we don't control the time that flush will take. + // + // We put each response in the batch onto the wire in a separate pgb_writer.flush() + // call, which (all unmeasured) adds syscall overhead but reduces time to first byte + // and avoids building up a "giant" contiguous userspace buffer to hold the entire response. + // TODO: vectored socket IO would be great, but pgb_writer doesn't support that. + let flush_timers = { + let flushing_start_time = Instant::now(); + let mut flush_timers = Vec::with_capacity(handler_results.len()); + for handler_result in &mut handler_results { + let flush_timer = match handler_result { + Ok((_, timer)) => Some( + timer + .observe_execution_end(flushing_start_time) + .expect("we are the first caller"), + ), + Err(_) => { + // TODO: measure errors + None + } + }; + flush_timers.push(flush_timer); } + assert_eq!(flush_timers.len(), handler_results.len()); + flush_timers }; // Map handler result to protocol behavior. // Some handler errors cause exit from pagestream protocol. // Other handler errors are sent back as an error message and we stay in pagestream protocol. - for handler_result in handler_results { - let (response_msg, timer) = match handler_result { - Err(e) => match &e { + for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) { + let response_msg = match handler_result { + Err(e) => match &e.err { PageStreamError::Shutdown => { // If we fail to fulfil a request during shutdown, which may be _because_ of // shutdown, then do not send the error to the client. Instead just drop the @@ -1038,49 +1150,40 @@ impl PageServerHandler { // print the all details to the log with {:#}, but for the client the // error message is enough. Do not log if shutting down, as the anyhow::Error // here includes cancellation which is not an error. - let full = utils::error::report_compact_sources(&e); + let full = utils::error::report_compact_sources(&e.err); span.in_scope(|| { error!("error reading relation or page version: {full:#}") }); - ( - PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), - }), - None, // TODO: measure errors - ) + + PagestreamBeMessage::Error(PagestreamErrorResponse { + req: e.req, + message: e.err.to_string(), + }) } }, - Ok((response_msg, timer)) => (response_msg, Some(timer)), + Ok((response_msg, _op_timer_already_observed)) => response_msg, }; // // marshal & transmit response message // - pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; + pgb_writer.write_message_noflush(&BeMessage::CopyData( + &response_msg.serialize(protocol_version), + ))?; - // We purposefully don't count flush time into the timer. - // - // The reason is that current compute client will not perform protocol processing - // if the postgres backend process is doing things other than `->smgr_read()`. - // This is especially the case for prefetch. - // - // If the compute doesn't read from the connection, eventually TCP will backpressure - // all the way into our flush call below. - // - // The timer's underlying metric is used for a storage-internal latency SLO and - // we don't want to include latency in it that we can't control. - // And as pointed out above, in this case, we don't control the time that flush will take. - let flushing_timer = - timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing()); + failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel); // what we want to do + let socket_fd = pgb_writer.socket_fd; let flush_fut = pgb_writer.flush(); // metric for how long flushing takes let flush_fut = match flushing_timer { - Some(flushing_timer) => { - futures::future::Either::Left(flushing_timer.measure(flush_fut)) - } + Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure( + Instant::now(), + flush_fut, + socket_fd, + )), None => futures::future::Either::Right(flush_fut), }; // do it while respecting cancellation @@ -1098,13 +1201,154 @@ impl PageServerHandler { } Ok(()) } - // and log the info! line inside the request span - .instrument(span.clone()) .await?; } Ok(()) } + /// Helper which dispatches a batched message to the appropriate handler. + /// Returns a vec of results, along with the extracted trace span. + async fn pagestream_dispatch_batched_message( + &mut self, + batch: BatchedFeMessage, + io_concurrency: IoConcurrency, + ctx: &RequestContext, + ) -> Result< + ( + Vec>, + Span, + ), + QueryError, + > { + Ok(match batch { + BatchedFeMessage::Exists { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::exists"); + ( + vec![ + self.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + BatchedFeMessage::Nblocks { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::nblocks"); + ( + vec![ + self.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + BatchedFeMessage::GetPage { + span, + shard, + effective_request_lsn, + pages, + } => { + fail::fail_point!("ps::handle-pagerequest-message::getpage"); + ( + { + let npages = pages.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_get_page_at_lsn_request_batched( + &*shard.upgrade()?, + effective_request_lsn, + pages, + io_concurrency, + ctx, + ) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } + BatchedFeMessage::DbSize { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::dbsize"); + ( + vec![ + self.handle_db_size_request(&*shard.upgrade()?, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + BatchedFeMessage::GetSlruSegment { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); + ( + vec![ + self.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer)) + .map_err(|err| BatchedPageStreamError { err, req: req.hdr }), + ], + span, + ) + } + #[cfg(feature = "testing")] + BatchedFeMessage::Test { + span, + shard, + requests, + } => { + fail::fail_point!("ps::handle-pagerequest-message::test"); + ( + { + let npages = requests.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_test_request_batch(&*shard.upgrade()?, requests, ctx) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } + BatchedFeMessage::RespondError { span, error } => { + // We've already decided to respond with an error, so we don't need to + // call the handler. + (vec![Err(error)], span) + } + }) + } + /// Pagestream sub-protocol handler. /// /// It is a simple request-response protocol inside a COPYBOTH session. @@ -1120,7 +1364,7 @@ impl PageServerHandler { pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, - _protocol_version: PagestreamProtocolVersion, + protocol_version: PagestreamProtocolVersion, ctx: RequestContext, ) -> Result<(), QueryError> where @@ -1140,6 +1384,17 @@ impl PageServerHandler { } } + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + match self.gate_guard.try_clone() { + Ok(guard) => guard, + Err(_) => { + info!("shutdown request received in page handler"); + return Err(QueryError::Shutdown); + } + }, + ); + let pgb_reader = pgb .split() .context("implementation error: split pgb into reader and writer")?; @@ -1149,7 +1404,7 @@ impl PageServerHandler { .take() .expect("implementation error: timeline_handles should not be locked"); - let request_span = info_span!("request", shard_id = tracing::field::Empty); + let request_span = info_span!("request"); let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() { PageServicePipeliningConfig::Pipelined(pipelining_config) => { self.handle_pagerequests_pipelined( @@ -1160,6 +1415,8 @@ impl PageServerHandler { timeline_handles, request_span, pipelining_config, + protocol_version, + io_concurrency, &ctx, ) .await @@ -1172,6 +1429,8 @@ impl PageServerHandler { timeline_id, timeline_handles, request_span, + protocol_version, + io_concurrency, &ctx, ) .await @@ -1198,6 +1457,8 @@ impl PageServerHandler { timeline_id: TimelineId, mut timeline_handles: TimelineHandles, request_span: Span, + protocol_version: PagestreamProtocolVersion, + io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> ( (PostgresBackendReader, TimelineHandles), @@ -1215,6 +1476,7 @@ impl PageServerHandler { &mut timeline_handles, &cancel, ctx, + protocol_version, request_span.clone(), ) .await; @@ -1222,7 +1484,7 @@ impl PageServerHandler { Ok(msg) => msg, Err(e) => break e, }; - let mut msg = match msg { + let msg = match msg { Some(msg) => msg, None => { debug!("pagestream subprotocol end observed"); @@ -1230,14 +1492,17 @@ impl PageServerHandler { } }; - if let Err(cancelled) = msg.throttle(&self.cancel).await { - break cancelled; - } - - let err = self - .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx) + let result = self + .pagesteam_handle_batched_message( + pgb_writer, + msg, + io_concurrency.clone(), + &cancel, + protocol_version, + ctx, + ) .await; - match err { + match result { Ok(()) => {} Err(e) => break e, } @@ -1258,6 +1523,8 @@ impl PageServerHandler { mut timeline_handles: TimelineHandles, request_span: Span, pipelining_config: PageServicePipeliningConfigPipelined, + protocol_version: PagestreamProtocolVersion, + io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> ( (PostgresBackendReader, TimelineHandles), @@ -1355,6 +1622,7 @@ impl PageServerHandler { &mut timeline_handles, &cancel_batcher, &ctx, + protocol_version, request_span.clone(), ) .await; @@ -1391,15 +1659,21 @@ impl PageServerHandler { return Ok(()); } }; - let mut batch = match batch { + let batch = match batch { Ok(batch) => batch, Err(e) => { return Err(e); } }; - batch.throttle(&self.cancel).await?; - self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx) - .await?; + self.pagesteam_handle_batched_message( + pgb_writer, + batch, + io_concurrency.clone(), + &cancel, + protocol_version, + &ctx, + ) + .await?; } } }); @@ -1480,7 +1754,7 @@ impl PageServerHandler { // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN). if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() { let gc_info = &timeline.gc_info.read().unwrap(); - if !gc_info.leases.contains_key(&request_lsn) { + if !gc_info.lsn_covered_by_lease(request_lsn) { return Err( PageStreamError::BadRequest(format!( "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", @@ -1496,6 +1770,7 @@ impl PageServerHandler { .wait_lsn( not_modified_since, crate::tenant::timeline::WaitLsnWaiter::PageService, + timeline::WaitLsnTimeout::Default, ctx, ) .await?; @@ -1553,6 +1828,13 @@ impl PageServerHandler { .as_millis() .to_string() }); + + info!( + "acquired lease for {} until {}", + lsn, + valid_until_str.as_deref().unwrap_or("") + ); + let bytes = valid_until_str.as_ref().map(|x| x.as_bytes()); pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( @@ -1570,11 +1852,11 @@ impl PageServerHandler { req: &PagestreamExistsRequest, ctx: &RequestContext, ) -> Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, - req.request_lsn, - req.not_modified_since, + req.hdr.request_lsn, + req.hdr.not_modified_since, &latest_gc_cutoff_lsn, ctx, ) @@ -1585,6 +1867,7 @@ impl PageServerHandler { .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { + req: *req, exists, })) } @@ -1596,11 +1879,11 @@ impl PageServerHandler { req: &PagestreamNblocksRequest, ctx: &RequestContext, ) -> Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, - req.request_lsn, - req.not_modified_since, + req.hdr.request_lsn, + req.hdr.not_modified_since, &latest_gc_cutoff_lsn, ctx, ) @@ -1611,6 +1894,7 @@ impl PageServerHandler { .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { + req: *req, n_blocks, })) } @@ -1622,11 +1906,11 @@ impl PageServerHandler { req: &PagestreamDbSizeRequest, ctx: &RequestContext, ) -> Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, - req.request_lsn, - req.not_modified_since, + req.hdr.request_lsn, + req.hdr.not_modified_since, &latest_gc_cutoff_lsn, ctx, ) @@ -1638,6 +1922,7 @@ impl PageServerHandler { let db_size = total_blocks as i64 * BLCKSZ as i64; Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { + req: *req, db_size, })) } @@ -1647,19 +1932,35 @@ impl PageServerHandler { &mut self, timeline: &Timeline, effective_lsn: Lsn, - requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>, + requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, + io_concurrency: IoConcurrency, ctx: &RequestContext, - ) -> Vec> { + ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); timeline .query_metrics .observe_getpage_batch_start(requests.len()); + // If a page trace is running, submit an event for this request. + if let Some(page_trace) = timeline.page_trace.load().as_ref() { + let time = SystemTime::now(); + for batch in &requests { + let key = rel_block_to_key(batch.req.rel, batch.req.blkno).to_compact(); + // Ignore error (trace buffer may be full or tracer may have disconnected). + _ = page_trace.try_send(PageTraceEvent { + key, + effective_lsn, + time, + }); + } + } + let results = timeline .get_rel_page_at_lsn_batched( - requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)), + requests.iter().map(|p| (&p.req.rel, &p.req.blkno)), effective_lsn, + io_concurrency, ctx, ) .await; @@ -1670,16 +1971,20 @@ impl PageServerHandler { requests .into_iter() .zip(results.into_iter()) - .map(|((_, _, timer), res)| { + .map(|(req, res)| { res.map(|page| { ( PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { + req: req.req, page, }), - timer, + req.timer, ) }) - .map_err(PageStreamError::from) + .map_err(|e| BatchedPageStreamError { + err: PageStreamError::from(e), + req: req.req.hdr, + }) }), ) } @@ -1691,11 +1996,11 @@ impl PageServerHandler { req: &PagestreamGetSlruSegmentRequest, ctx: &RequestContext, ) -> Result { - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, - req.request_lsn, - req.not_modified_since, + req.hdr.request_lsn, + req.hdr.not_modified_since, &latest_gc_cutoff_lsn, ctx, ) @@ -1706,10 +2011,55 @@ impl PageServerHandler { let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?; Ok(PagestreamBeMessage::GetSlruSegment( - PagestreamGetSlruSegmentResponse { segment }, + PagestreamGetSlruSegmentResponse { req: *req, segment }, )) } + // NB: this impl mimics what we do for batched getpage requests. + #[cfg(feature = "testing")] + #[instrument(skip_all, fields(shard_id))] + async fn handle_test_request_batch( + &mut self, + timeline: &Timeline, + requests: Vec, + _ctx: &RequestContext, + ) -> Vec> { + // real requests would do something with the timeline + let mut results = Vec::with_capacity(requests.len()); + for _req in requests.iter() { + tokio::task::yield_now().await; + + results.push({ + if timeline.cancel.is_cancelled() { + Err(PageReconstructError::Cancelled) + } else { + Ok(()) + } + }); + } + + // TODO: avoid creating the new Vec here + Vec::from_iter( + requests + .into_iter() + .zip(results.into_iter()) + .map(|(req, res)| { + res.map(|()| { + ( + PagestreamBeMessage::Test(models::PagestreamTestResponse { + req: req.req.clone(), + }), + req.timer, + ) + }) + .map_err(|e| BatchedPageStreamError { + err: PageStreamError::from(e), + req: req.req.hdr, + }) + }), + ) + } + /// Note on "fullbackup": /// Full basebackups should only be used for debugging purposes. /// Originally, it was introduced to enable breaking storage format changes, @@ -1742,7 +2092,8 @@ impl PageServerHandler { { fn map_basebackup_error(err: BasebackupError) -> QueryError { match err { - BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)), + // TODO: passthrough the error site to the final error message? + BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)), BasebackupError::Server(e) => QueryError::Other(e), } } @@ -1755,8 +2106,16 @@ impl PageServerHandler { .unwrap() .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; + set_tracing_field_shard_id(&timeline); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + if timeline.is_archived() == Some(true) { + tracing::info!( + "timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it." + ); + return Err(QueryError::NotFound("timeline is archived".into())); + } + + let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); @@ -1764,6 +2123,7 @@ impl PageServerHandler { .wait_lsn( lsn, crate::tenant::timeline::WaitLsnWaiter::PageService, + crate::tenant::timeline::WaitLsnTimeout::Default, ctx, ) .await?; @@ -1835,10 +2195,12 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } - writer - .flush() - .await - .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?; + writer.flush().await.map_err(|e| { + map_basebackup_error(BasebackupError::Client( + e, + "handle_basebackup_request,flush", + )) + })?; } pgb.write_message_noflush(&BeMessage::CopyDone) @@ -1901,6 +2263,7 @@ struct FullBackupCmd { struct PageStreamCmd { tenant_id: TenantId, timeline_id: TimelineId, + protocol_version: PagestreamProtocolVersion, } /// `lease lsn tenant timeline lsn` @@ -1921,7 +2284,7 @@ enum PageServiceCmd { } impl PageStreamCmd { - fn parse(query: &str) -> anyhow::Result { + fn parse(query: &str, protocol_version: PagestreamProtocolVersion) -> anyhow::Result { let parameters = query.split_whitespace().collect_vec(); if parameters.len() != 2 { bail!( @@ -1936,6 +2299,7 @@ impl PageStreamCmd { Ok(Self { tenant_id, timeline_id, + protocol_version, }) } } @@ -2073,7 +2437,14 @@ impl PageServiceCmd { bail!("cannot parse query: {query}") }; match cmd.to_ascii_lowercase().as_str() { - "pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)), + "pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse( + other, + PagestreamProtocolVersion::V2, + )?)), + "pagestream_v3" => Ok(Self::PageStream(PageStreamCmd::parse( + other, + PagestreamProtocolVersion::V3, + )?)), "basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)), "fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)), "lease" => { @@ -2129,9 +2500,16 @@ where fn startup( &mut self, _pgb: &mut PostgresBackend, - _sm: &FeStartupPacket, + sm: &FeStartupPacket, ) -> Result<(), QueryError> { fail::fail_point!("ps::connection-start::startup-packet"); + + if let FeStartupPacket::StartupMessage { params, .. } = sm { + if let Some(app_name) = params.get("application_name") { + Span::current().record("application_name", field::display(app_name)); + } + }; + Ok(()) } @@ -2155,25 +2533,21 @@ where PageServiceCmd::PageStream(PageStreamCmd { tenant_id, timeline_id, + protocol_version, }) => { tracing::Span::current() .record("tenant_id", field::display(tenant_id)) .record("timeline_id", field::display(timeline_id)); self.check_permission(Some(tenant_id))?; + let command_kind = match protocol_version { + PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2, + PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3, + }; + COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc(); - COMPUTE_COMMANDS_COUNTERS - .for_command(ComputeCommandKind::PageStreamV2) - .inc(); - - self.handle_pagerequests( - pgb, - tenant_id, - timeline_id, - PagestreamProtocolVersion::V2, - ctx, - ) - .await?; + self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx) + .await?; } PageServiceCmd::BaseBackup(BaseBackupCmd { tenant_id, @@ -2320,6 +2694,14 @@ impl From for QueryError { } } +impl From for QueryError { + fn from(e: crate::tenant::timeline::handle::HandleUpgradeError) -> Self { + match e { + crate::tenant::timeline::handle::HandleUpgradeError::ShutDown => QueryError::Shutdown, + } + } +} + fn set_tracing_field_shard_id(timeline: &Timeline) { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); tracing::Span::current().record( @@ -2352,7 +2734,8 @@ mod tests { cmd, PageServiceCmd::PageStream(PageStreamCmd { tenant_id, - timeline_id + timeline_id, + protocol_version: PagestreamProtocolVersion::V2, }) ); let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap(); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 255bd01e25..787b1b895c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,6 +6,36 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! +use std::collections::{BTreeMap, HashMap, HashSet, hash_map}; +use std::ops::{ControlFlow, Range}; + +use anyhow::{Context, ensure}; +use bytes::{Buf, Bytes, BytesMut}; +use enum_map::Enum; +use itertools::Itertools; +use pageserver_api::key::{ + AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists, + TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, + rel_size_to_key, rel_tag_sparse_key, rel_tag_sparse_key_range, relmap_file_key, + repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, + slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, +}; +use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::record::NeonWalRecord; +use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; +use pageserver_api::shard::ShardIdentity; +use pageserver_api::value::Value; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId}; +use serde::{Deserialize, Serialize}; +use strum::IntoEnumIterator; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, trace, warn}; +use utils::bin_ser::{BeSer, DeserializeError}; +use utils::lsn::Lsn; +use utils::pausable_failpoint; +use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; + use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; use crate::context::RequestContext; @@ -17,37 +47,8 @@ use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, }; +use crate::tenant::storage_layer::IoConcurrency; use crate::tenant::timeline::GetVectoredError; -use anyhow::{ensure, Context}; -use bytes::{Buf, Bytes, BytesMut}; -use enum_map::Enum; -use itertools::Itertools; -use pageserver_api::key::Key; -use pageserver_api::key::{ - dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, - relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, - slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, - CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, -}; -use pageserver_api::keyspace::SparseKeySpace; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; -use pageserver_api::shard::ShardIdentity; -use pageserver_api::value::Value; -use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::BLCKSZ; -use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId}; -use serde::{Deserialize, Serialize}; -use std::collections::{hash_map, BTreeMap, HashMap, HashSet}; -use std::ops::ControlFlow; -use std::ops::Range; -use strum::IntoEnumIterator; -use tokio_util::sync::CancellationToken; -use tracing::{debug, trace, warn}; -use utils::bin_ser::DeserializeError; -use utils::pausable_failpoint; -use utils::{bin_ser::BeSer, lsn::Lsn}; -use wal_decoder::serialized_batch::SerializedValueBatch; /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. pub const MAX_AUX_FILE_DELTAS: usize = 1024; @@ -200,6 +201,7 @@ impl Timeline { blknum: BlockNumber, version: Version<'_>, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result { match version { Version::Lsn(effective_lsn) => { @@ -208,6 +210,7 @@ impl Timeline { .get_rel_page_at_lsn_batched( pages.iter().map(|(tag, blknum)| (tag, blknum)), effective_lsn, + io_concurrency.clone(), ctx, ) .await; @@ -246,6 +249,7 @@ impl Timeline { &self, pages: impl ExactSizeIterator, effective_lsn: Lsn, + io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -309,7 +313,10 @@ impl Timeline { acc.to_keyspace() }; - match self.get_vectored(keyspace, effective_lsn, ctx).await { + match self + .get_vectored(keyspace, effective_lsn, io_concurrency, ctx) + .await + { Ok(results) => { for (key, res) in results { let mut key_slots = keys_slots.remove(&key).unwrap().into_iter(); @@ -319,16 +326,16 @@ impl Timeline { let clone = match &res { Ok(buf) => Ok(buf.clone()), Err(err) => Err(match err { - PageReconstructError::Cancelled => { - PageReconstructError::Cancelled - } + PageReconstructError::Cancelled => PageReconstructError::Cancelled, - x @ PageReconstructError::Other(_) | - x @ PageReconstructError::AncestorLsnTimeout(_) | - x @ PageReconstructError::WalRedo(_) | - x @ PageReconstructError::MissingKey(_) => { - PageReconstructError::Other(anyhow::anyhow!("there was more than one request for this key in the batch, error logged once: {x:?}")) - }, + x @ PageReconstructError::Other(_) + | x @ PageReconstructError::AncestorLsnTimeout(_) + | x @ PageReconstructError::WalRedo(_) + | x @ PageReconstructError::MissingKey(_) => { + PageReconstructError::Other(anyhow::anyhow!( + "there was more than one request for this key in the batch, error logged once: {x:?}" + )) + } }), }; @@ -347,23 +354,23 @@ impl Timeline { // this whole `match` is a lot like `From for PageReconstructError` // but without taking ownership of the GetVectoredError let err = match &err { - GetVectoredError::Cancelled => { - Err(PageReconstructError::Cancelled) - } + GetVectoredError::Cancelled => Err(PageReconstructError::Cancelled), // TODO: restructure get_vectored API to make this error per-key GetVectoredError::MissingKey(err) => { - Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more of the requested keys were missing: {err:?}"))) + Err(PageReconstructError::Other(anyhow::anyhow!( + "whole vectored get request failed because one or more of the requested keys were missing: {err:?}" + ))) } // TODO: restructure get_vectored API to make this error per-key GetVectoredError::GetReadyAncestorError(err) => { - Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}"))) + Err(PageReconstructError::Other(anyhow::anyhow!( + "whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}" + ))) } // TODO: restructure get_vectored API to make this error per-key - GetVectoredError::Other(err) => { - Err(PageReconstructError::Other( - anyhow::anyhow!("whole vectored get request failed: {err:?}"), - )) - } + GetVectoredError::Other(err) => Err(PageReconstructError::Other( + anyhow::anyhow!("whole vectored get request failed: {err:?}"), + )), // TODO: we can prevent this error class by moving this check into the type system GetVectoredError::InvalidLsn(e) => { Err(anyhow::anyhow!("invalid LSN: {e:?}").into()) @@ -371,10 +378,7 @@ impl Timeline { // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS // TODO: we can prevent this error class by moving this check into the type system GetVectoredError::Oversized(err) => { - Err(anyhow::anyhow!( - "batching oversized: {err:?}" - ) - .into()) + Err(anyhow::anyhow!("batching oversized: {err:?}").into()) } }; @@ -483,12 +487,33 @@ impl Timeline { if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) { return Ok(false); } - // fetch directory listing + + // Read path: first read the new reldir keyspace. Early return if the relation exists. + // Otherwise, read the old reldir keyspace. + // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2. + + if self.get_rel_size_v2_enabled() { + // fetch directory listing (new) + let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum); + let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?) + .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?; + let exists_v2 = buf == RelDirExists::Exists; + // Fast path: if the relation exists in the new format, return true. + // TODO: we should have a verification mode that checks both keyspaces + // to ensure the relation only exists in one of them. + if exists_v2 { + return Ok(true); + } + } + + // fetch directory listing (old) + let key = rel_dir_to_key(tag.spcnode, tag.dbnode); let buf = version.get(self, key, ctx).await?; let dir = RelDirectory::des(&buf)?; - Ok(dir.rels.contains(&(tag.relnode, tag.forknum))) + let exists_v1 = dir.rels.contains(&(tag.relnode, tag.forknum)); + Ok(exists_v1) } /// Get a list of all existing relations in given tablespace and database. @@ -506,12 +531,12 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { - // fetch directory listing + // fetch directory listing (old) let key = rel_dir_to_key(spcnode, dbnode); let buf = version.get(self, key, ctx).await?; let dir = RelDirectory::des(&buf)?; - let rels: HashSet = + let rels_v1: HashSet = HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { spcnode, dbnode, @@ -519,6 +544,46 @@ impl Timeline { forknum: *forknum, })); + if !self.get_rel_size_v2_enabled() { + return Ok(rels_v1); + } + + // scan directory listing (new), merge with the old results + let key_range = rel_tag_sparse_key_range(spcnode, dbnode); + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| PageReconstructError::Cancelled)?, + ); + let results = self + .scan( + KeySpace::single(key_range), + version.get_lsn(), + ctx, + io_concurrency, + ) + .await?; + let mut rels = rels_v1; + for (key, val) in results { + let val = RelDirExists::decode(&val?) + .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?; + assert_eq!(key.field6, 1); + assert_eq!(key.field2, spcnode); + assert_eq!(key.field3, dbnode); + let tag = RelTag { + spcnode, + dbnode, + relnode: key.field4, + forknum: key.field5, + }; + if val == RelDirExists::Removed { + debug_assert!(!rels.contains(&tag), "removed reltag in v2"); + continue; + } + let did_not_contain = rels.insert(tag); + debug_assert!(did_not_contain, "duplicate reltag in v2"); + } Ok(rels) } @@ -604,12 +669,19 @@ impl Timeline { ) -> Result { pausable_failpoint!("find-lsn-for-timestamp-pausable"); - let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); + let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn(); + let gc_cutoff_planned = { + let gc_info = self.gc_info.read().unwrap(); + gc_info.min_cutoff() + }; + // Usually the planned cutoff is newer than the cutoff of the last gc run, + // but let's be defensive. + let gc_cutoff = gc_cutoff_planned.max(*gc_cutoff_lsn_guard); // We use this method to figure out the branching LSN for the new branch, but the // GC cutoff could be before the branching point and we cannot create a new branch // with LSN < `ancestor_lsn`. Thus, pick the maximum of these two to be // on the safe side. - let min_lsn = std::cmp::max(*gc_cutoff_lsn_guard, self.get_ancestor_lsn()); + let min_lsn = std::cmp::max(gc_cutoff, self.get_ancestor_lsn()); let max_lsn = self.get_last_record_lsn(); // LSNs are always 8-byte aligned. low/mid/high represent the @@ -627,7 +699,7 @@ impl Timeline { // cannot overflow, high and low are both smaller than u64::MAX / 2 let mid = (high + low) / 2; - let cmp = self + let cmp = match self .is_latest_commit_timestamp_ge_than( search_timestamp, Lsn(mid * 8), @@ -635,7 +707,19 @@ impl Timeline { &mut found_larger, ctx, ) - .await?; + .await + { + Ok(res) => res, + Err(PageReconstructError::MissingKey(e)) => { + warn!( + "Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", + e + ); + // Return that we didn't find any requests smaller than the LSN, and logging the error. + return Ok(LsnForTimestamp::Past(min_lsn)); + } + Err(e) => return Err(e), + }; if cmp { high = mid; @@ -643,6 +727,7 @@ impl Timeline { low = mid + 1; } } + // If `found_smaller == true`, `low = t + 1` where `t` is the target LSN, // so the LSN of the last commit record before or at `search_timestamp`. // Remove one from `low` to get `t`. @@ -879,9 +964,15 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result, PageReconstructError> { let kv = self - .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx) + .scan( + KeySpace::single(Key::metadata_aux_key_range()), + lsn, + ctx, + io_concurrency, + ) .await?; let mut result = HashMap::new(); let mut sz = 0; @@ -904,8 +995,9 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result<(), PageReconstructError> { - self.list_aux_files_v2(lsn, ctx).await?; + self.list_aux_files_v2(lsn, ctx, io_concurrency).await?; Ok(()) } @@ -913,17 +1005,24 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result, PageReconstructError> { - self.list_aux_files_v2(lsn, ctx).await + self.list_aux_files_v2(lsn, ctx, io_concurrency).await } pub(crate) async fn get_replorigins( &self, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> Result, PageReconstructError> { let kv = self - .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx) + .scan( + KeySpace::single(repl_origin_key_range()), + lsn, + ctx, + io_concurrency, + ) .await?; let mut result = HashMap::new(); for (k, v) in kv { @@ -1106,7 +1205,11 @@ impl Timeline { let dense_keyspace = result.to_keyspace(); let sparse_keyspace = SparseKeySpace(KeySpace { - ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()], + ranges: vec![ + Key::metadata_aux_key_range(), + repl_origin_key_range(), + Key::rel_dir_sparse_key_range(), + ], }); if cfg!(debug_assertions) { @@ -1236,13 +1339,23 @@ pub struct DatadirModification<'a> { /// For special "directory" keys that store key-value maps, track the size of the map /// if it was updated in this modification. - pending_directory_entries: Vec<(DirectoryKind, usize)>, + pending_directory_entries: Vec<(DirectoryKind, MetricsUpdate)>, /// An **approximation** of how many metadata bytes will be written to the EphemeralFile. pending_metadata_bytes: usize, } -impl<'a> DatadirModification<'a> { +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MetricsUpdate { + /// Set the metrics to this value + Set(u64), + /// Increment the metrics by this value + Add(u64), + /// Decrement the metrics by this value + Sub(u64), +} + +impl DatadirModification<'_> { // When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed. @@ -1263,7 +1376,27 @@ impl<'a> DatadirModification<'a> { pub(crate) fn has_dirty_data(&self) -> bool { self.pending_data_batch .as_ref() - .map_or(false, |b| b.has_data()) + .is_some_and(|b| b.has_data()) + } + + /// Returns statistics about the currently pending modifications. + pub(crate) fn stats(&self) -> DatadirModificationStats { + let mut stats = DatadirModificationStats::default(); + for (_, _, value) in self.pending_metadata_pages.values().flatten() { + match value { + Value::Image(_) => stats.metadata_images += 1, + Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1, + Value::WalRecord(_) => stats.metadata_deltas += 1, + } + } + for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) { + match valuemeta { + ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1, + ValueMeta::Serialized(_) => stats.data_deltas += 1, + ValueMeta::Observed(_) => {} + } + } + stats } /// Set the current lsn @@ -1301,7 +1434,8 @@ impl<'a> DatadirModification<'a> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), })?; - self.pending_directory_entries.push((DirectoryKind::Db, 0)); + self.pending_directory_entries + .push((DirectoryKind::Db, MetricsUpdate::Set(0))); self.put(DBDIR_KEY, Value::Image(buf.into())); let buf = if self.tline.pg_version >= 17 { @@ -1314,23 +1448,34 @@ impl<'a> DatadirModification<'a> { }) }?; self.pending_directory_entries - .push((DirectoryKind::TwoPhase, 0)); + .push((DirectoryKind::TwoPhase, MetricsUpdate::Set(0))); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); let empty_dir = Value::Image(buf); - self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); - self.put( - slru_dir_to_key(SlruKind::MultiXactMembers), - empty_dir.clone(), - ); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); - self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0)); + + // Initialize SLRUs on shard 0 only: creating these on other shards would be + // harmless but they'd just be dropped on later compaction. + if self.tline.tenant_shard_id.is_shard_zero() { + self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(SlruKind::Clog), + MetricsUpdate::Set(0), + )); + self.put( + slru_dir_to_key(SlruKind::MultiXactMembers), + empty_dir.clone(), + ); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(SlruKind::Clog), + MetricsUpdate::Set(0), + )); + self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), + MetricsUpdate::Set(0), + )); + } Ok(()) } @@ -1595,10 +1740,16 @@ impl<'a> DatadirModification<'a> { } if r.is_none() { // Create RelDirectory + // TODO: if we have fully migrated to v2, no need to create this directory let buf = RelDirectory::ser(&RelDirectory { rels: HashSet::new(), })?; - self.pending_directory_entries.push((DirectoryKind::Rel, 0)); + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); + if self.tline.get_rel_size_v2_enabled() { + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); + } self.put( rel_dir_to_key(spcnode, dbnode), Value::Image(Bytes::from(buf)), @@ -1622,8 +1773,10 @@ impl<'a> DatadirModification<'a> { if !dir.xids.insert(xid) { anyhow::bail!("twophase file for xid {} already exists", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { let xid = xid as u32; @@ -1631,8 +1784,10 @@ impl<'a> DatadirModification<'a> { if !dir.xids.insert(xid) { anyhow::bail!("twophase file for xid {} already exists", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectory::ser(&dir)?) }; self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); @@ -1681,8 +1836,10 @@ impl<'a> DatadirModification<'a> { let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; - self.pending_directory_entries - .push((DirectoryKind::Db, dir.dbdirs.len())); + self.pending_directory_entries.push(( + DirectoryKind::Db, + MetricsUpdate::Set(dir.dbdirs.len() as u64), + )); self.put(DBDIR_KEY, Value::Image(buf.into())); } else { warn!( @@ -1715,39 +1872,85 @@ impl<'a> DatadirModification<'a> { // tablespace. Create the reldir entry for it if so. let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?) .context("deserialize db")?; - let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let mut rel_dir = + + let dbdir_exists = if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { // Didn't exist. Update dbdir e.insert(false); let buf = DbDirectory::ser(&dbdir).context("serialize db")?; - self.pending_directory_entries - .push((DirectoryKind::Db, dbdir.dbdirs.len())); + self.pending_directory_entries.push(( + DirectoryKind::Db, + MetricsUpdate::Set(dbdir.dbdirs.len() as u64), + )); self.put(DBDIR_KEY, Value::Image(buf.into())); - - // and create the RelDirectory - RelDirectory::default() + false } else { - // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) - .context("deserialize db")? + true }; + let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); + let mut rel_dir = if !dbdir_exists { + // Create the RelDirectory + RelDirectory::default() + } else { + // reldir already exists, fetch it + RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) + .context("deserialize db")? + }; + // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { return Err(RelationError::AlreadyExists); } - self.pending_directory_entries - .push((DirectoryKind::Rel, rel_dir.rels.len())); - - self.put( - rel_dir_key, - Value::Image(Bytes::from( - RelDirectory::ser(&rel_dir).context("serialize")?, - )), - ); - + if self.tline.get_rel_size_v2_enabled() { + let sparse_rel_dir_key = + rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); + // check if the rel_dir_key exists in v2 + let val = self + .sparse_get(sparse_rel_dir_key, ctx) + .await + .map_err(|e| RelationError::Other(e.into()))?; + let val = RelDirExists::decode_option(val) + .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + if val == RelDirExists::Exists { + return Err(RelationError::AlreadyExists); + } + self.put( + sparse_rel_dir_key, + Value::Image(RelDirExists::Exists.encode()), + ); + if !dbdir_exists { + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); + // We don't write `rel_dir_key -> rel_dir.rels` back to the storage in the v2 path unless it's the initial creation. + // TODO: if we have fully migrated to v2, no need to create this directory. Otherwise, there + // will be key not found errors if we don't create an empty one for rel_size_v2. + self.put( + rel_dir_key, + Value::Image(Bytes::from( + RelDirectory::ser(&RelDirectory::default()).context("serialize")?, + )), + ); + } + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Add(1))); + } else { + if !dbdir_exists { + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Set(0))) + } + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Add(1))); + self.put( + rel_dir_key, + Value::Image(Bytes::from( + RelDirectory::ser(&rel_dir).context("serialize")?, + )), + ); + } // Put size let size_key = rel_size_to_key(rel); let buf = nblocks.to_le_bytes(); @@ -1833,9 +2036,34 @@ impl<'a> DatadirModification<'a> { let mut dirty = false; for rel_tag in rel_tags { - if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) { + let found = if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) { + self.pending_directory_entries + .push((DirectoryKind::Rel, MetricsUpdate::Sub(1))); dirty = true; + true + } else if self.tline.get_rel_size_v2_enabled() { + // The rel is not found in the old reldir key, so we need to check the new sparse keyspace. + // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion + // logic). + let key = + rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum); + let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?) + .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + if val == RelDirExists::Exists { + self.pending_directory_entries + .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1))); + // put tombstone + self.put(key, Value::Image(RelDirExists::Removed.encode())); + // no need to set dirty to true + true + } else { + false + } + } else { + false + }; + if found { // update logical size let size_key = rel_size_to_key(rel_tag); let old_size = self.get(size_key, ctx).await?.get_u32_le(); @@ -1851,8 +2079,6 @@ impl<'a> DatadirModification<'a> { if dirty { self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); - self.pending_directory_entries - .push((DirectoryKind::Rel, dir.rels.len())); } } @@ -1876,8 +2102,10 @@ impl<'a> DatadirModification<'a> { if !dir.segments.insert(segno) { anyhow::bail!("slru segment {kind:?}/{segno} already exists"); } - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(kind), + MetricsUpdate::Set(dir.segments.len() as u64), + )); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -1924,8 +2152,10 @@ impl<'a> DatadirModification<'a> { if !dir.segments.remove(&segno) { warn!("slru segment {:?}/{} does not exist", kind, segno); } - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); + self.pending_directory_entries.push(( + DirectoryKind::SlruSegment(kind), + MetricsUpdate::Set(dir.segments.len() as u64), + )); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -1957,8 +2187,10 @@ impl<'a> DatadirModification<'a> { if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { let xid: u32 = u32::try_from(xid)?; @@ -1967,8 +2199,10 @@ impl<'a> DatadirModification<'a> { if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); + self.pending_directory_entries.push(( + DirectoryKind::TwoPhase, + MetricsUpdate::Set(dir.xids.len() as u64), + )); Bytes::from(TwoPhaseDirectory::ser(&dir)?) }; self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); @@ -2029,6 +2263,13 @@ impl<'a> DatadirModification<'a> { self.tline.aux_file_size_estimator.on_add(content.len()); new_files.push((path, content)); } + // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit. + // Compute doesn't know if previous version of this file exists or not, so + // attempt to delete non-existing file can cause this message. + // To avoid false alarms, log it as info rather than warning. + (None, true) if path.starts_with("pg_stat/") => { + info!("removing non-existing pg_stat file: {}", path) + } (None, true) => warn!("removing non-existing aux file: {}", path), } let new_val = aux_file::encode_file_value(&new_files)?; @@ -2084,7 +2325,7 @@ impl<'a> DatadirModification<'a> { } for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { - writer.update_directory_entries_count(kind, count as u64); + writer.update_directory_entries_count(kind, count); } Ok(()) @@ -2170,7 +2411,7 @@ impl<'a> DatadirModification<'a> { } for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { - writer.update_directory_entries_count(kind, count as u64); + writer.update_directory_entries_count(kind, count); } self.pending_metadata_bytes = 0; @@ -2222,10 +2463,12 @@ impl<'a> DatadirModification<'a> { // modifications before ingesting DB create operations, which are the only kind that reads // data pages during ingest. if cfg!(debug_assertions) { - assert!(!self - .pending_data_batch - .as_ref() - .map_or(false, |b| b.updates_key(&key))); + assert!( + !self + .pending_data_batch + .as_ref() + .is_some_and(|b| b.updates_key(&key)) + ); } } @@ -2234,6 +2477,22 @@ impl<'a> DatadirModification<'a> { self.tline.get(key, lsn, ctx).await } + /// Get a key from the sparse keyspace. Automatically converts the missing key error + /// and the empty value into None. + async fn sparse_get( + &self, + key: Key, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let val = self.get(key, ctx).await; + match val { + Ok(val) if val.is_empty() => Ok(None), + Ok(val) => Ok(Some(val)), + Err(PageReconstructError::MissingKey(_)) => Ok(None), + Err(e) => Err(e), + } + } + fn put(&mut self, key: Key, val: Value) { if Self::is_data_key(&key) { self.put_data(key.to_compact(), val) @@ -2281,6 +2540,15 @@ impl<'a> DatadirModification<'a> { } } +/// Statistics for a DatadirModification. +#[derive(Default)] +pub struct DatadirModificationStats { + pub metadata_images: u64, + pub metadata_deltas: u64, + pub data_images: u64, + pub data_deltas: u64, +} + /// This struct facilitates accessing either a committed key from the timeline at a /// specific LSN, or the latest uncommitted key from a pending modification. /// @@ -2294,7 +2562,7 @@ pub enum Version<'a> { Modified(&'a DatadirModification<'a>), } -impl<'a> Version<'a> { +impl Version<'_> { async fn get( &self, timeline: &Timeline, @@ -2307,6 +2575,23 @@ impl<'a> Version<'a> { } } + /// Get a key from the sparse keyspace. Automatically converts the missing key error + /// and the empty value into None. + async fn sparse_get( + &self, + timeline: &Timeline, + key: Key, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let val = self.get(timeline, key, ctx).await; + match val { + Ok(val) if val.is_empty() => Ok(None), + Ok(val) => Ok(Some(val)), + Err(PageReconstructError::MissingKey(_)) => Ok(None), + Err(e) => Err(e), + } + } + fn get_lsn(&self) -> Lsn { match self { Version::Lsn(lsn) => *lsn, @@ -2366,6 +2651,7 @@ pub(crate) enum DirectoryKind { Rel, AuxFiles, SlruSegment(SlruKind), + RelV2, } impl DirectoryKind { @@ -2381,15 +2667,14 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); #[cfg(test)] mod tests { use hex_literal::hex; - use pageserver_api::{models::ShardParameters, shard::ShardStripeSize}; - use utils::{ - id::TimelineId, - shard::{ShardCount, ShardNumber}, - }; + use pageserver_api::models::ShardParameters; + use pageserver_api::shard::ShardStripeSize; + use utils::id::TimelineId; + use utils::shard::{ShardCount, ShardNumber}; use super::*; - - use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION}; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::TenantHarness; /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline #[tokio::test] @@ -2417,7 +2702,11 @@ mod tests { ("foo/bar2".to_string(), Bytes::from_static(b"content2")), ]); - let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?; + let io_concurrency = IoConcurrency::spawn_for_test(); + + let readback = tline + .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone()) + .await?; assert_eq!(readback, expect_1008); // Second modification: update one key, remove the other @@ -2429,11 +2718,15 @@ mod tests { let expect_2008 = HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]); - let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?; + let readback = tline + .list_aux_files(Lsn(0x2008), &ctx, io_concurrency.clone()) + .await?; assert_eq!(readback, expect_2008); // Reading back in time works - let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?; + let readback = tline + .list_aux_files(Lsn(0x1008), &ctx, io_concurrency.clone()) + .await?; assert_eq!(readback, expect_1008); Ok(()) diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 4e8be58d58..85c2ed8499 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -73,11 +73,10 @@ impl Statvfs { pub mod mock { use camino::Utf8Path; + pub use pageserver_api::config::statvfs::mock::Behavior; use regex::Regex; use tracing::log::info; - pub use pageserver_api::config::statvfs::mock::Behavior; - pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result { info!("running mocked statvfs"); @@ -85,7 +84,7 @@ pub mod mock { Behavior::Success { blocksize, total_blocks, - ref name_filter, + name_filter, } => { let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap(); @@ -134,7 +133,7 @@ pub mod mock { } Err(e) => { return Err(anyhow::Error::new(e) - .context(format!("get metadata of {:?}", entry.path()))) + .context(format!("get metadata of {:?}", entry.path()))); } }; total += m.len(); diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 622738022a..0b71b2cf5b 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -40,15 +40,12 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; +use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; use tokio::task::JoinHandle; use tokio::task_local; use tokio_util::sync::CancellationToken; - use tracing::{debug, error, info, warn}; - -use once_cell::sync::Lazy; - use utils::env; use utils::id::TimelineId; @@ -328,8 +325,8 @@ pub enum TaskKind { // Eviction. One per timeline. Eviction, - // Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure) - IngestHousekeeping, + // Tenant housekeeping (flush idle ephemeral layers, shut down idle walredo, etc.). + TenantHousekeeping, /// See [`crate::disk_usage_eviction_task`]. DiskUsageEviction, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 54fa95fc47..71dc3c9075 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -12,137 +12,99 @@ //! parent timeline, and the last LSN that has been written to disk. //! -use anyhow::{bail, Context}; +use std::collections::hash_map::Entry; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::fmt::{Debug, Display}; +use std::fs::File; +use std::future::Future; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::{Arc, Mutex, Weak}; +use std::time::{Duration, Instant, SystemTime}; +use std::{fmt, fs}; + +use anyhow::{Context, bail}; use arc_swap::ArcSwap; -use camino::Utf8Path; -use camino::Utf8PathBuf; +use camino::{Utf8Path, Utf8PathBuf}; use chrono::NaiveDateTime; use enumset::EnumSet; -use futures::stream::FuturesUnordered; use futures::StreamExt; +use futures::stream::FuturesUnordered; +use itertools::Itertools as _; +use once_cell::sync::Lazy; use pageserver_api::models; -use pageserver_api::models::LsnLease; -use pageserver_api::models::TimelineArchivalState; -use pageserver_api::models::TimelineState; -use pageserver_api::models::TopTenantShardItem; -use pageserver_api::models::WalRedoManagerStatus; -use pageserver_api::shard::ShardIdentity; -use pageserver_api::shard::ShardStripeSize; -use pageserver_api::shard::TenantShardId; -use remote_storage::DownloadError; -use remote_storage::GenericRemoteStorage; -use remote_storage::TimeoutOrCancel; -use remote_timeline_client::manifest::{ - OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION, +pub use pageserver_api::models::TenantState; +use pageserver_api::models::{ + CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem, + WalRedoManagerStatus, }; -use remote_timeline_client::UploadQueueNotReadyError; -use std::collections::BTreeMap; -use std::collections::VecDeque; -use std::fmt; -use std::future::Future; -use std::sync::atomic::AtomicBool; -use std::sync::Weak; -use std::time::SystemTime; +use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId}; +use remote_storage::{DownloadError, GenericRemoteStorage, TimeoutOrCancel}; +use remote_timeline_client::index::GcCompactionState; +use remote_timeline_client::manifest::{ + LATEST_TENANT_MANIFEST_VERSION, OffloadedTimelineManifest, TenantManifest, +}; +use remote_timeline_client::{ + FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD, UploadQueueNotReadyError, +}; +use secondary::heatmap::{HeatMapTenant, HeatMapTimeline}; use storage_broker::BrokerClientChannel; -use timeline::compaction::ScheduledCompactionTask; -use timeline::import_pgdata; -use timeline::offload::offload_timeline; -use timeline::CompactFlags; -use timeline::CompactOptions; -use timeline::CompactionError; -use timeline::ShutdownMode; +use timeline::compaction::{CompactionOutcome, GcCompactionQueue}; +use timeline::offload::{OffloadError, offload_timeline}; +use timeline::{ + CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata, +}; use tokio::io::BufReader; -use tokio::sync::watch; +use tokio::sync::{Notify, Semaphore, watch}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; use upload_queue::NotInitialized; -use utils::backoff; use utils::circuit_breaker::CircuitBreaker; -use utils::completion; use utils::crashsafe::path_with_suffix_extension; -use utils::failpoint_support; -use utils::fs_ext; -use utils::pausable_failpoint; -use utils::sync::gate::Gate; -use utils::sync::gate::GateGuard; -use utils::timeout::timeout_cancellable; -use utils::timeout::TimeoutCancellableError; -use utils::zstd::create_zst_tarball; -use utils::zstd::extract_zst_tarball; +use utils::sync::gate::{Gate, GateGuard}; +use utils::timeout::{TimeoutCancellableError, timeout_cancellable}; +use utils::try_rcu::ArcSwapExt; +use utils::zstd::{create_zst_tarball, extract_zst_tarball}; +use utils::{backoff, completion, failpoint_support, fs_ext, pausable_failpoint}; -use self::config::AttachedLocationConfig; -use self::config::AttachmentMode; -use self::config::LocationConf; -use self::config::TenantConf; +use self::config::{AttachedLocationConfig, AttachmentMode, LocationConf, TenantConf}; use self::metadata::TimelineMetadata; -use self::mgr::GetActiveTenantError; -use self::mgr::GetTenantError; +use self::mgr::{GetActiveTenantError, GetTenantError}; use self::remote_timeline_client::upload::{upload_index_part, upload_tenant_manifest}; use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; -use self::timeline::uninit::TimelineCreateGuard; -use self::timeline::uninit::TimelineExclusionError; -use self::timeline::uninit::UninitializedTimeline; -use self::timeline::EvictionTaskTenantState; -use self::timeline::GcCutoffs; -use self::timeline::TimelineDeleteProgress; -use self::timeline::TimelineResources; -use self::timeline::WaitLsnError; +use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, UninitializedTimeline}; +use self::timeline::{ + EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError, +}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::deletion_queue::DeletionQueueClient; -use crate::deletion_queue::DeletionQueueError; -use crate::import_datadir; -use crate::is_uninit_mark; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::l0_flush::L0FlushGlobalState; -use crate::metrics::TENANT; use crate::metrics::{ - remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, - TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, + BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS, + INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC, + TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics, }; -use crate::task_mgr; use crate::task_mgr::TaskKind; -use crate::tenant::config::LocationMode; -use crate::tenant::config::TenantConfOpt; +use crate::tenant::config::{LocationMode, TenantConfOpt}; use crate::tenant::gc_result::GcResult; pub use crate::tenant::remote_timeline_client::index::IndexPart; -use crate::tenant::remote_timeline_client::remote_initdb_archive_path; -use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; -use crate::tenant::remote_timeline_client::INITDB_PATH; -use crate::tenant::storage_layer::DeltaLayer; -use crate::tenant::storage_layer::ImageLayer; -use crate::walingest::WalLagCooldown; -use crate::walredo; -use crate::InitializationOrder; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::collections::HashSet; -use std::fmt::Debug; -use std::fmt::Display; -use std::fs; -use std::fs::File; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::Arc; -use std::sync::Mutex; -use std::time::{Duration, Instant}; - -use crate::span; +use crate::tenant::remote_timeline_client::{ + INITDB_PATH, MaybeDeletedIndexPart, remote_initdb_archive_path, +}; +use crate::tenant::storage_layer::{DeltaLayer, ImageLayer}; use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; +use crate::walingest::WalLagCooldown; use crate::walredo::PostgresRedoManager; -use crate::TEMP_FILE_SUFFIX; -use once_cell::sync::Lazy; -pub use pageserver_api::models::TenantState; -use tokio::sync::Semaphore; +use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo}; static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); -use utils::{ - crashsafe, - generation::Generation, - id::TimelineId, - lsn::{Lsn, RecordLsn}, -}; +use utils::crashsafe; +use utils::generation::Generation; +use utils::id::TimelineId; +use utils::lsn::{Lsn, RecordLsn}; pub mod blob_io; pub mod block_io; @@ -171,9 +133,9 @@ mod gc_block; mod gc_result; pub(crate) mod throttle; -pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; +pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; @@ -238,7 +200,9 @@ impl AttachedTenantConf { Ok(Self::new(location_conf.tenant_conf, *attach_conf)) } LocationMode::Secondary(_) => { - anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode") + anyhow::bail!( + "Attempted to construct AttachedTenantConf from a LocationConf in secondary mode" + ) } } } @@ -253,6 +217,7 @@ struct TimelinePreload { timeline_id: TimelineId, client: RemoteTimelineClient, index_part: Result, + previous_heatmap: Option, } pub(crate) struct TenantPreload { @@ -344,10 +309,11 @@ pub struct Tenant { /// Overhead of mutex is acceptable because compaction is done with a multi-second period. compaction_circuit_breaker: std::sync::Mutex, - /// Scheduled compaction tasks. Currently, this can only be populated by triggering - /// a manual gc-compaction from the manual compaction API. - scheduled_compaction_tasks: - std::sync::Mutex>>, + /// Signals the tenant compaction loop that there is L0 compaction work to be done. + pub(crate) l0_compaction_trigger: Arc, + + /// Scheduled gc-compaction tasks. + scheduled_compaction_tasks: std::sync::Mutex>>, /// If the tenant is in Activating state, notify this to encourage it /// to proceed to Active as soon as possible, rather than waiting for lazy @@ -367,8 +333,9 @@ pub struct Tenant { /// Throttle applied at the top of [`Timeline::get`]. /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance. - pub(crate) pagestream_throttle: - Arc>, + pub(crate) pagestream_throttle: Arc, + + pub(crate) pagestream_throttle_metrics: Arc, /// An ongoing timeline detach concurrency limiter. /// @@ -449,7 +416,9 @@ impl WalredoManagerId { static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed); if id == 0 { - panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique"); + panic!( + "WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique" + ); } Self(id) } @@ -1117,6 +1086,7 @@ impl Tenant { resources: TimelineResources, mut index_part: IndexPart, metadata: TimelineMetadata, + previous_heatmap: Option, ancestor: Option>, cause: LoadTimelineCause, ctx: &RequestContext, @@ -1147,10 +1117,12 @@ impl Tenant { let timeline = self.create_timeline_struct( timeline_id, &metadata, + previous_heatmap, ancestor.clone(), resources, CreateTimelineCause::Load, idempotency.clone(), + index_part.gc_compaction.clone(), )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -1172,12 +1144,47 @@ impl Tenant { format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}") })?; + // When unarchiving, we've mostly likely lost the heatmap generated prior + // to the archival operation. To allow warming this timeline up, generate + // a previous heatmap which contains all visible layers in the layer map. + // This previous heatmap will be used whenever a fresh heatmap is generated + // for the timeline. + if matches!(cause, LoadTimelineCause::Unoffload) { + let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn())); + while let Some((tline, end_lsn)) = tline_ending_at { + let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await; + if !tline.is_previous_heatmap_active() { + tline + .previous_heatmap + .store(Some(Arc::new(unarchival_heatmap))); + } else { + tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.") + } + + match tline.ancestor_timeline() { + Some(ancestor) => { + if ancestor.update_layer_visibility().await.is_err() { + // Ancestor timeline is shutting down. + break; + } + + tline_ending_at = Some((ancestor, tline.get_ancestor_lsn())); + } + None => { + tline_ending_at = None; + } + } + } + } + match import_pgdata { Some(import_pgdata) if !import_pgdata.is_done() => { match cause { LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), LoadTimelineCause::ImportPgdata { .. } => { - unreachable!("ImportPgdata should not be reloading timeline import is done and persisted as such in s3") + unreachable!( + "ImportPgdata should not be reloading timeline import is done and persisted as such in s3" + ) } } let mut guard = self.timelines_creating.lock().unwrap(); @@ -1210,8 +1217,8 @@ impl Tenant { // We should never try and load the same timeline twice during startup Entry::Occupied(_) => { unreachable!( - "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" - ); + "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" + ); } Entry::Vacant(v) => { v.insert(Arc::clone(&timeline)); @@ -1546,8 +1553,18 @@ impl Tenant { } } + // TODO(vlad): Could go to S3 if the secondary is freezing cold and hasn't even + // pulled the first heatmap. Not entirely necessary since the storage controller + // will kick the secondary in any case and cause a download. + let maybe_heatmap_at = self.read_on_disk_heatmap().await; + let timelines = self - .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel) + .load_timelines_metadata( + remote_timeline_ids, + remote_storage, + maybe_heatmap_at, + cancel, + ) .await?; Ok(TenantPreload { @@ -1560,6 +1577,26 @@ impl Tenant { }) } + async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> { + let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id); + match tokio::fs::read_to_string(on_disk_heatmap_path).await { + Ok(heatmap) => match serde_json::from_str::(&heatmap) { + Ok(heatmap) => Some((heatmap, std::time::Instant::now())), + Err(err) => { + error!("Failed to deserialize old heatmap: {err}"); + None + } + }, + Err(err) => match err.kind() { + std::io::ErrorKind::NotFound => None, + _ => { + error!("Unexpected IO error reading old heatmap: {err}"); + None + } + }, + } + } + /// /// Background task that downloads all data for a tenant and brings it to Active state. /// @@ -1575,7 +1612,9 @@ impl Tenant { failpoint_support::sleep_millis_async!("before-attaching-tenant"); let Some(preload) = preload else { - anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); + anyhow::bail!( + "local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624" + ); }; let mut offloaded_timeline_ids = HashSet::new(); @@ -1647,7 +1686,10 @@ impl Tenant { match index_part { MaybeDeletedIndexPart::IndexPart(index_part) => { timeline_ancestors.insert(timeline_id, index_part.metadata.clone()); - remote_index_and_client.insert(timeline_id, (index_part, preload.client)); + remote_index_and_client.insert( + timeline_id, + (index_part, preload.client, preload.previous_heatmap), + ); } MaybeDeletedIndexPart::Deleted(index_part) => { info!( @@ -1666,7 +1708,7 @@ impl Tenant { // layer file. let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?; for (timeline_id, remote_metadata) in sorted_timelines { - let (index_part, remote_client) = remote_index_and_client + let (index_part, remote_client, previous_heatmap) = remote_index_and_client .remove(&timeline_id) .expect("just put it in above"); @@ -1686,11 +1728,8 @@ impl Tenant { timeline_id, index_part, remote_metadata, - TimelineResources { - remote_client, - pagestream_throttle: self.pagestream_throttle.clone(), - l0_flush_global_state: self.l0_flush_global_state.clone(), - }, + previous_heatmap, + self.get_timeline_resources_for(remote_client), LoadTimelineCause::Attach, ctx, ) @@ -1788,11 +1827,7 @@ impl Tenant { let entry = entry.context("read timeline dir entry")?; let entry_path = entry.path(); - let purge = if crate::is_temporary(entry_path) - // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718) - || is_uninit_mark(entry_path) - || crate::is_delete_mark(entry_path) - { + let purge = if crate::is_temporary(entry_path) { true } else { match TimelineId::try_from(entry_path.file_name()) { @@ -1843,11 +1878,13 @@ impl Tenant { } #[instrument(skip_all, fields(timeline_id=%timeline_id))] + #[allow(clippy::too_many_arguments)] async fn load_remote_timeline( self: &Arc, timeline_id: TimelineId, index_part: IndexPart, remote_metadata: TimelineMetadata, + previous_heatmap: Option, resources: TimelineResources, cause: LoadTimelineCause, ctx: &RequestContext, @@ -1877,6 +1914,7 @@ impl Tenant { resources, index_part, remote_metadata, + previous_heatmap, ancestor, cause, ctx, @@ -1888,14 +1926,29 @@ impl Tenant { self: &Arc, timeline_ids: HashSet, remote_storage: &GenericRemoteStorage, + heatmap: Option<(HeatMapTenant, std::time::Instant)>, cancel: CancellationToken, ) -> anyhow::Result> { + let mut timeline_heatmaps = heatmap.map(|h| (h.0.into_timelines_index(), h.1)); + let mut part_downloads = JoinSet::new(); for timeline_id in timeline_ids { let cancel_clone = cancel.clone(); + + let previous_timeline_heatmap = timeline_heatmaps.as_mut().and_then(|hs| { + hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active { + heatmap: h, + read_at: hs.1, + }) + }); part_downloads.spawn( - self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone) - .instrument(info_span!("download_index_part", %timeline_id)), + self.load_timeline_metadata( + timeline_id, + remote_storage.clone(), + previous_timeline_heatmap, + cancel_clone, + ) + .instrument(info_span!("download_index_part", %timeline_id)), ); } @@ -1943,8 +1996,9 @@ impl Tenant { self: &Arc, timeline_id: TimelineId, remote_storage: GenericRemoteStorage, + previous_heatmap: Option, cancel: CancellationToken, - ) -> impl Future { + ) -> impl Future + use<> { let client = self.build_timeline_client(timeline_id, remote_storage); async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -1958,6 +2012,7 @@ impl Tenant { client, timeline_id, index_part, + previous_heatmap, } } } @@ -2037,7 +2092,7 @@ impl Tenant { ) -> Result, TimelineArchivalError> { info!("unoffloading timeline"); - // We activate the timeline below manually, so this must be called on an active timeline. + // We activate the timeline below manually, so this must be called on an active tenant. // We expect callers of this function to ensure this. match self.current_state() { TenantState::Activating { .. } @@ -2069,7 +2124,12 @@ impl Tenant { })?; let timeline_preload = self - .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone()) + .load_timeline_metadata( + timeline_id, + self.remote_storage.clone(), + None, + cancel.clone(), + ) .await; let index_part = match timeline_preload.index_part { @@ -2103,6 +2163,7 @@ impl Tenant { timeline_id, index_part, remote_metadata, + None, timeline_resources, LoadTimelineCause::Unoffload, &ctx, @@ -2421,7 +2482,7 @@ impl Tenant { // Make sure the freeze_and_flush reaches remote storage. tline.remote_client.wait_completion().await.unwrap(); - let tl = uninit_tl.finish_creation()?; + let tl = uninit_tl.finish_creation().await?; // The non-test code would call tl.activate() here. tl.set_state(TimelineState::Active); Ok(tl) @@ -2555,7 +2616,12 @@ impl Tenant { // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. ancestor_timeline - .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx) + .wait_lsn( + *lsn, + timeline::WaitLsnWaiter::Tenant, + timeline::WaitLsnTimeout::Default, + ctx, + ) .await .map_err(|e| match e { e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => { @@ -2602,9 +2668,15 @@ impl Tenant { WaitCompletionError::NotInitialized( e, // If the queue is already stopped, it's a shutdown error. ) if e.is_stopping() => CreateTimelineError::ShuttingDown, - e => CreateTimelineError::Other(e.into()), - }) - .context("wait for timeline initial uploads to complete")?; + WaitCompletionError::NotInitialized(_) => { + // This is a bug: we should never try to wait for uploads before initializing the timeline + debug_assert!(false); + CreateTimelineError::Other(anyhow::anyhow!("timeline not initialized")) + } + WaitCompletionError::UploadQueueShutDownOrStopped => { + CreateTimelineError::ShuttingDown + } + })?; // The creating task is responsible for activating the timeline. // We do this after `wait_completion()` so that we don't spin up tasks that start @@ -2621,7 +2693,9 @@ impl Tenant { timeline } CreateTimelineResult::ImportSpawned(timeline) => { - info!("import task spawned, timeline will become visible and activated once the import is done"); + info!( + "import task spawned, timeline will become visible and activated once the import is done" + ); timeline } }; @@ -2667,7 +2741,7 @@ impl Tenant { { StartCreatingTimelineResult::CreateGuard(guard) => guard, StartCreatingTimelineResult::Idempotent(timeline) => { - return Ok(CreateTimelineResult::Idempotent(timeline)) + return Ok(CreateTimelineResult::Idempotent(timeline)); } }; @@ -2801,13 +2875,15 @@ impl Tenant { let index_part = match index_part { MaybeDeletedIndexPart::Deleted(_) => { // likely concurrent delete call, cplane should prevent this - anyhow::bail!("index part says deleted but we are not done creating yet, this should not happen but") + anyhow::bail!( + "index part says deleted but we are not done creating yet, this should not happen but" + ) } MaybeDeletedIndexPart::IndexPart(p) => p, }; let metadata = index_part.metadata.clone(); self - .load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{ + .load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{ create_guard: timeline_create_guard, activate, }, &ctx) .await? .ready_to_activate() @@ -2886,217 +2962,232 @@ impl Tenant { .await } - /// Perform one compaction iteration. - /// This function is periodically called by compactor task. - /// Also it can be explicitly requested per timeline through page server - /// api's 'compact' command. + /// Performs one compaction iteration. Called periodically from the compaction loop. Returns + /// whether another compaction is needed, if we still have pending work or if we yield for + /// immediate L0 compaction. /// - /// Returns whether we have pending compaction task. + /// Compaction can also be explicitly requested for a timeline via the HTTP API. async fn compaction_iteration( self: &Arc, cancel: &CancellationToken, ctx: &RequestContext, - ) -> Result { - // Don't start doing work during shutdown, or when broken, we do not need those in the logs + ) -> Result { + // Don't compact inactive tenants. if !self.is_active() { - return Ok(false); + return Ok(CompactionOutcome::Skipped); } - { - let conf = self.tenant_conf.load(); + // Don't compact tenants that can't upload layers. We don't check `may_delete_layers_hint`, + // since we need to compact L0 even in AttachedMulti to bound read amplification. + let location = self.tenant_conf.load().location; + if !location.may_upload_layers_hint() { + info!("skipping compaction in location state {location:?}"); + return Ok(CompactionOutcome::Skipped); + } - // Note that compaction usually requires deletions, but we don't respect - // may_delete_layers_hint here: that is because tenants in AttachedMulti - // should proceed with compaction even if they can't do deletion, to avoid - // accumulating dangerously deep stacks of L0 layers. Deletions will be - // enqueued inside RemoteTimelineClient, and executed layer if/when we transition - // to AttachedSingle state. - if !conf.location.may_upload_layers_hint() { - info!("Skipping compaction in location state {:?}", conf.location); - return Ok(false); + // Don't compact if the circuit breaker is tripped. + if self.compaction_circuit_breaker.lock().unwrap().is_broken() { + info!("skipping compaction due to previous failures"); + return Ok(CompactionOutcome::Skipped); + } + + // Collect all timelines to compact, along with offload instructions and L0 counts. + let mut compact: Vec> = Vec::new(); + let mut offload: HashSet = HashSet::new(); + let mut l0_counts: HashMap = HashMap::new(); + + { + let offload_enabled = self.get_timeline_offloading_enabled(); + let timelines = self.timelines.lock().unwrap(); + for (&timeline_id, timeline) in timelines.iter() { + // Skip inactive timelines. + if !timeline.is_active() { + continue; + } + + // Schedule the timeline for compaction. + compact.push(timeline.clone()); + + // Schedule the timeline for offloading if eligible. + let can_offload = offload_enabled + && timeline.can_offload().0 + && !timelines + .iter() + .any(|(_, tli)| tli.get_ancestor_timeline_id() == Some(timeline_id)); + if can_offload { + offload.insert(timeline_id); + } + } + } // release timelines lock + + for timeline in &compact { + // Collect L0 counts. Can't await while holding lock above. + if let Ok(lm) = timeline.layers.read().await.layer_map() { + l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len()); } } - // Scan through the hashmap and collect a list of all the timelines, - // while holding the lock. Then drop the lock and actually perform the - // compactions. We don't want to block everything else while the - // compaction runs. - let timelines_to_compact_or_offload; - { - let timelines = self.timelines.lock().unwrap(); - timelines_to_compact_or_offload = timelines + // Pass 1: L0 compaction across all timelines, in order of L0 count. We prioritize this to + // bound read amplification. + // + // TODO: this may spin on one or more ingest-heavy timelines, starving out image/GC + // compaction and offloading. We leave that as a potential problem to solve later. Consider + // splitting L0 and image/GC compaction to separate background jobs. + if self.get_compaction_l0_first() { + let compaction_threshold = self.get_compaction_threshold(); + let compact_l0 = compact .iter() - .filter_map(|(timeline_id, timeline)| { - let (is_active, (can_offload, _)) = - (timeline.is_active(), timeline.can_offload()); - let has_no_unoffloaded_children = { - !timelines - .iter() - .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id)) - }; - let config_allows_offload = self.conf.timeline_offloading - || self - .tenant_conf - .load() - .tenant_conf - .timeline_offloading - .unwrap_or_default(); - let can_offload = - can_offload && has_no_unoffloaded_children && config_allows_offload; - if (is_active, can_offload) == (false, false) { - None - } else { - Some((*timeline_id, timeline.clone(), (is_active, can_offload))) - } - }) - .collect::>(); - drop(timelines); - } + .map(|tli| (tli, l0_counts.get(&tli.timeline_id).copied().unwrap_or(0))) + .filter(|&(_, l0)| l0 >= compaction_threshold) + .sorted_by_key(|&(_, l0)| l0) + .rev() + .map(|(tli, _)| tli.clone()) + .collect_vec(); - // Before doing any I/O work, check our circuit breaker - if self.compaction_circuit_breaker.lock().unwrap().is_broken() { - info!("Skipping compaction due to previous failures"); - return Ok(false); - } - - let mut has_pending_task = false; - - for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload - { - // pending_task_left == None: cannot compact, maybe still pending tasks - // pending_task_left == Some(true): compaction task left - // pending_task_left == Some(false): no compaction task left - let pending_task_left = if *can_compact { - let has_pending_l0_compaction_task = timeline - .compact(cancel, EnumSet::empty(), ctx) - .instrument(info_span!("compact_timeline", %timeline_id)) + let mut has_pending_l0 = false; + for timeline in compact_l0 { + let outcome = timeline + .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx) + .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) .await - .inspect_err(|e| match e { - timeline::CompactionError::ShuttingDown => (), - timeline::CompactionError::Offload(_) => { - // Failures to offload timelines do not trip the circuit breaker, because - // they do not do lots of writes the way compaction itself does: it is cheap - // to retry, and it would be bad to stop all compaction because of an issue with offloading. - } - timeline::CompactionError::Other(e) => { - self.compaction_circuit_breaker - .lock() - .unwrap() - .fail(&CIRCUIT_BREAKERS_BROKEN, e); - } - })?; - if has_pending_l0_compaction_task { - Some(true) - } else { - let mut has_pending_scheduled_compaction_task; - let next_scheduled_compaction_task = { - let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); - if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) { - if !tline_pending_tasks.is_empty() { - info!( - "{} tasks left in the compaction schedule queue", - tline_pending_tasks.len() - ); - } - let next_task = tline_pending_tasks.pop_front(); - has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty(); - next_task - } else { - has_pending_scheduled_compaction_task = false; - None - } - }; - if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task - { - if !next_scheduled_compaction_task - .options - .flags - .contains(CompactFlags::EnhancedGcBottomMostCompaction) - { - warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options); - } else if next_scheduled_compaction_task.options.sub_compaction { - info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"); - let jobs = timeline - .gc_compaction_split_jobs(next_scheduled_compaction_task.options) - .await - .map_err(CompactionError::Other)?; - if jobs.is_empty() { - info!("no jobs to run, skipping scheduled compaction task"); - } else { - has_pending_scheduled_compaction_task = true; - let jobs_len = jobs.len(); - let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); - let tline_pending_tasks = guard.entry(*timeline_id).or_default(); - for (idx, job) in jobs.into_iter().enumerate() { - tline_pending_tasks.push_back(if idx == jobs_len - 1 { - ScheduledCompactionTask { - options: job, - // The last job in the queue sends the signal and releases the gc guard - result_tx: next_scheduled_compaction_task - .result_tx - .take(), - gc_block: next_scheduled_compaction_task - .gc_block - .take(), - } - } else { - ScheduledCompactionTask { - options: job, - result_tx: None, - gc_block: None, - } - }); - } - info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len); - } - } else { - let _ = timeline - .compact_with_options( - cancel, - next_scheduled_compaction_task.options, - ctx, - ) - .instrument(info_span!("scheduled_compact_timeline", %timeline_id)) - .await?; - if let Some(tx) = next_scheduled_compaction_task.result_tx.take() { - // TODO: we can send compaction statistics in the future - tx.send(()).ok(); - } - } - } - Some(has_pending_scheduled_compaction_task) + .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?; + match outcome { + CompactionOutcome::Done => {} + CompactionOutcome::Skipped => {} + CompactionOutcome::Pending => has_pending_l0 = true, + CompactionOutcome::YieldForL0 => has_pending_l0 = true, } - } else { - None - }; - has_pending_task |= pending_task_left.unwrap_or(false); - if pending_task_left == Some(false) && *can_offload { - offload_timeline(self, timeline) - .instrument(info_span!("offload_timeline", %timeline_id)) + } + if has_pending_l0 { + return Ok(CompactionOutcome::YieldForL0); // do another pass + } + } + + // Pass 2: image compaction and timeline offloading. If any timelines have accumulated + // more L0 layers, they may also be compacted here. + // + // NB: image compaction may yield if there is pending L0 compaction. + // + // TODO: it will only yield if there is pending L0 compaction on the same timeline. If a + // different timeline needs compaction, it won't. It should check `l0_compaction_trigger`. + // We leave this for a later PR. + // + // TODO: consider ordering timelines by some priority, e.g. time since last full compaction, + // amount of L1 delta debt or garbage, offload-eligible timelines first, etc. + let mut has_pending = false; + for timeline in compact { + if !timeline.is_active() { + continue; + } + + let mut outcome = timeline + .compact(cancel, EnumSet::default(), ctx) + .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id)) + .await + .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?; + + // If we're done compacting, check the scheduled GC compaction queue for more work. + if outcome == CompactionOutcome::Done { + let queue = { + let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); + guard + .entry(timeline.timeline_id) + .or_insert_with(|| Arc::new(GcCompactionQueue::new())) + .clone() + }; + outcome = queue + .iteration(cancel, ctx, &self.gc_block, &timeline) + .instrument( + info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id), + ) .await?; } + + // If we're done compacting, offload the timeline if requested. + if outcome == CompactionOutcome::Done && offload.contains(&timeline.timeline_id) { + pausable_failpoint!("before-timeline-auto-offload"); + offload_timeline(self, &timeline) + .instrument(info_span!("offload_timeline", timeline_id = %timeline.timeline_id)) + .await + .or_else(|err| match err { + // Ignore this, we likely raced with unarchival. + OffloadError::NotArchived => Ok(()), + err => Err(err), + })?; + } + + match outcome { + CompactionOutcome::Done => {} + CompactionOutcome::Skipped => {} + CompactionOutcome::Pending => has_pending = true, + // This mostly makes sense when the L0-only pass above is enabled, since there's + // otherwise no guarantee that we'll start with the timeline that has high L0. + CompactionOutcome::YieldForL0 => return Ok(CompactionOutcome::YieldForL0), + } } + // Success! Untrip the breaker if necessary. self.compaction_circuit_breaker .lock() .unwrap() .success(&CIRCUIT_BREAKERS_UNBROKEN); - Ok(has_pending_task) + match has_pending { + true => Ok(CompactionOutcome::Pending), + false => Ok(CompactionOutcome::Done), + } + } + + /// Trips the compaction circuit breaker if appropriate. + pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) { + match err { + CompactionError::ShuttingDown => (), + // Offload failures don't trip the circuit breaker, since they're cheap to retry and + // shouldn't block compaction. + CompactionError::Offload(_) => {} + CompactionError::CollectKeySpaceError(err) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, err); + } + CompactionError::Other(err) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, err); + } + CompactionError::AlreadyRunning(_) => {} + } } /// Cancel scheduled compaction tasks - pub(crate) fn cancel_scheduled_compaction( + pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) { + let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); + if let Some(q) = guard.get_mut(&timeline_id) { + q.cancel_scheduled(); + } + } + + pub(crate) fn get_scheduled_compaction_tasks( &self, timeline_id: TimelineId, - ) -> Vec { - let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); - if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) { - let current_tline_pending_tasks = std::mem::take(tline_pending_tasks); - current_tline_pending_tasks.into_iter().collect() - } else { - Vec::new() + ) -> Vec { + let res = { + let guard = self.scheduled_compaction_tasks.lock().unwrap(); + guard.get(&timeline_id).map(|q| q.remaining_jobs()) + }; + let Some((running, remaining)) = res else { + return Vec::new(); + }; + let mut result = Vec::new(); + if let Some((id, running)) = running { + result.extend(running.into_compact_info_resp(id, true)); } + for (id, job) in remaining { + result.extend(job.into_compact_info_resp(id, false)); + } + result } /// Schedule a compaction task for a timeline. @@ -3105,49 +3196,37 @@ impl Tenant { timeline_id: TimelineId, options: CompactOptions, ) -> anyhow::Result> { - let gc_guard = match self.gc_block.start().await { - Ok(guard) => guard, - Err(e) => { - bail!("cannot run gc-compaction because gc is blocked: {}", e); - } - }; let (tx, rx) = tokio::sync::oneshot::channel(); let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); - let tline_pending_tasks = guard.entry(timeline_id).or_default(); - tline_pending_tasks.push_back(ScheduledCompactionTask { - options, - result_tx: Some(tx), - gc_block: Some(gc_guard), - }); + let q = guard + .entry(timeline_id) + .or_insert_with(|| Arc::new(GcCompactionQueue::new())); + q.schedule_manual_compaction(options, Some(tx)); Ok(rx) } - // Call through to all timelines to freeze ephemeral layers if needed. Usually - // this happens during ingest: this background housekeeping is for freezing layers - // that are open but haven't been written to for some time. - async fn ingest_housekeeping(&self) { - // Scan through the hashmap and collect a list of all the timelines, - // while holding the lock. Then drop the lock and actually perform the - // compactions. We don't want to block everything else while the - // compaction runs. - let timelines = { - self.timelines - .lock() - .unwrap() - .values() - .filter_map(|timeline| { - if timeline.is_active() { - Some(timeline.clone()) - } else { - None - } - }) - .collect::>() - }; + /// Performs periodic housekeeping, via the tenant housekeeping background task. + async fn housekeeping(&self) { + // Call through to all timelines to freeze ephemeral layers as needed. This usually happens + // during ingest, but we don't want idle timelines to hold open layers for too long. + let timelines = self + .timelines + .lock() + .unwrap() + .values() + .filter(|tli| tli.is_active()) + .cloned() + .collect_vec(); - for timeline in &timelines { + for timeline in timelines { timeline.maybe_freeze_ephemeral_layer().await; } + + // Shut down walredo if idle. + const WALREDO_IDLE_TIMEOUT: Duration = Duration::from_secs(180); + if let Some(ref walredo_mgr) = self.walredo_mgr { + walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT); + } } pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool { @@ -3789,7 +3868,9 @@ where if !later.is_empty() { for (missing_id, orphan_ids) in later { for (orphan_id, _) in orphan_ids { - error!("could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded"); + error!( + "could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded" + ); } } bail!("could not load tenant because some timelines are missing ancestors"); @@ -3850,6 +3931,27 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + pub fn get_rel_size_v2_enabled(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .rel_size_v2_enabled + .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) + } + + pub fn get_compaction_upper_limit(&self) -> usize { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_upper_limit + .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit) + } + + pub fn get_compaction_l0_first(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_l0_first + .unwrap_or(self.conf.default_tenant_conf.compaction_l0_first) + } + pub fn get_gc_horizon(&self) -> u64 { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf @@ -3904,6 +4006,16 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) } + pub fn get_timeline_offloading_enabled(&self) -> bool { + if self.conf.timeline_offloading { + return true; + } + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .timeline_offloading + .unwrap_or(self.conf.default_tenant_conf.timeline_offloading) + } + /// Generate an up-to-date TenantManifest based on the state of this Tenant. fn build_tenant_manifest(&self) -> TenantManifest { let timelines_offloaded = self.timelines_offloaded.lock().unwrap(); @@ -3921,25 +4033,28 @@ impl Tenant { } } - pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { + pub fn update_tenant_config anyhow::Result>( + &self, + update: F, + ) -> anyhow::Result { // Use read-copy-update in order to avoid overwriting the location config // state if this races with [`Tenant::set_new_location_config`]. Note that // this race is not possible if both request types come from the storage // controller (as they should!) because an exclusive op lock is required // on the storage controller side. - self.tenant_conf.rcu(|inner| { - Arc::new(AttachedTenantConf { - tenant_conf: new_tenant_conf.clone(), - location: inner.location, - // Attached location is not changed, no need to update lsn lease deadline. - lsn_lease_deadline: inner.lsn_lease_deadline, - }) - }); + self.tenant_conf + .try_rcu(|attached_conf| -> Result<_, anyhow::Error> { + Ok(Arc::new(AttachedTenantConf { + tenant_conf: update(attached_conf.tenant_conf.clone())?, + location: attached_conf.location, + lsn_lease_deadline: attached_conf.lsn_lease_deadline, + })) + })?; - let updated = self.tenant_conf.load().clone(); + let updated = self.tenant_conf.load(); - self.tenant_conf_updated(&new_tenant_conf); + self.tenant_conf_updated(&updated.tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. @@ -3947,6 +4062,8 @@ impl Tenant { for timeline in timelines { timeline.tenant_conf_updated(&updated); } + + Ok(updated.tenant_conf.clone()) } pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) { @@ -3993,10 +4110,12 @@ impl Tenant { &self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, + previous_heatmap: Option, ancestor: Option>, resources: TimelineResources, cause: CreateTimelineCause, create_idempotency: CreateTimelineIdempotency, + gc_compaction_state: Option, ) -> anyhow::Result> { let state = match cause { CreateTimelineCause::Load => { @@ -4016,6 +4135,7 @@ impl Tenant { self.conf, Arc::clone(&self.tenant_conf), new_metadata, + previous_heatmap, ancestor, new_timeline_id, self.tenant_shard_id, @@ -4027,12 +4147,16 @@ impl Tenant { state, self.attach_wal_lag_cooldown.clone(), create_idempotency, + gc_compaction_state, self.cancel.child_token(), ); Ok(timeline) } + /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object + /// to ensure proper cleanup of background tasks and metrics. + // // Allow too_many_arguments because a constructor's argument list naturally grows with the // number of attributes in the struct: breaking these out into a builder wouldn't be helpful. #[allow(clippy::too_many_arguments)] @@ -4134,6 +4258,7 @@ impl Tenant { // use an extremely long backoff. Some(Duration::from_secs(3600 * 24)), )), + l0_compaction_trigger: Arc::new(Notify::new()), scheduled_compaction_tasks: Mutex::new(Default::default()), activate_now_sem: tokio::sync::Semaphore::new(0), attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()), @@ -4141,8 +4266,10 @@ impl Tenant { gate: Gate::default(), pagestream_throttle: Arc::new(throttle::Throttle::new( Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf), - crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id), )), + pagestream_throttle_metrics: Arc::new( + crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id), + ), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), gc_block: Default::default(), @@ -4437,13 +4564,17 @@ impl Tenant { let mut gc_cutoffs: HashMap = HashMap::with_capacity(timelines.len()); + // Ensures all timelines use the same start time when computing the time cutoff. + let now_ts_for_pitr_calc = SystemTime::now(); for timeline in timelines.iter() { let cutoff = timeline .get_last_record_lsn() .checked_sub(horizon) .unwrap_or(Lsn(0)); - let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?; + let cutoffs = timeline + .find_gc_cutoffs(now_ts_for_pitr_calc, cutoff, pitr, cancel, ctx) + .await?; let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs); assert!(old.is_none()); } @@ -4648,29 +4779,36 @@ impl Tenant { // We check it against both the planned GC cutoff stored in 'gc_info', // and the 'latest_gc_cutoff' of the last GC that was performed. The // planned GC cutoff in 'gc_info' is normally larger than - // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just + // 'applied_gc_cutoff_lsn', but beware of corner cases like if you just // changed the GC settings for the tenant to make the PITR window // larger, but some of the data was already removed by an earlier GC // iteration. // check against last actual 'latest_gc_cutoff' first - let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); - src_timeline - .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) - .context(format!( - "invalid branch start lsn: less than latest GC cutoff {}", - *latest_gc_cutoff_lsn, - )) - .map_err(CreateTimelineError::AncestorLsn)?; - - // and then the planned GC cutoff + let applied_gc_cutoff_lsn = src_timeline.get_applied_gc_cutoff_lsn(); { let gc_info = src_timeline.gc_info.read().unwrap(); - let cutoff = gc_info.min_cutoff(); - if start_lsn < cutoff { - return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( - "invalid branch start lsn: less than planned GC cutoff {cutoff}" - ))); + let planned_cutoff = gc_info.min_cutoff(); + if gc_info.lsn_covered_by_lease(start_lsn) { + tracing::info!( + "skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", + *applied_gc_cutoff_lsn + ); + } else { + src_timeline + .check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn) + .context(format!( + "invalid branch start lsn: less than latest GC cutoff {}", + *applied_gc_cutoff_lsn, + )) + .map_err(CreateTimelineError::AncestorLsn)?; + + // and then the planned GC cutoff + if start_lsn < planned_cutoff { + return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( + "invalid branch start lsn: less than planned GC cutoff {planned_cutoff}" + ))); + } } } @@ -4700,7 +4838,7 @@ impl Tenant { dst_prev, Some(src_id), start_lsn, - *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer? + *src_timeline.applied_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer? src_timeline.initdb_lsn, src_timeline.pg_version, ); @@ -4715,7 +4853,7 @@ impl Tenant { ) .await?; - let new_timeline = uninitialized_timeline.finish_creation()?; + let new_timeline = uninitialized_timeline.finish_creation().await?; // Root timeline gets its layers during creation and uploads them along with the metadata. // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created. @@ -4801,7 +4939,9 @@ impl Tenant { } // Idempotent <=> CreateTimelineIdempotency is identical (x, y) if x == y => { - info!("timeline already exists and idempotency matches, succeeding request"); + info!( + "timeline already exists and idempotency matches, succeeding request" + ); // fallthrough } (_, _) => { @@ -4883,7 +5023,7 @@ impl Tenant { { StartCreatingTimelineResult::CreateGuard(guard) => guard, StartCreatingTimelineResult::Idempotent(timeline) => { - return Ok(CreateTimelineResult::Idempotent(timeline)) + return Ok(CreateTimelineResult::Idempotent(timeline)); } }; @@ -4905,10 +5045,11 @@ impl Tenant { } // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it + let pgdata_path_deferred = pgdata_path.clone(); scopeguard::defer! { - if let Err(e) = fs::remove_dir_all(&pgdata_path) { + if let Err(e) = fs::remove_dir_all(&pgdata_path_deferred) { // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call - error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}"); + error!("Failed to remove temporary initdb directory '{pgdata_path_deferred}': {e}"); } } if let Some(existing_initdb_timeline_id) = load_existing_initdb { @@ -4975,7 +5116,7 @@ impl Tenant { pgdata_lsn, pg_version, ); - let raw_timeline = self + let mut raw_timeline = self .prepare_new_timeline( timeline_id, &new_metadata, @@ -4986,42 +5127,33 @@ impl Tenant { .await?; let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id; - let unfinished_timeline = raw_timeline.raw_timeline()?; - - // Flush the new layer files to disk, before we make the timeline as available to - // the outside world. - // - // Flush loop needs to be spawned in order to be able to flush. - unfinished_timeline.maybe_spawn_flush_loop(); - - import_datadir::import_timeline_from_postgres_datadir( - unfinished_timeline, - &pgdata_path, - pgdata_lsn, - ctx, - ) - .await - .with_context(|| { - format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}") - })?; - - fail::fail_point!("before-checkpoint-new-timeline", |_| { - Err(CreateTimelineError::Other(anyhow::anyhow!( - "failpoint before-checkpoint-new-timeline" - ))) - }); - - unfinished_timeline - .freeze_and_flush() - .await - .with_context(|| { - format!( - "Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}" + raw_timeline + .write(|unfinished_timeline| async move { + import_datadir::import_timeline_from_postgres_datadir( + &unfinished_timeline, + &pgdata_path, + pgdata_lsn, + ctx, ) - })?; + .await + .with_context(|| { + format!( + "Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}" + ) + })?; + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + Err(CreateTimelineError::Other(anyhow::anyhow!( + "failpoint before-checkpoint-new-timeline" + ))) + }); + + Ok(()) + }) + .await?; // All done! - let timeline = raw_timeline.finish_creation()?; + let timeline = raw_timeline.finish_creation().await?; // Callers are responsible to wait for uploads to complete and for activating the timeline. @@ -5040,11 +5172,19 @@ impl Tenant { ) } - /// Call this before constructing a timeline, to build its required structures + /// Builds required resources for a new timeline. fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources { + let remote_client = self.build_timeline_remote_client(timeline_id); + self.get_timeline_resources_for(remote_client) + } + + /// Builds timeline resources for the given remote client. + fn get_timeline_resources_for(&self, remote_client: RemoteTimelineClient) -> TimelineResources { TimelineResources { - remote_client: self.build_timeline_remote_client(timeline_id), + remote_client, pagestream_throttle: self.pagestream_throttle.clone(), + pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(), + l0_compaction_trigger: self.l0_compaction_trigger.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), } } @@ -5073,10 +5213,12 @@ impl Tenant { .create_timeline_struct( new_timeline_id, new_metadata, + None, ancestor, resources, CreateTimelineCause::Load, create_guard.idempotency.clone(), + None, ) .context("Failed to create timeline data structure")?; @@ -5086,7 +5228,9 @@ impl Tenant { .create_timeline_files(&create_guard.timeline_path) .await { - error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}"); + error!( + "Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}" + ); cleanup_timeline_directory(create_guard); return Err(e); } @@ -5334,27 +5478,37 @@ impl Tenant { return Ok(()); } - upload_tenant_manifest( - &self.remote_storage, - &self.tenant_shard_id, - self.generation, - &manifest, + // Remote storage does no retries internally, so wrap it + match backoff::retry( + || async { + upload_tenant_manifest( + &self.remote_storage, + &self.tenant_shard_id, + self.generation, + &manifest, + &self.cancel, + ) + .await + }, + |_e| self.cancel.is_cancelled(), + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "uploading tenant manifest", &self.cancel, ) .await - .map_err(|e| { - if self.cancel.is_cancelled() { - TenantManifestError::Cancelled - } else { - TenantManifestError::RemoteStorage(e) + { + None => Err(TenantManifestError::Cancelled), + Some(Err(_)) if self.cancel.is_cancelled() => Err(TenantManifestError::Cancelled), + Some(Err(e)) => Err(TenantManifestError::RemoteStorage(e)), + Some(Ok(_)) => { + // Store the successfully uploaded manifest, so that future callers can avoid + // re-uploading the same thing. + *guard = Some(manifest); + + Ok(()) } - })?; - - // Store the successfully uploaded manifest, so that future callers can avoid - // re-uploading the same thing. - *guard = Some(manifest); - - Ok(()) + } } } @@ -5376,8 +5530,17 @@ async fn run_initdb( initdb_bin_path, initdb_target_dir, initdb_lib_dir, ); - let _permit = INIT_DB_SEMAPHORE.acquire().await; + let _permit = { + let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer(); + INIT_DB_SEMAPHORE.acquire().await + }; + CONCURRENT_INITDBS.inc(); + scopeguard::defer! { + CONCURRENT_INITDBS.dec(); + } + + let _timer = INITDB_RUN_TIME.start_timer(); let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { superuser: &conf.superuser, locale: &conf.locale, @@ -5432,20 +5595,19 @@ pub async fn dump_layerfile_from_path( #[cfg(test)] pub(crate) mod harness { use bytes::{Bytes, BytesMut}; + use hex_literal::hex; use once_cell::sync::OnceCell; + use pageserver_api::key::Key; use pageserver_api::models::ShardParameters; + use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::ShardIndex; + use utils::id::TenantId; use utils::logging; + use super::*; use crate::deletion_queue::mock::MockDeletionQueue; use crate::l0_flush::L0FlushConfig; use crate::walredo::apply_neon; - use pageserver_api::key::Key; - use pageserver_api::record::NeonWalRecord; - - use super::*; - use hex_literal::hex; - use utils::id::TenantId; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -5469,7 +5631,13 @@ pub(crate) mod harness { compaction_target_size: Some(tenant_conf.compaction_target_size), compaction_period: Some(tenant_conf.compaction_period), compaction_threshold: Some(tenant_conf.compaction_threshold), + compaction_upper_limit: Some(tenant_conf.compaction_upper_limit), compaction_algorithm: Some(tenant_conf.compaction_algorithm), + compaction_l0_first: Some(tenant_conf.compaction_l0_first), + compaction_l0_semaphore: Some(tenant_conf.compaction_l0_semaphore), + l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold, + l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold, + l0_flush_wait_upload: Some(tenant_conf.l0_flush_wait_upload), gc_horizon: Some(tenant_conf.gc_horizon), gc_period: Some(tenant_conf.gc_period), image_creation_threshold: Some(tenant_conf.image_creation_threshold), @@ -5488,10 +5656,19 @@ pub(crate) mod harness { image_layer_creation_check_threshold: Some( tenant_conf.image_layer_creation_check_threshold, ), + image_creation_preempt_threshold: Some( + tenant_conf.image_creation_preempt_threshold, + ), lsn_lease_length: Some(tenant_conf.lsn_lease_length), lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts), timeline_offloading: Some(tenant_conf.timeline_offloading), wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override, + rel_size_v2_enabled: Some(tenant_conf.rel_size_v2_enabled), + gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled), + gc_compaction_initial_threshold_kb: Some( + tenant_conf.gc_compaction_initial_threshold_kb, + ), + gc_compaction_ratio_percent: Some(tenant_conf.gc_compaction_ratio_percent), } } } @@ -5711,32 +5888,34 @@ pub(crate) mod harness { mod tests { use std::collections::{BTreeMap, BTreeSet}; - use super::*; - use crate::keyspace::KeySpaceAccum; - use crate::tenant::harness::*; - use crate::tenant::timeline::CompactFlags; - use crate::DEFAULT_PG_VERSION; use bytes::{Bytes, BytesMut}; use hex_literal::hex; use itertools::Itertools; - use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; + #[cfg(feature = "testing")] + use models::CompactLsnRange; + use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; + #[cfg(feature = "testing")] + use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; use pageserver_compaction::helpers::overlaps_with; - use rand::{thread_rng, Rng}; - use storage_layer::PersistentLayerKey; + use rand::{Rng, thread_rng}; + use storage_layer::{IoConcurrency, PersistentLayerKey}; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; + #[cfg(feature = "testing")] + use timeline::GcInfo; + #[cfg(feature = "testing")] + use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; use timeline::{CompactOptions, DeltaLayerTestDesc}; use utils::id::TenantId; - #[cfg(feature = "testing")] - use pageserver_api::record::NeonWalRecord; - #[cfg(feature = "testing")] - use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; - #[cfg(feature = "testing")] - use timeline::GcInfo; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::keyspace::KeySpaceAccum; + use crate::tenant::harness::*; + use crate::tenant::timeline::CompactFlags; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -5986,11 +6165,12 @@ mod tests { panic!("wrong error type") }; assert!(err.to_string().contains("invalid branch start lsn")); - assert!(err - .source() - .unwrap() - .to_string() - .contains("we might've already garbage collected needed data")) + assert!( + err.source() + .unwrap() + .to_string() + .contains("we might've already garbage collected needed data") + ) } } @@ -6019,11 +6199,12 @@ mod tests { panic!("wrong error type"); }; assert!(&err.to_string().contains("invalid branch start lsn")); - assert!(&err - .source() - .unwrap() - .to_string() - .contains("is earlier than latest GC cutoff")); + assert!( + &err.source() + .unwrap() + .to_string() + .contains("is earlier than latest GC cutoff") + ); } } @@ -6043,8 +6224,8 @@ mod tests { make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?; - let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); - assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); + let applied_gc_cutoff_lsn = tline.get_applied_gc_cutoff_lsn(); + assert!(*applied_gc_cutoff_lsn > Lsn(0x25)); match tline.get(*TEST_KEY, Lsn(0x25)) { Ok(_) => panic!("request for page should have failed"), Err(err) => assert!(err.to_string().contains("not found at")), @@ -6504,6 +6685,7 @@ mod tests { async fn test_get_vectored() -> anyhow::Result<()> { let harness = TenantHarness::create("test_get_vectored").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; @@ -6568,7 +6750,7 @@ mod tests { .get_vectored_impl( read.clone(), reads_lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await; @@ -6615,6 +6797,7 @@ mod tests { let harness = TenantHarness::create("test_get_vectored_aux_files").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) .await?; @@ -6649,7 +6832,7 @@ mod tests { .get_vectored_impl( aux_keyspace.clone(), read_lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await; @@ -6697,6 +6880,7 @@ mod tests { ) .await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let gap_at_key = current_key.add(100); @@ -6797,7 +6981,7 @@ mod tests { .get_vectored_impl( read.clone(), current_lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await?; @@ -6840,6 +7024,7 @@ mod tests { async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> { let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_key = start_key.add(1000); @@ -6932,7 +7117,7 @@ mod tests { ranges: vec![child_gap_at_key..child_gap_at_key.next()], }, query_lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await; @@ -7328,10 +7513,12 @@ mod tests { } } - assert!(!harness - .conf - .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) - .exists()); + assert!( + !harness + .conf + .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) + .exists() + ); Ok(()) } @@ -7378,6 +7565,7 @@ mod tests { async fn test_metadata_scan() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_scan").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -7431,7 +7619,7 @@ mod tests { .get_vectored_impl( keyspace.clone(), lsn, - &mut ValuesReconstructState::default(), + &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) .await? @@ -7531,7 +7719,10 @@ mod tests { let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); - assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); + assert!( + after_num_l0_delta_files < before_num_l0_delta_files, + "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}" + ); assert_eq!( tline.get(test_key, lsn, &ctx).await?, @@ -7546,6 +7737,7 @@ mod tests { let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap(); let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let mut lsn = Lsn(0x08); @@ -7565,7 +7757,10 @@ mod tests { } // we can read everything from the storage - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + let files = tline + .list_aux_files(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); assert_eq!( files.get("pg_logical/mappings/test1"), Some(&bytes::Bytes::from_static(b"first")) @@ -7581,7 +7776,10 @@ mod tests { modification.commit(&ctx).await.unwrap(); } - let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + let files = tline + .list_aux_files(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); assert_eq!( files.get("pg_logical/mappings/test2"), Some(&bytes::Bytes::from_static(b"second")) @@ -7592,7 +7790,10 @@ mod tests { .await .unwrap(); - let files = child.list_aux_files(lsn, &ctx).await.unwrap(); + let files = child + .list_aux_files(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); assert_eq!(files.get("pg_logical/mappings/test1"), None); assert_eq!(files.get("pg_logical/mappings/test2"), None); } @@ -7601,6 +7802,7 @@ mod tests { async fn test_metadata_image_creation() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_image_creation").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -7620,8 +7822,9 @@ mod tests { keyspace: &KeySpace, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> anyhow::Result<(BTreeMap>, usize)> { - let mut reconstruct_state = ValuesReconstructState::default(); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let res = tline .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) .await?; @@ -7669,7 +7872,8 @@ mod tests { if iter % 5 == 0 { let (_, before_delta_file_accessed) = - scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone()) + .await?; tline .compact( &cancel, @@ -7683,8 +7887,12 @@ mod tests { ) .await?; let (_, after_delta_file_accessed) = - scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; - assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"); + scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone()) + .await?; + assert!( + after_delta_file_accessed < before_delta_file_accessed, + "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}" + ); // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. assert!( after_delta_file_accessed <= 2, @@ -7738,10 +7946,12 @@ mod tests { get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, Some(test_img("data key 1")) ); - assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) - .await - .unwrap_err() - .is_missing_key_error()); + assert!( + get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); assert!( get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx) .await @@ -7772,11 +7982,23 @@ mod tests { async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap(); let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap(); + let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap(); + + let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap(); + let base_inherited_key_child = + Key::from_hex("610000000033333333444444445500000001").unwrap(); + let base_inherited_key_nonexist = + Key::from_hex("610000000033333333444444445500000002").unwrap(); + let base_inherited_key_overwrite = + Key::from_hex("610000000033333333444444445500000003").unwrap(); + assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix... + assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX); let tline = tenant .create_test_timeline_with_layers( @@ -7785,7 +8007,18 @@ mod tests { DEFAULT_PG_VERSION, &ctx, Vec::new(), // delta layers - vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers + vec![( + Lsn(0x20), + vec![ + (base_inherited_key, test_img("metadata inherited key 1")), + ( + base_inherited_key_overwrite, + test_img("metadata key overwrite 1a"), + ), + (base_key, test_img("metadata key 1")), + (base_key_overwrite, test_img("metadata key overwrite 1b")), + ], + )], // image layers Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN ) .await?; @@ -7799,7 +8032,18 @@ mod tests { Vec::new(), // delta layers vec![( Lsn(0x30), - vec![(base_key_child, test_img("metadata key 2"))], + vec![ + ( + base_inherited_key_child, + test_img("metadata inherited key 2"), + ), + ( + base_inherited_key_overwrite, + test_img("metadata key overwrite 2a"), + ), + (base_key_child, test_img("metadata key 2")), + (base_key_overwrite, test_img("metadata key overwrite 2b")), + ], )], // image layers Lsn(0x30), ) @@ -7821,6 +8065,26 @@ mod tests { get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?, None ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?, + Some(test_img("metadata key overwrite 1b")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?, + Some(test_img("metadata inherited key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?, + Some(test_img("metadata key overwrite 1a")) + ); // test vectored get on child timeline assert_eq!( @@ -7835,6 +8099,82 @@ mod tests { get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?, None ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?, + Some(test_img("metadata inherited key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?, + Some(test_img("metadata inherited key 2")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?, + Some(test_img("metadata key overwrite 2b")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?, + Some(test_img("metadata key overwrite 2a")) + ); + + // test vectored scan on parent timeline + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); + let res = tline + .get_vectored_impl( + KeySpace::single(Key::metadata_key_range()), + lsn, + &mut reconstruct_state, + &ctx, + ) + .await?; + + assert_eq!( + res.into_iter() + .map(|(k, v)| (k, v.unwrap())) + .collect::>(), + vec![ + (base_inherited_key, test_img("metadata inherited key 1")), + ( + base_inherited_key_overwrite, + test_img("metadata key overwrite 1a") + ), + (base_key, test_img("metadata key 1")), + (base_key_overwrite, test_img("metadata key overwrite 1b")), + ] + ); + + // test vectored scan on child timeline + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); + let res = child + .get_vectored_impl( + KeySpace::single(Key::metadata_key_range()), + lsn, + &mut reconstruct_state, + &ctx, + ) + .await?; + + assert_eq!( + res.into_iter() + .map(|(k, v)| (k, v.unwrap())) + .collect::>(), + vec![ + (base_inherited_key, test_img("metadata inherited key 1")), + ( + base_inherited_key_child, + test_img("metadata inherited key 2") + ), + ( + base_inherited_key_overwrite, + test_img("metadata key overwrite 2a") + ), + (base_key_child, test_img("metadata key 2")), + (base_key_overwrite, test_img("metadata key overwrite 2b")), + ] + ); Ok(()) } @@ -7845,7 +8185,9 @@ mod tests { lsn: Lsn, ctx: &RequestContext, ) -> Result, GetVectoredError> { - let mut reconstruct_state = ValuesReconstructState::new(); + let io_concurrency = + IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap()); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let mut res = tline .get_vectored_impl( KeySpace::single(key..key.next()), @@ -7946,6 +8288,7 @@ mod tests { .await .unwrap(); let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); @@ -8005,7 +8348,7 @@ mod tests { // Image layers are created at last_record_lsn let images = tline - .inspect_image_layers(Lsn(0x40), &ctx) + .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone()) .await .unwrap() .into_iter() @@ -8020,6 +8363,7 @@ mod tests { .await .unwrap(); let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); @@ -8070,7 +8414,7 @@ mod tests { // Image layers are created at last_record_lsn let images = tline - .inspect_image_layers(Lsn(0x30), &ctx) + .inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone()) .await .unwrap() .into_iter() @@ -8083,6 +8427,7 @@ mod tests { async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> { let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?; let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); fn get_key(id: u32) -> Key { // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. @@ -8172,7 +8517,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -8224,7 +8569,7 @@ mod tests { // Check if the image layer at the GC horizon contains exactly what we want let image_at_gc_horizon = tline - .inspect_image_layers(Lsn(0x30), &ctx) + .inspect_image_layers(Lsn(0x30), &ctx, io_concurrency.clone()) .await .unwrap() .into_iter() @@ -8280,7 +8625,7 @@ mod tests { // increase GC horizon and compact again { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x40)) .wait() @@ -8448,8 +8793,8 @@ mod tests { // Force set disk consistent lsn so we can get the cutoff at `end_lsn`. info!( - "latest_gc_cutoff_lsn: {}", - *timeline.get_latest_gc_cutoff_lsn() + "applied_gc_cutoff_lsn: {}", + *timeline.get_applied_gc_cutoff_lsn() ); timeline.force_set_disk_consistent_lsn(end_lsn); @@ -8475,7 +8820,7 @@ mod tests { // Make lease on a already GC-ed LSN. // 0/80 does not have a valid lease + is below latest_gc_cutoff - assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn()); + assert!(Lsn(0x80) < *timeline.get_applied_gc_cutoff_lsn()); timeline .init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx) .expect_err("lease request on GC-ed LSN should fail"); @@ -8666,7 +9011,7 @@ mod tests { }; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -8753,7 +9098,7 @@ mod tests { // increase GC horizon and compact again { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x40)) .wait() @@ -9206,7 +9551,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -9327,7 +9672,6 @@ mod tests { &cancel, CompactOptions { flags: dryrun_flags, - compact_range: None, ..Default::default() }, &ctx, @@ -9354,7 +9698,7 @@ mod tests { // increase GC horizon and compact again { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x38)) .wait() @@ -9455,7 +9799,7 @@ mod tests { .await?; { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -9576,7 +9920,6 @@ mod tests { &cancel, CompactOptions { flags: dryrun_flags, - compact_range: None, ..Default::default() }, &ctx, @@ -9606,6 +9949,8 @@ mod tests { #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> { + use models::CompactLsnRange; + let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?; let (tenant, ctx) = harness.load().await; @@ -9705,7 +10050,7 @@ mod tests { { parent_tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x10)) .wait() @@ -9725,7 +10070,7 @@ mod tests { { branch_tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x50)) .wait() @@ -9798,6 +10143,22 @@ mod tests { verify_result().await; + // Piggyback a compaction with above_lsn. Ensure it works correctly when the specified LSN intersects with the layer files. + // Now we already have a single large delta layer, so the compaction min_layer_lsn should be the same as ancestor LSN (0x18). + branch_tline + .compact_with_gc( + &cancel, + CompactOptions { + compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x40))), + ..Default::default() + }, + &ctx, + ) + .await + .unwrap(); + + verify_result().await; + Ok(()) } @@ -9921,7 +10282,12 @@ mod tests { let keyspace = KeySpace::single(get_key(0)..get_key(10)); let results = tline - .get_vectored(keyspace, delta_layer_end_lsn, &ctx) + .get_vectored( + keyspace, + delta_layer_end_lsn, + IoConcurrency::sequential(), + &ctx, + ) .await .expect("No vectored errors"); for (key, res) in results { @@ -10060,7 +10426,7 @@ mod tests { { tline - .latest_gc_cutoff_lsn + .applied_gc_cutoff_lsn .lock_for_write() .store_and_unlock(Lsn(0x30)) .wait() @@ -10086,7 +10452,7 @@ mod tests { &cancel, CompactOptions { flags: EnumSet::new(), - compact_range: Some((get_key(0)..get_key(2)).into()), + compact_key_range: Some((get_key(0)..get_key(2)).into()), ..Default::default() }, &ctx, @@ -10133,7 +10499,7 @@ mod tests { &cancel, CompactOptions { flags: EnumSet::new(), - compact_range: Some((get_key(2)..get_key(4)).into()), + compact_key_range: Some((get_key(2)..get_key(4)).into()), ..Default::default() }, &ctx, @@ -10185,7 +10551,7 @@ mod tests { &cancel, CompactOptions { flags: EnumSet::new(), - compact_range: Some((get_key(4)..get_key(9)).into()), + compact_key_range: Some((get_key(4)..get_key(9)).into()), ..Default::default() }, &ctx, @@ -10236,7 +10602,7 @@ mod tests { &cancel, CompactOptions { flags: EnumSet::new(), - compact_range: Some((get_key(9)..get_key(10)).into()), + compact_key_range: Some((get_key(9)..get_key(10)).into()), ..Default::default() }, &ctx, @@ -10292,7 +10658,7 @@ mod tests { &cancel, CompactOptions { flags: EnumSet::new(), - compact_range: Some((get_key(0)..get_key(10)).into()), + compact_key_range: Some((get_key(0)..get_key(10)).into()), ..Default::default() }, &ctx, @@ -10321,7 +10687,6 @@ mod tests { }, ], ); - Ok(()) } @@ -10374,4 +10739,602 @@ mod tests { Ok(()) } + + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_simple_bottom_most_compaction_above_lsn() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_simple_bottom_most_compaction_above_lsn").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + )]; + let delta4 = vec![( + get_key(1), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + )]; + let delta2 = vec![ + ( + get_key(1), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(1), + Lsn(0x38), + Value::WalRecord(NeonWalRecord::wal_append("@0x38")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![ + // delta1/2/4 only contain a single key but multiple updates + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x28)..Lsn(0x30), delta4), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta3), + ], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + tline + .applied_gc_cutoff_lsn + .lock_for_write() + .store_and_unlock(Lsn(0x30)) + .wait() + .await; + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![ + (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), + (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), + ], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_20 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_10 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let verify_result = || async { + let gc_horizon = { + let gc_info = tline.gc_info.read().unwrap(); + gc_info.cutoffs.time + }; + for idx in 0..10 { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), gc_horizon, &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x20), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_20[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x10), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_10[idx] + ); + } + }; + + verify_result().await; + + let cancel = CancellationToken::new(); + tline + .compact_with_gc( + &cancel, + CompactOptions { + compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x28))), + ..Default::default() + }, + &ctx, + ) + .await + .unwrap(); + verify_result().await; + + let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; + check_layer_map_key_eq( + all_layers, + vec![ + // The original image layer, not compacted + PersistentLayerKey { + key_range: get_key(0)..get_key(10), + lsn_range: Lsn(0x10)..Lsn(0x11), + is_delta: false, + }, + // Delta layer below the specified above_lsn not compacted + PersistentLayerKey { + key_range: get_key(1)..get_key(2), + lsn_range: Lsn(0x20)..Lsn(0x28), + is_delta: true, + }, + // Delta layer compacted above the LSN + PersistentLayerKey { + key_range: get_key(1)..get_key(10), + lsn_range: Lsn(0x28)..Lsn(0x50), + is_delta: true, + }, + ], + ); + + // compact again + tline + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .await + .unwrap(); + verify_result().await; + + let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; + check_layer_map_key_eq( + all_layers, + vec![ + // The compacted image layer (full key range) + PersistentLayerKey { + key_range: Key::MIN..Key::MAX, + lsn_range: Lsn(0x10)..Lsn(0x11), + is_delta: false, + }, + // All other data in the delta layer + PersistentLayerKey { + key_range: get_key(1)..get_key(10), + lsn_range: Lsn(0x10)..Lsn(0x50), + is_delta: true, + }, + ], + ); + + Ok(()) + } + + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_simple_bottom_most_compaction_rectangle() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_simple_bottom_most_compaction_rectangle").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + )]; + let delta4 = vec![( + get_key(1), + Lsn(0x28), + Value::WalRecord(NeonWalRecord::wal_append("@0x28")), + )]; + let delta2 = vec![ + ( + get_key(1), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append("@0x30")), + ), + ( + get_key(1), + Lsn(0x38), + Value::WalRecord(NeonWalRecord::wal_append("@0x38")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ( + get_key(9), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![ + // delta1/2/4 only contain a single key but multiple updates + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x28)..Lsn(0x30), delta4), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta3), + ], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + tline + .applied_gc_cutoff_lsn + .lock_for_write() + .store_and_unlock(Lsn(0x30)) + .wait() + .await; + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![ + (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), + (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), + ], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let expected_result = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), + ]; + + let expected_result_at_gc_horizon = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_20 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10@0x20"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let expected_result_at_lsn_10 = [ + Bytes::from_static(b"value 0@0x10"), + Bytes::from_static(b"value 1@0x10"), + Bytes::from_static(b"value 2@0x10"), + Bytes::from_static(b"value 3@0x10"), + Bytes::from_static(b"value 4@0x10"), + Bytes::from_static(b"value 5@0x10"), + Bytes::from_static(b"value 6@0x10"), + Bytes::from_static(b"value 7@0x10"), + Bytes::from_static(b"value 8@0x10"), + Bytes::from_static(b"value 9@0x10"), + ]; + + let verify_result = || async { + let gc_horizon = { + let gc_info = tline.gc_info.read().unwrap(); + gc_info.cutoffs.time + }; + for idx in 0..10 { + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x50), &ctx) + .await + .unwrap(), + &expected_result[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), gc_horizon, &ctx) + .await + .unwrap(), + &expected_result_at_gc_horizon[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x20), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_20[idx] + ); + assert_eq!( + tline + .get(get_key(idx as u32), Lsn(0x10), &ctx) + .await + .unwrap(), + &expected_result_at_lsn_10[idx] + ); + } + }; + + verify_result().await; + + let cancel = CancellationToken::new(); + + tline + .compact_with_gc( + &cancel, + CompactOptions { + compact_key_range: Some((get_key(0)..get_key(2)).into()), + compact_lsn_range: Some((Lsn(0x20)..Lsn(0x28)).into()), + ..Default::default() + }, + &ctx, + ) + .await + .unwrap(); + verify_result().await; + + let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; + check_layer_map_key_eq( + all_layers, + vec![ + // The original image layer, not compacted + PersistentLayerKey { + key_range: get_key(0)..get_key(10), + lsn_range: Lsn(0x10)..Lsn(0x11), + is_delta: false, + }, + // According the selection logic, we select all layers with start key <= 0x28, so we would merge the layer 0x20-0x28 and + // the layer 0x28-0x30 into one. + PersistentLayerKey { + key_range: get_key(1)..get_key(2), + lsn_range: Lsn(0x20)..Lsn(0x30), + is_delta: true, + }, + // Above the upper bound and untouched + PersistentLayerKey { + key_range: get_key(1)..get_key(2), + lsn_range: Lsn(0x30)..Lsn(0x50), + is_delta: true, + }, + // This layer is untouched + PersistentLayerKey { + key_range: get_key(8)..get_key(10), + lsn_range: Lsn(0x30)..Lsn(0x50), + is_delta: true, + }, + ], + ); + + tline + .compact_with_gc( + &cancel, + CompactOptions { + compact_key_range: Some((get_key(3)..get_key(8)).into()), + compact_lsn_range: Some((Lsn(0x28)..Lsn(0x40)).into()), + ..Default::default() + }, + &ctx, + ) + .await + .unwrap(); + verify_result().await; + + let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; + check_layer_map_key_eq( + all_layers, + vec![ + // The original image layer, not compacted + PersistentLayerKey { + key_range: get_key(0)..get_key(10), + lsn_range: Lsn(0x10)..Lsn(0x11), + is_delta: false, + }, + // Not in the compaction key range, uncompacted + PersistentLayerKey { + key_range: get_key(1)..get_key(2), + lsn_range: Lsn(0x20)..Lsn(0x30), + is_delta: true, + }, + // Not in the compaction key range, uncompacted but need rewrite because the delta layer overlaps with the range + PersistentLayerKey { + key_range: get_key(1)..get_key(2), + lsn_range: Lsn(0x30)..Lsn(0x50), + is_delta: true, + }, + // Note that when we specify the LSN upper bound to be 0x40, the compaction algorithm will not try to cut the layer + // horizontally in half. Instead, it will include all LSNs that overlap with 0x40. So the real max_lsn of the compaction + // becomes 0x50. + PersistentLayerKey { + key_range: get_key(8)..get_key(10), + lsn_range: Lsn(0x30)..Lsn(0x50), + is_delta: true, + }, + ], + ); + + // compact again + tline + .compact_with_gc( + &cancel, + CompactOptions { + compact_key_range: Some((get_key(0)..get_key(5)).into()), + compact_lsn_range: Some((Lsn(0x20)..Lsn(0x50)).into()), + ..Default::default() + }, + &ctx, + ) + .await + .unwrap(); + verify_result().await; + + let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; + check_layer_map_key_eq( + all_layers, + vec![ + // The original image layer, not compacted + PersistentLayerKey { + key_range: get_key(0)..get_key(10), + lsn_range: Lsn(0x10)..Lsn(0x11), + is_delta: false, + }, + // The range gets compacted + PersistentLayerKey { + key_range: get_key(1)..get_key(2), + lsn_range: Lsn(0x20)..Lsn(0x50), + is_delta: true, + }, + // Not touched during this iteration of compaction + PersistentLayerKey { + key_range: get_key(8)..get_key(10), + lsn_range: Lsn(0x30)..Lsn(0x50), + is_delta: true, + }, + ], + ); + + // final full compaction + tline + .compact_with_gc(&cancel, CompactOptions::default(), &ctx) + .await + .unwrap(); + verify_result().await; + + let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; + check_layer_map_key_eq( + all_layers, + vec![ + // The compacted image layer (full key range) + PersistentLayerKey { + key_range: Key::MIN..Key::MAX, + lsn_range: Lsn(0x10)..Lsn(0x11), + is_delta: false, + }, + // All other data in the delta layer + PersistentLayerKey { + key_range: get_key(1)..get_key(10), + lsn_range: Lsn(0x10)..Lsn(0x50), + is_delta: true, + }, + ], + ); + + Ok(()) + } } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 4a5158349a..ad66f7b4a7 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -14,6 +14,10 @@ //! len < 128: 0XXXXXXX //! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! +use std::cmp::min; +use std::io::{Error, ErrorKind}; +use std::sync::Arc; + use async_compression::Level; use bytes::{BufMut, BytesMut}; use pageserver_api::models::ImageCompressionAlgorithm; @@ -24,12 +28,10 @@ use tracing::warn; use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; use crate::tenant::block_io::BlockCursor; +use crate::virtual_file::IoBufferMut; +use crate::virtual_file::VirtualFile; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::owned_buffers_io::write::BufferedWriter; -use crate::virtual_file::{IoBufferMut, VirtualFile}; -use std::cmp::min; -use std::io::{Error, ErrorKind}; -use std::sync::Arc; #[derive(Copy, Clone, Debug)] pub struct CompressionInfo { @@ -37,7 +39,7 @@ pub struct CompressionInfo { pub compressed_size: Option, } -impl<'a> BlockCursor<'a> { +impl BlockCursor<'_> { /// Read a blob into a new buffer. pub async fn read_blob( &self, @@ -326,12 +328,15 @@ impl BlobWriter { #[cfg(test)] pub(crate) mod tests { - use super::*; - use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use rand::{Rng, SeedableRng}; + use super::*; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::tenant::block_io::BlockReaderRef; + async fn round_trip_test(blobs: &[Vec]) -> Result<(), Error> { round_trip_test_compressed(blobs, false).await } @@ -397,7 +402,7 @@ pub(crate) mod tests { pub(crate) fn random_array(len: usize) -> Vec { let mut rng = rand::thread_rng(); - (0..len).map(|_| rng.gen()).collect::<_>() + (0..len).map(|_| rng.r#gen()).collect::<_>() } #[tokio::test] @@ -449,9 +454,9 @@ pub(crate) mod tests { let mut rng = rand::rngs::StdRng::seed_from_u64(42); let blobs = (0..1024) .map(|_| { - let mut sz: u16 = rng.gen(); + let mut sz: u16 = rng.r#gen(); // Make 50% of the arrays small - if rng.gen() { + if rng.r#gen() { sz &= 63; } random_array(sz.into()) diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 5fcb91a57d..a8cffa2aa1 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -2,13 +2,14 @@ //! Low-level Block-oriented I/O functions //! +use std::ops::Deref; + use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; -use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; +use crate::page_cache::{self, FileId, PAGE_SZ, PageReadGuard, PageWriteGuard, ReadBufResult}; #[cfg(test)] use crate::virtual_file::IoBufferMut; use crate::virtual_file::{IoBuffer, VirtualFile}; -use std::ops::Deref; /// This is implemented by anything that can read 8 kB (PAGE_SZ) /// blocks, using the page cache @@ -88,7 +89,7 @@ pub(crate) enum BlockReaderRef<'a> { VirtualFile(&'a VirtualFile), } -impl<'a> BlockReaderRef<'a> { +impl BlockReaderRef<'_> { #[inline(always)] async fn read_blk( &self, diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs index 1e8fa8d1d6..d5b979ab2a 100644 --- a/pageserver/src/tenant/checks.rs +++ b/pageserver/src/tenant/checks.rs @@ -1,12 +1,15 @@ use std::collections::BTreeSet; use itertools::Itertools; +use pageserver_compaction::helpers::overlaps_with; use super::storage_layer::LayerName; /// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong). /// -/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, +/// The function implements a fast path check and a slow path check. +/// +/// The fast path checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, /// /// ```plain /// | | | | @@ -25,31 +28,47 @@ use super::storage_layer::LayerName; /// | | | 4 | | | /// /// If layer 2 and 4 contain the same single key, this is also a valid layer map. +/// +/// However, if a partial compaction is still going on, it is possible that we get a layer map not satisfying the above condition. +/// Therefore, we fallback to simply check if any of the two delta layers overlap. (See "A slow path...") pub fn check_valid_layermap(metadata: &[LayerName]) -> Option { let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) let mut all_delta_layers = Vec::new(); for name in metadata { if let LayerName::Delta(layer) = name { - if layer.key_range.start.next() != layer.key_range.end { - all_delta_layers.push(layer.clone()); - } + all_delta_layers.push(layer.clone()); } } for layer in &all_delta_layers { - let lsn_range = &layer.lsn_range; - lsn_split_point.insert(lsn_range.start); - lsn_split_point.insert(lsn_range.end); + if layer.key_range.start.next() != layer.key_range.end { + let lsn_range = &layer.lsn_range; + lsn_split_point.insert(lsn_range.start); + lsn_split_point.insert(lsn_range.end); + } } - for layer in &all_delta_layers { + for (idx, layer) in all_delta_layers.iter().enumerate() { + if layer.key_range.start.next() == layer.key_range.end { + continue; + } let lsn_range = layer.lsn_range.clone(); let intersects = lsn_split_point.range(lsn_range).collect_vec(); if intersects.len() > 1 { - let err = format!( - "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]", - layer, - intersects.into_iter().map(|lsn| lsn.to_string()).join(", ") - ); - return Some(err); + // A slow path to check if the layer intersects with any other delta layer. + for (other_idx, other_layer) in all_delta_layers.iter().enumerate() { + if other_idx == idx { + // do not check self intersects with self + continue; + } + if overlaps_with(&layer.lsn_range, &other_layer.lsn_range) + && overlaps_with(&layer.key_range, &other_layer.key_range) + { + let err = format!( + "layer violates the layer map LSN split assumption: layer {} intersects with layer {}", + layer, other_layer + ); + return Some(err); + } + } } } None diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 5d3ac5a8e3..334fb04604 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -8,16 +8,17 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! +use std::num::NonZeroU64; +use std::time::Duration; + pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; -use pageserver_api::models::CompactionAlgorithmSettings; -use pageserver_api::models::EvictionPolicy; -use pageserver_api::models::{self, ThrottleConfig}; +use pageserver_api::models::{ + self, CompactionAlgorithmSettings, EvictionPolicy, TenantConfigPatch, +}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; use serde_json::Value; -use std::num::NonZeroU64; -use std::time::Duration; use utils::generation::Generation; use utils::postgres_client::PostgresClientProtocol; @@ -277,10 +278,34 @@ pub struct TenantConfOpt { #[serde(default)] pub compaction_threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub compaction_upper_limit: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub compaction_algorithm: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub compaction_l0_first: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub compaction_l0_semaphore: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub l0_flush_delay_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub l0_flush_stall_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub l0_flush_wait_upload: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub gc_horizon: Option, @@ -341,6 +366,9 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] pub image_layer_creation_check_threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub image_creation_preempt_threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] #[serde(default)] @@ -357,6 +385,18 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] pub wal_receiver_protocol_override: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub rel_size_v2_enabled: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_compaction_enabled: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_compaction_initial_threshold_kb: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_compaction_ratio_percent: Option, } impl TenantConfOpt { @@ -377,11 +417,29 @@ impl TenantConfOpt { compaction_threshold: self .compaction_threshold .unwrap_or(global_conf.compaction_threshold), + compaction_upper_limit: self + .compaction_upper_limit + .unwrap_or(global_conf.compaction_upper_limit), compaction_algorithm: self .compaction_algorithm .as_ref() .unwrap_or(&global_conf.compaction_algorithm) .clone(), + compaction_l0_first: self + .compaction_l0_first + .unwrap_or(global_conf.compaction_l0_first), + compaction_l0_semaphore: self + .compaction_l0_semaphore + .unwrap_or(global_conf.compaction_l0_semaphore), + l0_flush_delay_threshold: self + .l0_flush_delay_threshold + .or(global_conf.l0_flush_delay_threshold), + l0_flush_stall_threshold: self + .l0_flush_stall_threshold + .or(global_conf.l0_flush_stall_threshold), + l0_flush_wait_upload: self + .l0_flush_wait_upload + .unwrap_or(global_conf.l0_flush_wait_upload), gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), gc_period: self.gc_period.unwrap_or(global_conf.gc_period), image_creation_threshold: self @@ -413,6 +471,9 @@ impl TenantConfOpt { image_layer_creation_check_threshold: self .image_layer_creation_check_threshold .unwrap_or(global_conf.image_layer_creation_check_threshold), + image_creation_preempt_threshold: self + .image_creation_preempt_threshold + .unwrap_or(global_conf.image_creation_preempt_threshold), lsn_lease_length: self .lsn_lease_length .unwrap_or(global_conf.lsn_lease_length), @@ -420,13 +481,197 @@ impl TenantConfOpt { .lsn_lease_length_for_ts .unwrap_or(global_conf.lsn_lease_length_for_ts), timeline_offloading: self - .lazy_slru_download + .timeline_offloading .unwrap_or(global_conf.timeline_offloading), wal_receiver_protocol_override: self .wal_receiver_protocol_override .or(global_conf.wal_receiver_protocol_override), + rel_size_v2_enabled: self + .rel_size_v2_enabled + .unwrap_or(global_conf.rel_size_v2_enabled), + gc_compaction_enabled: self + .gc_compaction_enabled + .unwrap_or(global_conf.gc_compaction_enabled), + gc_compaction_initial_threshold_kb: self + .gc_compaction_initial_threshold_kb + .unwrap_or(global_conf.gc_compaction_initial_threshold_kb), + gc_compaction_ratio_percent: self + .gc_compaction_ratio_percent + .unwrap_or(global_conf.gc_compaction_ratio_percent), } } + + pub fn apply_patch(self, patch: TenantConfigPatch) -> anyhow::Result { + let Self { + mut checkpoint_distance, + mut checkpoint_timeout, + mut compaction_target_size, + mut compaction_period, + mut compaction_threshold, + mut compaction_upper_limit, + mut compaction_algorithm, + mut compaction_l0_first, + mut compaction_l0_semaphore, + mut l0_flush_delay_threshold, + mut l0_flush_stall_threshold, + mut l0_flush_wait_upload, + mut gc_horizon, + mut gc_period, + mut image_creation_threshold, + mut pitr_interval, + mut walreceiver_connect_timeout, + mut lagging_wal_timeout, + mut max_lsn_wal_lag, + mut eviction_policy, + mut min_resident_size_override, + mut evictions_low_residence_duration_metric_threshold, + mut heatmap_period, + mut lazy_slru_download, + mut timeline_get_throttle, + mut image_layer_creation_check_threshold, + mut image_creation_preempt_threshold, + mut lsn_lease_length, + mut lsn_lease_length_for_ts, + mut timeline_offloading, + mut wal_receiver_protocol_override, + mut rel_size_v2_enabled, + mut gc_compaction_enabled, + mut gc_compaction_initial_threshold_kb, + mut gc_compaction_ratio_percent, + } = self; + + patch.checkpoint_distance.apply(&mut checkpoint_distance); + patch + .checkpoint_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut checkpoint_timeout); + patch + .compaction_target_size + .apply(&mut compaction_target_size); + patch + .compaction_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut compaction_period); + patch.compaction_threshold.apply(&mut compaction_threshold); + patch + .compaction_upper_limit + .apply(&mut compaction_upper_limit); + patch.compaction_algorithm.apply(&mut compaction_algorithm); + patch.compaction_l0_first.apply(&mut compaction_l0_first); + patch + .compaction_l0_semaphore + .apply(&mut compaction_l0_semaphore); + patch + .l0_flush_delay_threshold + .apply(&mut l0_flush_delay_threshold); + patch + .l0_flush_stall_threshold + .apply(&mut l0_flush_stall_threshold); + patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload); + patch.gc_horizon.apply(&mut gc_horizon); + patch + .gc_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut gc_period); + patch + .image_creation_threshold + .apply(&mut image_creation_threshold); + patch + .pitr_interval + .map(|v| humantime::parse_duration(&v))? + .apply(&mut pitr_interval); + patch + .walreceiver_connect_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut walreceiver_connect_timeout); + patch + .lagging_wal_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lagging_wal_timeout); + patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag); + patch.eviction_policy.apply(&mut eviction_policy); + patch + .min_resident_size_override + .apply(&mut min_resident_size_override); + patch + .evictions_low_residence_duration_metric_threshold + .map(|v| humantime::parse_duration(&v))? + .apply(&mut evictions_low_residence_duration_metric_threshold); + patch + .heatmap_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut heatmap_period); + patch.lazy_slru_download.apply(&mut lazy_slru_download); + patch + .timeline_get_throttle + .apply(&mut timeline_get_throttle); + patch + .image_layer_creation_check_threshold + .apply(&mut image_layer_creation_check_threshold); + patch + .image_creation_preempt_threshold + .apply(&mut image_creation_preempt_threshold); + patch + .lsn_lease_length + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lsn_lease_length); + patch + .lsn_lease_length_for_ts + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lsn_lease_length_for_ts); + patch.timeline_offloading.apply(&mut timeline_offloading); + patch + .wal_receiver_protocol_override + .apply(&mut wal_receiver_protocol_override); + patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled); + patch + .gc_compaction_enabled + .apply(&mut gc_compaction_enabled); + patch + .gc_compaction_initial_threshold_kb + .apply(&mut gc_compaction_initial_threshold_kb); + patch + .gc_compaction_ratio_percent + .apply(&mut gc_compaction_ratio_percent); + + Ok(Self { + checkpoint_distance, + checkpoint_timeout, + compaction_target_size, + compaction_period, + compaction_threshold, + compaction_upper_limit, + compaction_algorithm, + compaction_l0_first, + compaction_l0_semaphore, + l0_flush_delay_threshold, + l0_flush_stall_threshold, + l0_flush_wait_upload, + gc_horizon, + gc_period, + image_creation_threshold, + pitr_interval, + walreceiver_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + eviction_policy, + min_resident_size_override, + evictions_low_residence_duration_metric_threshold, + heatmap_period, + lazy_slru_download, + timeline_get_throttle, + image_layer_creation_check_threshold, + image_creation_preempt_threshold, + lsn_lease_length, + lsn_lease_length_for_ts, + timeline_offloading, + wal_receiver_protocol_override, + rel_size_v2_enabled, + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + }) + } } impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { @@ -449,46 +694,56 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { /// This is a conversion from our internal tenant config object to the one used /// in external APIs. impl From for models::TenantConfig { + // TODO(vlad): These are now the same, but they have different serialization logic. + // Can we merge them? fn from(value: TenantConfOpt) -> Self { - fn humantime(d: Duration) -> String { - format!("{}s", d.as_secs()) - } Self { checkpoint_distance: value.checkpoint_distance, - checkpoint_timeout: value.checkpoint_timeout.map(humantime), + checkpoint_timeout: value.checkpoint_timeout, compaction_algorithm: value.compaction_algorithm, compaction_target_size: value.compaction_target_size, - compaction_period: value.compaction_period.map(humantime), + compaction_period: value.compaction_period, compaction_threshold: value.compaction_threshold, + compaction_upper_limit: value.compaction_upper_limit, + compaction_l0_first: value.compaction_l0_first, + compaction_l0_semaphore: value.compaction_l0_semaphore, + l0_flush_delay_threshold: value.l0_flush_delay_threshold, + l0_flush_stall_threshold: value.l0_flush_stall_threshold, + l0_flush_wait_upload: value.l0_flush_wait_upload, gc_horizon: value.gc_horizon, - gc_period: value.gc_period.map(humantime), + gc_period: value.gc_period, image_creation_threshold: value.image_creation_threshold, - pitr_interval: value.pitr_interval.map(humantime), - walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime), - lagging_wal_timeout: value.lagging_wal_timeout.map(humantime), + pitr_interval: value.pitr_interval, + walreceiver_connect_timeout: value.walreceiver_connect_timeout, + lagging_wal_timeout: value.lagging_wal_timeout, max_lsn_wal_lag: value.max_lsn_wal_lag, eviction_policy: value.eviction_policy, min_resident_size_override: value.min_resident_size_override, evictions_low_residence_duration_metric_threshold: value - .evictions_low_residence_duration_metric_threshold - .map(humantime), - heatmap_period: value.heatmap_period.map(humantime), + .evictions_low_residence_duration_metric_threshold, + heatmap_period: value.heatmap_period, lazy_slru_download: value.lazy_slru_download, - timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from), + timeline_get_throttle: value.timeline_get_throttle, image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, - lsn_lease_length: value.lsn_lease_length.map(humantime), - lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), + image_creation_preempt_threshold: value.image_creation_preempt_threshold, + lsn_lease_length: value.lsn_lease_length, + lsn_lease_length_for_ts: value.lsn_lease_length_for_ts, timeline_offloading: value.timeline_offloading, wal_receiver_protocol_override: value.wal_receiver_protocol_override, + rel_size_v2_enabled: value.rel_size_v2_enabled, + gc_compaction_enabled: value.gc_compaction_enabled, + gc_compaction_initial_threshold_kb: value.gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent: value.gc_compaction_ratio_percent, } } } #[cfg(test)] mod tests { - use super::*; use models::TenantConfig; + use super::*; + #[test] fn de_serializing_pageserver_config_omits_empty_values() { let small_conf = TenantConfOpt { @@ -505,29 +760,10 @@ mod tests { assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap()); } - #[test] - fn test_try_from_models_tenant_config_err() { - let tenant_config = models::TenantConfig { - lagging_wal_timeout: Some("5a".to_string()), - ..TenantConfig::default() - }; - - let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config); - - assert!( - tenant_conf_opt.is_err(), - "Suceeded to convert TenantConfig to TenantConfOpt" - ); - - let expected_error_str = - "lagging_wal_timeout: invalid value: string \"5a\", expected a duration"; - assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str); - } - #[test] fn test_try_from_models_tenant_config_success() { let tenant_config = models::TenantConfig { - lagging_wal_timeout: Some("5s".to_string()), + lagging_wal_timeout: Some(Duration::from_secs(5)), ..TenantConfig::default() }; diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 49f7bd9a65..cdee42239f 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -18,28 +18,24 @@ //! - An Iterator interface would be more convenient for the callers than the //! 'visit' function //! +use std::cmp::Ordering; +use std::iter::Rev; +use std::ops::{Range, RangeInclusive}; +use std::{io, result}; + use async_stream::try_stream; -use byteorder::{ReadBytesExt, BE}; +use byteorder::{BE, ReadBytesExt}; use bytes::BufMut; use either::Either; use futures::{Stream, StreamExt}; use hex; -use std::{ - cmp::Ordering, - io, - iter::Rev, - ops::{Range, RangeInclusive}, - result, -}; use thiserror::Error; use tracing::error; -use crate::{ - context::{DownloadBehavior, RequestContext}, - task_mgr::TaskKind, - tenant::block_io::{BlockReader, BlockWriter}, - virtual_file::{owned_buffers_io::write::Buffer, IoBuffer, IoBufferMut}, -}; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::TaskKind; +use crate::tenant::block_io::{BlockReader, BlockWriter}; +use crate::virtual_file::{IoBuffer, IoBufferMut, owned_buffers_io::write::Buffer}; // The maximum size of a value stored in the B-tree. 5 bytes is enough currently. pub const VALUE_SZ: usize = 5; @@ -85,17 +81,17 @@ impl Value { fn to_u64(self) -> u64 { let b = &self.0; - (b[0] as u64) << 32 - | (b[1] as u64) << 24 - | (b[2] as u64) << 16 - | (b[3] as u64) << 8 + ((b[0] as u64) << 32) + | ((b[1] as u64) << 24) + | ((b[2] as u64) << 16) + | ((b[3] as u64) << 8) | b[4] as u64 } fn to_blknum(self) -> u32 { let b = &self.0; assert!(b[0] == 0x80); - (b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32 + ((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32 } } @@ -533,7 +529,7 @@ pub struct DiskBtreeIterator<'a> { >, } -impl<'a> DiskBtreeIterator<'a> { +impl DiskBtreeIterator<'_> { pub async fn next(&mut self) -> Option, u64), DiskBtreeError>> { self.stream.next().await } @@ -834,12 +830,14 @@ impl BuildNode { #[cfg(test)] pub(crate) mod tests { - use super::*; - use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; - use rand::Rng; use std::collections::BTreeMap; use std::sync::atomic::{AtomicUsize, Ordering}; + use rand::Rng; + + use super::*; + use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; + #[derive(Clone, Default)] pub(crate) struct TestDisk { blocks: Vec, @@ -1116,7 +1114,7 @@ pub(crate) mod tests { // Test get() operations on random keys, most of which will not exist for _ in 0..100000 { - let key_int = rand::thread_rng().gen::(); + let key_int = rand::thread_rng().r#gen::(); let search_key = u128::to_be_bytes(key_int); assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned()); } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 60a10efc41..17d6acafd8 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -1,6 +1,17 @@ //! Implementation of append-only file data structure //! used to keep in-memory layers spilled on disk. +use std::io; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; + +use camino::Utf8PathBuf; +use num_traits::Num; +use pageserver_api::shard::TenantShardId; +use tokio_epoll_uring::{BoundedBuf, Slice}; +use tracing::error; +use utils::id::TimelineId; + use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::RequestContext; @@ -9,17 +20,7 @@ use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; use crate::virtual_file::owned_buffers_io::write::Buffer; -use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile}; -use camino::Utf8PathBuf; -use num_traits::Num; -use pageserver_api::shard::TenantShardId; -use tokio_epoll_uring::{BoundedBuf, Slice}; -use tracing::error; - -use std::io; -use std::sync::atomic::AtomicU64; -use std::sync::Arc; -use utils::id::TimelineId; +use crate::virtual_file::{self, IoBufferMut, VirtualFile, owned_buffers_io}; pub struct EphemeralFile { _tenant_shard_id: TenantShardId, @@ -175,11 +176,11 @@ impl EphemeralFile { } impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( - &'b self, + async fn read_exact_at_eof_ok( + &self, start: u64, dst: tokio_epoll_uring::Slice, - ctx: &'a RequestContext, + ctx: &RequestContext, ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { let submitted_offset = self.buffered_writer.submit_offset(); @@ -320,13 +321,14 @@ pub fn is_ephemeral_file(filename: &str) -> bool { #[cfg(test)] mod tests { + use std::fs; + use std::str::FromStr; + use rand::Rng; use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - use std::fs; - use std::str::FromStr; fn harness( test_name: &str, diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs index af73acb2be..7aa920c953 100644 --- a/pageserver/src/tenant/gc_block.rs +++ b/pageserver/src/tenant/gc_block.rs @@ -1,4 +1,5 @@ -use std::{collections::HashMap, sync::Arc}; +use std::collections::HashMap; +use std::sync::Arc; use utils::id::TimelineId; diff --git a/pageserver/src/tenant/gc_result.rs b/pageserver/src/tenant/gc_result.rs index c805aafeab..7a7d6d19cb 100644 --- a/pageserver/src/tenant/gc_result.rs +++ b/pageserver/src/tenant/gc_result.rs @@ -1,8 +1,9 @@ -use anyhow::Result; -use serde::Serialize; use std::ops::AddAssign; use std::time::Duration; +use anyhow::Result; +use serde::Serialize; + /// /// Result of performing GC /// diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 7f15baed10..59f5a6bd90 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -46,28 +46,28 @@ mod historic_layer_coverage; mod layer_coverage; -use crate::context::RequestContext; -use crate::keyspace::KeyPartitioning; -use crate::tenant::storage_layer::InMemoryLayer; -use anyhow::Result; -use pageserver_api::key::Key; -use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; -use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; use std::collections::{HashMap, VecDeque}; use std::iter::Peekable; use std::ops::Range; use std::sync::Arc; -use utils::lsn::Lsn; +use anyhow::Result; use historic_layer_coverage::BufferedHistoricLayerCoverage; pub use historic_layer_coverage::LayerKey; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; +use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; +use tokio::sync::watch; +use utils::lsn::Lsn; use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc}; +use crate::context::RequestContext; +use crate::keyspace::KeyPartitioning; +use crate::tenant::storage_layer::InMemoryLayer; /// /// LayerMap tracks what layers exist on a timeline. /// -#[derive(Default)] pub struct LayerMap { // // 'open_layer' holds the current InMemoryLayer that is accepting new @@ -93,7 +93,25 @@ pub struct LayerMap { /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. + /// + /// NB: make sure to notify `watch_l0_deltas` on changes. l0_delta_layers: Vec>, + + /// Notifies about L0 delta layer changes, sending the current number of L0 layers. + watch_l0_deltas: watch::Sender, +} + +impl Default for LayerMap { + fn default() -> Self { + Self { + open_layer: Default::default(), + next_open_layer_at: Default::default(), + frozen_layers: Default::default(), + historic: Default::default(), + l0_delta_layers: Default::default(), + watch_l0_deltas: watch::channel(0).0, + } + } } /// The primary update API for the layer map. @@ -392,8 +410,8 @@ impl LayerMap { image_layer: Option>, end_lsn: Lsn, ) -> Option { - assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta())); - assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta())); + assert!(delta_layer.as_ref().is_none_or(|l| l.is_delta())); + assert!(image_layer.as_ref().is_none_or(|l| !l.is_delta())); match (delta_layer, image_layer) { (None, None) => None, @@ -466,6 +484,8 @@ impl LayerMap { if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) { self.l0_delta_layers.push(layer_desc.clone().into()); + self.watch_l0_deltas + .send_replace(self.l0_delta_layers.len()); } self.historic.insert( @@ -488,6 +508,8 @@ impl LayerMap { let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers); l0_delta_layers.retain(|other| other.key() != layer_key); self.l0_delta_layers = l0_delta_layers; + self.watch_l0_deltas + .send_replace(self.l0_delta_layers.len()); // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers, // there's a chance that the comparison fails at runtime due to it comparing (pointer, // vtable) pairs. @@ -850,6 +872,11 @@ impl LayerMap { &self.l0_delta_layers } + /// Subscribes to L0 delta layer changes, sending the current number of L0 delta layers. + pub fn watch_level0_deltas(&self) -> watch::Receiver { + self.watch_l0_deltas.subscribe() + } + /// debugging function to print out the contents of the layer map #[allow(unused)] pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { @@ -1039,18 +1066,17 @@ impl LayerMap { #[cfg(test)] mod tests { - use crate::tenant::{storage_layer::LayerName, IndexPart}; - use pageserver_api::{ - key::DBDIR_KEY, - keyspace::{KeySpace, KeySpaceRandomAccum}, - }; - use std::{collections::HashMap, path::PathBuf}; - use utils::{ - id::{TenantId, TimelineId}, - shard::TenantShardId, - }; + use std::collections::HashMap; + use std::path::PathBuf; + + use pageserver_api::key::DBDIR_KEY; + use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; + use utils::id::{TenantId, TimelineId}; + use utils::shard::TenantShardId; use super::*; + use crate::tenant::IndexPart; + use crate::tenant::storage_layer::LayerName; #[derive(Clone)] struct LayerDesc { @@ -1390,9 +1416,11 @@ mod tests { assert!(!shadow.ranges.is_empty()); // At least some layers should be marked covered - assert!(layer_visibilities - .iter() - .any(|i| matches!(i.1, LayerVisibilityHint::Covered))); + assert!( + layer_visibilities + .iter() + .any(|i| matches!(i.1, LayerVisibilityHint::Covered)) + ); let layer_visibilities = layer_visibilities.into_iter().collect::>(); diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index 136f68bc36..f8bec48886 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -3,9 +3,8 @@ use std::ops::Range; use tracing::info; -use crate::tenant::storage_layer::PersistentLayerDesc; - use super::layer_coverage::LayerCoverageTuple; +use crate::tenant::storage_layer::PersistentLayerDesc; /// Layers in this module are identified and indexed by this data. /// diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 24440d4b35..77f9a3579d 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -19,8 +19,9 @@ use anyhow::ensure; use serde::{Deserialize, Serialize}; -use utils::bin_ser::SerializeError; -use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn}; +use utils::bin_ser::{BeSer, SerializeError}; +use utils::id::TimelineId; +use utils::lsn::Lsn; /// Use special format number to enable backward compatibility. const METADATA_FORMAT_VERSION: u16 = 4; @@ -130,7 +131,10 @@ struct TimelineMetadataBodyV2 { prev_record_lsn: Option, ancestor_timeline: Option, ancestor_lsn: Lsn, + + // The LSN at which GC was last executed. Synonym of [`Timeline::applied_gc_cutoff_lsn`]. latest_gc_cutoff_lsn: Lsn, + initdb_lsn: Lsn, pg_version: u32, } @@ -320,7 +324,6 @@ impl TimelineMetadata { // Checksums make it awkward to build a valid instance by hand. This helper // provides a TimelineMetadata with a valid checksum in its header. - #[cfg(test)] pub fn example() -> Self { let instance = Self::new( "0/16960E8".parse::().unwrap(), @@ -343,9 +346,10 @@ impl TimelineMetadata { } pub(crate) mod modern_serde { - use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader}; use serde::{Deserialize, Serialize}; + use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader}; + pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result where D: serde::de::Deserializer<'de>, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index e8b0d1d4dd..003f84e640 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1,34 +1,42 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; -use futures::StreamExt; -use itertools::Itertools; -use pageserver_api::key::Key; -use pageserver_api::models::LocationConfigMode; -use pageserver_api::shard::{ - ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, -}; -use pageserver_api::upcall_api::ReAttachResponseTenant; -use rand::{distributions::Alphanumeric, Rng}; -use remote_storage::TimeoutOrCancel; use std::borrow::Cow; use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::Deref; use std::sync::Arc; use std::time::Duration; -use sysinfo::SystemExt; -use tokio::fs; use anyhow::Context; +use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use itertools::Itertools; use once_cell::sync::Lazy; +use pageserver_api::key::Key; +use pageserver_api::models::LocationConfigMode; +use pageserver_api::shard::{ + ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, +}; +use pageserver_api::upcall_api::ReAttachResponseTenant; +use rand::Rng; +use rand::distributions::Alphanumeric; +use remote_storage::TimeoutOrCancel; +use sysinfo::SystemExt; +use tokio::fs; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; - +use utils::crashsafe::path_with_suffix_extension; +use utils::fs_ext::PathExt; +use utils::generation::Generation; +use utils::id::{TenantId, TimelineId}; use utils::{backoff, completion, crashsafe}; +use super::remote_timeline_client::remote_tenant_path; +use super::secondary::SecondaryTenant; +use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; +use super::{GlobalShutDown, TenantSharedResources}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::controller_upcall_client::{ @@ -37,7 +45,7 @@ use crate::controller_upcall_client::{ use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; -use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, }; @@ -48,16 +56,6 @@ use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Ten use crate::virtual_file::MaybeFatalIo; use crate::{InitializationOrder, TEMP_FILE_SUFFIX}; -use utils::crashsafe::path_with_suffix_extension; -use utils::fs_ext::PathExt; -use utils::generation::Generation; -use utils::id::{TenantId, TimelineId}; - -use super::remote_timeline_client::remote_tenant_path; -use super::secondary::SecondaryTenant; -use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; -use super::{GlobalShutDown, TenantSharedResources}; - /// For a tenant that appears in TenantsMap, it may either be /// - `Attached`: has a full Tenant object, is elegible to service /// reads and ingest WAL. @@ -140,7 +138,7 @@ impl TenantStartupMode { /// If this returns None, the re-attach struct is in an invalid state and /// should be ignored in the response. fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option { - match (rart.mode, rart.gen) { + match (rart.mode, rart.r#gen) { (LocationConfigMode::Detached, _) => None, (LocationConfigMode::Secondary, _) => Some(Self::Secondary), (LocationConfigMode::AttachedMulti, Some(g)) => { @@ -376,7 +374,7 @@ async fn init_load_generations( TenantStartupMode::Attached((_mode, generation)) => Some(generation), TenantStartupMode::Secondary => None, } - .map(|gen| (*id, *gen)) + .map(|gen_| (*id, *gen_)) }) .collect(); resources.deletion_queue_client.recover(attached_tenants)?; @@ -502,7 +500,9 @@ pub async fn init_tenant_mgr( .total_memory(); let max_ephemeral_layer_bytes = conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024); - tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory"); + tracing::info!( + "Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory" + ); inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store( max_ephemeral_layer_bytes, std::sync::atomic::Ordering::Relaxed, @@ -700,10 +700,11 @@ fn tenant_spawn( // to avoid impacting prod runtime performance. assert!(!crate::is_temporary(tenant_path)); debug_assert!(tenant_path.is_dir()); - debug_assert!(conf - .tenant_location_config_path(&tenant_shard_id) - .try_exists() - .unwrap()); + debug_assert!( + conf.tenant_location_config_path(&tenant_shard_id) + .try_exists() + .unwrap() + ); Tenant::spawn( conf, @@ -791,7 +792,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { (total_in_progress, total_attached) } TenantsMap::ShuttingDown(_) => { - error!("already shutting down, this function isn't supposed to be called more than once"); + error!( + "already shutting down, this function isn't supposed to be called more than once" + ); return; } } @@ -1016,9 +1019,9 @@ impl TenantManager { Ok(Ok(_)) => return Ok(Some(tenant)), Err(_) => { tracing::warn!( - timeout_ms = flush_timeout.as_millis(), - "Timed out waiting for flush to remote storage, proceeding anyway." - ) + timeout_ms = flush_timeout.as_millis(), + "Timed out waiting for flush to remote storage, proceeding anyway." + ) } } } @@ -1194,7 +1197,9 @@ impl TenantManager { } TenantSlot::Attached(tenant) => { let (_guard, progress) = utils::completion::channel(); - info!("Shutting down just-spawned tenant, because tenant manager is shut down"); + info!( + "Shutting down just-spawned tenant, because tenant manager is shut down" + ); match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { info!("Finished shutting down just-spawned tenant"); @@ -1643,6 +1648,7 @@ impl TenantManager { .wait_lsn( *target_lsn, crate::tenant::timeline::WaitLsnWaiter::Tenant, + crate::tenant::timeline::WaitLsnTimeout::Default, ctx, ) .await @@ -1783,7 +1789,7 @@ impl TenantManager { _ => { return Err(anyhow::anyhow!(e).context(format!( "Hard linking {relative_layer} into {child_prefix}" - ))) + ))); } } } @@ -2024,8 +2030,8 @@ impl TenantManager { .wait_to_become_active(std::time::Duration::from_secs(9999)) .await .map_err(|e| { - use pageserver_api::models::TenantState; use GetActiveTenantError::{Cancelled, WillNotBecomeActive}; + use pageserver_api::models::TenantState; match e { Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => { Error::ShuttingDown @@ -2088,7 +2094,7 @@ impl TenantManager { match selector { ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { - return ShardResolveResult::Found(tenant.clone()) + return ShardResolveResult::Found(tenant.clone()); } ShardSelector::Page(key) => { // First slot we see for this tenant, calculate the expected shard number @@ -2485,7 +2491,7 @@ impl SlotGuard { TenantsMap::Initializing => { return Err(TenantSlotUpsertError::MapState( TenantMapError::StillInitializing, - )) + )); } TenantsMap::ShuttingDown(_) => { return Err(TenantSlotUpsertError::ShuttingDown(( @@ -2814,21 +2820,22 @@ where } } -use { - crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest, - utils::http::error::ApiError, -}; +use http_utils::error::ApiError; +use pageserver_api::models::TimelineGcRequest; + +use crate::tenant::gc_result::GcResult; #[cfg(test)] mod tests { use std::collections::BTreeMap; use std::sync::Arc; + use tracing::Instrument; + use super::super::harness::TenantHarness; + use super::TenantsMap; use crate::tenant::mgr::TenantSlot; - use super::{super::harness::TenantHarness, TenantsMap}; - #[tokio::test(start_paused = true)] async fn shutdown_awaits_in_progress_tenant() { // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 20e0536a00..4ba5844fea 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -63,22 +63,18 @@ //! The contract between client and its user is that the user is responsible of //! scheduling operations in an order that keeps the remote consistent as //! described above. +//! //! From the user's perspective, the operations are executed sequentially. //! Internally, the client knows which operations can be performed in parallel, //! and which operations act like a "barrier" that require preceding operations //! to finish. The calling code just needs to call the schedule-functions in the //! correct order, and the client will parallelize the operations in a way that -//! is safe. -//! -//! The caller should be careful with deletion, though. They should not delete -//! local files that have been scheduled for upload but not yet finished uploading. -//! Otherwise the upload will fail. To wait for an upload to finish, use -//! the 'wait_completion' function (more on that later.) +//! is safe. For more details, see `UploadOp::can_bypass`. //! //! All of this relies on the following invariants: //! //! - We rely on read-after write consistency in the remote storage. -//! - Layer files are immutable +//! - Layer files are immutable. //! //! NB: Pageserver assumes that it has exclusive write access to the tenant in remote //! storage. Different tenants can be attached to different pageservers, but if the @@ -183,77 +179,64 @@ pub mod index; pub mod manifest; pub(crate) mod upload; -use anyhow::Context; -use camino::Utf8Path; -use chrono::{NaiveDateTime, Utc}; - -pub(crate) use download::download_initdb_tar_zst; -use pageserver_api::models::TimelineArchivalState; -use pageserver_api::shard::{ShardIndex, TenantShardId}; -use regex::Regex; -use scopeguard::ScopeGuard; -use tokio_util::sync::CancellationToken; -use utils::backoff::{ - self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, -}; -use utils::pausable_failpoint; -use utils::shard::ShardNumber; - use std::collections::{HashMap, HashSet, VecDeque}; +use std::ops::DerefMut; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::time::Duration; +use anyhow::Context; +use camino::Utf8Path; +use chrono::{NaiveDateTime, Utc}; +pub(crate) use download::{ + download_index_part, download_initdb_tar_zst, download_tenant_manifest, is_temp_download_file, + list_remote_tenant_shards, list_remote_timelines, +}; +use index::GcCompactionState; +pub(crate) use index::LayerFileMetadata; +use pageserver_api::models::TimelineArchivalState; +use pageserver_api::shard::{ShardIndex, TenantShardId}; +use regex::Regex; use remote_storage::{ DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel, }; -use std::ops::DerefMut; -use tracing::{debug, error, info, instrument, warn}; -use tracing::{info_span, Instrument}; -use utils::lsn::Lsn; - -use crate::context::RequestContext; -use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; -use crate::metrics::{ - MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, - RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, - REMOTE_ONDEMAND_DOWNLOADED_LAYERS, +use scopeguard::ScopeGuard; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; +pub(crate) use upload::upload_initdb_dir; +use utils::backoff::{ + self, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff, }; -use crate::task_mgr::shutdown_token; -use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::remote_timeline_client::download::download_retry; -use crate::tenant::storage_layer::AsLayerDesc; -use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable}; -use crate::tenant::TIMELINES_SEGMENT_NAME; -use crate::{ - config::PageServerConf, - task_mgr, - task_mgr::TaskKind, - task_mgr::BACKGROUND_RUNTIME, - tenant::metadata::TimelineMetadata, - tenant::upload_queue::{ - UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask, - }, - TENANT_HEATMAP_BASENAME, -}; - use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::pausable_failpoint; +use utils::shard::ShardNumber; use self::index::IndexPart; - use super::config::AttachedLocationConfig; use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerName, ResidentLayer}; use super::timeline::import_pgdata; use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; use super::{DeleteTimelineError, Generation}; - -pub(crate) use download::{ - download_index_part, download_tenant_manifest, is_temp_download_file, - list_remote_tenant_shards, list_remote_timelines, +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; +use crate::metrics::{ + MeasureRemoteOp, REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS, + RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, + RemoteTimelineClientMetricsCallTrackSize, }; -pub(crate) use index::LayerFileMetadata; -pub(crate) use upload::upload_initdb_dir; +use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind, shutdown_token}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::download::download_retry; +use crate::tenant::storage_layer::AsLayerDesc; +use crate::tenant::upload_queue::{ + Delete, OpType, UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, + UploadQueueStoppedDeletable, UploadTask, +}; +use crate::tenant::{TIMELINES_SEGMENT_NAME, debug_assert_current_span_has_tenant_and_timeline_id}; +use crate::{TENANT_HEATMAP_BASENAME, task_mgr}; // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a download fails, we log it at info-level, and retry. @@ -304,6 +287,15 @@ pub enum WaitCompletionError { #[derive(Debug, thiserror::Error)] #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")] pub struct UploadQueueNotReadyError; + +#[derive(Debug, thiserror::Error)] +pub enum ShutdownIfArchivedError { + #[error(transparent)] + NotInitialized(NotInitialized), + #[error("timeline is not archived")] + NotArchived, +} + /// Behavioral modes that enable seamless live migration. /// /// See docs/rfcs/028-pageserver-migration.md to understand how these fit in. @@ -377,6 +369,12 @@ pub(crate) struct RemoteTimelineClient { cancel: CancellationToken, } +impl Drop for RemoteTimelineClient { + fn drop(&mut self) { + debug!("dropping RemoteTimelineClient"); + } +} + impl RemoteTimelineClient { /// /// Create a remote storage client for given timeline @@ -420,8 +418,15 @@ impl RemoteTimelineClient { /// an index file upload, i.e., it's not empty. /// The given `index_part` must be the one on the remote. pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> { + // Set the maximum number of inprogress tasks to the remote storage concurrency. There's + // certainly no point in starting more upload tasks than this. + let inprogress_limit = self + .conf + .remote_storage_config + .as_ref() + .map_or(0, |r| r.concurrency_limit()); let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_with_current_remote_index_part(index_part)?; + upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?; self.update_remote_physical_size_gauge(Some(index_part)); info!( "initialized upload queue from remote index with {} layer files", @@ -436,8 +441,15 @@ impl RemoteTimelineClient { &self, local_metadata: &TimelineMetadata, ) -> anyhow::Result<()> { + // Set the maximum number of inprogress tasks to the remote storage concurrency. There's + // certainly no point in starting more upload tasks than this. + let inprogress_limit = self + .conf + .remote_storage_config + .as_ref() + .map_or(0, |r| r.concurrency_limit()); let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_empty_remote(local_metadata)?; + upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?; self.update_remote_physical_size_gauge(None); info!("initialized upload queue as empty"); Ok(()) @@ -453,9 +465,14 @@ impl RemoteTimelineClient { let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!( "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted" ))?; + let inprogress_limit = self + .conf + .remote_storage_config + .as_ref() + .map_or(0, |r| r.concurrency_limit()); let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_with_current_remote_index_part(index_part)?; + upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?; self.update_remote_physical_size_gauge(Some(index_part)); self.stop_impl(&mut upload_queue); @@ -487,7 +504,7 @@ impl RemoteTimelineClient { if let Ok(queue) = queue_locked.initialized_mut() { let blocked_deletions = std::mem::take(&mut queue.blocked_deletions); for d in blocked_deletions { - if let Err(e) = self.deletion_queue_client.push_layers_sync( + if let Err(e) = self.deletion_queue_client.push_layers( self.tenant_shard_id, self.timeline_id, self.generation, @@ -749,7 +766,7 @@ impl RemoteTimelineClient { // ahead of what's _actually_ on the remote during index upload. upload_queue.dirty.metadata = metadata.clone(); - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Ok(()) } @@ -770,7 +787,13 @@ impl RemoteTimelineClient { upload_queue.dirty.metadata.apply(update); - self.schedule_index_upload(upload_queue)?; + // Defense in depth: if we somehow generated invalid metadata, do not persist it. + upload_queue + .dirty + .validate() + .map_err(|e| anyhow::anyhow!(e))?; + + self.schedule_index_upload(upload_queue); Ok(()) } @@ -809,13 +832,62 @@ impl RemoteTimelineClient { if let Some(archived_at_set) = need_upload_scheduled { let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc()); upload_queue.dirty.archived_at = intended_archived_at; - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); } let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some(); Ok(need_wait) } + /// Shuts the timeline client down, but only if the timeline is archived. + /// + /// This function and [`Self::schedule_index_upload_for_timeline_archival_state`] use the + /// same lock to prevent races between unarchival and offloading: unarchival requires the + /// upload queue to be initialized, and leaves behind an upload queue where either dirty + /// or clean has archived_at of `None`. offloading leaves behind an uninitialized upload + /// queue. + pub(crate) async fn shutdown_if_archived( + self: &Arc, + ) -> Result<(), ShutdownIfArchivedError> { + { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard + .initialized_mut() + .map_err(ShutdownIfArchivedError::NotInitialized)?; + + match ( + upload_queue.dirty.archived_at.is_none(), + upload_queue.clean.0.archived_at.is_none(), + ) { + // The expected case: the timeline is archived and we don't want to unarchive + (false, false) => {} + (true, false) => { + tracing::info!("can't shut down timeline: timeline slated for unarchival"); + return Err(ShutdownIfArchivedError::NotArchived); + } + (dirty_archived, true) => { + tracing::info!(%dirty_archived, "can't shut down timeline: timeline not archived in remote storage"); + return Err(ShutdownIfArchivedError::NotArchived); + } + } + + // Set the shutting_down flag while the guard from the archival check is held. + // This prevents a race with unarchival, as initialized_mut will not return + // an upload queue from this point. + // Also launch the queued tasks like shutdown() does. + if !upload_queue.shutting_down { + upload_queue.shutting_down = true; + upload_queue.queued_operations.push_back(UploadOp::Shutdown); + // this operation is not counted similar to Barrier + self.launch_queued_tasks(upload_queue); + } + } + + self.shutdown().await; + + Ok(()) + } + /// Launch an index-file upload operation in the background, setting `import_pgdata` field. pub(crate) fn schedule_index_upload_for_import_pgdata_state_update( self: &Arc, @@ -824,7 +896,19 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; upload_queue.dirty.import_pgdata = state; - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); + Ok(()) + } + + /// Launch an index-file upload operation in the background, setting `import_pgdata` field. + pub(crate) fn schedule_index_upload_for_gc_compaction_state_update( + self: &Arc, + gc_compaction_state: GcCompactionState, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.gc_compaction = Some(gc_compaction_state); + self.schedule_index_upload(upload_queue); Ok(()) } @@ -843,17 +927,14 @@ impl RemoteTimelineClient { let upload_queue = guard.initialized_mut()?; if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); } Ok(()) } /// Launch an index-file upload operation in the background (internal function) - fn schedule_index_upload( - self: &Arc, - upload_queue: &mut UploadQueueInitialized, - ) -> Result<(), NotInitialized> { + fn schedule_index_upload(self: &Arc, upload_queue: &mut UploadQueueInitialized) { let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); // fix up the duplicated field upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn; @@ -880,7 +961,6 @@ impl RemoteTimelineClient { // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); - Ok(()) } /// Reparent this timeline to a new parent. @@ -909,7 +989,7 @@ impl RemoteTimelineClient { upload_queue.dirty.metadata.reparent(new_parent); upload_queue.dirty.lineage.record_previous_ancestor(&prev); - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } @@ -948,7 +1028,7 @@ impl RemoteTimelineClient { assert!(prev.is_none(), "copied layer existed already {layer}"); } - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } @@ -997,14 +1077,18 @@ impl RemoteTimelineClient { if !wanted(x) && wanted(y) { // this could be avoided by having external in-memory synchronization, like // timeline detach ancestor - warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason"); + warn!( + ?reason, + op = "insert", + "unexpected: two racing processes to enable and disable a gc blocking reason" + ); } // at this point, the metadata must always show that there is a parent upload_queue.dirty.gc_blocking = current .map(|x| x.with_reason(reason)) .or_else(|| Some(index::GcBlocking::started_now_for(reason))); - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } } @@ -1051,14 +1135,17 @@ impl RemoteTimelineClient { (x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)), (x, y) => { if !wanted(x) && wanted(y) { - warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)"); + warn!( + ?reason, + op = "remove", + "unexpected: two racing processes to enable and disable a gc blocking reason (remove)" + ); } upload_queue.dirty.gc_blocking = current.as_ref().and_then(|x| x.without_reason(reason)); assert!(wanted(upload_queue.dirty.gc_blocking.as_ref())); - // FIXME: bogus ? - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } } @@ -1125,8 +1212,8 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - let with_metadata = self - .schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?; + let with_metadata = + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned()); self.schedule_deletion_of_unlinked0(upload_queue, with_metadata); @@ -1153,7 +1240,7 @@ impl RemoteTimelineClient { let names = gc_layers.iter().map(|x| x.layer_desc().layer_name()); - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?; + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); self.launch_queued_tasks(upload_queue); @@ -1166,7 +1253,7 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> Result, NotInitialized> + ) -> Vec<(LayerName, LayerFileMetadata)> where I: IntoIterator, { @@ -1194,12 +1281,14 @@ impl RemoteTimelineClient { #[cfg(feature = "testing")] for (name, metadata) in &with_metadata { - let gen = metadata.generation; - if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen) { - if unexpected == gen { + let gen_ = metadata.generation; + if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen_) { + if unexpected == gen_ { tracing::error!("{name} was unlinked twice with same generation"); } else { - tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}"); + tracing::error!( + "{name} was unlinked twice with different generations {gen_:?} and {unexpected:?}" + ); } } } @@ -1208,10 +1297,10 @@ impl RemoteTimelineClient { // index_part update, because that needs to be uploaded before we can actually delete the // files. if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); } - Ok(with_metadata) + with_metadata } /// Schedules deletion for layer files which have previously been unlinked from the @@ -1261,11 +1350,11 @@ impl RemoteTimelineClient { #[cfg(feature = "testing")] for (name, meta) in &with_metadata { - let gen = meta.generation; + let gen_ = meta.generation; match upload_queue.dangling_files.remove(name) { - Some(same) if same == gen => { /* expected */ } + Some(same) if same == gen_ => { /* expected */ } Some(other) => { - tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}"); + tracing::error!("{name} was unlinked with {other:?} but deleted with {gen_:?}"); } None => { tracing::error!("{name} was unlinked but was not dangling"); @@ -1302,7 +1391,7 @@ impl RemoteTimelineClient { let names = compacted_from.iter().map(|x| x.layer_desc().layer_name()); - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?; + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); self.launch_queued_tasks(upload_queue); Ok(()) @@ -1362,7 +1451,9 @@ impl RemoteTimelineClient { // proper stop is yet to be called. On cancel the original or some later task must call // `stop` or `shutdown`. let sg = scopeguard::guard((), |_| { - tracing::error!("RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error") + tracing::error!( + "RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error" + ) }); let fut = { @@ -1378,7 +1469,7 @@ impl RemoteTimelineClient { scopeguard::ScopeGuard::into_inner(sg); return; } - UploadQueue::Initialized(ref mut init) => init, + UploadQueue::Initialized(init) => init, }; // if the queue is already stuck due to a shutdown operation which was cancelled, then @@ -1738,7 +1829,9 @@ impl RemoteTimelineClient { .map(|n| n.starts_with(IndexPart::FILE_NAME)) .unwrap_or(false) }) - .filter_map(|o| parse_remote_index_path(o.key.clone()).map(|gen| (o.key.clone(), gen))) + .filter_map(|o| { + parse_remote_index_path(o.key.clone()).map(|gen_| (o.key.clone(), gen_)) + }) .max_by_key(|i| i.1) .map(|i| i.0.clone()) .unwrap_or( @@ -1802,57 +1895,17 @@ impl RemoteTimelineClient { Ok(()) } - /// /// Pick next tasks from the queue, and start as many of them as possible without violating /// the ordering constraints. /// - /// The caller needs to already hold the `upload_queue` lock. + /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does. + /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has + /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks. fn launch_queued_tasks(self: &Arc, upload_queue: &mut UploadQueueInitialized) { - while let Some(next_op) = upload_queue.queued_operations.front() { - // Can we run this task now? - let can_run_now = match next_op { - UploadOp::UploadLayer(..) => { - // Can always be scheduled. - true - } - UploadOp::UploadMetadata { .. } => { - // These can only be performed after all the preceding operations - // have finished. - upload_queue.inprogress_tasks.is_empty() - } - UploadOp::Delete(..) => { - // Wait for preceding uploads to finish. Concurrent deletions are OK, though. - upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len() - } + while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() { + debug!("starting op: {next_op}"); - UploadOp::Barrier(_) | UploadOp::Shutdown => { - upload_queue.inprogress_tasks.is_empty() - } - }; - - // If we cannot launch this task, don't look any further. - // - // In some cases, we could let some non-frontmost tasks to "jump the queue" and launch - // them now, but we don't try to do that currently. For example, if the frontmost task - // is an index-file upload that cannot proceed until preceding uploads have finished, we - // could still start layer uploads that were scheduled later. - if !can_run_now { - break; - } - - if let UploadOp::Shutdown = next_op { - // leave the op in the queue but do not start more tasks; it will be dropped when - // the stop is called. - upload_queue.shutdown_ready.close(); - break; - } - - // We can launch this task. Remove it from the queue first. - let mut next_op = upload_queue.queued_operations.pop_front().unwrap(); - - debug!("starting op: {}", next_op); - - // Update the counters and prepare + // Prepare upload. match &mut next_op { UploadOp::UploadLayer(layer, meta, mode) => { if upload_queue @@ -1863,18 +1916,14 @@ impl RemoteTimelineClient { } else { *mode = Some(OpType::MayReorder) } - upload_queue.num_inprogress_layer_uploads += 1; - } - UploadOp::UploadMetadata { .. } => { - upload_queue.num_inprogress_metadata_uploads += 1; } + UploadOp::UploadMetadata { .. } => {} UploadOp::Delete(Delete { layers }) => { for (name, meta) in layers { upload_queue .recently_deleted .insert((name.clone(), meta.generation)); } - upload_queue.num_inprogress_deletions += 1; } UploadOp::Barrier(sender) => { sender.send_replace(()); @@ -1891,6 +1940,7 @@ impl RemoteTimelineClient { let task = Arc::new(UploadTask { task_id: upload_task_id, op: next_op, + coalesced_ops, retries: AtomicU32::new(0), }); upload_queue @@ -1948,8 +1998,34 @@ impl RemoteTimelineClient { return; } + // Assert that we don't modify a layer that's referenced by the current index. + if cfg!(debug_assertions) { + let modified = match &task.op { + UploadOp::UploadLayer(layer, layer_metadata, _) => { + vec![(layer.layer_desc().layer_name(), layer_metadata)] + } + UploadOp::Delete(delete) => { + delete.layers.iter().map(|(n, m)| (n.clone(), m)).collect() + } + // These don't modify layers. + UploadOp::UploadMetadata { .. } => Vec::new(), + UploadOp::Barrier(_) => Vec::new(), + UploadOp::Shutdown => Vec::new(), + }; + if let Ok(queue) = self.upload_queue.lock().unwrap().initialized_mut() { + for (ref name, metadata) in modified { + debug_assert!( + !queue.clean.0.references(name, metadata), + "layer {name} modified while referenced by index", + ); + } + } + } + let upload_result: anyhow::Result<()> = match &task.op { - UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => { + UploadOp::UploadLayer(layer, layer_metadata, mode) => { + // TODO: check if this mechanism can be removed now that can_bypass() performs + // conflict checks during scheduling. if let Some(OpType::FlushDeletion) = mode { if self.config.read().unwrap().block_deletions { // Of course, this is not efficient... but usually the queue should be empty. @@ -2037,7 +2113,7 @@ impl RemoteTimelineClient { ) .await } - UploadOp::UploadMetadata { ref uploaded } => { + UploadOp::UploadMetadata { uploaded } => { let res = upload::upload_index_part( &self.storage_impl, &self.tenant_shard_id, @@ -2088,7 +2164,6 @@ impl RemoteTimelineClient { self.generation, delete.layers.clone(), ) - .await .map_err(|e| anyhow::anyhow!(e)) } } @@ -2154,11 +2229,11 @@ impl RemoteTimelineClient { let lsn_update = { let mut upload_queue_guard = self.upload_queue.lock().unwrap(); let upload_queue = match upload_queue_guard.deref_mut() { - UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"), - UploadQueue::Stopped(_stopped) => { - None - }, - UploadQueue::Initialized(qi) => { Some(qi) } + UploadQueue::Uninitialized => panic!( + "callers are responsible for ensuring this is only called on an initialized queue" + ), + UploadQueue::Stopped(_stopped) => None, + UploadQueue::Initialized(qi) => Some(qi), }; let upload_queue = match upload_queue { @@ -2172,20 +2247,19 @@ impl RemoteTimelineClient { upload_queue.inprogress_tasks.remove(&task.task_id); let lsn_update = match task.op { - UploadOp::UploadLayer(_, _, _) => { - upload_queue.num_inprogress_layer_uploads -= 1; - None - } + UploadOp::UploadLayer(_, _, _) => None, UploadOp::UploadMetadata { ref uploaded } => { - upload_queue.num_inprogress_metadata_uploads -= 1; - // the task id is reused as a monotonicity check for storing the "clean" // IndexPart. let last_updater = upload_queue.clean.1; let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id); let monotone = is_later || last_updater.is_none(); - assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id); + assert!( + monotone, + "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", + task.task_id + ); // not taking ownership is wasteful upload_queue.clean.0.clone_from(uploaded); @@ -2212,10 +2286,7 @@ impl RemoteTimelineClient { None } } - UploadOp::Delete(_) => { - upload_queue.num_inprogress_deletions -= 1; - None - } + UploadOp::Delete(_) => None, UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(), }; @@ -2240,6 +2311,9 @@ impl RemoteTimelineClient { } self.metric_end(&task.op); + for coalesced_op in &task.coalesced_ops { + self.metric_end(coalesced_op); + } } fn metric_impl( @@ -2332,6 +2406,7 @@ impl RemoteTimelineClient { // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point. // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it. let upload_queue_for_deletion = UploadQueueInitialized { + inprogress_limit: initialized.inprogress_limit, task_counter: 0, dirty: initialized.dirty.clone(), clean: initialized.clean.clone(), @@ -2339,9 +2414,6 @@ impl RemoteTimelineClient { visible_remote_consistent_lsn: initialized .visible_remote_consistent_lsn .clone(), - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, inprogress_tasks: HashMap::default(), queued_operations: VecDeque::default(), #[cfg(feature = "testing")] @@ -2368,14 +2440,6 @@ impl RemoteTimelineClient { } }; - // consistency check - assert_eq!( - qi.num_inprogress_layer_uploads - + qi.num_inprogress_metadata_uploads - + qi.num_inprogress_deletions, - qi.inprogress_tasks.len() - ); - // We don't need to do anything here for in-progress tasks. They will finish // on their own, decrement the unfinished-task counter themselves, and observe // that the queue is Stopped. @@ -2514,6 +2578,21 @@ pub fn remote_layer_path( RemotePath::from_string(&path).expect("Failed to construct path") } +/// Returns true if a and b have the same layer path within a tenant/timeline. This is essentially +/// remote_layer_path(a) == remote_layer_path(b) without the string allocations. +/// +/// TODO: there should be a variant of LayerName for the physical path that contains information +/// about the shard and generation, such that this could be replaced by a simple comparison. +pub fn is_same_remote_layer_path( + aname: &LayerName, + ameta: &LayerFileMetadata, + bname: &LayerName, + bmeta: &LayerFileMetadata, +) -> bool { + // NB: don't assert remote_layer_path(a) == remote_layer_path(b); too expensive even for debug. + aname == bname && ameta.shard == bmeta.shard && ameta.generation == bmeta.generation +} + pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath { RemotePath::from_string(&format!( "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PATH}" @@ -2579,20 +2658,16 @@ pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option #[cfg(test)] mod tests { - use super::*; - use crate::{ - context::RequestContext, - tenant::{ - config::AttachmentMode, - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::layer::local_layer_path, - Tenant, Timeline, - }, - DEFAULT_PG_VERSION, - }; - use std::collections::HashSet; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::context::RequestContext; + use crate::tenant::config::AttachmentMode; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::layer::local_layer_path; + use crate::tenant::{Tenant, Timeline}; + pub(super) fn dummy_contents(name: &str) -> Vec { format!("contents for {name}").into() } @@ -2807,8 +2882,8 @@ mod tests { let mut guard = client.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut().unwrap(); assert!(upload_queue.queued_operations.is_empty()); - assert!(upload_queue.inprogress_tasks.len() == 2); - assert!(upload_queue.num_inprogress_layer_uploads == 2); + assert_eq!(upload_queue.inprogress_tasks.len(), 2); + assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2); // also check that `latest_file_changes` was updated assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2); @@ -2878,8 +2953,8 @@ mod tests { // Deletion schedules upload of the index file, and the file deletion itself assert_eq!(upload_queue.queued_operations.len(), 2); assert_eq!(upload_queue.inprogress_tasks.len(), 1); - assert_eq!(upload_queue.num_inprogress_layer_uploads, 1); - assert_eq!(upload_queue.num_inprogress_deletions, 0); + assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1); + assert_eq!(upload_queue.num_inprogress_deletions(), 0); assert_eq!( upload_queue.latest_files_changes_since_metadata_upload_scheduled, 0 diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index caa884653d..9b3c4ee243 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -8,41 +8,39 @@ use std::future::Future; use std::str::FromStr; use std::time::SystemTime; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::shard::TenantShardId; +use remote_storage::{ + DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, +}; use tokio::fs::{self, File, OpenOptions}; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::warn; -use utils::backoff; +use utils::crashsafe::path_with_suffix_extension; +use utils::id::{TenantId, TimelineId}; +use utils::{backoff, pausable_failpoint}; +use super::index::{IndexPart, LayerFileMetadata}; +use super::manifest::TenantManifest; +use super::{ + FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, parse_remote_index_path, + parse_remote_tenant_manifest_path, remote_index_path, remote_initdb_archive_path, + remote_initdb_preserved_archive_path, remote_tenant_manifest_path, + remote_tenant_manifest_prefix, remote_tenant_path, +}; +use crate::TEMP_FILE_SUFFIX; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id, }; +use crate::tenant::Generation; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerName; -use crate::tenant::Generation; -use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; -use crate::TEMP_FILE_SUFFIX; -use remote_storage::{ - DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, -}; -use utils::crashsafe::path_with_suffix_extension; -use utils::id::{TenantId, TimelineId}; -use utils::pausable_failpoint; - -use super::index::{IndexPart, LayerFileMetadata}; -use super::manifest::TenantManifest; -use super::{ - parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path, - remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path, - remote_tenant_manifest_prefix, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, INITDB_PATH, -}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error}; /// /// If 'metadata' is given, we will validate that the downloaded file's size matches that @@ -145,8 +143,8 @@ pub async fn download_layer_file<'a>( /// /// If Err() is returned, there was some error. The file at `dst_path` has been unlinked. /// The unlinking has _not_ been made durable. -async fn download_object<'a>( - storage: &'a GenericRemoteStorage, +async fn download_object( + storage: &GenericRemoteStorage, src_path: &RemotePath, dst_path: &Utf8PathBuf, #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate, @@ -207,9 +205,9 @@ async fn download_object<'a>( } #[cfg(target_os = "linux")] crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { - use crate::virtual_file::owned_buffers_io; - use crate::virtual_file::IoBufferMut; use std::sync::Arc; + + use crate::virtual_file::{IoBufferMut, owned_buffers_io}; async { let destination_file = Arc::new( VirtualFile::create(dst_path, ctx) diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 506990fb2f..ceaed58bbd 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -7,16 +7,16 @@ use std::collections::HashMap; use chrono::NaiveDateTime; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::shard::ShardIndex; use serde::{Deserialize, Serialize}; use utils::id::TimelineId; +use utils::lsn::Lsn; +use super::is_same_remote_layer_path; +use crate::tenant::Generation; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::import_pgdata; -use crate::tenant::Generation; -use pageserver_api::shard::ShardIndex; - -use utils::lsn::Lsn; /// In-memory representation of an `index_part.json` file /// @@ -45,10 +45,8 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none")] pub import_pgdata: Option, - /// Per layer file name metadata, which can be present for a present or missing layer file. - /// - /// Older versions of `IndexPart` will not have this property or have only a part of metadata - /// that latest version stores. + /// Layer filenames and metadata. For an index persisted in remote storage, all layers must + /// exist in remote storage. pub layer_metadata: HashMap, /// Because of the trouble of eyeballing the legacy "metadata" field, we copied the @@ -79,8 +77,59 @@ pub struct IndexPart { /// /// None means no aux files have been written to the storage before the point /// when this flag is introduced. + /// + /// This flag is not used any more as all tenants have been transitioned to the new aux file policy. #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) last_aux_file_policy: Option, + + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) rel_size_migration: Option, + + /// Not used anymore -- kept here for backwards compatibility. Merged into the `gc_compaction` field. + #[serde(skip_serializing_if = "Option::is_none", default)] + l2_lsn: Option, + + /// State for the garbage-collecting compaction pass. + /// + /// Garbage-collecting compaction (gc-compaction) prunes `Value`s that are outside + /// the PITR window and not needed by child timelines. + /// + /// A commonly used synonym for this compaction pass is + /// "bottommost-compaction" because the affected LSN range + /// is the "bottom" of the (key,lsn) map. + /// + /// Gc-compaction is a quite expensive operation; that's why we use + /// trigger condition. + /// This field here holds the state pertaining to that trigger condition + /// and (in future) to the progress of the gc-compaction, so that it's + /// resumable across restarts & migrations. + /// + /// Note that the underlying algorithm is _also_ called `gc-compaction` + /// in most places & design docs; but in fact it is more flexible than + /// just the specific use case here; it needs a new name. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) gc_compaction: Option, +} + +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +pub struct GcCompactionState { + /// The upper bound of the last completed garbage-collecting compaction, aka. L2 LSN. + pub(crate) last_completed_lsn: Lsn, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub enum RelSizeMigration { + /// The tenant is using the old rel_size format. + /// Note that this enum is persisted as `Option` in the index part, so + /// `None` is the same as `Some(RelSizeMigration::Legacy)`. + Legacy, + /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are + /// persisted in the index part. The read path will read both formats and merge them. + Migrating, + /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted + /// in the index part, and the read path will not read the old format. + Migrated, } impl IndexPart { @@ -99,14 +148,17 @@ impl IndexPart { /// - 8: added `archived_at` /// - 9: +gc_blocking /// - 10: +import_pgdata - const LATEST_VERSION: usize = 10; + /// - 11: +rel_size_migration + /// - 12: +l2_lsn + /// - 13: +gc_compaction + const LATEST_VERSION: usize = 13; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; pub const FILE_NAME: &'static str = "index_part.json"; - pub(crate) fn empty(metadata: TimelineMetadata) -> Self { + pub fn empty(metadata: TimelineMetadata) -> Self { IndexPart { version: Self::LATEST_VERSION, layer_metadata: Default::default(), @@ -118,6 +170,9 @@ impl IndexPart { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, + gc_compaction: None, } } @@ -143,6 +198,32 @@ impl IndexPart { pub(crate) fn example() -> Self { Self::empty(TimelineMetadata::example()) } + + /// Returns true if the index contains a reference to the given layer (i.e. file path). + /// + /// TODO: there should be a variant of LayerName for the physical remote path that contains + /// information about the shard and generation, to avoid passing in metadata. + pub fn references(&self, name: &LayerName, metadata: &LayerFileMetadata) -> bool { + let Some(index_metadata) = self.layer_metadata.get(name) else { + return false; + }; + is_same_remote_layer_path(name, metadata, name, index_metadata) + } + + /// Check for invariants in the index: this is useful when uploading an index to ensure that if + /// we encounter a bug, we do not persist buggy metadata. + pub(crate) fn validate(&self) -> Result<(), String> { + if self.import_pgdata.is_none() + && self.metadata.ancestor_timeline().is_none() + && self.layer_metadata.is_empty() + { + // Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must + // always have at least one layer. + return Err("Index has no ancestor and no layers".to_string()); + } + + Ok(()) + } } /// Metadata gathered for each of the layer files. @@ -170,6 +251,10 @@ impl LayerFileMetadata { shard, } } + /// Helper to get both generation and file size in a tuple + pub fn generation_file_size(&self) -> (Generation, u64) { + (self.generation, self.file_size) + } } /// Limited history of earlier ancestors. @@ -350,10 +435,12 @@ impl GcBlocking { #[cfg(test)] mod tests { - use super::*; use std::str::FromStr; + use utils::id::TimelineId; + use super::*; + #[test] fn v1_indexpart_is_parsed() { let example = r#"{ @@ -392,6 +479,9 @@ mod tests { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -437,6 +527,9 @@ mod tests { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -483,6 +576,9 @@ mod tests { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -532,6 +628,9 @@ mod tests { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, + gc_compaction: None, }; let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -576,6 +675,9 @@ mod tests { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -623,6 +725,9 @@ mod tests { gc_blocking: None, last_aux_file_policy: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -675,6 +780,9 @@ mod tests { gc_blocking: None, last_aux_file_policy: Some(AuxFilePolicy::V2), import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -732,6 +840,9 @@ mod tests { gc_blocking: None, last_aux_file_policy: Default::default(), import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -790,6 +901,9 @@ mod tests { gc_blocking: None, last_aux_file_policy: Default::default(), import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -853,6 +967,9 @@ mod tests { last_aux_file_policy: Default::default(), archived_at: None, import_pgdata: None, + rel_size_migration: None, + l2_lsn: None, + gc_compaction: None, }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); @@ -928,7 +1045,176 @@ mod tests { started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), - }))) + }))), + rel_size_migration: None, + l2_lsn: None, + gc_compaction: None, + }; + + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v11_rel_size_migration_is_parsed() { + let example = r#"{ + "version": 11, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + }, + "import_pgdata": { + "V1": { + "Done": { + "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", + "started_at": "2024-11-13T09:23:42.123", + "finished_at": "2024-11-13T09:42:23.123" + } + } + }, + "rel_size_migration": "legacy" + }"#; + + let expected = IndexPart { + version: 11, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ + started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), + finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), + idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), + }))), + rel_size_migration: Some(RelSizeMigration::Legacy), + l2_lsn: None, + gc_compaction: None, + }; + + let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v12_v13_l2_gc_ompaction_is_parsed() { + let example = r#"{ + "version": 12, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "gc_blocking": { + "started_at": "2024-07-19T09:00:00.123", + "reasons": ["DetachAncestor"] + }, + "import_pgdata": { + "V1": { + "Done": { + "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5", + "started_at": "2024-11-13T09:23:42.123", + "finished_at": "2024-11-13T09:42:23.123" + } + } + }, + "rel_size_migration": "legacy", + "l2_lsn": "0/16960E8", + "gc_compaction": { + "last_completed_lsn": "0/16960E8" + } + }"#; + + let expected = IndexPart { + version: 12, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: None, + lineage: Default::default(), + gc_blocking: Some(GcBlocking { + started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"), + reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]), + }), + last_aux_file_policy: Default::default(), + archived_at: None, + import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{ + started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"), + finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"), + idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()), + }))), + rel_size_migration: Some(RelSizeMigration::Legacy), + l2_lsn: Some("0/16960E8".parse::().unwrap()), + gc_compaction: Some(GcCompactionState { + last_completed_lsn: "0/16960E8".parse::().unwrap(), + }), }; let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap(); diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs index 2029847a12..543ccc219d 100644 --- a/pageserver/src/tenant/remote_timeline_client/manifest.rs +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -1,6 +1,7 @@ use chrono::NaiveDateTime; use serde::{Deserialize, Serialize}; -use utils::{id::TimelineId, lsn::Lsn}; +use utils::id::TimelineId; +use utils::lsn::Lsn; /// Tenant-shard scoped manifest #[derive(Clone, Serialize, Deserialize, PartialEq, Eq)] diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 0cd5d05aa2..7d9f47665a 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -1,32 +1,32 @@ //! Helper functions to upload files to remote storage with a RemoteStorage -use anyhow::{bail, Context}; +use std::io::{ErrorKind, SeekFrom}; +use std::time::SystemTime; + +use anyhow::{Context, bail}; use bytes::Bytes; use camino::Utf8Path; use fail::fail_point; use pageserver_api::shard::TenantShardId; -use std::io::{ErrorKind, SeekFrom}; -use std::time::SystemTime; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; use tokio::fs::{self, File}; use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; +use tracing::info; +use utils::id::{TenantId, TimelineId}; use utils::{backoff, pausable_failpoint}; +use super::Generation; use super::index::IndexPart; use super::manifest::TenantManifest; -use super::Generation; use crate::tenant::remote_timeline_client::{ remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path, }; -use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; -use utils::id::{TenantId, TimelineId}; - -use tracing::info; /// Serializes and uploads the given index part data to the remote storage. -pub(crate) async fn upload_index_part<'a>( - storage: &'a GenericRemoteStorage, +pub(crate) async fn upload_index_part( + storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, generation: Generation, @@ -40,6 +40,10 @@ pub(crate) async fn upload_index_part<'a>( }); pausable_failpoint!("before-upload-index-pausable"); + // Safety: refuse to persist invalid index metadata, to mitigate the impact of any bug that produces this + // (this should never happen) + index_part.validate().map_err(|e| anyhow::anyhow!(e))?; + // FIXME: this error comes too late let serialized = index_part.to_json_bytes()?; let serialized = Bytes::from(serialized); @@ -130,7 +134,9 @@ pub(super) async fn upload_timeline_layer<'a>( .len(); if metadata_size != fs_size { - bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); + bail!( + "File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}" + ); } let fs_size = usize::try_from(fs_size) diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 4bc208331b..8f8622c796 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -3,40 +3,31 @@ pub mod heatmap; mod heatmap_uploader; mod scheduler; -use std::{sync::Arc, time::SystemTime}; +use std::sync::Arc; +use std::time::SystemTime; -use crate::{ - context::RequestContext, - disk_usage_eviction_task::DiskUsageEvictionInfo, - metrics::SECONDARY_HEATMAP_TOTAL_SIZE, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, -}; - -use self::{ - downloader::{downloader_task, SecondaryDetail}, - heatmap_uploader::heatmap_uploader_task, -}; - -use super::{ - config::{SecondaryLocationConfig, TenantConfOpt}, - mgr::TenantManager, - span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerName, - GetTenantError, -}; - -use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE; use metrics::UIntGauge; -use pageserver_api::{ - models, - shard::{ShardIdentity, TenantShardId}, -}; +use pageserver_api::models; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use remote_storage::GenericRemoteStorage; - use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::instrument; -use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate}; +use utils::completion::Barrier; +use utils::id::TimelineId; +use utils::sync::gate::Gate; + +use self::downloader::{SecondaryDetail, downloader_task}; +use self::heatmap_uploader::heatmap_uploader_task; +use super::GetTenantError; +use super::config::{SecondaryLocationConfig, TenantConfOpt}; +use super::mgr::TenantManager; +use super::span::debug_assert_current_span_has_tenant_id; +use super::storage_layer::LayerName; +use crate::context::RequestContext; +use crate::disk_usage_eviction_task::DiskUsageEvictionInfo; +use crate::metrics::{SECONDARY_HEATMAP_TOTAL_SIZE, SECONDARY_RESIDENT_PHYSICAL_SIZE}; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; enum DownloadCommand { Download(TenantShardId), diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 395e34e404..a13b9323ac 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -1,47 +1,8 @@ -use std::{ - collections::{HashMap, HashSet}, - pin::Pin, - str::FromStr, - sync::Arc, - time::{Duration, Instant, SystemTime}, -}; - -use crate::{ - config::PageServerConf, - context::RequestContext, - disk_usage_eviction_task::{ - finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, - }, - metrics::SECONDARY_MODE, - tenant::{ - config::SecondaryLocationConfig, - debug_assert_current_span_has_tenant_and_timeline_id, - ephemeral_file::is_ephemeral_file, - remote_timeline_client::{ - index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - }, - span::debug_assert_current_span_has_tenant_id, - storage_layer::{layer::local_layer_path, LayerName, LayerVisibilityHint}, - tasks::{warn_when_period_overrun, BackgroundLoopKind}, - }, - virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, - TEMP_FILE_SUFFIX, -}; - -use super::{ - heatmap::HeatMapLayer, - scheduler::{ - self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult, - TenantBackgroundJobs, - }, - GetTenantError, SecondaryTenant, SecondaryTenantError, -}; - -use crate::tenant::{ - mgr::TenantManager, - remote_timeline_client::{download::download_layer_file, remote_heatmap_path}, -}; +use std::collections::{HashMap, HashSet}; +use std::pin::Pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant, SystemTime}; use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; @@ -50,18 +11,43 @@ use metrics::UIntGauge; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage}; - use tokio_util::sync::CancellationToken; -use tracing::{info_span, instrument, warn, Instrument}; -use utils::{ - backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, - id::TimelineId, pausable_failpoint, serde_system_time, -}; +use tracing::{Instrument, info_span, instrument, warn}; +use utils::completion::Barrier; +use utils::crashsafe::path_with_suffix_extension; +use utils::id::TimelineId; +use utils::{backoff, failpoint_support, fs_ext, pausable_failpoint, serde_system_time}; -use super::{ - heatmap::{HeatMapTenant, HeatMapTimeline}, - CommandRequest, DownloadCommand, +use super::heatmap::{HeatMapLayer, HeatMapTenant, HeatMapTimeline}; +use super::scheduler::{ + self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs, period_jitter, + period_warmup, }; +use super::{ + CommandRequest, DownloadCommand, GetTenantError, SecondaryTenant, SecondaryTenantError, +}; +use crate::TEMP_FILE_SUFFIX; +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::disk_usage_eviction_task::{ + DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, finite_f32, +}; +use crate::metrics::SECONDARY_MODE; +use crate::tenant::config::SecondaryLocationConfig; +use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::ephemeral_file::is_ephemeral_file; +use crate::tenant::mgr::TenantManager; +use crate::tenant::remote_timeline_client::download::download_layer_file; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::remote_timeline_client::{ + FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, is_temp_download_file, + remote_heatmap_path, +}; +use crate::tenant::span::debug_assert_current_span_has_tenant_id; +use crate::tenant::storage_layer::layer::local_layer_path; +use crate::tenant::storage_layer::{LayerName, LayerVisibilityHint}; +use crate::tenant::tasks::{BackgroundLoopKind, warn_when_period_overrun}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error}; /// For each tenant, default period for how long must have passed since the last download_tenant call before /// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first @@ -559,6 +545,13 @@ impl JobGenerator { @@ -666,12 +659,30 @@ impl<'a> TenantDownloader<'a> { HeatMapDownload::Modified(m) => m, }; - let heatmap = serde_json::from_slice::(&heatmap_bytes)?; - - // Save the heatmap: this will be useful on restart, allowing us to reconstruct - // layer metadata without having to re-download it. + // Heatmap storage location let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id); + let last_heatmap = if last_download.is_none() { + match load_heatmap(&heatmap_path, ctx).await { + Ok(htm) => htm, + Err(e) => { + tracing::warn!("Couldn't load heatmap from {heatmap_path}: {e:?}"); + None + } + } + } else { + None + }; + + let last_heatmap_timelines = last_heatmap.as_ref().map(|htm| { + htm.timelines + .iter() + .map(|tl| (tl.timeline_id, tl)) + .collect::>() + }); + + let heatmap = serde_json::from_slice::(&heatmap_bytes)?; + let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX); let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}"); let heatmap_path_bg = heatmap_path.clone(); @@ -700,10 +711,17 @@ impl<'a> TenantDownloader<'a> { let timeline_state = match timeline_state { Some(t) => t, None => { + let last_heatmap = + last_heatmap_timelines + .as_ref() + .and_then(|last_heatmap_timelines| { + last_heatmap_timelines.get(&timeline.timeline_id).copied() + }); // We have no existing state: need to scan local disk for layers first. let timeline_state = init_timeline_state( self.conf, tenant_shard_id, + last_heatmap, timeline, &self.secondary_state.resident_size_metric, ) @@ -1008,69 +1026,17 @@ impl<'a> TenantDownloader<'a> { return (Err(UpdateError::Restart), touched); } - // Existing on-disk layers: just update their access time. - if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) { - tracing::debug!("Layer {} is already on disk", layer.name); - - if cfg!(debug_assertions) { - // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think - // are already present on disk are really there. - match tokio::fs::metadata(&on_disk.local_path).await { - Ok(meta) => { - tracing::debug!( - "Layer {} present at {}, size {}", - layer.name, - on_disk.local_path, - meta.len(), - ); - } - Err(e) => { - tracing::warn!( - "Layer {} not found at {} ({})", - layer.name, - on_disk.local_path, - e - ); - debug_assert!(false); - } - } - } - - if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time { - // We already have this layer on disk. Update its access time. - tracing::debug!( - "Access time updated for layer {}: {} -> {}", - layer.name, - strftime(&on_disk.access_time), - strftime(&layer.access_time) - ); - touched.push(layer); - } - continue; - } else { - tracing::debug!("Layer {} not present on disk yet", layer.name); - } - - // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more - // recently than it was evicted. - if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) { - if &layer.access_time > evicted_at { - tracing::info!( - "Re-downloading evicted layer {}, accessed at {}, evicted at {}", - layer.name, - strftime(&layer.access_time), - strftime(evicted_at) - ); - } else { - tracing::trace!( - "Not re-downloading evicted layer {}, accessed at {}, evicted at {}", - layer.name, - strftime(&layer.access_time), - strftime(evicted_at) - ); + match self.layer_action(&timeline_state, &layer).await { + LayerAction::Download => (), + LayerAction::NoAction => continue, + LayerAction::Skip => { self.skip_layer(layer); continue; } + LayerAction::Touch => { + touched.push(layer); + continue; + } } match self @@ -1091,6 +1057,86 @@ impl<'a> TenantDownloader<'a> { (Ok(()), touched) } + async fn layer_action( + &self, + timeline_state: &SecondaryDetailTimeline, + layer: &HeatMapLayer, + ) -> LayerAction { + // Existing on-disk layers: just update their access time. + if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) { + tracing::debug!("Layer {} is already on disk", layer.name); + + if cfg!(debug_assertions) { + // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think + // are already present on disk are really there. + match tokio::fs::metadata(&on_disk.local_path).await { + Ok(meta) => { + tracing::debug!( + "Layer {} present at {}, size {}", + layer.name, + on_disk.local_path, + meta.len(), + ); + } + Err(e) => { + tracing::warn!( + "Layer {} not found at {} ({})", + layer.name, + on_disk.local_path, + e + ); + debug_assert!(false); + } + } + } + + if on_disk.metadata.generation_file_size() != layer.metadata.generation_file_size() { + tracing::info!( + "Re-downloading layer {} with changed size or generation: {:?}->{:?}", + layer.name, + on_disk.metadata.generation_file_size(), + layer.metadata.generation_file_size() + ); + return LayerAction::Download; + } + if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time { + // We already have this layer on disk. Update its access time. + tracing::debug!( + "Access time updated for layer {}: {} -> {}", + layer.name, + strftime(&on_disk.access_time), + strftime(&layer.access_time) + ); + return LayerAction::Touch; + } + return LayerAction::NoAction; + } else { + tracing::debug!("Layer {} not present on disk yet", layer.name); + } + + // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more + // recently than it was evicted. + if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) { + if &layer.access_time > evicted_at { + tracing::info!( + "Re-downloading evicted layer {}, accessed at {}, evicted at {}", + layer.name, + strftime(&layer.access_time), + strftime(evicted_at) + ); + } else { + tracing::trace!( + "Not re-downloading evicted layer {}, accessed at {}, evicted at {}", + layer.name, + strftime(&layer.access_time), + strftime(evicted_at) + ); + return LayerAction::Skip; + } + } + LayerAction::Download + } + async fn download_timeline( &self, timeline: HeatMapTimeline, @@ -1242,6 +1288,7 @@ impl<'a> TenantDownloader<'a> { async fn init_timeline_state( conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, + last_heatmap: Option<&HeatMapTimeline>, heatmap: &HeatMapTimeline, resident_metric: &UIntGauge, ) -> SecondaryDetailTimeline { @@ -1271,6 +1318,13 @@ async fn init_timeline_state( let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = heatmap.layers.iter().map(|l| (&l.name, l)).collect(); + let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = + if let Some(last_heatmap) = last_heatmap { + last_heatmap.layers.iter().map(|l| (&l.name, l)).collect() + } else { + HashMap::new() + }; + while let Some(dentry) = dir .next_entry() .await @@ -1304,18 +1358,32 @@ async fn init_timeline_state( match LayerName::from_str(file_name) { Ok(name) => { let remote_meta = heatmap_metadata.get(&name); + let last_meta = last_heatmap_metadata.get(&name); + let mut remove = false; match remote_meta { Some(remote_meta) => { + let last_meta_generation_file_size = last_meta + .map(|m| m.metadata.generation_file_size()) + .unwrap_or(remote_meta.metadata.generation_file_size()); // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784) - if local_meta.len() != remote_meta.metadata.file_size { - // This should not happen, because we do crashsafe write-then-rename when downloading - // layers, and layers in remote storage are immutable. Remove the local file because - // we cannot trust it. - tracing::warn!( + if remote_meta.metadata.generation_file_size() + != last_meta_generation_file_size + { + tracing::info!( + "Removing local layer {name} as on-disk json metadata has different generation or file size from remote: {:?} -> {:?}", + last_meta_generation_file_size, + remote_meta.metadata.generation_file_size() + ); + remove = true; + } else if local_meta.len() != remote_meta.metadata.file_size { + // This can happen in the presence of race conditions: the remote and on-disk metadata have changed, but we haven't had + // the chance yet to download the new layer to disk, before the process restarted. + tracing::info!( "Removing local layer {name} with unexpected local size {} != {}", local_meta.len(), remote_meta.metadata.file_size ); + remove = true; } else { // We expect the access time to be initialized immediately afterwards, when // the latest heatmap is applied to the state. @@ -1337,15 +1405,18 @@ async fn init_timeline_state( "Removing secondary local layer {} because it's absent in heatmap", name ); - tokio::fs::remove_file(&dentry.path()) - .await - .or_else(fs_ext::ignore_not_found) - .fatal_err(&format!( - "Removing layer {}", - dentry.path().to_string_lossy() - )); + remove = true; } } + if remove { + tokio::fs::remove_file(&dentry.path()) + .await + .or_else(fs_ext::ignore_not_found) + .fatal_err(&format!( + "Removing layer {}", + dentry.path().to_string_lossy() + )); + } } Err(_) => { // Ignore it. @@ -1356,3 +1427,18 @@ async fn init_timeline_state( detail } + +/// Loads a json-encoded heatmap file from the provided on-disk path +async fn load_heatmap( + path: &Utf8PathBuf, + ctx: &RequestContext, +) -> Result, anyhow::Error> { + let mut file = match VirtualFile::open(path, ctx).await { + Ok(file) => file, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None), + Err(e) => Err(e)?, + }; + let st = file.read_to_string(ctx).await?; + let htm = serde_json::from_str(&st)?; + Ok(Some(htm)) +} diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 4a8e66d38a..4a938e9095 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,14 +1,16 @@ +use std::collections::HashMap; use std::time::SystemTime; -use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName}; - use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; +use serde_with::{DisplayFromStr, TimestampSeconds, serde_as}; +use utils::generation::Generation; +use utils::id::TimelineId; -use utils::{generation::Generation, id::TimelineId}; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::storage_layer::LayerName; #[derive(Serialize, Deserialize)] -pub(super) struct HeatMapTenant { +pub(crate) struct HeatMapTenant { /// Generation of the attached location that uploaded the heatmap: this is not required /// for correctness, but acts as a hint to secondary locations in order to detect thrashing /// in the unlikely event that two attached locations are both uploading conflicting heatmaps. @@ -25,8 +27,17 @@ pub(super) struct HeatMapTenant { pub(super) upload_period_ms: Option, } +impl HeatMapTenant { + pub(crate) fn into_timelines_index(self) -> HashMap { + self.timelines + .into_iter() + .map(|htl| (htl.timeline_id, htl)) + .collect() + } +} + #[serde_as] -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub(crate) struct HeatMapTimeline { #[serde_as(as = "DisplayFromStr")] pub(crate) timeline_id: TimelineId, @@ -35,13 +46,13 @@ pub(crate) struct HeatMapTimeline { } #[serde_as] -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub(crate) struct HeatMapLayer { pub(crate) name: LayerName, pub(crate) metadata: LayerFileMetadata, #[serde_as(as = "TimestampSeconds")] - pub(super) access_time: SystemTime, + pub(crate) access_time: SystemTime, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary. } diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index c5e5e04945..3375714a66 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -1,38 +1,33 @@ -use std::{ - collections::HashMap, - pin::Pin, - sync::{Arc, Weak}, - time::{Duration, Instant}, -}; - -use crate::{ - metrics::SECONDARY_MODE, - tenant::{ - config::AttachmentMode, - mgr::GetTenantError, - mgr::TenantManager, - remote_timeline_client::remote_heatmap_path, - span::debug_assert_current_span_has_tenant_id, - tasks::{warn_when_period_overrun, BackgroundLoopKind}, - Tenant, - }, -}; +use std::collections::HashMap; +use std::pin::Pin; +use std::sync::{Arc, Weak}; +use std::time::{Duration, Instant}; use futures::Future; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, TimeoutOrCancel}; - -use super::{ - heatmap::HeatMapTenant, - scheduler::{ - self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult, - TenantBackgroundJobs, - }, - CommandRequest, SecondaryTenantError, UploadCommand, -}; use tokio_util::sync::CancellationToken; -use tracing::{info_span, instrument, Instrument}; -use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop}; +use tracing::{Instrument, info_span, instrument}; +use utils::backoff; +use utils::completion::Barrier; +use utils::crashsafe::path_with_suffix_extension; +use utils::yielding_loop::yielding_loop; + +use super::heatmap::HeatMapTenant; +use super::scheduler::{ + self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs, period_jitter, + period_warmup, +}; +use super::{CommandRequest, SecondaryTenantError, UploadCommand}; +use crate::TEMP_FILE_SUFFIX; +use crate::metrics::SECONDARY_MODE; +use crate::tenant::Tenant; +use crate::tenant::config::AttachmentMode; +use crate::tenant::mgr::{GetTenantError, TenantManager}; +use crate::tenant::remote_timeline_client::remote_heatmap_path; +use crate::tenant::span::debug_assert_current_span_has_tenant_id; +use crate::tenant::tasks::{BackgroundLoopKind, warn_when_period_overrun}; +use crate::virtual_file::VirtualFile; pub(super) async fn heatmap_uploader_task( tenant_manager: Arc, @@ -461,6 +456,18 @@ async fn upload_tenant_heatmap( } } + // After a successful upload persist the fresh heatmap to disk. + // When restarting, the tenant will read the heatmap from disk + // and additively generate a new heatmap (see [`Timeline::generate_heatmap`]). + // If the heatmap is stale, the additive generation can lead to keeping previously + // evicted timelines on the secondarie's disk. + let tenant_shard_id = tenant.get_tenant_shard_id(); + let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id); + let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX); + if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await { + tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}"); + } + tracing::info!("Successfully uploaded {size} byte heatmap to {path}"); Ok(UploadHeatmapOutcome::Uploaded(LastUploadState { diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index e963c722b9..f948f9114f 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -1,16 +1,15 @@ -use futures::Future; -use rand::Rng; -use std::{ - collections::HashMap, - marker::PhantomData, - pin::Pin, - time::{Duration, Instant}, -}; +use std::collections::HashMap; +use std::marker::PhantomData; +use std::pin::Pin; +use std::time::{Duration, Instant}; +use futures::Future; use pageserver_api::shard::TenantShardId; +use rand::Rng; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use utils::{completion::Barrier, yielding_loop::yielding_loop}; +use utils::completion::Barrier; +use utils::yielding_loop::yielding_loop; use super::{CommandRequest, CommandResponse, SecondaryTenantError}; diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 6c3276ea3c..ed6b351c75 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -4,21 +4,18 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; use tenant_size_model::svg::SvgBranchKind; -use tokio::sync::oneshot::error::RecvError; +use tenant_size_model::{Segment, StorageModel}; use tokio::sync::Semaphore; +use tokio::sync::oneshot::error::RecvError; use tokio_util::sync::CancellationToken; - -use crate::context::RequestContext; -use crate::pgdatadir_mapping::CalculateLogicalSizeError; - -use super::{GcError, LogicalSizeCalculationCause, Tenant}; -use crate::tenant::{MaybeOffloaded, Timeline}; +use tracing::*; use utils::id::TimelineId; use utils::lsn::Lsn; -use tracing::*; - -use tenant_size_model::{Segment, StorageModel}; +use super::{GcError, LogicalSizeCalculationCause, Tenant}; +use crate::context::RequestContext; +use crate::pgdatadir_mapping::CalculateLogicalSizeError; +use crate::tenant::{MaybeOffloaded, Timeline}; /// Inputs to the actual tenant sizing model /// @@ -394,7 +391,7 @@ pub(super) async fn gather_inputs( ancestor_lsn, last_record: last_record_lsn, // this is not used above, because it might not have updated recently enough - latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), + latest_gc_cutoff: *timeline.get_applied_gc_cutoff_lsn(), next_pitr_cutoff, retention_param_cutoff, lease_points, @@ -498,7 +495,9 @@ async fn fill_logical_sizes( } Err(join_error) => { // cannot really do anything, as this panic is likely a bug - error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}"); + error!( + "task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}" + ); have_any_error = Some(CalculateSyntheticSizeError::Fatal( anyhow::anyhow!(join_error) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9e3a25cbbc..7f313f46a2 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -10,33 +10,39 @@ mod layer_desc; mod layer_name; pub mod merge_iterator; -use crate::context::{AccessStatsBehavior, RequestContext}; +use std::cmp::Ordering; +use std::collections::hash_map::Entry; +use std::collections::{BinaryHeap, HashMap}; +use std::future::Future; +use std::ops::Range; +use std::pin::Pin; +use std::sync::Arc; +use std::sync::atomic::AtomicUsize; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter}; use bytes::Bytes; -use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE}; +pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; +pub use image_layer::{ImageLayer, ImageLayerWriter}; +pub use inmemory_layer::InMemoryLayer; +pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; +pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; +pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; +use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; -use std::cmp::{Ordering, Reverse}; -use std::collections::hash_map::Entry; -use std::collections::{BinaryHeap, HashMap}; -use std::ops::Range; -use std::sync::Arc; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; - +use tracing::{Instrument, trace}; use utils::lsn::Lsn; - -pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; -pub use image_layer::{ImageLayer, ImageLayerWriter}; -pub use inmemory_layer::InMemoryLayer; -pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; -pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; - -pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; +use utils::sync::gate::GateGuard; use self::inmemory_layer::InMemoryLayerFileId; - -use super::timeline::GetVectoredError; use super::PageReconstructError; +use super::timeline::{GetVectoredError, ReadPath}; +use crate::config::PageServerConf; +use crate::context::{AccessStatsBehavior, RequestContext}; pub fn range_overlaps(a: &Range, b: &Range) -> bool where @@ -71,6 +77,16 @@ pub(crate) struct ValueReconstructState { pub(crate) img: Option<(Lsn, Bytes)>, } +impl ValueReconstructState { + /// Returns the number of page deltas applied to the page image. + pub fn num_deltas(&self) -> usize { + match self.img { + Some(_) => self.records.len(), + None => self.records.len() - 1, // omit will_init record + } + } +} + #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] pub(crate) enum ValueReconstructSituation { Complete, @@ -78,30 +94,151 @@ pub(crate) enum ValueReconstructSituation { Continue, } -/// Reconstruct data accumulated for a single key during a vectored get -#[derive(Debug, Default, Clone)] -pub(crate) struct VectoredValueReconstructState { - pub(crate) records: Vec<(Lsn, NeonWalRecord)>, - pub(crate) img: Option<(Lsn, Bytes)>, - - situation: ValueReconstructSituation, +/// On disk representation of a value loaded in a buffer +#[derive(Debug)] +pub(crate) enum OnDiskValue { + /// Unencoded [`Value::Image`] + RawImage(Bytes), + /// Encoded [`Value`]. Can deserialize into an image or a WAL record + WalRecordOrImage(Bytes), } -impl VectoredValueReconstructState { - fn get_cached_lsn(&self) -> Option { - self.img.as_ref().map(|img| img.0) +/// Reconstruct data accumulated for a single key during a vectored get +#[derive(Debug, Default)] +pub(crate) struct VectoredValueReconstructState { + pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>, + + pub(crate) situation: ValueReconstructSituation, +} + +#[derive(Debug)] +pub(crate) struct OnDiskValueIoWaiter { + rx: tokio::sync::oneshot::Receiver, +} + +#[derive(Debug)] +#[must_use] +pub(crate) enum OnDiskValueIo { + /// Traversal identified this IO as required to complete the vectored get. + Required { + num_active_ios: Arc, + tx: tokio::sync::oneshot::Sender, + }, + /// Sparse keyspace reads always read all the values for a given key, + /// even though only the first value is needed. + /// + /// This variant represents the unnecessary IOs for those values at lower LSNs + /// that aren't needed, but are currently still being done. + /// + /// The execution of unnecessary IOs was a pre-existing behavior before concurrent IO. + /// We added this explicit representation here so that we can drop + /// unnecessary IO results immediately, instead of buffering them in + /// `oneshot` channels inside [`VectoredValueReconstructState`] until + /// [`VectoredValueReconstructState::collect_pending_ios`] gets called. + Unnecessary, +} + +type OnDiskValueIoResult = Result; + +impl OnDiskValueIo { + pub(crate) fn complete(self, res: OnDiskValueIoResult) { + match self { + OnDiskValueIo::Required { num_active_ios, tx } => { + num_active_ios.fetch_sub(1, std::sync::atomic::Ordering::Release); + let _ = tx.send(res); + } + OnDiskValueIo::Unnecessary => { + // Nobody cared, see variant doc comment. + } + } } } -impl From for ValueReconstructState { - fn from(mut state: VectoredValueReconstructState) -> Self { - // walredo expects the records to be descending in terms of Lsn - state.records.sort_by_key(|(lsn, _)| Reverse(*lsn)); +#[derive(Debug, thiserror::Error)] +pub(crate) enum WaitCompletionError { + #[error("OnDiskValueIo was dropped without completing, likely the sidecar task panicked")] + IoDropped, +} - ValueReconstructState { - records: state.records, - img: state.img, +impl OnDiskValueIoWaiter { + pub(crate) async fn wait_completion(self) -> Result { + // NB: for Unnecessary IOs, this method never gets called because we don't add them to `on_disk_values`. + self.rx.await.map_err(|_| WaitCompletionError::IoDropped) + } +} + +impl VectoredValueReconstructState { + /// # Cancel-Safety + /// + /// Technically fine to stop polling this future, but, the IOs will still + /// be executed to completion by the sidecar task and hold on to / consume resources. + /// Better not do it to make reasonsing about the system easier. + pub(crate) async fn collect_pending_ios( + self, + ) -> Result { + use utils::bin_ser::BeSer; + + let mut res = Ok(ValueReconstructState::default()); + + // We should try hard not to bail early, so that by the time we return from this + // function, all IO for this value is done. It's not required -- we could totally + // stop polling the IO futures in the sidecar task, they need to support that, + // but just stopping to poll doesn't reduce the IO load on the disk. It's easier + // to reason about the system if we just wait for all IO to complete, even if + // we're no longer interested in the result. + // + // Revisit this when IO futures are replaced with a more sophisticated IO system + // and an IO scheduler, where we know which IOs were submitted and which ones + // just queued. Cf the comment on IoConcurrency::spawn_io. + for (lsn, waiter) in self.on_disk_values { + let value_recv_res = waiter + .wait_completion() + // we rely on the caller to poll us to completion, so this is not a bail point + .await; + // Force not bailing early by wrapping the code into a closure. + #[allow(clippy::redundant_closure_call)] + let _: () = (|| { + match (&mut res, value_recv_res) { + (Err(_), _) => { + // We've already failed, no need to process more. + } + (Ok(_), Err(wait_err)) => { + // This shouldn't happen - likely the sidecar task panicked. + res = Err(PageReconstructError::Other(wait_err.into())); + } + (Ok(_), Ok(Err(err))) => { + let err: std::io::Error = err; + // TODO: returning IO error here will fail a compute query. + // Probably not what we want, we're not doing `maybe_fatal_err` + // in the IO futures. + // But it's been like that for a long time, not changing it + // as part of concurrent IO. + // => https://github.com/neondatabase/neon/issues/10454 + res = Err(PageReconstructError::Other(err.into())); + } + (Ok(ok), Ok(Ok(OnDiskValue::RawImage(img)))) => { + assert!(ok.img.is_none()); + ok.img = Some((lsn, img)); + } + (Ok(ok), Ok(Ok(OnDiskValue::WalRecordOrImage(buf)))) => { + match Value::des(&buf) { + Ok(Value::WalRecord(rec)) => { + ok.records.push((lsn, rec)); + } + Ok(Value::Image(img)) => { + assert!(ok.img.is_none()); + ok.img = Some((lsn, img)); + } + Err(err) => { + res = Err(PageReconstructError::Other(err.into())); + } + } + } + } + })(); } + + res } } @@ -109,7 +246,7 @@ impl From for ValueReconstructState { pub(crate) struct ValuesReconstructState { /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` /// should not expect to get anything from this hashmap. - pub(crate) keys: HashMap>, + pub(crate) keys: HashMap, /// The keys which are already retrieved keys_done: KeySpaceRandomAccum, @@ -119,27 +256,369 @@ pub(crate) struct ValuesReconstructState { // Statistics that are still accessible as a caller of `get_vectored_impl`. layers_visited: u32, delta_layers_visited: u32, + + pub(crate) io_concurrency: IoConcurrency, + num_active_ios: Arc, + + pub(crate) read_path: Option, +} + +/// The level of IO concurrency to be used on the read path +/// +/// The desired end state is that we always do parallel IO. +/// This struct and the dispatching in the impl will be removed once +/// we've built enough confidence. +pub(crate) enum IoConcurrency { + Sequential, + SidecarTask { + task_id: usize, + ios_tx: tokio::sync::mpsc::UnboundedSender, + }, +} + +type IoFuture = Pin>>; + +pub(crate) enum SelectedIoConcurrency { + Sequential, + SidecarTask(GateGuard), +} + +impl std::fmt::Debug for IoConcurrency { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + IoConcurrency::Sequential => write!(f, "Sequential"), + IoConcurrency::SidecarTask { .. } => write!(f, "SidecarTask"), + } + } +} + +impl std::fmt::Debug for SelectedIoConcurrency { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SelectedIoConcurrency::Sequential => write!(f, "Sequential"), + SelectedIoConcurrency::SidecarTask(_) => write!(f, "SidecarTask"), + } + } +} + +impl IoConcurrency { + /// Force sequential IO. This is a temporary workaround until we have + /// moved plumbing-through-the-call-stack + /// of IoConcurrency into `RequestContextq. + /// + /// DO NOT USE for new code. + /// + /// Tracking issue: . + pub(crate) fn sequential() -> Self { + Self::spawn(SelectedIoConcurrency::Sequential) + } + + pub(crate) fn spawn_from_conf( + conf: &'static PageServerConf, + gate_guard: GateGuard, + ) -> IoConcurrency { + use pageserver_api::config::GetVectoredConcurrentIo; + let selected = match conf.get_vectored_concurrent_io { + GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential, + GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard), + }; + Self::spawn(selected) + } + + pub(crate) fn spawn(io_concurrency: SelectedIoConcurrency) -> Self { + match io_concurrency { + SelectedIoConcurrency::Sequential => IoConcurrency::Sequential, + SelectedIoConcurrency::SidecarTask(gate_guard) => { + let (ios_tx, ios_rx) = tokio::sync::mpsc::unbounded_channel(); + static TASK_ID: AtomicUsize = AtomicUsize::new(0); + let task_id = TASK_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + // TODO: enrich the span with more context (tenant,shard,timeline) + (basebackup|pagestream|...) + let span = + tracing::info_span!(parent: None, "IoConcurrency_sidecar", task_id = task_id); + trace!(task_id, "spawning sidecar task"); + tokio::spawn(async move { + trace!("start"); + scopeguard::defer!{ trace!("end") }; + type IosRx = tokio::sync::mpsc::UnboundedReceiver; + enum State { + Waiting { + // invariant: is_empty(), but we recycle the allocation + empty_futures: FuturesUnordered, + ios_rx: IosRx, + }, + Executing { + futures: FuturesUnordered, + ios_rx: IosRx, + }, + ShuttingDown { + futures: FuturesUnordered, + }, + } + let mut state = State::Waiting { + empty_futures: FuturesUnordered::new(), + ios_rx, + }; + loop { + match state { + State::Waiting { + empty_futures, + mut ios_rx, + } => { + assert!(empty_futures.is_empty()); + tokio::select! { + fut = ios_rx.recv() => { + if let Some(fut) = fut { + trace!("received new io future"); + empty_futures.push(fut); + state = State::Executing { futures: empty_futures, ios_rx }; + } else { + state = State::ShuttingDown { futures: empty_futures } + } + } + } + } + State::Executing { + mut futures, + mut ios_rx, + } => { + tokio::select! { + res = futures.next() => { + trace!("io future completed"); + assert!(res.is_some()); + if futures.is_empty() { + state = State::Waiting { empty_futures: futures, ios_rx}; + } else { + state = State::Executing { futures, ios_rx }; + } + } + fut = ios_rx.recv() => { + if let Some(fut) = fut { + trace!("received new io future"); + futures.push(fut); + state = State::Executing { futures, ios_rx}; + } else { + state = State::ShuttingDown { futures }; + } + } + } + } + State::ShuttingDown { + mut futures, + } => { + trace!("shutting down"); + while let Some(()) = futures.next().await { + trace!("io future completed (shutdown)"); + // drain + } + trace!("shutdown complete"); + break; + } + } + } + drop(gate_guard); // drop it right before we exit + }.instrument(span)); + IoConcurrency::SidecarTask { task_id, ios_tx } + } + } + } + + pub(crate) fn clone(&self) -> Self { + match self { + IoConcurrency::Sequential => IoConcurrency::Sequential, + IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask { + task_id: *task_id, + ios_tx: ios_tx.clone(), + }, + } + } + + /// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string. + /// + /// The IO is represented as an opaque future. + /// IO completion must be handled inside the future, e.g., through a oneshot channel. + /// + /// The API seems simple but there are multiple **pitfalls** involving + /// DEADLOCK RISK. + /// + /// First, there are no guarantees about the exexecution of the IO. + /// It may be `await`ed in-place before this function returns. + /// It may be polled partially by this task and handed off to another task to be finished. + /// It may be polled and then dropped before returning ready. + /// + /// This means that submitted IOs must not be interedependent. + /// Interdependence may be through shared limited resources, e.g., + /// - VirtualFile file descriptor cache slot acquisition + /// - tokio-epoll-uring slot + /// + /// # Why current usage is safe from deadlocks + /// + /// Textbook condition for a deadlock is that _all_ of the following be given + /// - Mutual exclusion + /// - Hold and wait + /// - No preemption + /// - Circular wait + /// + /// The current usage is safe because: + /// - Mutual exclusion: IO futures definitely use mutexes, no way around that for now + /// - Hold and wait: IO futures currently hold two kinds of locks/resources while waiting + /// for acquisition of other resources: + /// - VirtualFile file descriptor cache slot tokio mutex + /// - tokio-epoll-uring slot (uses tokio notify => wait queue, much like mutex) + /// - No preemption: there's no taking-away of acquired locks/resources => given + /// - Circular wait: this is the part of the condition that isn't met: all IO futures + /// first acquire VirtualFile mutex, then tokio-epoll-uring slot. + /// There is no IO future that acquires slot before VirtualFile. + /// Hence there can be no circular waiting. + /// Hence there cannot be a deadlock. + /// + /// This is a very fragile situation and must be revisited whenver any code called from + /// inside the IO futures is changed. + /// + /// We will move away from opaque IO futures towards well-defined IOs at some point in + /// the future when we have shipped this first version of concurrent IO to production + /// and are ready to retire the Sequential mode which runs the futures in place. + /// Right now, while brittle, the opaque IO approach allows us to ship the feature + /// with minimal changes to the code and minimal changes to existing behavior in Sequential mode. + /// + /// Also read the comment in `collect_pending_ios`. + pub(crate) async fn spawn_io(&mut self, fut: F) + where + F: std::future::Future + Send + 'static, + { + match self { + IoConcurrency::Sequential => fut.await, + IoConcurrency::SidecarTask { ios_tx, .. } => { + let fut = Box::pin(fut); + // NB: experiments showed that doing an opportunistic poll of `fut` here was bad for throughput + // while insignificant for latency. + // It would make sense to revisit the tokio-epoll-uring API in the future such that we can try + // a submission here, but never poll the future. That way, io_uring can make proccess while + // the future sits in the ios_tx queue. + match ios_tx.send(fut) { + Ok(()) => {} + Err(_) => { + unreachable!("the io task must have exited, likely it panicked") + } + } + } + } + } + + #[cfg(test)] + pub(crate) fn spawn_for_test() -> impl std::ops::DerefMut { + use std::ops::{Deref, DerefMut}; + + use tracing::info; + use utils::sync::gate::Gate; + + // Spawn needs a Gate, give it one. + struct Wrapper { + inner: IoConcurrency, + #[allow(dead_code)] + gate: Box, + } + impl Deref for Wrapper { + type Target = IoConcurrency; + + fn deref(&self) -> &Self::Target { + &self.inner + } + } + impl DerefMut for Wrapper { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } + } + let gate = Box::new(Gate::default()); + + // The default behavior when running Rust unit tests without any further + // flags is to use the new behavior. + // The CI uses the following environment variable to unit test both old + // and new behavior. + // NB: the Python regression & perf tests take the `else` branch + // below and have their own defaults management. + let selected = { + // The pageserver_api::config type is unsuitable because it's internally tagged. + #[derive(serde::Deserialize)] + #[serde(rename_all = "kebab-case")] + enum TestOverride { + Sequential, + SidecarTask, + } + use once_cell::sync::Lazy; + static TEST_OVERRIDE: Lazy = Lazy::new(|| { + utils::env::var_serde_json_string( + "NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO", + ) + .unwrap_or(TestOverride::SidecarTask) + }); + + match *TEST_OVERRIDE { + TestOverride::Sequential => SelectedIoConcurrency::Sequential, + TestOverride::SidecarTask => { + SelectedIoConcurrency::SidecarTask(gate.enter().expect("just created it")) + } + } + }; + + info!(?selected, "get_vectored_concurrent_io test"); + + Wrapper { + inner: Self::spawn(selected), + gate, + } + } +} + +/// Make noise in case the [`ValuesReconstructState`] gets dropped while +/// there are still IOs in flight. +/// Refer to `collect_pending_ios` for why we prefer not to do that. +// +/// We log from here instead of from the sidecar task because the [`ValuesReconstructState`] +/// gets dropped in a tracing span with more context. +/// We repeat the sidecar tasks's `task_id` so we can correlate what we emit here with +/// the logs / panic handler logs from the sidecar task, which also logs the `task_id`. +impl Drop for ValuesReconstructState { + fn drop(&mut self) { + let num_active_ios = self + .num_active_ios + .load(std::sync::atomic::Ordering::Acquire); + if num_active_ios == 0 { + return; + } + let sidecar_task_id = match &self.io_concurrency { + IoConcurrency::Sequential => None, + IoConcurrency::SidecarTask { task_id, .. } => Some(*task_id), + }; + tracing::warn!( + num_active_ios, + ?sidecar_task_id, + backtrace=%std::backtrace::Backtrace::force_capture(), + "dropping ValuesReconstructState while some IOs have not been completed", + ); + } } impl ValuesReconstructState { - pub(crate) fn new() -> Self { + pub(crate) fn new(io_concurrency: IoConcurrency) -> Self { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), keys_with_image_coverage: None, layers_visited: 0, delta_layers_visited: 0, + io_concurrency, + num_active_ios: Arc::new(AtomicUsize::new(0)), + read_path: None, } } - /// Associate a key with the error which it encountered and mark it as done - pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) { - let previous = self.keys.insert(key, Err(err)); - if let Some(Ok(state)) = previous { - if state.situation == ValueReconstructSituation::Continue { - self.keys_done.add_key(key); - } - } + /// Absolutely read [`IoConcurrency::spawn_io`] to learn about assumptions & pitfalls. + pub(crate) async fn spawn_io(&mut self, fut: F) + where + F: std::future::Future + Send + 'static, + { + self.io_concurrency.spawn_io(fut).await; } pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) { @@ -159,29 +638,6 @@ impl ValuesReconstructState { self.layers_visited } - /// This function is called after reading a keyspace from a layer. - /// It checks if the read path has now moved past the cached Lsn for any keys. - /// - /// Implementation note: We intentionally iterate over the keys for which we've - /// already collected some reconstruct data. This avoids scaling complexity with - /// the size of the search space. - pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) { - for (key, value) in self.keys.iter_mut() { - if !keyspace.contains(key) { - continue; - } - - if let Ok(state) = value { - if state.situation != ValueReconstructSituation::Complete - && state.get_cached_lsn() >= Some(advanced_to) - { - state.situation = ValueReconstructSituation::Complete; - self.keys_done.add_key(*key); - } - } - } - } - /// On hitting image layer, we can mark all keys in this range as done, because /// if the image layer does not contain a key, it is deleted/never added. pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range) { @@ -199,70 +655,42 @@ impl ValuesReconstructState { /// /// If the key is in the sparse keyspace (i.e., aux files), we do not track them in /// `key_done`. - pub(crate) fn update_key( - &mut self, - key: &Key, - lsn: Lsn, - value: Value, - ) -> ValueReconstructSituation { - let state = self - .keys - .entry(*key) - .or_insert(Ok(VectoredValueReconstructState::default())); - let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key); - if let Ok(state) = state { - let key_done = match state.situation { - ValueReconstructSituation::Complete => { - if is_sparse_key { - // Sparse keyspace might be visited multiple times because - // we don't track unmapped keyspaces. - return ValueReconstructSituation::Complete; - } else { - unreachable!() - } - } - ValueReconstructSituation::Continue => match value { - Value::Image(img) => { - state.img = Some((lsn, img)); - true - } - Value::WalRecord(rec) => { - debug_assert!( - Some(lsn) > state.get_cached_lsn(), - "Attempt to collect a record below cached LSN for walredo: {} < {}", - lsn, - state - .get_cached_lsn() - .expect("Assertion can only fire if a cached lsn is present") - ); + // TODO: rename this method & update description. + pub(crate) fn update_key(&mut self, key: &Key, lsn: Lsn, completes: bool) -> OnDiskValueIo { + let state = self.keys.entry(*key).or_default(); - let will_init = rec.will_init(); - state.records.push((lsn, rec)); - will_init - } - }, - }; + let is_sparse_key = key.is_sparse(); - if key_done && state.situation == ValueReconstructSituation::Continue { - state.situation = ValueReconstructSituation::Complete; - if !is_sparse_key { - self.keys_done.add_key(*key); + let required_io = match state.situation { + ValueReconstructSituation::Complete => { + if is_sparse_key { + // Sparse keyspace might be visited multiple times because + // we don't track unmapped keyspaces. + return OnDiskValueIo::Unnecessary; + } else { + unreachable!() } } + ValueReconstructSituation::Continue => { + self.num_active_ios + .fetch_add(1, std::sync::atomic::Ordering::Release); + let (tx, rx) = tokio::sync::oneshot::channel(); + state.on_disk_values.push((lsn, OnDiskValueIoWaiter { rx })); + OnDiskValueIo::Required { + tx, + num_active_ios: Arc::clone(&self.num_active_ios), + } + } + }; - state.situation - } else { - ValueReconstructSituation::Complete + if completes && state.situation == ValueReconstructSituation::Continue { + state.situation = ValueReconstructSituation::Complete; + if !is_sparse_key { + self.keys_done.add_key(*key); + } } - } - /// Returns the Lsn at which this key is cached if one exists. - /// The read path should go no further than this Lsn for the given key. - pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option { - self.keys - .get(key) - .and_then(|k| k.as_ref().ok()) - .and_then(|state| state.get_cached_lsn()) + required_io } /// Returns the key space describing the keys that have @@ -276,12 +704,6 @@ impl ValuesReconstructState { } } -impl Default for ValuesReconstructState { - fn default() -> Self { - Self::new() - } -} - /// A key that uniquely identifies a layer in a timeline #[derive(Debug, PartialEq, Eq, Clone, Hash)] pub(crate) enum LayerId { @@ -345,10 +767,7 @@ impl LayerFringe { } pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range)> { - let read_desc = match self.planned_visits_by_lsn.pop() { - Some(desc) => desc, - None => return None, - }; + let read_desc = self.planned_visits_by_lsn.pop()?; let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id); @@ -723,3 +1142,78 @@ impl std::fmt::Debug for RangeDisplayDebug<'_, T> { write!(f, "{}..{}", self.0.start, self.0.end) } } + +#[cfg(test)] +mod tests2 { + use pageserver_api::key::DBDIR_KEY; + use tracing::info; + + use super::*; + use crate::tenant::storage_layer::IoConcurrency; + + /// TODO: currently this test relies on manual visual inspection of the --no-capture output. + /// Should look like so: + /// ```text + /// RUST_LOG=trace cargo nextest run --features testing --no-capture test_io_concurrency_noise + /// running 1 test + /// 2025-01-21T17:42:01.335679Z INFO get_vectored_concurrent_io test selected=SidecarTask + /// 2025-01-21T17:42:01.335680Z TRACE spawning sidecar task task_id=0 + /// 2025-01-21T17:42:01.335937Z TRACE IoConcurrency_sidecar{task_id=0}: start + /// 2025-01-21T17:42:01.335972Z TRACE IoConcurrency_sidecar{task_id=0}: received new io future + /// 2025-01-21T17:42:01.335999Z INFO IoConcurrency_sidecar{task_id=0}: waiting for signal to complete IO + /// 2025-01-21T17:42:01.336229Z WARN dropping ValuesReconstructState while some IOs have not been completed num_active_ios=1 sidecar_task_id=Some(0) backtrace= 0: ::drop + /// at ./src/tenant/storage_layer.rs:553:24 + /// 1: core::ptr::drop_in_place + /// at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/ptr/mod.rs:521:1 + /// 2: core::mem::drop + /// at /home/christian/.rustup/toolchains/1.84.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/mem/mod.rs:942:24 + /// 3: pageserver::tenant::storage_layer::tests2::test_io_concurrency_noise::{{closure}} + /// at ./src/tenant/storage_layer.rs:1159:9 + /// ... + /// 49: + /// 2025-01-21T17:42:01.452293Z INFO IoConcurrency_sidecar{task_id=0}: completing IO + /// 2025-01-21T17:42:01.452357Z TRACE IoConcurrency_sidecar{task_id=0}: io future completed + /// 2025-01-21T17:42:01.452473Z TRACE IoConcurrency_sidecar{task_id=0}: end + /// test tenant::storage_layer::tests2::test_io_concurrency_noise ... ok + /// + /// ``` + #[tokio::test] + async fn test_io_concurrency_noise() { + crate::tenant::harness::setup_logging(); + + let io_concurrency = IoConcurrency::spawn_for_test(); + match *io_concurrency { + IoConcurrency::Sequential => { + // This test asserts behavior in sidecar mode, doesn't make sense in sequential mode. + return; + } + IoConcurrency::SidecarTask { .. } => {} + } + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); + + let (io_fut_is_waiting_tx, io_fut_is_waiting) = tokio::sync::oneshot::channel(); + let (do_complete_io, should_complete_io) = tokio::sync::oneshot::channel(); + let (io_fut_exiting_tx, io_fut_exiting) = tokio::sync::oneshot::channel(); + + let io = reconstruct_state.update_key(&DBDIR_KEY, Lsn(8), true); + reconstruct_state + .spawn_io(async move { + info!("waiting for signal to complete IO"); + io_fut_is_waiting_tx.send(()).unwrap(); + should_complete_io.await.unwrap(); + info!("completing IO"); + io.complete(Ok(OnDiskValue::RawImage(Bytes::new()))); + io_fut_exiting_tx.send(()).unwrap(); + }) + .await; + + io_fut_is_waiting.await.unwrap(); + + // this is what makes the noise + drop(reconstruct_state); + + do_complete_io.send(()).unwrap(); + + io_fut_exiting.await.unwrap(); + } +} diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 67e18350fc..84db79a83e 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -1,17 +1,22 @@ -use std::{future::Future, ops::Range, sync::Arc}; +use std::future::Future; +use std::ops::Range; +use std::sync::Arc; use bytes::Bytes; -use pageserver_api::key::{Key, KEY_SIZE}; -use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId}; - -use crate::tenant::storage_layer::Layer; -use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline}; +use pageserver_api::key::{KEY_SIZE, Key}; use pageserver_api::value::Value; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::shard::TenantShardId; use super::layer::S3_UPLOAD_LIMIT; use super::{ DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, }; +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::tenant::Timeline; +use crate::tenant::storage_layer::Layer; pub(crate) enum BatchWriterResult { Produced(ResidentLayer), @@ -87,6 +92,23 @@ impl BatchLayerWriter { )); } + pub(crate) async fn finish( + self, + tline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result> { + let res = self + .finish_with_discard_fn(tline, ctx, |_| async { false }) + .await?; + let mut output = Vec::new(); + for r in res { + if let BatchWriterResult::Produced(layer) = r { + output.push(layer); + } + } + Ok(output) + } + pub(crate) async fn finish_with_discard_fn( self, tline: &Arc, @@ -149,6 +171,10 @@ impl BatchLayerWriter { // END: catch every error and do the recovery in the above section Ok(generated_layers) } + + pub fn pending_layer_num(&self) -> usize { + self.generated_layer_writers.len() + } } /// An image writer that takes images and produces multiple image layers. @@ -410,15 +436,10 @@ mod tests { use itertools::Itertools; use rand::{RngCore, SeedableRng}; - use crate::{ - tenant::{ - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::AsLayerDesc, - }, - DEFAULT_PG_VERSION, - }; - use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::AsLayerDesc; fn get_key(id: u32) -> Key { let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index df7451716b..b061bfab34 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -27,6 +27,36 @@ //! "values" part. The actual page images and WAL records are stored in the //! "values" part. //! +use anyhow::{Context, Result, bail, ensure}; +use camino::{Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; +use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key}; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::models::ImageCompressionAlgorithm; +use pageserver_api::shard::TenantShardId; +use pageserver_api::value::Value; +use serde::{Deserialize, Serialize}; +use std::collections::{HashMap, VecDeque}; +use std::fs::File; +use std::ops::Range; +use std::os::unix::fs::FileExt; +use std::str::FromStr; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; +use tokio::sync::OnceCell; +use tokio_epoll_uring::IoBuf; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::bin_ser::SerializeError; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use super::{ + AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; @@ -41,44 +71,10 @@ use crate::tenant::vectored_blob_io::{ BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; -use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::owned_buffers_io::write::Buffer; -use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; -use crate::virtual_file::{IoBuffer, IoBufferMut}; -use crate::TEMP_FILE_SUFFIX; -use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; -use anyhow::{anyhow, bail, ensure, Context, Result}; -use camino::{Utf8Path, Utf8PathBuf}; -use futures::StreamExt; -use itertools::Itertools; -use pageserver_api::config::MaxVectoredReadBytes; -use pageserver_api::key::DBDIR_KEY; -use pageserver_api::key::{Key, KEY_SIZE}; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::ImageCompressionAlgorithm; -use pageserver_api::shard::TenantShardId; -use pageserver_api::value::Value; -use serde::{Deserialize, Serialize}; -use std::collections::VecDeque; -use std::fs::File; -use std::ops::Range; -use std::os::unix::fs::FileExt; -use std::str::FromStr; -use std::sync::atomic::AtomicU64; -use std::sync::Arc; -use tokio::sync::OnceCell; -use tokio_epoll_uring::IoBuf; -use tracing::*; -use utils::bin_ser::SerializeError; - -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; +use crate::virtual_file::{self, IoBuffer, IoBufferMut, MaybeFatalIo, VirtualFile}; +use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; /// /// Header stored in the beginning of the file @@ -236,7 +232,7 @@ pub struct DeltaLayerInner { index_start_blk: u32, index_root_blk: u32, - file: VirtualFile, + file: Arc, file_id: FileId, layer_key_range: Range, @@ -848,9 +844,11 @@ impl DeltaLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> anyhow::Result { - let file = VirtualFile::open_v2(path, ctx) - .await - .context("open layer file")?; + let file = Arc::new( + VirtualFile::open_v2(path, ctx) + .await + .context("open layer file")?, + ); let file_id = page_cache::next_file_id(); @@ -895,12 +893,11 @@ impl DeltaLayerInner { // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. // - // If the key is cached, go no further than the cached Lsn. - // // Currently, the index is visited for each range, but this // can be further optimised to visit the index only once. pub(super) async fn get_values_reconstruct_data( &self, + this: ResidentLayer, keyspace: KeySpace, lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, @@ -928,17 +925,14 @@ impl DeltaLayerInner { data_end_offset, index_reader, planner, - reconstruct_state, ctx, ) .await .map_err(GetVectoredError::Other)?; - self.do_reads_and_update_state(reads, reconstruct_state, ctx) + self.do_reads_and_update_state(this, reads, reconstruct_state, ctx) .await; - reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start); - Ok(()) } @@ -948,7 +942,6 @@ impl DeltaLayerInner { data_end_offset: u64, index_reader: DiskBtreeReader, mut planner: VectoredReadPlanner, - reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> anyhow::Result> where @@ -975,10 +968,9 @@ impl DeltaLayerInner { assert!(key >= range.start); let outside_lsn_range = !lsn_range.contains(&lsn); - let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn); let flag = { - if outside_lsn_range || below_cached_lsn { + if outside_lsn_range { BlobFlag::Ignore } else if blob_ref.will_init() { BlobFlag::ReplaceAll @@ -1022,7 +1014,10 @@ impl DeltaLayerInner { .as_slice() .iter() .filter_map(|(_, blob_meta)| { - if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY { + if blob_meta.key.is_rel_dir_key() + || blob_meta.key == DBDIR_KEY + || blob_meta.key.is_aux_file_key() + { // The size of values for these keys is unbounded and can // grow very large in pathological cases. None @@ -1047,98 +1042,78 @@ impl DeltaLayerInner { async fn do_reads_and_update_state( &self, + this: ResidentLayer, reads: Vec, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) { - let vectored_blob_reader = VectoredBlobReader::new(&self.file); - let mut ignore_key_with_err = None; - let max_vectored_read_bytes = self .max_vectored_read_bytes .expect("Layer is loaded with max vectored bytes config") .0 .into(); let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes); - let mut buf = Some(IoBufferMut::with_capacity(buf_size)); // Note that reads are processed in reverse order (from highest key+lsn). // This is the order that `ReconstructState` requires such that it can // track when a key is done. for read in reads.into_iter().rev() { - let res = vectored_blob_reader - .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx) - .await; - - let blobs_buf = match res { - Ok(blobs_buf) => blobs_buf, - Err(err) => { - let kind = err.kind(); - for (_, blob_meta) in read.blobs_at.as_slice() { - reconstruct_state.on_key_error( - blob_meta.key, - PageReconstructError::Other(anyhow!( - "Failed to read blobs from virtual file {}: {}", - self.file.path(), - kind - )), - ); - } - - // We have "lost" the buffer since the lower level IO api - // doesn't return the buffer on error. Allocate a new one. - buf = Some(IoBufferMut::with_capacity(buf_size)); - - continue; - } - }; - let view = BufView::new_slice(&blobs_buf.buf); - for meta in blobs_buf.blobs.iter().rev() { - if Some(meta.meta.key) == ignore_key_with_err { - continue; - } - let blob_read = meta.read(&view).await; - let blob_read = match blob_read { - Ok(buf) => buf, - Err(e) => { - reconstruct_state.on_key_error( - meta.meta.key, - PageReconstructError::Other(anyhow!(e).context(format!( - "Failed to decompress blob from virtual file {}", - self.file.path(), - ))), - ); - - ignore_key_with_err = Some(meta.meta.key); - continue; - } - }; - - let value = Value::des(&blob_read); - - let value = match value { - Ok(v) => v, - Err(e) => { - reconstruct_state.on_key_error( - meta.meta.key, - PageReconstructError::Other(anyhow!(e).context(format!( - "Failed to deserialize blob from virtual file {}", - self.file.path(), - ))), - ); - - ignore_key_with_err = Some(meta.meta.key); - continue; - } - }; - - // Invariant: once a key reaches [`ValueReconstructSituation::Complete`] - // state, no further updates shall be made to it. The call below will - // panic if the invariant is violated. - reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value); + let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); + for (_, blob_meta) in read.blobs_at.as_slice().iter().rev() { + let io = reconstruct_state.update_key( + &blob_meta.key, + blob_meta.lsn, + blob_meta.will_init, + ); + ios.insert((blob_meta.key, blob_meta.lsn), io); } - buf = Some(blobs_buf.buf); + let read_extend_residency = this.clone(); + let read_from = self.file.clone(); + let read_ctx = ctx.attached_child(); + reconstruct_state + .spawn_io(async move { + let vectored_blob_reader = VectoredBlobReader::new(&read_from); + let buf = IoBufferMut::with_capacity(buf_size); + + let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await; + match res { + Ok(blobs_buf) => { + let view = BufView::new_slice(&blobs_buf.buf); + for meta in blobs_buf.blobs.iter().rev() { + let io = ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap(); + + let blob_read = meta.read(&view).await; + let blob_read = match blob_read { + Ok(buf) => buf, + Err(e) => { + io.complete(Err(e)); + continue; + } + }; + + io.complete(Ok(OnDiskValue::WalRecordOrImage( + blob_read.into_bytes(), + ))); + } + + assert!(ios.is_empty()); + } + Err(err) => { + for (_, sender) in ios { + sender.complete(Err(std::io::Error::new( + err.kind(), + "vec read failed", + ))); + } + } + } + + // keep layer resident until this IO is done; this spawned IO future generally outlives the + // call to `self` / the `Arc` / the `ResidentLayer` that guarantees residency + drop(read_extend_residency); + }) + .await; } } @@ -1203,10 +1178,11 @@ impl DeltaLayerInner { until: Lsn, ctx: &RequestContext, ) -> anyhow::Result { + use futures::stream::TryStreamExt; + use crate::tenant::vectored_blob_io::{ BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended, }; - use futures::stream::TryStreamExt; #[derive(Debug)] enum Item { @@ -1277,7 +1253,14 @@ impl DeltaLayerInner { let actionable = if let Some((key, lsn, start_offset)) = prev.take() { let end_offset = offset; - Some((BlobMeta { key, lsn }, start_offset..end_offset)) + Some(( + BlobMeta { + key, + lsn, + will_init: false, + }, + start_offset..end_offset, + )) } else { None }; @@ -1539,7 +1522,7 @@ pub struct ValueRef<'a> { layer: &'a DeltaLayerInner, } -impl<'a> ValueRef<'a> { +impl ValueRef<'_> { /// Loads the value from disk pub async fn load(&self, ctx: &RequestContext) -> Result { let buf = self.load_raw(ctx).await?; @@ -1596,7 +1579,7 @@ pub struct DeltaLayerIterator<'a> { is_end: bool, } -impl<'a> DeltaLayerIterator<'a> { +impl DeltaLayerIterator<'_> { pub(crate) fn layer_dbg_info(&self) -> String { self.delta_layer.layer_dbg_info() } @@ -1613,7 +1596,9 @@ impl<'a> DeltaLayerIterator<'a> { let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); let blob_ref = BlobRef(value); let offset = blob_ref.pos(); - if let Some(batch_plan) = self.planner.handle(key, lsn, offset) { + if let Some(batch_plan) = + self.planner.handle(key, lsn, offset, blob_ref.will_init()) + { break batch_plan; } } else { @@ -1663,23 +1648,21 @@ impl<'a> DeltaLayerIterator<'a> { pub(crate) mod test { use std::collections::BTreeMap; + use bytes::Bytes; use itertools::MinMaxResult; + use pageserver_api::value::Value; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use rand::{Rng, RngCore}; use super::*; - use crate::tenant::harness::TIMELINE_ID; + use crate::DEFAULT_PG_VERSION; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::tenant::disk_btree::tests::TestDisk; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::{Tenant, Timeline}; - use crate::{ - context::DownloadBehavior, - task_mgr::TaskKind, - tenant::{disk_btree::tests::TestDisk, harness::TenantHarness}, - DEFAULT_PG_VERSION, - }; - use bytes::Bytes; - use pageserver_api::value::Value; /// Construct an index for a fictional delta layer and and then /// traverse in order to plan vectored reads for a query. Finally, @@ -1726,7 +1709,6 @@ pub(crate) mod test { .expect("In memory disk finish should never fail"); let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk); let planner = VectoredReadPlanner::new(100); - let mut reconstruct_state = ValuesReconstructState::new(); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let keyspace = KeySpace { @@ -1744,7 +1726,6 @@ pub(crate) mod test { disk_offset, reader, planner, - &mut reconstruct_state, &ctx, ) .await @@ -1989,7 +1970,6 @@ pub(crate) mod test { ); let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES); - let mut reconstruct_state = ValuesReconstructState::new(); let keyspace = pick_random_keyspace(rng, &entries_meta.key_range); let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64; @@ -1999,7 +1979,6 @@ pub(crate) mod test { data_end_offset, index_reader, planner, - &mut reconstruct_state, &ctx, ) .await?; diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs index 8660be1fcc..8d172a1c19 100644 --- a/pageserver/src/tenant/storage_layer/filter_iterator.rs +++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs @@ -1,18 +1,14 @@ -use std::{ops::Range, sync::Arc}; +use std::ops::Range; +use std::sync::Arc; use anyhow::bail; -use pageserver_api::{ - key::Key, - keyspace::{KeySpace, SparseKeySpace}, -}; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, SparseKeySpace}; +use pageserver_api::value::Value; use utils::lsn::Lsn; -use pageserver_api::value::Value; - -use super::{ - merge_iterator::{MergeIterator, MergeIteratorItem}, - PersistentLayerKey, -}; +use super::PersistentLayerKey; +use super::merge_iterator::{MergeIterator, MergeIteratorItem}; /// A filter iterator over merge iterators (and can be easily extended to other types of iterators). /// @@ -98,19 +94,14 @@ impl<'a> FilterIterator<'a> { #[cfg(test)] mod tests { - use super::*; - use itertools::Itertools; use pageserver_api::key::Key; use utils::lsn::Lsn; - use crate::{ - tenant::{ - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::delta_layer::test::produce_delta_layer, - }, - DEFAULT_PG_VERSION, - }; + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::delta_layer::test::produce_delta_layer; async fn assert_filter_iter_equal( filter_iter: &mut FilterIterator<'_>, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index abd987e258..4d66843718 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -25,6 +25,38 @@ //! layer, and offsets to the other parts. The "index" is a B-tree, //! mapping from Key to an offset in the "values" part. The //! actual page images are stored in the "values" part. +use std::collections::{HashMap, VecDeque}; +use std::fs::File; +use std::ops::Range; +use std::os::unix::prelude::FileExt; +use std::str::FromStr; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; + +use anyhow::{Context, Result, bail, ensure}; +use bytes::Bytes; +use camino::{Utf8Path, Utf8PathBuf}; +use hex; +use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; +use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key}; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; +use pageserver_api::value::Value; +use serde::{Deserialize, Serialize}; +use tokio::sync::OnceCell; +use tokio_stream::StreamExt; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::bin_ser::SerializeError; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use super::layer_name::ImageLayerName; +use super::{ + AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; @@ -38,44 +70,10 @@ use crate::tenant::vectored_blob_io::{ BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; -use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::owned_buffers_io::write::Buffer; -use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; -use crate::virtual_file::{IoBuffer, IoBufferMut}; +use crate::virtual_file::{self, IoBuffer, IoBufferMut, MaybeFatalIo, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; -use anyhow::{anyhow, bail, ensure, Context, Result}; -use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; -use hex; -use itertools::Itertools; -use pageserver_api::config::MaxVectoredReadBytes; -use pageserver_api::key::DBDIR_KEY; -use pageserver_api::key::{Key, KEY_SIZE}; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::shard::{ShardIdentity, TenantShardId}; -use pageserver_api::value::Value; -use serde::{Deserialize, Serialize}; -use std::collections::VecDeque; -use std::fs::File; -use std::ops::Range; -use std::os::unix::prelude::FileExt; -use std::str::FromStr; -use std::sync::atomic::AtomicU64; -use std::sync::Arc; -use tokio::sync::OnceCell; -use tokio_stream::StreamExt; -use tracing::*; -use utils::bin_ser::SerializeError; - -use utils::{ - bin_ser::BeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -use super::layer_name::ImageLayerName; -use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState}; /// /// Header stored in the beginning of the file @@ -175,7 +173,7 @@ pub struct ImageLayerInner { key_range: Range, lsn: Lsn, - file: VirtualFile, + file: Arc, file_id: FileId, max_vectored_read_bytes: Option, @@ -404,9 +402,11 @@ impl ImageLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> anyhow::Result { - let file = VirtualFile::open_v2(path, ctx) - .await - .context("open layer file")?; + let file = Arc::new( + VirtualFile::open_v2(path, ctx) + .await + .context("open layer file")?, + ); let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader @@ -452,6 +452,7 @@ impl ImageLayerInner { // the reconstruct state with whatever is found. pub(super) async fn get_values_reconstruct_data( &self, + this: ResidentLayer, keyspace: KeySpace, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, @@ -461,7 +462,7 @@ impl ImageLayerInner { .await .map_err(GetVectoredError::Other)?; - self.do_reads_and_update_state(reads, reconstruct_state, ctx) + self.do_reads_and_update_state(this, reads, reconstruct_state, ctx) .await; reconstruct_state.on_image_layer_visited(&self.key_range); @@ -583,6 +584,7 @@ impl ImageLayerInner { async fn do_reads_and_update_state( &self, + this: ResidentLayer, reads: Vec, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, @@ -593,8 +595,13 @@ impl ImageLayerInner { .0 .into(); - let vectored_blob_reader = VectoredBlobReader::new(&self.file); for read in reads.into_iter() { + let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); + for (_, blob_meta) in read.blobs_at.as_slice() { + let io = reconstruct_state.update_key(&blob_meta.key, blob_meta.lsn, true); + ios.insert((blob_meta.key, blob_meta.lsn), io); + } + let buf_size = read.size(); if buf_size > max_vectored_read_bytes { @@ -604,7 +611,10 @@ impl ImageLayerInner { .as_slice() .iter() .filter_map(|(_, blob_meta)| { - if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY { + if blob_meta.key.is_rel_dir_key() + || blob_meta.key == DBDIR_KEY + || blob_meta.key.is_aux_file_key() + { // The size of values for these keys is unbounded and can // grow very large in pathological cases. None @@ -624,50 +634,51 @@ impl ImageLayerInner { } } - let buf = IoBufferMut::with_capacity(buf_size); - let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await; + let read_extend_residency = this.clone(); + let read_from = self.file.clone(); + let read_ctx = ctx.attached_child(); + reconstruct_state + .spawn_io(async move { + let buf = IoBufferMut::with_capacity(buf_size); + let vectored_blob_reader = VectoredBlobReader::new(&read_from); + let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await; - match res { - Ok(blobs_buf) => { - let view = BufView::new_slice(&blobs_buf.buf); - for meta in blobs_buf.blobs.iter() { - let img_buf = meta.read(&view).await; + match res { + Ok(blobs_buf) => { + let view = BufView::new_slice(&blobs_buf.buf); + for meta in blobs_buf.blobs.iter() { + let io: OnDiskValueIo = + ios.remove(&(meta.meta.key, meta.meta.lsn)).unwrap(); + let img_buf = meta.read(&view).await; - let img_buf = match img_buf { - Ok(img_buf) => img_buf, - Err(e) => { - reconstruct_state.on_key_error( - meta.meta.key, - PageReconstructError::Other(anyhow!(e).context(format!( - "Failed to decompress blob from virtual file {}", - self.file.path(), - ))), - ); + let img_buf = match img_buf { + Ok(img_buf) => img_buf, + Err(e) => { + io.complete(Err(e)); + continue; + } + }; - continue; + io.complete(Ok(OnDiskValue::RawImage(img_buf.into_bytes()))); } - }; - reconstruct_state.update_key( - &meta.meta.key, - self.lsn, - Value::Image(img_buf.into_bytes()), - ); + + assert!(ios.is_empty()); + } + Err(err) => { + for (_, io) in ios { + io.complete(Err(std::io::Error::new( + err.kind(), + "vec read failed", + ))); + } + } } - } - Err(err) => { - let kind = err.kind(); - for (_, blob_meta) in read.blobs_at.as_slice() { - reconstruct_state.on_key_error( - blob_meta.key, - PageReconstructError::from(anyhow!( - "Failed to read blobs from virtual file {}: {}", - self.file.path(), - kind - )), - ); - } - } - }; + + // keep layer resident until this IO is done; this spawned IO future generally outlives the + // call to `self` / the `Arc` / the `ResidentLayer` that guarantees residency + drop(read_extend_residency); + }) + .await; } } @@ -1122,7 +1133,7 @@ pub struct ImageLayerIterator<'a> { is_end: bool, } -impl<'a> ImageLayerIterator<'a> { +impl ImageLayerIterator<'_> { pub(crate) fn layer_dbg_info(&self) -> String { self.image_layer.layer_dbg_info() } @@ -1139,6 +1150,7 @@ impl<'a> ImageLayerIterator<'a> { Key::from_slice(&raw_key[..KEY_SIZE]), self.image_layer.lsn, offset, + true, ) { break batch_plan; } @@ -1189,34 +1201,26 @@ impl<'a> ImageLayerIterator<'a> { #[cfg(test)] mod test { - use std::{sync::Arc, time::Duration}; + use std::sync::Arc; + use std::time::Duration; use bytes::Bytes; use itertools::Itertools; - use pageserver_api::{ - key::Key, - shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}, - value::Value, - }; - use utils::{ - generation::Generation, - id::{TenantId, TimelineId}, - lsn::Lsn, - }; - - use crate::{ - context::RequestContext, - tenant::{ - config::TenantConf, - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::{Layer, ResidentLayer}, - vectored_blob_io::StreamingVectoredReadPlanner, - Tenant, Timeline, - }, - DEFAULT_PG_VERSION, - }; + use pageserver_api::key::Key; + use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; + use pageserver_api::value::Value; + use utils::generation::Generation; + use utils::id::{TenantId, TimelineId}; + use utils::lsn::Lsn; use super::{ImageLayerIterator, ImageLayerWriter}; + use crate::DEFAULT_PG_VERSION; + use crate::context::RequestContext; + use crate::tenant::config::TenantConf; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::{Layer, ResidentLayer}; + use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; + use crate::tenant::{Tenant, Timeline}; #[tokio::test] async fn image_layer_rewrite() { @@ -1226,10 +1230,10 @@ mod test { ..TenantConf::default() }; let tenant_id = TenantId::generate(); - let mut gen = Generation::new(0xdead0001); + let mut gen_ = Generation::new(0xdead0001); let mut get_next_gen = || { - let ret = gen; - gen = gen.next(); + let ret = gen_; + gen_ = gen_.next(); ret }; // The LSN at which we will create an image layer to filter diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index a353bf564d..54c82914d5 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -4,41 +4,39 @@ //! held in an ephemeral file, not in memory. The metadata for each page version, i.e. //! its position in the file, is kept in memory, though. //! -use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64}; -use crate::config::PageServerConf; -use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::tenant::ephemeral_file::EphemeralFile; -use crate::tenant::timeline::GetVectoredError; -use crate::tenant::PageReconstructError; -use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; -use crate::{l0_flush, page_cache}; -use anyhow::{anyhow, Result}; +use std::cmp::Ordering; +use std::collections::{BTreeMap, HashMap}; +use std::fmt::Write; +use std::ops::Range; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering as AtomicOrdering}; +use std::sync::{Arc, OnceLock}; +use std::time::Instant; + +use anyhow::Result; use camino::Utf8PathBuf; -use pageserver_api::key::CompactKey; -use pageserver_api::key::Key; +use pageserver_api::key::{CompactKey, Key}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use pageserver_api::value::Value; -use std::collections::{BTreeMap, HashMap}; -use std::sync::{Arc, OnceLock}; -use std::time::Instant; +use tokio::sync::RwLock; use tracing::*; -use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::vec_map::VecMap; use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta}; + +use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState}; +use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64, u64_to_usize}; +use crate::config::PageServerConf; +use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods use crate::metrics::TIMELINE_EPHEMERAL_BYTES; -use std::cmp::Ordering; -use std::fmt::Write; -use std::ops::Range; -use std::sync::atomic::Ordering as AtomicOrdering; -use std::sync::atomic::{AtomicU64, AtomicUsize}; -use tokio::sync::RwLock; - -use super::{ - DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState, -}; +use crate::tenant::ephemeral_file::EphemeralFile; +use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo}; +use crate::tenant::timeline::GetVectoredError; +use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; +use crate::{l0_flush, page_cache}; pub(crate) mod vectored_dio_read; @@ -112,8 +110,8 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = { /// /// Layout: /// - 1 bit: `will_init` -/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len` -/// - [`MAX_SUPPORTED_POS_BITS`]: `pos` +/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len` +/// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos` #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct IndexEntry(u64); @@ -415,10 +413,8 @@ impl InMemoryLayer { // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. - // - // If the key is cached, go no further than the cached Lsn. pub(crate) async fn get_values_reconstruct_data( - &self, + self: &Arc, keyspace: KeySpace, end_lsn: Lsn, reconstruct_state: &mut ValuesReconstructState, @@ -435,6 +431,9 @@ impl InMemoryLayer { read: vectored_dio_read::LogicalRead>, } let mut reads: HashMap> = HashMap::new(); + let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); + + let lsn_range = self.start_lsn..end_lsn; for range in keyspace.ranges.iter() { for (key, vec_map) in inner @@ -442,12 +441,7 @@ impl InMemoryLayer { .range(range.start.to_compact()..range.end.to_compact()) { let key = Key::from_compact(*key); - let lsn_range = match reconstruct_state.get_cached_lsn(&key) { - Some(cached_lsn) => (cached_lsn + 1)..end_lsn, - None => self.start_lsn..end_lsn, - }; - - let slice = vec_map.slice_range(lsn_range); + let slice = vec_map.slice_range(lsn_range.clone()); for (entry_lsn, index_entry) in slice.iter().rev() { let IndexEntryUnpacked { @@ -463,55 +457,59 @@ impl InMemoryLayer { Vec::with_capacity(len as usize), ), }); + + let io = reconstruct_state.update_key(&key, *entry_lsn, will_init); + ios.insert((key, *entry_lsn), io); + if will_init { break; } } } } + drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below + let read_from = Arc::clone(self); + let read_ctx = ctx.attached_child(); + reconstruct_state + .spawn_io(async move { + let inner = read_from.inner.read().await; + let f = vectored_dio_read::execute( + &inner.file, + reads + .iter() + .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)), + &read_ctx, + ); + send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865 + .await; - // Execute the reads. - - let f = vectored_dio_read::execute( - &inner.file, - reads - .iter() - .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)), - &ctx, - ); - send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865 - .await; - - // Process results into the reconstruct state - 'next_key: for (key, value_reads) in reads { - for ValueRead { entry_lsn, read } in value_reads { - match read.into_result().expect("we run execute() above") { - Err(e) => { - reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); - continue 'next_key; - } - Ok(value_buf) => { - let value = Value::des(&value_buf); - if let Err(e) = value { - reconstruct_state - .on_key_error(key, PageReconstructError::from(anyhow!(e))); - continue 'next_key; + for (key, value_reads) in reads { + for ValueRead { entry_lsn, read } in value_reads { + let io = ios.remove(&(key, entry_lsn)).expect("sender must exist"); + match read.into_result().expect("we run execute() above") { + Err(e) => { + io.complete(Err(std::io::Error::new( + e.kind(), + "dio vec read failed", + ))); + } + Ok(value_buf) => { + io.complete(Ok(OnDiskValue::WalRecordOrImage(value_buf.into()))); + } } - - let key_situation = - reconstruct_state.update_key(&key, entry_lsn, value.unwrap()); - if key_situation == ValueReconstructSituation::Complete { - // TODO: metric to see if we fetched more values than necessary - continue 'next_key; - } - - // process the next value in the next iteration of the loop } } - } - } - reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn); + assert!(ios.is_empty()); + + // Keep layer existent until this IO is done; + // This is kinda forced for InMemoryLayer because we need to inner.read() anyway, + // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit + // drop for consistency among all three layer types. + drop(inner); + drop(read_from); + }) + .await; Ok(()) } @@ -558,7 +556,9 @@ impl InMemoryLayer { gate: &utils::sync::gate::Gate, ctx: &RequestContext, ) -> Result { - trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); + trace!( + "initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}" + ); let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); @@ -606,6 +606,7 @@ impl InMemoryLayer { // Write the batch to the file inner.file.write_raw(&raw, ctx).await?; let new_size = inner.file.len(); + let expected_new_len = base_offset .checked_add(raw.len().into_u64()) // write_raw would error if we were to overflow u64. @@ -820,8 +821,7 @@ mod tests { #[test] fn test_index_entry() { const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS; - use IndexEntryNewArgs as Args; - use IndexEntryUnpacked as Unpacked; + use {IndexEntryNewArgs as Args, IndexEntryUnpacked as Unpacked}; let roundtrip = |args, expect: Unpacked| { let res = IndexEntry::new(args).expect("this tests expects no errors"); diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index a4bb3a6bfc..90455fd0ca 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -1,16 +1,13 @@ -use std::{ - collections::BTreeMap, - sync::{Arc, RwLock}, -}; +use std::collections::BTreeMap; +use std::sync::{Arc, RwLock}; use itertools::Itertools; use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice}; -use crate::{ - assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}, - context::RequestContext, - virtual_file::{owned_buffers_io::io_buf_aligned::IoBufAlignedMut, IoBufferMut}, -}; +use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; +use crate::context::RequestContext; +use crate::virtual_file::IoBufferMut; +use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; /// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`]. pub trait File: Send { @@ -25,11 +22,11 @@ pub trait File: Send { /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`. /// /// No guarantees are made about the remaining bytes in `dst` in case of a short read. - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( - &'b self, + async fn read_exact_at_eof_ok( + &self, start: u64, dst: Slice, - ctx: &'a RequestContext, + ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)>; } @@ -132,7 +129,9 @@ where let req_len = match cur { LogicalReadState::NotStarted(buf) => { if buf.len() != 0 { - panic!("The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`"); + panic!( + "The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`" + ); } // buf.cap() == 0 is ok @@ -141,7 +140,9 @@ where *state = LogicalReadState::Ongoing(buf); req_len } - x => panic!("must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"), + x => panic!( + "must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}" + ), }; // plan which chunks we need to read from @@ -422,15 +423,15 @@ impl Buffer for Vec { #[cfg(test)] #[allow(clippy::assertions_on_constants)] mod tests { + use std::cell::RefCell; + use std::collections::VecDeque; + use rand::Rng; - use crate::{ - context::DownloadBehavior, task_mgr::TaskKind, - virtual_file::owned_buffers_io::slice::SliceMutExt, - }; - use super::*; - use std::{cell::RefCell, collections::VecDeque}; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; struct InMemoryFile { content: Vec, @@ -479,11 +480,11 @@ mod tests { } impl File for InMemoryFile { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( - &'b self, + async fn read_exact_at_eof_ok( + &self, start: u64, mut dst: Slice, - _ctx: &'a RequestContext, + _ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)> { let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed(); let nread = { @@ -609,12 +610,12 @@ mod tests { } } - impl<'x> File for RecorderFile<'x> { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( - &'b self, + impl File for RecorderFile<'_> { + async fn read_exact_at_eof_ok( + &self, start: u64, dst: Slice, - ctx: &'a RequestContext, + ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)> { let (dst, nread) = self.file.read_exact_at_eof_ok(start, dst, ctx).await?; self.recorded.borrow_mut().push(RecordedRead { @@ -740,11 +741,11 @@ mod tests { } impl File for MockFile { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( - &'b self, + async fn read_exact_at_eof_ok( + &self, start: u64, mut dst: Slice, - _ctx: &'a RequestContext, + _ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)> { let ExpectedRead { expect_pos, diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 8933e8ceb1..ae06aca63b 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1,32 +1,32 @@ +use std::ops::Range; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::sync::{Arc, Weak}; +use std::time::{Duration, SystemTime}; + use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::HistoricLayerInfo; use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; -use std::ops::Range; -use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; -use std::sync::{Arc, Weak}; -use std::time::{Duration, SystemTime}; use tracing::Instrument; +use utils::generation::Generation; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::sync::{gate, heavier_once_cell}; -use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; -use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::task_mgr::TaskKind; -use crate::tenant::timeline::{CompactionError, GetVectoredError}; -use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; - use super::delta_layer::{self}; use super::image_layer::{self}; use super::{ AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState, }; - -use utils::generation::Generation; +use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; +use crate::tenant::remote_timeline_client::LayerFileMetadata; +use crate::tenant::timeline::{CompactionError, GetVectoredError}; #[cfg(test)] mod tests; @@ -136,6 +136,22 @@ pub(crate) fn local_layer_path( } } +pub(crate) enum LastEviction { + Never, + At(std::time::Instant), + Evicting, +} + +impl LastEviction { + pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool { + match self { + LastEviction::Never => false, + LastEviction::At(evicted_at) => evicted_at > &timepoint, + LastEviction::Evicting => true, + } + } +} + impl Layer { /// Creates a layer value for a file we know to not be resident. pub(crate) fn for_evicted( @@ -308,7 +324,7 @@ impl Layer { reconstruct_data: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let layer = self + let downloaded = self .0 .get_or_maybe_download(true, Some(ctx)) .await @@ -318,11 +334,15 @@ impl Layer { } other => GetVectoredError::Other(anyhow::anyhow!(other)), })?; + let this = ResidentLayer { + downloaded: downloaded.clone(), + owner: self.clone(), + }; self.record_access(ctx); - layer - .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) + downloaded + .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx) .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)) .await .map_err(|err| match err { @@ -336,7 +356,7 @@ impl Layer { /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. - pub(crate) async fn download(&self) -> anyhow::Result<()> { + pub(crate) async fn download(&self) -> Result<(), DownloadError> { self.0.get_or_maybe_download(true, None).await?; Ok(()) } @@ -349,7 +369,6 @@ impl Layer { /// while the guard exists. /// /// Returns None if the layer is currently evicted or becoming evicted. - #[cfg(test)] pub(crate) async fn keep_resident(&self) -> Option { let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?; @@ -402,6 +421,17 @@ impl Layer { self.0.metadata() } + pub(crate) fn last_evicted_at(&self) -> LastEviction { + match self.0.last_evicted_at.try_lock() { + Ok(lock) => match *lock { + None => LastEviction::Never, + Some(at) => LastEviction::At(at), + }, + Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting, + Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"), + } + } + pub(crate) fn get_timeline_id(&self) -> Option { self.0 .timeline @@ -526,7 +556,6 @@ impl ResidentOrWantedEvicted { /// This is not used on the read path (anything that calls /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`]. - #[cfg(test)] fn get(&self) -> Option> { match self { ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()), @@ -654,7 +683,9 @@ struct LayerInner { /// When the Layer was last evicted but has not been downloaded since. /// - /// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`]. + /// This is used for skipping evicted layers from the previous heatmap (see + /// `[Timeline::generate_heatmap]`) and for updating metrics + /// (see [`LayerImplMetrics::redownload_after`]). last_evicted_at: std::sync::Mutex>, #[cfg(test)] @@ -697,13 +728,7 @@ impl Drop for LayerInner { if let Some(timeline) = timeline.as_ref() { // Only need to decrement metrics if the timeline still exists: otherwise // it will have already de-registered these metrics via TimelineMetrics::shutdown - if self.desc.is_delta() { - timeline.metrics.layer_count_delta.dec(); - timeline.metrics.layer_size_delta.sub(self.desc.file_size); - } else { - timeline.metrics.layer_count_image.dec(); - timeline.metrics.layer_size_image.sub(self.desc.file_size); - } + timeline.metrics.dec_layer(&self.desc); if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) { debug_assert!( @@ -813,13 +838,7 @@ impl LayerInner { }; // This object acts as a RAII guard on these metrics: increment on construction - if desc.is_delta() { - timeline.metrics.layer_count_delta.inc(); - timeline.metrics.layer_size_delta.add(desc.file_size); - } else { - timeline.metrics.layer_count_image.inc(); - timeline.metrics.layer_size_image.add(desc.file_size); - } + timeline.metrics.inc_layer(&desc); // New layers are visible by default. This metric is later updated on drop or in set_visibility timeline @@ -1768,25 +1787,25 @@ impl DownloadedLayer { async fn get_values_reconstruct_data( &self, + this: ResidentLayer, keyspace: KeySpace, lsn_range: Range, reconstruct_data: &mut ValuesReconstructState, - owner: &Arc, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { use LayerKind::*; match self - .get(owner, ctx) + .get(&this.owner.0, ctx) .await .map_err(GetVectoredError::Other)? { Delta(d) => { - d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx) + d.get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx) .await } Image(i) => { - i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx) + i.get_values_reconstruct_data(this, keyspace, reconstruct_data, ctx) .await } } @@ -1812,7 +1831,7 @@ enum LayerKind { /// Guard for forcing a layer be resident while it exists. #[derive(Clone)] -pub(crate) struct ResidentLayer { +pub struct ResidentLayer { owner: Layer, downloaded: Arc, } @@ -1854,8 +1873,8 @@ impl ResidentLayer { self.owner.record_access(ctx); let res = match inner { - Delta(ref d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await, - Image(ref i) => image_layer::ImageLayerInner::load_keys(i, ctx).await, + Delta(d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await, + Image(i) => image_layer::ImageLayerInner::load_keys(i, ctx).await, }; res.with_context(|| format!("Layer index is corrupted for {self}")) } @@ -1901,7 +1920,7 @@ impl ResidentLayer { let owner = &self.owner.0; match self.downloaded.get(owner, ctx).await? { - Delta(ref d) => d + Delta(d) => d .copy_prefix(writer, until, ctx) .await .with_context(|| format!("copy_delta_prefix until {until} of {self}")), @@ -1924,7 +1943,7 @@ impl ResidentLayer { ) -> anyhow::Result<&delta_layer::DeltaLayerInner> { use LayerKind::*; match self.downloaded.get(&self.owner.0, ctx).await? { - Delta(ref d) => Ok(d), + Delta(d) => Ok(d), Image(_) => Err(anyhow::anyhow!("image layer")), } } @@ -1936,7 +1955,7 @@ impl ResidentLayer { ) -> anyhow::Result<&image_layer::ImageLayerInner> { use LayerKind::*; match self.downloaded.get(&self.owner.0, ctx).await? { - Image(ref d) => Ok(d), + Image(d) => Ok(d), Delta(_) => Err(anyhow::anyhow!("delta layer")), } } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 36dcc8d805..724150d27f 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,16 +1,16 @@ use std::time::UNIX_EPOCH; -use pageserver_api::key::CONTROLFILE_KEY; +use pageserver_api::key::{CONTROLFILE_KEY, Key}; use tokio::task::JoinSet; -use utils::{ - completion::{self, Completion}, - id::TimelineId, -}; +use utils::completion::{self, Completion}; +use utils::id::TimelineId; use super::failpoints::{Failpoint, FailpointKind}; use super::*; -use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint}; -use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; +use crate::context::DownloadBehavior; +use crate::task_mgr::TaskKind; +use crate::tenant::harness::{TenantHarness, test_img}; +use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint}; /// Used in tests to advance a future to wanted await point, and not futher. const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600); @@ -28,23 +28,55 @@ async fn smoke_test() { let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let (tenant, _) = h.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); + let image_layers = vec![( + Lsn(0x40), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + test_img("foo"), + )], + )]; + + // Create a test timeline with one real layer, and one synthetic test layer. The synthetic + // one is only there so that we can GC the real one without leaving the timeline's metadata + // empty, which is an illegal state (see [`IndexPart::validate`]). let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline_with_layers( + TimelineId::generate(), + Lsn(0x10), + 14, + &ctx, + Default::default(), + image_layers, + Lsn(0x100), + ) .await .unwrap(); - let layer = { + // Grab one of the timeline's layers to exercise in the test, and the other layer that is just + // there to avoid the timeline being illegally empty + let (layer, dummy_layer) = { let mut layers = { let layers = timeline.layers.read().await; layers.likely_resident_layers().cloned().collect::>() }; - assert_eq!(layers.len(), 1); + assert_eq!(layers.len(), 2); - layers.swap_remove(0) + layers.sort_by_key(|l| l.layer_desc().get_key_range().start); + let synthetic_layer = layers.pop().unwrap(); + let real_layer = layers.pop().unwrap(); + tracing::info!( + "real_layer={:?} ({}), synthetic_layer={:?} ({})", + real_layer, + real_layer.layer_desc().file_size, + synthetic_layer, + synthetic_layer.layer_desc().file_size + ); + (real_layer, synthetic_layer) }; // all layers created at pageserver are like `layer`, initialized with strong @@ -55,7 +87,7 @@ async fn smoke_test() { }; let img_before = { - let mut data = ValuesReconstructState::default(); + let mut data = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( controlfile_keyspace.clone(), @@ -65,10 +97,13 @@ async fn smoke_test() { ) .await .unwrap(); + data.keys .remove(&CONTROLFILE_KEY) .expect("must be present") - .expect("should not error") + .collect_pending_ios() + .await + .expect("must not error") .img .take() .expect("tenant harness writes the control file") @@ -87,7 +122,7 @@ async fn smoke_test() { // on accesses when the layer is evicted, it will automatically be downloaded. let img_after = { - let mut data = ValuesReconstructState::default(); + let mut data = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( controlfile_keyspace.clone(), @@ -101,7 +136,9 @@ async fn smoke_test() { data.keys .remove(&CONTROLFILE_KEY) .expect("must be present") - .expect("should not error") + .collect_pending_ios() + .await + .expect("must not error") .img .take() .expect("tenant harness writes the control file") @@ -173,10 +210,13 @@ async fn smoke_test() { let rtc = &timeline.remote_client; + // Simulate GC removing our test layer. { - let layers = &[layer]; let mut g = timeline.layers.write().await; + + let layers = &[layer]; g.open_mut().unwrap().finish_gc_timeline(layers); + // this just updates the remote_physical_size for demonstration purposes rtc.schedule_gc_update(layers).unwrap(); } @@ -191,7 +231,10 @@ async fn smoke_test() { rtc.wait_completion().await.unwrap(); - assert_eq!(rtc.get_remote_physical_size(), 0); + assert_eq!( + rtc.get_remote_physical_size(), + dummy_layer.metadata().file_size + ); assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) } @@ -722,10 +765,12 @@ async fn evict_and_wait_does_not_wait_for_download() { let (arrival, _download_arrived) = utils::completion::channel(); layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier)); - let mut download = std::pin::pin!(layer - .0 - .get_or_maybe_download(true, None) - .instrument(download_span)); + let mut download = std::pin::pin!( + layer + .0 + .get_or_maybe_download(true, None) + .instrument(download_span) + ); assert!( !layer.is_likely_resident(), diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index 2097e90764..ed16dcaa0d 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -1,16 +1,15 @@ use core::fmt::Display; -use pageserver_api::shard::TenantShardId; use std::ops::Range; -use utils::{id::TimelineId, lsn::Lsn}; use pageserver_api::key::Key; - -use super::{DeltaLayerName, ImageLayerName, LayerName}; - +use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; - #[cfg(test)] use utils::id::TenantId; +use utils::id::TimelineId; +use utils::lsn::Lsn; + +use super::{DeltaLayerName, ImageLayerName, LayerName}; /// A unique identifier of a persistent layer. /// diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index addf3b85d9..0f7995f87b 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -1,12 +1,12 @@ //! //! Helper functions for dealing with filenames of the image and delta layer files. //! -use pageserver_api::key::Key; use std::cmp::Ordering; use std::fmt; use std::ops::Range; use std::str::FromStr; +use pageserver_api::key::Key; use utils::lsn::Lsn; use super::PersistentLayerDesc; @@ -305,7 +305,7 @@ impl FromStr for LayerName { (None, None) => { return Err(format!( "neither delta nor image layer file name: {value:?}" - )) + )); } (Some(delta), None) => Self::Delta(delta), (None, Some(image)) => Self::Image(image), diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index 19cfcb0867..76cdddd06a 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -1,21 +1,16 @@ -use std::{ - cmp::Ordering, - collections::{binary_heap, BinaryHeap}, - sync::Arc, -}; +use std::cmp::Ordering; +use std::collections::{BinaryHeap, binary_heap}; +use std::sync::Arc; use anyhow::bail; use pageserver_api::key::Key; +use pageserver_api::value::Value; use utils::lsn::Lsn; +use super::delta_layer::{DeltaLayerInner, DeltaLayerIterator}; +use super::image_layer::{ImageLayerInner, ImageLayerIterator}; +use super::{PersistentLayerDesc, PersistentLayerKey}; use crate::context::RequestContext; -use pageserver_api::value::Value; - -use super::{ - delta_layer::{DeltaLayerInner, DeltaLayerIterator}, - image_layer::{ImageLayerInner, ImageLayerIterator}, - PersistentLayerDesc, PersistentLayerKey, -}; #[derive(Clone, Copy)] pub(crate) enum LayerRef<'a> { @@ -349,24 +344,18 @@ impl<'a> MergeIterator<'a> { #[cfg(test)] mod tests { - use super::*; - use itertools::Itertools; use pageserver_api::key::Key; - use utils::lsn::Lsn; - - use crate::{ - tenant::{ - harness::{TenantHarness, TIMELINE_ID}, - storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}, - }, - DEFAULT_PG_VERSION, - }; - - #[cfg(feature = "testing")] - use crate::tenant::storage_layer::delta_layer::test::sort_delta_value; #[cfg(feature = "testing")] use pageserver_api::record::NeonWalRecord; + use utils::lsn::Lsn; + + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + #[cfg(feature = "testing")] + use crate::tenant::storage_layer::delta_layer::test::sort_delta_value; + use crate::tenant::storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}; async fn assert_merge_iter_equal( merge_iter: &mut MergeIterator<'_>, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 0118a5ce5f..670f9ad87f 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -1,52 +1,83 @@ -//! This module contains functions to serve per-tenant background processes, -//! such as compaction and GC +//! This module contains per-tenant background processes, e.g. compaction and GC. -use std::ops::ControlFlow; -use std::str::FromStr; +use std::cmp::max; +use std::future::Future; +use std::ops::{ControlFlow, RangeInclusive}; +use std::pin::pin; use std::sync::Arc; use std::time::{Duration, Instant}; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::TENANT_TASK_EVENTS; -use crate::task_mgr; -use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; -use crate::tenant::throttle::Stats; -use crate::tenant::timeline::CompactionError; -use crate::tenant::{Tenant, TenantState}; +use once_cell::sync::Lazy; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD; use rand::Rng; +use scopeguard::defer; +use tokio::sync::{Semaphore, SemaphorePermit}; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::{backoff, completion, pausable_failpoint}; +use utils::backoff::exponential_backoff_duration; +use utils::completion::Barrier; +use utils::pausable_failpoint; -static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = - once_cell::sync::Lazy::new(|| { - let total_threads = task_mgr::TOKIO_WORKER_THREADS.get(); - let permits = usize::max( - 1, - // while a lot of the work is done on spawn_blocking, we still do - // repartitioning in the async context. this should give leave us some workers - // unblocked to be blocked on other work, hopefully easing any outside visible - // effects of restarts. - // - // 6/8 is a guess; previously we ran with unlimited 8 and more from - // spawn_blocking. - (total_threads * 3).checked_div(4).unwrap_or(0), - ); - assert_ne!(permits, 0, "we will not be adding in permits later"); - assert!( - permits < total_threads, - "need threads avail for shorter work" - ); - tokio::sync::Semaphore::new(permits) - }); +use crate::context::{DownloadBehavior, RequestContext}; +use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind}; +use crate::tenant::throttle::Stats; +use crate::tenant::timeline::CompactionError; +use crate::tenant::timeline::compaction::CompactionOutcome; +use crate::tenant::{Tenant, TenantState}; -#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)] +/// Semaphore limiting concurrent background tasks (across all tenants). +/// +/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. +static CONCURRENT_BACKGROUND_TASKS: Lazy = Lazy::new(|| { + let total_threads = TOKIO_WORKER_THREADS.get(); + let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0)); + assert_ne!(permits, 0, "we will not be adding in permits later"); + assert!(permits < total_threads, "need threads for other work"); + Semaphore::new(permits) +}); + +/// Semaphore limiting concurrent L0 compaction tasks (across all tenants). This is only used if +/// both `compaction_l0_semaphore` and `compaction_l0_first` are enabled. +/// +/// This is a separate semaphore from background tasks, because L0 compaction needs to be responsive +/// to avoid high read amp during heavy write workloads. Regular image/GC compaction is less +/// important (e.g. due to page images in delta layers) and can wait for other background tasks. +/// +/// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. Note +/// that this runs on the same Tokio runtime as `CONCURRENT_BACKGROUND_TASKS`, and shares the same +/// thread pool. +static CONCURRENT_L0_COMPACTION_TASKS: Lazy = Lazy::new(|| { + let total_threads = TOKIO_WORKER_THREADS.get(); + let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0)); + assert_ne!(permits, 0, "we will not be adding in permits later"); + assert!(permits < total_threads, "need threads for other work"); + Semaphore::new(permits) +}); + +/// Background jobs. +/// +/// NB: not all of these acquire a CONCURRENT_BACKGROUND_TASKS semaphore permit, only the ones that +/// do any significant IO or CPU work. +#[derive( + Debug, + PartialEq, + Eq, + Clone, + Copy, + strum_macros::IntoStaticStr, + strum_macros::Display, + enum_map::Enum, +)] #[strum(serialize_all = "snake_case")] pub(crate) enum BackgroundLoopKind { + /// L0Compaction runs as a separate pass within the Compaction loop, not a separate loop. It is + /// used to request the `CONCURRENT_L0_COMPACTION_TASKS` semaphore and associated metrics. + L0Compaction, Compaction, Gc, Eviction, - IngestHouseKeeping, + TenantHouseKeeping, ConsumptionMetricsCollectMetrics, ConsumptionMetricsSyntheticSizeWorker, InitialLogicalSizeCalculation, @@ -54,37 +85,41 @@ pub(crate) enum BackgroundLoopKind { SecondaryDownload, } -impl BackgroundLoopKind { - fn as_static_str(&self) -> &'static str { - self.into() - } +pub struct BackgroundLoopSemaphorePermit<'a> { + _permit: SemaphorePermit<'static>, + _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>, } -/// Cancellation safe. -pub(crate) async fn concurrent_background_tasks_rate_limit_permit( +/// Acquires a semaphore permit, to limit concurrent background jobs. +pub(crate) async fn acquire_concurrency_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, -) -> tokio::sync::SemaphorePermit<'static> { - let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind); +) -> BackgroundLoopSemaphorePermit<'static> { + let mut recorder = metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind); - pausable_failpoint!( - "initial-size-calculation-permit-pause", - loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation - ); + if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation { + pausable_failpoint!("initial-size-calculation-permit-pause"); + } // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); - match CONCURRENT_BACKGROUND_TASKS.acquire().await { - Ok(permit) => permit, - Err(_closed) => unreachable!("we never close the semaphore"), + let semaphore = match loop_kind { + BackgroundLoopKind::L0Compaction => &CONCURRENT_L0_COMPACTION_TASKS, + _ => &CONCURRENT_BACKGROUND_TASKS, + }; + let permit = semaphore.acquire().await.expect("should never close"); + + recorder.acquired(); + + BackgroundLoopSemaphorePermit { + _permit: permit, + _recorder: recorder, } } -/// Start per tenant background loops: compaction and gc. -pub fn start_background_loops( - tenant: &Arc, - background_jobs_can_start: Option<&completion::Barrier>, -) { +/// Start per tenant background loops: compaction, GC, and ingest housekeeping. +pub fn start_background_loops(tenant: &Arc, can_start: Option<&Barrier>) { let tenant_shard_id = tenant.tenant_shard_id; + task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, @@ -93,13 +128,15 @@ pub fn start_background_loops( &format!("compactor for tenant {tenant_shard_id}"), { let tenant = Arc::clone(tenant); - let background_jobs_can_start = background_jobs_can_start.cloned(); + let can_start = can_start.cloned(); async move { - let cancel = task_mgr::shutdown_token(); + let cancel = task_mgr::shutdown_token(); // NB: must be in async context tokio::select! { - _ = cancel.cancelled() => { return Ok(()) }, - _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + _ = cancel.cancelled() => return Ok(()), + _ = Barrier::maybe_wait(can_start) => {} }; + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc()); compaction_loop(tenant, cancel) // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) @@ -108,6 +145,7 @@ pub fn start_background_loops( } }, ); + task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::GarbageCollector, @@ -116,13 +154,15 @@ pub fn start_background_loops( &format!("garbage collector for tenant {tenant_shard_id}"), { let tenant = Arc::clone(tenant); - let background_jobs_can_start = background_jobs_can_start.cloned(); + let can_start = can_start.cloned(); async move { - let cancel = task_mgr::shutdown_token(); + let cancel = task_mgr::shutdown_token(); // NB: must be in async context tokio::select! { - _ = cancel.cancelled() => { return Ok(()) }, - _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + _ = cancel.cancelled() => return Ok(()), + _ = Barrier::maybe_wait(can_start) => {} }; + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc()); gc_loop(tenant, cancel) .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; @@ -133,21 +173,23 @@ pub fn start_background_loops( task_mgr::spawn( BACKGROUND_RUNTIME.handle(), - TaskKind::IngestHousekeeping, + TaskKind::TenantHousekeeping, tenant_shard_id, None, - &format!("ingest housekeeping for tenant {tenant_shard_id}"), + &format!("housekeeping for tenant {tenant_shard_id}"), { let tenant = Arc::clone(tenant); - let background_jobs_can_start = background_jobs_can_start.cloned(); + let can_start = can_start.cloned(); async move { - let cancel = task_mgr::shutdown_token(); + let cancel = task_mgr::shutdown_token(); // NB: must be in async context tokio::select! { - _ = cancel.cancelled() => { return Ok(()) }, - _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + _ = cancel.cancelled() => return Ok(()), + _ = Barrier::maybe_wait(can_start) => {} }; - ingest_housekeeping_loop(tenant, cancel) - .instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc()); + tenant_housekeeping_loop(tenant, cancel) + .instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) } @@ -155,372 +197,297 @@ pub fn start_background_loops( ); } -/// -/// Compaction task's main loop -/// +/// Compaction task's main loop. async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { + const BASE_BACKOFF_SECS: f64 = 1.0; const MAX_BACKOFF_SECS: f64 = 300.0; - // How many errors we have seen consequtively - let mut error_run_count = 0; + const RECHECK_CONFIG_INTERVAL: Duration = Duration::from_secs(10); - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - async { - let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); - let mut first = true; - loop { + let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); + let mut period = tenant.get_compaction_period(); + let mut error_run = 0; // consecutive errors + + // Stagger the compaction loop across tenants. + if wait_for_active_tenant(&tenant, &cancel).await.is_break() { + return; + } + if sleep_random(period, &cancel).await.is_err() { + return; + } + + loop { + // Recheck that we're still active. + if wait_for_active_tenant(&tenant, &cancel).await.is_break() { + return; + } + + // Refresh the period. If compaction is disabled, check again in a bit. + period = tenant.get_compaction_period(); + if period == Duration::ZERO { + #[cfg(not(feature = "testing"))] + info!("automatic compaction is disabled"); tokio::select! { - _ = cancel.cancelled() => { - return; - }, - tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { - ControlFlow::Break(()) => return, - ControlFlow::Continue(()) => (), - }, + _ = tokio::time::sleep(RECHECK_CONFIG_INTERVAL) => {}, + _ = cancel.cancelled() => return, } + continue; + } - let period = tenant.get_compaction_period(); + // Wait for the next compaction run. + let backoff = exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS); + tokio::select! { + _ = tokio::time::sleep(backoff), if error_run > 0 => {}, + _ = tokio::time::sleep(period), if error_run == 0 => {}, + _ = tenant.l0_compaction_trigger.notified(), if error_run == 0 => {}, + _ = cancel.cancelled() => return, + } - // TODO: we shouldn't need to await to find tenant and this could be moved outside of - // loop, #3501. There are also additional "allowed_errors" in tests. - if first { - first = false; - if random_init_delay(period, &cancel).await.is_err() { - break; + // Run compaction. + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::Compaction, + }; + let IterationResult { output, elapsed } = iteration + .run(tenant.compaction_iteration(&cancel, &ctx)) + .await; + + match output { + Ok(outcome) => { + error_run = 0; + // If there's more compaction work, L0 or not, schedule an immediate run. + match outcome { + CompactionOutcome::Done => {} + CompactionOutcome::Skipped => {} + CompactionOutcome::YieldForL0 => tenant.l0_compaction_trigger.notify_one(), + CompactionOutcome::Pending => tenant.l0_compaction_trigger.notify_one(), } } - let sleep_duration; - if period == Duration::ZERO { - #[cfg(not(feature = "testing"))] - info!("automatic compaction is disabled"); - // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10) - } else { - let iteration = Iteration { - started_at: Instant::now(), - period, - kind: BackgroundLoopKind::Compaction, - }; - - // Run compaction - let IterationResult { output, elapsed } = iteration - .run(tenant.compaction_iteration(&cancel, &ctx)) - .await; - match output { - Ok(has_pending_task) => { - error_run_count = 0; - // schedule the next compaction immediately in case there is a pending compaction task - sleep_duration = if has_pending_task { - Duration::ZERO - } else { - period - }; - } - Err(e) => { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - log_compaction_error( - &e, - error_run_count, - &wait_duration, - cancel.is_cancelled(), - ); - sleep_duration = wait_duration; - } - } - - // the duration is recorded by performance tests by enabling debug in this function - tracing::debug!( - elapsed_ms = elapsed.as_millis(), - "compaction iteration complete" - ); - }; - - // Perhaps we did no work and the walredo process has been idle for some time: - // give it a chance to shut down to avoid leaving walredo process running indefinitely. - // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off, - // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens. - if let Some(walredo_mgr) = &tenant.walredo_mgr { - walredo_mgr.maybe_quiesce(period * 10); - } - - // Sleep - if tokio::time::timeout(sleep_duration, cancel.cancelled()) - .await - .is_ok() - { - break; + Err(err) => { + error_run += 1; + let backoff = + exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS); + log_compaction_error(&err, error_run, backoff, cancel.is_cancelled()); + continue; } } + + // NB: this log entry is recorded by performance tests. + debug!( + elapsed_ms = elapsed.as_millis(), + "compaction iteration complete" + ); } - .await; - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); } fn log_compaction_error( - e: &CompactionError, - error_run_count: u32, - sleep_duration: &std::time::Duration, + err: &CompactionError, + error_count: u32, + sleep_duration: Duration, task_cancelled: bool, ) { - use crate::tenant::upload_queue::NotInitialized; - use crate::tenant::PageReconstructError; use CompactionError::*; - enum LooksLike { - Info, - Error, - } + use crate::pgdatadir_mapping::CollectKeySpaceError; + use crate::tenant::PageReconstructError; + use crate::tenant::upload_queue::NotInitialized; - let decision = match e { - ShuttingDown => None, - Offload(_) => Some(LooksLike::Error), - _ if task_cancelled => Some(LooksLike::Info), - Other(e) => { - let root_cause = e.root_cause(); + let level = match err { + ShuttingDown => return, + Offload(_) => Level::ERROR, + AlreadyRunning(_) => Level::ERROR, + CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO, + CollectKeySpaceError(_) => Level::ERROR, + _ if task_cancelled => Level::INFO, + Other(err) => { + let root_cause = err.root_cause(); - let is_stopping = { - let upload_queue = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_stopping()); - - let timeline = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_stopping()); - - upload_queue || timeline - }; + let upload_queue = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + let timeline = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + let is_stopping = upload_queue || timeline; if is_stopping { - Some(LooksLike::Info) + Level::INFO } else { - Some(LooksLike::Error) + Level::ERROR } } }; - match decision { - Some(LooksLike::Info) => info!( - "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}", - ), - Some(LooksLike::Error) => error!( - "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}", - ), - None => {} + match level { + Level::ERROR => { + error!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}") + } + Level::INFO => { + info!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}") + } + level => unimplemented!("unexpected level {level:?}"), } } -/// -/// GC task's main loop -/// +/// GC task's main loop. async fn gc_loop(tenant: Arc, cancel: CancellationToken) { const MAX_BACKOFF_SECS: f64 = 300.0; - // How many errors we have seen consequtively - let mut error_run_count = 0; + let mut error_run = 0; // consecutive errors - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - async { - // GC might require downloading, to find the cutoff LSN that corresponds to the - // cutoff specified as time. - let ctx = - RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + // GC might require downloading, to find the cutoff LSN that corresponds to the + // cutoff specified as time. + let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + let mut first = true; - let mut first = true; - loop { - tokio::select! { - _ = cancel.cancelled() => { - return; - }, - tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { - ControlFlow::Break(()) => return, - ControlFlow::Continue(()) => (), - }, - } + loop { + if wait_for_active_tenant(&tenant, &cancel).await.is_break() { + return; + } - let period = tenant.get_gc_period(); + let period = tenant.get_gc_period(); - if first { - first = false; - - let delays = async { - random_init_delay(period, &cancel).await?; - Ok::<_, Cancelled>(()) - }; - - if delays.await.is_err() { - break; - } - } - - let gc_horizon = tenant.get_gc_horizon(); - let sleep_duration; - if period == Duration::ZERO || gc_horizon == 0 { - #[cfg(not(feature = "testing"))] - info!("automatic GC is disabled"); - // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10); - } else { - let iteration = Iteration { - started_at: Instant::now(), - period, - kind: BackgroundLoopKind::Gc, - }; - // Run gc - let IterationResult { output, elapsed: _ } = - iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)) - .await; - match output { - Ok(_) => { - error_run_count = 0; - sleep_duration = period; - } - Err(crate::tenant::GcError::TenantCancelled) => { - return; - } - Err(e) => { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - - if matches!(e, crate::tenant::GcError::TimelineCancelled) { - // Timeline was cancelled during gc. We might either be in an event - // that affects the entire tenant (tenant deletion, pageserver shutdown), - // or in one that affects the timeline only (timeline deletion). - // Therefore, don't exit the loop. - info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); - } else { - error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); - } - - sleep_duration = wait_duration; - } - } - }; - - if tokio::time::timeout(sleep_duration, cancel.cancelled()) - .await - .is_ok() - { + if first { + first = false; + if sleep_random(period, &cancel).await.is_err() { break; } } - } - .await; - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); -} - -async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken) { - TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); - async { - let mut last_throttle_flag_reset_at = Instant::now(); - loop { - tokio::select! { - _ = cancel.cancelled() => { - return; - }, - tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { - ControlFlow::Break(()) => return, - ControlFlow::Continue(()) => (), - }, - } - - // We run ingest housekeeping with the same frequency as compaction: it is not worth - // having a distinct setting. But we don't run it in the same task, because compaction - // blocks on acquiring the background job semaphore. - let period = tenant.get_compaction_period(); - - // If compaction period is set to zero (to disable it), then we will use a reasonable default - let period = if period == Duration::ZERO { - humantime::Duration::from_str( - pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD, - ) - .unwrap() - .into() - } else { - period - }; - - // Jitter the period by +/- 5% - let period = - rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100); - - // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of - // a tenant, since it won't have started writing any ephemeral files yet. - if tokio::time::timeout(period, cancel.cancelled()) - .await - .is_ok() - { - break; - } + let gc_horizon = tenant.get_gc_horizon(); + let sleep_duration; + if period == Duration::ZERO || gc_horizon == 0 { + #[cfg(not(feature = "testing"))] + info!("automatic GC is disabled"); + // check again in 10 seconds, in case it's been enabled again. + sleep_duration = Duration::from_secs(10); + } else { let iteration = Iteration { started_at: Instant::now(), period, - kind: BackgroundLoopKind::IngestHouseKeeping, + kind: BackgroundLoopKind::Gc, }; - iteration.run(tenant.ingest_housekeeping()).await; - - // TODO: rename the background loop kind to something more generic, like, tenant housekeeping. - // Or just spawn another background loop for this throttle, it's not like it's super costly. - info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { - let now = Instant::now(); - let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); - let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats(); - if count_throttled == 0 { + // Run gc + let IterationResult { output, elapsed: _ } = iteration + .run(tenant.gc_iteration( + None, + gc_horizon, + tenant.get_pitr_interval(), + &cancel, + &ctx, + )) + .await; + match output { + Ok(_) => { + error_run = 0; + sleep_duration = period; + } + Err(crate::tenant::GcError::TenantCancelled) => { return; } - let allowed_rps = tenant.pagestream_throttle.steady_rps(); - let delta = now - prev; - info!( - n_seconds=%format_args!("{:.3}", delta.as_secs_f64()), - count_accounted = count_accounted_finish, // don't break existing log scraping - count_throttled, - sum_throttled_usecs, - count_accounted_start, // log after pre-existing fields to not break existing log scraping - allowed_rps=%format_args!("{allowed_rps:.0}"), - "shard was throttled in the last n_seconds" - ); - }); - } - } - .await; - TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); -} + Err(e) => { + error_run += 1; + let wait_duration = + exponential_backoff_duration(error_run, 1.0, MAX_BACKOFF_SECS); -async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { - // if the tenant has a proper status already, no need to wait for anything - if tenant.current_state() == TenantState::Active { - ControlFlow::Continue(()) - } else { - let mut tenant_state_updates = tenant.subscribe_for_state_updates(); - loop { - match tenant_state_updates.changed().await { - Ok(()) => { - let new_state = &*tenant_state_updates.borrow(); - match new_state { - TenantState::Active => { - debug!("Tenant state changed to active, continuing the task loop"); - return ControlFlow::Continue(()); - } - state => { - debug!("Not running the task loop, tenant is not active: {state:?}"); - continue; - } + if matches!(e, crate::tenant::GcError::TimelineCancelled) { + // Timeline was cancelled during gc. We might either be in an event + // that affects the entire tenant (tenant deletion, pageserver shutdown), + // or in one that affects the timeline only (timeline deletion). + // Therefore, don't exit the loop. + info!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}"); + } else { + error!("Gc failed {error_run} times, retrying in {wait_duration:?}: {e:?}"); } - } - Err(_sender_dropped_error) => { - return ControlFlow::Break(()); + + sleep_duration = wait_duration; } } + }; + + if tokio::time::timeout(sleep_duration, cancel.cancelled()) + .await + .is_ok() + { + break; + } + } +} + +/// Tenant housekeeping's main loop. +async fn tenant_housekeeping_loop(tenant: Arc, cancel: CancellationToken) { + let mut last_throttle_flag_reset_at = Instant::now(); + loop { + if wait_for_active_tenant(&tenant, &cancel).await.is_break() { + return; + } + + // Use the same period as compaction; it's not worth a separate setting. But if it's set to + // zero (to disable compaction), then use a reasonable default. Jitter it by 5%. + let period = match tenant.get_compaction_period() { + Duration::ZERO => humantime::parse_duration(DEFAULT_COMPACTION_PERIOD).unwrap(), + period => period, + }; + + let Ok(period) = sleep_jitter(period, period * 5 / 100, &cancel).await else { + break; + }; + + // Do tenant housekeeping. + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::TenantHouseKeeping, + }; + iteration.run(tenant.housekeeping()).await; + + // Log any getpage throttling. + info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { + let now = Instant::now(); + let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); + let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats(); + if count_throttled == 0 { + return; + } + let allowed_rps = tenant.pagestream_throttle.steady_rps(); + let delta = now - prev; + info!( + n_seconds=%format_args!("{:.3}", delta.as_secs_f64()), + count_accounted = count_accounted_finish, // don't break existing log scraping + count_throttled, + sum_throttled_usecs, + count_accounted_start, // log after pre-existing fields to not break existing log scraping + allowed_rps=%format_args!("{allowed_rps:.0}"), + "shard was throttled in the last n_seconds" + ); + }); + } +} + +/// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down. +async fn wait_for_active_tenant( + tenant: &Arc, + cancel: &CancellationToken, +) -> ControlFlow<()> { + if tenant.current_state() == TenantState::Active { + return ControlFlow::Continue(()); + } + + let mut update_rx = tenant.subscribe_for_state_updates(); + loop { + tokio::select! { + _ = cancel.cancelled() => return ControlFlow::Break(()), + result = update_rx.changed() => if result.is_err() { + return ControlFlow::Break(()); + } + } + + match &*update_rx.borrow() { + TenantState::Active => { + debug!("Tenant state changed to active, continuing the task loop"); + return ControlFlow::Continue(()); + } + state => debug!("Not running the task loop, tenant is not active: {state:?}"), } } } @@ -529,26 +496,41 @@ async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { #[error("cancelled")] pub(crate) struct Cancelled; -/// Provide a random delay for background task initialization. +/// Sleeps for a random interval up to the given max value. /// /// This delay prevents a thundering herd of background tasks and will likely keep them running on /// different periods for more stable load. -pub(crate) async fn random_init_delay( - period: Duration, +pub(crate) async fn sleep_random( + max: Duration, cancel: &CancellationToken, -) -> Result<(), Cancelled> { - if period == Duration::ZERO { - return Ok(()); - } +) -> Result { + sleep_random_range(Duration::ZERO..=max, cancel).await +} - let d = { - let mut rng = rand::thread_rng(); - rng.gen_range(Duration::ZERO..=period) - }; - match tokio::time::timeout(d, cancel.cancelled()).await { - Ok(_) => Err(Cancelled), - Err(_) => Ok(()), +/// Sleeps for a random interval in the given range. Returns the duration. +pub(crate) async fn sleep_random_range( + interval: RangeInclusive, + cancel: &CancellationToken, +) -> Result { + let delay = rand::thread_rng().gen_range(interval); + if delay == Duration::ZERO { + return Ok(delay); } + tokio::select! { + _ = cancel.cancelled() => Err(Cancelled), + _ = tokio::time::sleep(delay) => Ok(delay), + } +} + +/// Sleeps for an interval with a random jitter. +pub(crate) async fn sleep_jitter( + duration: Duration, + jitter: Duration, + cancel: &CancellationToken, +) -> Result { + let from = duration.saturating_sub(jitter); + let to = duration.saturating_add(jitter); + sleep_random_range(from..=to, cancel).await } struct Iteration { @@ -564,42 +546,25 @@ struct IterationResult { impl Iteration { #[instrument(skip_all)] - pub(crate) async fn run(self, fut: Fut) -> IterationResult - where - Fut: std::future::Future, - { - let Self { - started_at, - period, - kind, - } = self; - - let mut fut = std::pin::pin!(fut); + pub(crate) async fn run, O>(self, fut: F) -> IterationResult { + let mut fut = pin!(fut); // Wrap `fut` into a future that logs a message every `period` so that we get a // very obvious breadcrumb in the logs _while_ a slow iteration is happening. - let liveness_logger = async move { - loop { - match tokio::time::timeout(period, &mut fut).await { - Ok(x) => return x, - Err(_) => { - // info level as per the same rationale why warn_when_period_overrun is info - // => https://github.com/neondatabase/neon/pull/5724 - info!("still running"); - } - } + let output = loop { + match tokio::time::timeout(self.period, &mut fut).await { + Ok(r) => break r, + Err(_) => info!("still running"), } }; - - let output = liveness_logger.await; - - let elapsed = started_at.elapsed(); - warn_when_period_overrun(elapsed, period, kind); + let elapsed = self.started_at.elapsed(); + warn_when_period_overrun(elapsed, self.period, self.kind); IterationResult { output, elapsed } } } -/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. + +// NB: the `task` and `period` are used for metrics labels. pub(crate) fn warn_when_period_overrun( elapsed: Duration, period: Duration, @@ -616,8 +581,8 @@ pub(crate) fn warn_when_period_overrun( ?task, "task iteration took longer than the configured period" ); - crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT - .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())]) + metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT + .with_label_values(&[task.into(), &format!("{}", period.as_secs())]) .inc(); } } diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index 54c0e59daa..6c37c3771b 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -1,10 +1,6 @@ -use std::{ - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, - time::{Duration, Instant}, -}; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::Instant; use arc_swap::ArcSwap; use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; @@ -16,9 +12,8 @@ use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; /// To share a throttle among multiple entities, wrap it in an [`Arc`]. /// /// The intial use case for this is tenant-wide throttling of getpage@lsn requests. -pub struct Throttle { +pub struct Throttle { inner: ArcSwap, - metric: M, /// will be turned into [`Stats::count_accounted_start`] count_accounted_start: AtomicU64, /// will be turned into [`Stats::count_accounted_finish`] @@ -36,15 +31,6 @@ pub struct Inner { pub type Config = pageserver_api::models::ThrottleConfig; -pub struct Observation { - pub wait_time: Duration, -} -pub trait Metric { - fn accounting_start(&self); - fn accounting_finish(&self); - fn observe_throttling(&self, observation: &Observation); -} - /// See [`Throttle::reset_stats`]. pub struct Stats { /// Number of requests that started [`Throttle::throttle`] calls. @@ -58,14 +44,15 @@ pub struct Stats { pub sum_throttled_usecs: u64, } -impl Throttle -where - M: Metric, -{ - pub fn new(config: Config, metric: M) -> Self { +pub enum ThrottleResult { + NotThrottled { end: Instant }, + Throttled { end: Instant }, +} + +impl Throttle { + pub fn new(config: Config) -> Self { Self { inner: ArcSwap::new(Arc::new(Self::new_inner(config))), - metric, count_accounted_start: AtomicU64::new(0), count_accounted_finish: AtomicU64::new(0), count_throttled: AtomicU64::new(0), @@ -122,32 +109,27 @@ where self.inner.load().rate_limiter.steady_rps() } - pub async fn throttle(&self, key_count: usize) -> Option { + /// `start` must be [`Instant::now`] or earlier. + pub async fn throttle(&self, key_count: usize, start: Instant) -> ThrottleResult { let inner = self.inner.load_full(); // clones the `Inner` Arc if !inner.enabled { - return None; + return ThrottleResult::NotThrottled { end: start }; } - let start = std::time::Instant::now(); - - self.metric.accounting_start(); self.count_accounted_start.fetch_add(1, Ordering::Relaxed); let did_throttle = inner.rate_limiter.acquire(key_count).await; self.count_accounted_finish.fetch_add(1, Ordering::Relaxed); - self.metric.accounting_finish(); if did_throttle { self.count_throttled.fetch_add(1, Ordering::Relaxed); - let now = Instant::now(); - let wait_time = now - start; + let end = Instant::now(); + let wait_time = end - start; self.sum_throttled_usecs .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed); - let observation = Observation { wait_time }; - self.metric.observe_throttling(&observation); - Some(wait_time) + ThrottleResult::Throttled { end } } else { - None + ThrottleResult::NotThrottled { end: start } } } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 98a32d519d..abcce23d83 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4,6 +4,7 @@ pub mod delete; pub(crate) mod detach_ancestor; mod eviction_task; pub(crate) mod handle; +mod heatmap_layers_downloader; pub(crate) mod import_pgdata; mod init; pub mod layer_manager; @@ -13,154 +14,116 @@ pub mod span; pub mod uninit; mod walreceiver; -use anyhow::{anyhow, bail, ensure, Context, Result}; -use arc_swap::ArcSwap; +use std::array; +use std::cmp::{max, min}; +use std::collections::btree_map::Entry; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::ops::{ControlFlow, Deref, Range}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}; +use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; +use std::time::{Duration, Instant, SystemTime}; + +use anyhow::{Context, Result, anyhow, bail, ensure}; +use arc_swap::{ArcSwap, ArcSwapOption}; use bytes::Bytes; use camino::Utf8Path; use chrono::{DateTime, Utc}; +use compaction::{CompactionOutcome, GcCompactionCombinedSettings}; use enumset::EnumSet; use fail::fail_point; +use futures::stream::FuturesUnordered; +use futures::{FutureExt, StreamExt}; use handle::ShardTimelineId; +use layer_manager::Shutdown; use offload::OffloadError; use once_cell::sync::Lazy; -use pageserver_api::{ - config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD, - key::{ - KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, - NON_INHERITED_SPARSE_RANGE, - }, - keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, - models::{ - CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo, - DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, - LsnLease, TimelineState, - }, - reltag::BlockNumber, - shard::{ShardIdentity, ShardNumber, TenantShardId}, +use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; +use pageserver_api::key::{ + KEY_SIZE, Key, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, + SPARSE_RANGE, }; +use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}; +use pageserver_api::models::{ + CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, + InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState, +}; +use pageserver_api::reltag::{BlockNumber, RelTag}; +use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; +#[cfg(test)] +use pageserver_api::value::Value; +use postgres_connection::PgConnectionConfig; +use postgres_ffi::v14::xlog_utils; +use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp}; use rand::Rng; use remote_storage::DownloadError; use serde_with::serde_as; use storage_broker::BrokerClientChannel; -use tokio::{ - runtime::Handle, - sync::{oneshot, watch}, -}; +use tokio::runtime::Handle; +use tokio::sync::mpsc::Sender; +use tokio::sync::{Notify, oneshot, watch}; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::{ - fs_ext, pausable_failpoint, - postgres_client::PostgresClientProtocol, - sync::gate::{Gate, GateGuard}, -}; +use utils::generation::Generation; +use utils::guard_arc_swap::GuardArcSwap; +use utils::id::TimelineId; +use utils::lsn::{AtomicLsn, Lsn, RecordLsn}; +use utils::postgres_client::PostgresClientProtocol; +use utils::rate_limit::RateLimit; +use utils::seqwait::SeqWait; +use utils::simple_rcu::{Rcu, RcuReadGuard}; +use utils::sync::gate::{Gate, GateGuard}; +use utils::{completion, critical, fs_ext, pausable_failpoint}; use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; -use std::sync::atomic::Ordering as AtomicOrdering; -use std::sync::{Arc, Mutex, RwLock, Weak}; -use std::time::{Duration, Instant, SystemTime}; -use std::{ - array, - collections::{BTreeMap, HashMap, HashSet}, - sync::atomic::AtomicU64, -}; -use std::{cmp::min, ops::ControlFlow}; -use std::{ - collections::btree_map::Entry, - ops::{Deref, Range}, -}; -use std::{pin::pin, sync::OnceLock}; - -use crate::{ - aux_file::AuxFileSizeEstimator, - tenant::{ - config::AttachmentMode, - layer_map::{LayerMap, SearchResult}, - metadata::TimelineMetadata, - storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc}, - }, - walingest::WalLagCooldown, - walredo, -}; -use crate::{ - context::{DownloadBehavior, RequestContext}, - disk_usage_eviction_task::DiskUsageEvictionInfo, - pgdatadir_mapping::CollectKeySpaceError, -}; -use crate::{ - disk_usage_eviction_task::finite_f32, - tenant::storage_layer::{ - AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState, - ValuesReconstructState, - }, -}; -use crate::{ - disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, -}; -use crate::{ - l0_flush::{self, L0FlushGlobalState}, - metrics::GetKind, -}; -use crate::{ - metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, -}; -use crate::{ - pgdatadir_mapping::DirectoryKind, - virtual_file::{MaybeFatalIo, VirtualFile}, -}; -use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; -use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey}; -use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; - -use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace}; -use crate::metrics::TimelineMetrics; -use crate::pgdatadir_mapping::CalculateLogicalSizeError; -use crate::tenant::config::TenantConfOpt; -use pageserver_api::reltag::RelTag; -use pageserver_api::shard::ShardIndex; - -use postgres_connection::PgConnectionConfig; -use postgres_ffi::{to_pg_timestamp, v14::xlog_utils, WAL_SEGMENT_SIZE}; -use utils::{ - completion, - generation::Generation, - id::TimelineId, - lsn::{AtomicLsn, Lsn, RecordLsn}, - seqwait::SeqWait, - simple_rcu::{Rcu, RcuReadGuard}, -}; - -use crate::task_mgr; -use crate::task_mgr::TaskKind; -use crate::tenant::gc_result::GcResult; -use crate::ZERO_PAGE; -use pageserver_api::key::Key; - use self::delete::DeleteTimelineFlow; pub(super) use self::eviction_task::EvictionTaskTenantState; use self::eviction_task::EvictionTaskTimelineState; use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; - +use super::config::TenantConf; +use super::remote_timeline_client::index::{GcCompactionState, IndexPart}; +use super::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; +use super::secondary::heatmap::HeatMapLayer; +use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; +use super::upload_queue::NotInitialized; use super::{ - config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized, - MaybeOffloaded, + AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded, + debug_assert_current_span_has_tenant_and_timeline_id, }; -use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; -use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; -use super::{ - remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError, - storage_layer::ReadableLayer, +use crate::aux_file::AuxFileSizeEstimator; +use crate::config::PageServerConf; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32}; +use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::l0_flush::{self, L0FlushGlobalState}; +use crate::metrics::{ + DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics, }; -use super::{ - secondary::heatmap::{HeatMapLayer, HeatMapTimeline}, - GcError, +use crate::page_service::TenantManagerTypes; +use crate::pgdatadir_mapping::{ + CalculateLogicalSizeError, CollectKeySpaceError, DirectoryKind, LsnForTimestamp, + MAX_AUX_FILE_V2_DELTAS, MetricsUpdate, }; - -#[cfg(test)] -use pageserver_api::value::Value; +use crate::task_mgr::TaskKind; +use crate::tenant::config::{AttachmentMode, TenantConfOpt}; +use crate::tenant::gc_result::GcResult; +use crate::tenant::layer_map::{LayerMap, SearchResult}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::storage_layer::delta_layer::DeltaEntry; +use crate::tenant::storage_layer::inmemory_layer::IndexEntry; +use crate::tenant::storage_layer::{ + AsLayerDesc, BatchLayerWriter, DeltaLayerWriter, EvictionError, ImageLayerName, + ImageLayerWriter, InMemoryLayer, IoConcurrency, Layer, LayerAccessStatsReset, LayerName, + PersistentLayerDesc, PersistentLayerKey, ResidentLayer, ValueReconstructSituation, + ValueReconstructState, ValuesReconstructState, +}; +use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::timeline::logical_size::CurrentLogicalSize; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; +use crate::walingest::WalLagCooldown; +use crate::{ZERO_PAGE, task_mgr, walredo}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(crate) enum FlushLoopState { @@ -187,6 +150,19 @@ pub enum ImageLayerCreationMode { Initial, } +#[derive(Clone, Debug, Default)] +pub enum LastImageLayerCreationStatus { + Incomplete { + /// The last key of the partition (exclusive) that was processed in the last + /// image layer creation attempt. We will continue from this key in the next + /// attempt. + last_key: Key, + }, + Complete, + #[default] + Initial, +} + impl std::fmt::Display for ImageLayerCreationMode { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?}", self) @@ -208,8 +184,9 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { /// The outward-facing resources required to build a Timeline pub struct TimelineResources { pub remote_client: RemoteTimelineClient, - pub pagestream_throttle: - Arc>, + pub pagestream_throttle: Arc, + pub pagestream_throttle_metrics: Arc, + pub l0_compaction_trigger: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } @@ -306,12 +283,16 @@ pub struct Timeline { ancestor_timeline: Option>, ancestor_lsn: Lsn, + // The LSN of gc-compaction that was last applied to this timeline. + gc_compaction_state: ArcSwap>, + pub(super) metrics: TimelineMetrics, // `Timeline` doesn't write these metrics itself, but it manages the lifetime. Code // in `crate::page_service` writes these metrics. pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline, + directory_metrics_inited: [AtomicBool; DirectoryKind::KINDS_NUM], directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM], /// Ensures layers aren't frozen by checkpointer between @@ -336,13 +317,20 @@ pub struct Timeline { /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>, - // Needed to ensure that we can't create a branch at a point that was already garbage collected - pub latest_gc_cutoff_lsn: Rcu, + // The LSN at which we have executed GC: whereas [`Self::gc_info`] records the LSN at which + // we _intend_ to GC (i.e. the PITR cutoff), this LSN records where we actually last did it. + // Because PITR interval is mutable, it's possible for this LSN to be earlier or later than + // the planned GC cutoff. + pub applied_gc_cutoff_lsn: Rcu, + + pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>, // List of child timelines and their branch points. This is needed to avoid // garbage collecting data that is still needed by the child timelines. pub(crate) gc_info: std::sync::RwLock, + pub(crate) last_image_layer_creation_status: ArcSwap, + // It may change across major versions so for simplicity // keep it after running initdb for a timeline. // It is needed in checks when we want to error on some operations @@ -351,8 +339,8 @@ pub struct Timeline { // though let's keep them both for better error visibility. pub initdb_lsn: Lsn, - /// When did we last calculate the partitioning? Make it pub to test cases. - pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, + /// The repartitioning result. Allows a single writer and multiple readers. + pub(crate) partitioning: GuardArcSwap<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, @@ -402,6 +390,12 @@ pub struct Timeline { /// Timeline deletion will acquire both compaction and gc locks in whatever order. compaction_lock: tokio::sync::Mutex<()>, + /// If true, the last compaction failed. + compaction_failed: AtomicBool, + + /// Notifies the tenant compaction loop that there is pending L0 compaction work. + l0_compaction_trigger: Arc, + /// Make sure we only have one running gc at a time. /// /// Must only be taken in two places: @@ -412,8 +406,7 @@ pub struct Timeline { gc_lock: tokio::sync::Mutex<()>, /// Cloned from [`super::Tenant::pagestream_throttle`] on construction. - pub(crate) pagestream_throttle: - Arc>, + pub(crate) pagestream_throttle: Arc, /// Size estimator for aux file v2 pub(crate) aux_file_size_estimator: AuxFileSizeEstimator, @@ -428,12 +421,29 @@ pub struct Timeline { pub(crate) l0_flush_global_state: L0FlushGlobalState, - pub(crate) handles: handle::PerTimelineState, + pub(crate) handles: handle::PerTimelineState, pub(crate) attach_wal_lag_cooldown: Arc>, /// Cf. [`crate::tenant::CreateTimelineIdempotency`]. pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency, + + /// If Some, collects GetPage metadata for an ongoing PageTrace. + pub(crate) page_trace: ArcSwapOption>, + + pub(super) previous_heatmap: ArcSwapOption, + + /// May host a background Tokio task which downloads all the layers from the current + /// heatmap on demand. + heatmap_layers_downloader: Mutex>, +} + +pub(crate) enum PreviousHeatmap { + Active { + heatmap: HeatMapTimeline, + read_at: std::time::Instant, + }, + Obsolete, } pub type TimelineDeleteProgress = Arc>; @@ -506,6 +516,9 @@ impl GcInfo { pub(super) fn remove_child_offloaded(&mut self, child_id: TimelineId) -> bool { self.remove_child_maybe_offloaded(child_id, MaybeOffloaded::Yes) } + pub(crate) fn lsn_covered_by_lease(&self, lsn: Lsn) -> bool { + self.leases.contains_key(&lsn) + } } /// The `GcInfo` component describing which Lsns need to be retained. Functionally, this @@ -597,6 +610,71 @@ impl From for GetVectoredError { } } +/// A layer identifier when used in the [`ReadPath`] structure. This enum is for observability purposes +/// only and not used by the "real read path". +pub enum ReadPathLayerId { + PersistentLayer(PersistentLayerKey), + InMemoryLayer(Range), +} + +impl std::fmt::Display for ReadPathLayerId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ReadPathLayerId::PersistentLayer(key) => write!(f, "{}", key), + ReadPathLayerId::InMemoryLayer(range) => { + write!(f, "in-mem {}..{}", range.start, range.end) + } + } + } +} +pub struct ReadPath { + keyspace: KeySpace, + lsn: Lsn, + path: Vec<(ReadPathLayerId, KeySpace, Range)>, +} + +impl ReadPath { + pub fn new(keyspace: KeySpace, lsn: Lsn) -> Self { + Self { + keyspace, + lsn, + path: Vec::new(), + } + } + + pub fn record_layer_visit( + &mut self, + layer_to_read: &ReadableLayer, + keyspace_to_read: &KeySpace, + lsn_range: &Range, + ) { + let id = match layer_to_read { + ReadableLayer::PersistentLayer(layer) => { + ReadPathLayerId::PersistentLayer(layer.layer_desc().key()) + } + ReadableLayer::InMemoryLayer(layer) => { + ReadPathLayerId::InMemoryLayer(layer.get_lsn_range()) + } + }; + self.path + .push((id, keyspace_to_read.clone(), lsn_range.clone())); + } +} + +impl std::fmt::Display for ReadPath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Read path for {} at lsn {}:", self.keyspace, self.lsn)?; + for (idx, (layer_id, keyspace, lsn_range)) in self.path.iter().enumerate() { + writeln!( + f, + "{}: {} {}..{} {}", + idx, layer_id, lsn_range.start, lsn_range.end, keyspace + )?; + } + Ok(()) + } +} + #[derive(thiserror::Error)] pub struct MissingKeyError { key: Key, @@ -604,6 +682,8 @@ pub struct MissingKeyError { cont_lsn: Lsn, request_lsn: Lsn, ancestor_lsn: Option, + /// Debug information about the read path if there's an error + read_path: Option, backtrace: Option, } @@ -620,10 +700,15 @@ impl std::fmt::Display for MissingKeyError { "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}", self.key, self.shard, self.cont_lsn, self.request_lsn )?; + if let Some(ref ancestor_lsn) = self.ancestor_lsn { write!(f, ", ancestor {}", ancestor_lsn)?; } + if let Some(ref read_path) = self.read_path { + write!(f, "\n{}", read_path)?; + } + if let Some(ref backtrace) = self.backtrace { write!(f, "\n{}", backtrace)?; } @@ -773,53 +858,44 @@ pub(crate) enum CompactFlags { ForceRepartition, ForceImageLayerCreation, ForceL0Compaction, + OnlyL0Compaction, EnhancedGcBottomMostCompaction, DryRun, + /// Disables compaction yielding e.g. due to high L0 count. This is set e.g. when requesting + /// compaction via HTTP API. + NoYield, } #[serde_with::serde_as] #[derive(Debug, Clone, serde::Deserialize)] pub(crate) struct CompactRequest { - pub compact_range: Option, - pub compact_below_lsn: Option, + pub compact_key_range: Option, + pub compact_lsn_range: Option, /// Whether the compaction job should be scheduled. #[serde(default)] pub scheduled: bool, /// Whether the compaction job should be split across key ranges. #[serde(default)] pub sub_compaction: bool, -} - -#[serde_with::serde_as] -#[derive(Debug, Clone, serde::Deserialize)] -pub(crate) struct CompactRange { - #[serde_as(as = "serde_with::DisplayFromStr")] - pub start: Key, - #[serde_as(as = "serde_with::DisplayFromStr")] - pub end: Key, -} - -impl From> for CompactRange { - fn from(range: Range) -> Self { - CompactRange { - start: range.start, - end: range.end, - } - } + /// Max job size for each subcompaction job. + pub sub_compaction_max_job_size_mb: Option, } #[derive(Debug, Clone, Default)] pub(crate) struct CompactOptions { pub flags: EnumSet, /// If set, the compaction will only compact the key range specified by this option. - /// This option is only used by GC compaction. - pub compact_range: Option, - /// If set, the compaction will only compact the LSN below this value. - /// This option is only used by GC compaction. - pub compact_below_lsn: Option, + /// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`]. + pub compact_key_range: Option, + /// If set, the compaction will only compact the LSN within this value. + /// This option is only used by GC compaction. For the full explanation, see [`compaction::GcCompactJob`]. + pub compact_lsn_range: Option, /// Enable sub-compaction (split compaction job across key ranges). /// This option is only used by GC compaction. pub sub_compaction: bool, + /// Set job size for the GC compaction. + /// This option is only used by GC compaction. + pub sub_compaction_max_job_size_mb: Option, } impl std::fmt::Debug for Timeline { @@ -909,10 +985,17 @@ impl From for PageReconstructError { } } +pub(crate) enum WaitLsnTimeout { + Custom(Duration), + // Use the [`PageServerConf::wait_lsn_timeout`] default + Default, +} + pub(crate) enum WaitLsnWaiter<'a> { Timeline(&'a Timeline), Tenant, PageService, + HttpEndpoint, } /// Argument to [`Timeline::shutdown`]. @@ -933,9 +1016,16 @@ pub(crate) enum ShutdownMode { Hard, } -struct ImageLayerCreationOutcome { - image: Option, - next_start_key: Key, +enum ImageLayerCreationOutcome { + /// We generated an image layer + Generated { + unfinished_image_layer: ImageLayerWriter, + }, + /// The key range is empty + Empty, + /// (Only used in metadata image layer creation), after reading the metadata keys, we decide to skip + /// the image layer creation. + Skip, } /// Public interface functions @@ -969,9 +1059,15 @@ impl Timeline { (history, gc_info.within_ancestor_pitr) } - /// Lock and get timeline's GC cutoff - pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { - self.latest_gc_cutoff_lsn.read() + /// Read timeline's GC cutoff: this is the LSN at which GC has started to happen + pub(crate) fn get_applied_gc_cutoff_lsn(&self) -> RcuReadGuard { + self.applied_gc_cutoff_lsn.read() + } + + /// Read timeline's planned GC cutoff: this is the logical end of history that users + /// are allowed to read (based on configured PITR), even if physically we have more history. + pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn { + self.gc_info.read().unwrap().cutoffs.time } /// Look up given page version. @@ -1011,9 +1107,7 @@ impl Timeline { ranges: vec![key..key.next()], }; - // Initialise the reconstruct state for the key with the cache - // entry returned above. - let mut reconstruct_state = ValuesReconstructState::new(); + let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential()); let vectored_res = self .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) @@ -1041,12 +1135,13 @@ impl Timeline { request_lsn: lsn, ancestor_lsn: None, backtrace: None, + read_path: None, })), } } pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; - pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0; + pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100; /// Look up multiple page versions at a given LSN /// @@ -1056,6 +1151,7 @@ impl Timeline { &self, keyspace: KeySpace, lsn: Lsn, + io_concurrency: super::storage_layer::IoConcurrency, ctx: &RequestContext, ) -> Result>, GetVectoredError> { if !lsn.is_valid() { @@ -1090,7 +1186,7 @@ impl Timeline { .get_vectored_impl( keyspace.clone(), lsn, - &mut ValuesReconstructState::new(), + &mut ValuesReconstructState::new(io_concurrency), ctx, ) .await; @@ -1115,6 +1211,7 @@ impl Timeline { keyspace: KeySpace, lsn: Lsn, ctx: &RequestContext, + io_concurrency: super::storage_layer::IoConcurrency, ) -> Result>, GetVectoredError> { if !lsn.is_valid() { return Err(GetVectoredError::InvalidLsn(lsn)); @@ -1146,7 +1243,7 @@ impl Timeline { .get_vectored_impl( keyspace.clone(), lsn, - &mut ValuesReconstructState::default(), + &mut ValuesReconstructState::new(io_concurrency), ctx, ) .await; @@ -1165,64 +1262,92 @@ impl Timeline { reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { - let get_kind = if keyspace.total_raw_size() == 1 { - GetKind::Singular + let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() { + Some(ReadPath::new(keyspace.clone(), lsn)) } else { - GetKind::Vectored + None + }; + reconstruct_state.read_path = read_path; + + let traversal_res: Result<(), _> = self + .get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) + .await; + if let Err(err) = traversal_res { + // Wait for all the spawned IOs to complete. + // See comments on `spawn_io` inside `storage_layer` for more details. + let mut collect_futs = std::mem::take(&mut reconstruct_state.keys) + .into_values() + .map(|state| state.collect_pending_ios()) + .collect::>(); + while collect_futs.next().await.is_some() {} + return Err(err); }; - let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME - .for_get_kind(get_kind) - .start_timer(); - self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) - .await?; - get_data_timer.stop_and_record(); - - let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME - .for_get_kind(get_kind) - .start_timer(); - let mut results: BTreeMap> = BTreeMap::new(); let layers_visited = reconstruct_state.get_layers_visited(); - for (key, res) in std::mem::take(&mut reconstruct_state.keys) { - match res { - Err(err) => { - results.insert(key, Err(err)); - } - Ok(state) => { - let state = ValueReconstructState::from(state); + let futs = FuturesUnordered::new(); + for (key, state) in std::mem::take(&mut reconstruct_state.keys) { + futs.push({ + let walredo_self = self.myself.upgrade().expect("&self method holds the arc"); + async move { + assert_eq!(state.situation, ValueReconstructSituation::Complete); - let reconstruct_res = self.reconstruct_value(key, lsn, state).await; - results.insert(key, reconstruct_res); + let converted = match state.collect_pending_ios().await { + Ok(ok) => ok, + Err(err) => { + return (key, Err(err)); + } + }; + DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64); + + // The walredo module expects the records to be descending in terms of Lsn. + // And we submit the IOs in that order, so, there shuold be no need to sort here. + debug_assert!( + converted + .records + .is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)), + "{converted:?}" + ); + + ( + key, + walredo_self.reconstruct_value(key, lsn, converted).await, + ) } - } + }); } - reconstruct_timer.stop_and_record(); + + let results = futs + .collect::>>() + .await; // For aux file keys (v1 or v2) the vectored read path does not return an error // when they're missing. Instead they are omitted from the resulting btree // (this is a requirement, not a bug). Skip updating the metric in these cases // to avoid infinite results. if !results.is_empty() { - let avg = layers_visited as f64 / results.len() as f64; - if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy> = + // Record the total number of layers visited towards each key in the batch. While some + // layers may not intersect with a given read, and the cost of layer visits are + // amortized across the batch, each visited layer contributes directly to the observed + // latency for every read in the batch, which is what we care about. + if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD { + static LOG_PACER: Lazy> = Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { + LOG_PACER.lock().unwrap().call(|| { + let num_keys = keyspace.total_raw_size(); + let num_pages = results.len(); tracing::info!( shard_id = %self.tenant_shard_id.shard_slug(), lsn = %lsn, - "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned", - keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size()); + "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.", + ); }); } - // Note that this is an approximation. Tracking the exact number of layers visited - // per key requires virtually unbounded memory usage and is inefficient - // (i.e. segment tree tracking each range queried from a layer) - crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg); + for _ in &results { + self.metrics.layers_per_read.observe(layers_visited as f64); + LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64); + } } Ok(results) @@ -1292,6 +1417,7 @@ impl Timeline { &self, lsn: Lsn, who_is_waiting: WaitLsnWaiter<'_>, + timeout: WaitLsnTimeout, ctx: &RequestContext, /* Prepare for use by cancellation */ ) -> Result<(), WaitLsnError> { let state = self.current_state(); @@ -1307,13 +1433,22 @@ impl Timeline { | TaskKind::WalReceiverConnectionHandler | TaskKind::WalReceiverConnectionPoller => { let is_myself = match who_is_waiting { - WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself), - WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()), + WaitLsnWaiter::Timeline(waiter) => { + Weak::ptr_eq(&waiter.myself, &self.myself) + } + WaitLsnWaiter::Tenant + | WaitLsnWaiter::PageService + | WaitLsnWaiter::HttpEndpoint => unreachable!( + "tenant or page_service context are not expected to have task kind {:?}", + ctx.task_kind() + ), }; if is_myself { if let Err(current) = self.last_record_lsn.would_wait_for(lsn) { // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here - panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock"); + panic!( + "this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock" + ); } } else { // if another timeline's is waiting for us, there's no deadlock risk because @@ -1324,13 +1459,14 @@ impl Timeline { } } + let timeout = match timeout { + WaitLsnTimeout::Custom(t) => t, + WaitLsnTimeout::Default => self.conf.wait_lsn_timeout, + }; + let _timer = crate::metrics::WAIT_LSN_TIME.start_timer(); - match self - .last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .await - { + match self.last_record_lsn.wait_for_timeout(lsn, timeout).await { Ok(()) => Ok(()), Err(e) => { use utils::seqwait::SeqWaitError::*; @@ -1341,12 +1477,12 @@ impl Timeline { drop(_timer); let walreceiver_status = self.walreceiver_status(); Err(WaitLsnError::Timeout(format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", - lsn, - self.get_last_record_lsn(), - self.get_disk_consistent_lsn(), - walreceiver_status, - ))) + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", + lsn, + self.get_last_record_lsn(), + self.get_disk_consistent_lsn(), + walreceiver_status, + ))) } } } @@ -1418,6 +1554,7 @@ impl Timeline { let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE); let mut gc_info = self.gc_info.write().unwrap(); + let planned_cutoff = gc_info.min_cutoff(); let valid_until = SystemTime::now() + length; @@ -1438,7 +1575,7 @@ impl Timeline { existing_lease.clone() } Entry::Vacant(vacant) => { - // Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and + // Reject already GC-ed LSN if we are in AttachedSingle and // not blocked by the lsn lease deadline. let validate = { let conf = self.tenant_conf.load(); @@ -1447,9 +1584,20 @@ impl Timeline { }; if init || validate { - let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); if lsn < *latest_gc_cutoff_lsn { - bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + bail!( + "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", + lsn, + *latest_gc_cutoff_lsn + ); + } + if lsn < planned_cutoff { + bail!( + "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", + lsn, + planned_cutoff + ); } } @@ -1573,7 +1721,9 @@ impl Timeline { // This is not harmful, but it only happens in relatively rare cases where // time-based checkpoints are not happening fast enough to keep the amount of // ephemeral data within configured limits. It's a sign of stress on the system. - tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure"); + tracing::info!( + "Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure" + ); } } @@ -1636,66 +1786,97 @@ impl Timeline { cancel: &CancellationToken, flags: EnumSet, ctx: &RequestContext, - ) -> Result { + ) -> Result { self.compact_with_options( cancel, CompactOptions { flags, - compact_range: None, - compact_below_lsn: None, + compact_key_range: None, + compact_lsn_range: None, sub_compaction: false, + sub_compaction_max_job_size_mb: None, }, ctx, ) .await } - /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending - /// compaction tasks. + /// Outermost timeline compaction operation; downloads needed layers. + /// + /// NB: the cancellation token is usually from a background task, but can also come from a + /// request task. pub(crate) async fn compact_with_options( self: &Arc, cancel: &CancellationToken, options: CompactOptions, ctx: &RequestContext, - ) -> Result { - // most likely the cancellation token is from background task, but in tests it could be the - // request task as well. + ) -> Result { + // Acquire the compaction lock and task semaphore. + // + // L0-only compaction uses a separate semaphore (if enabled) to make sure it isn't starved + // out by other background tasks (including image compaction). We request this via + // `BackgroundLoopKind::L0Compaction`. + // + // If this is a regular compaction pass, and L0-only compaction is enabled in the config, + // then we should yield for immediate L0 compaction if necessary while we're waiting for the + // background task semaphore. There's no point yielding otherwise, since we'd just end up + // right back here. + let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction); + let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() { + true => BackgroundLoopKind::L0Compaction, + false => BackgroundLoopKind::Compaction, + }; + let yield_for_l0 = !is_l0_only + && self.get_compaction_l0_first() + && !options.flags.contains(CompactFlags::NoYield); - let prepare = async move { + let acquire = async move { let guard = self.compaction_lock.lock().await; - - let permit = super::tasks::concurrent_background_tasks_rate_limit_permit( - BackgroundLoopKind::Compaction, - ctx, - ) - .await; - + let permit = super::tasks::acquire_concurrency_permit(semaphore_kind, ctx).await; (guard, permit) }; - // this wait probably never needs any "long time spent" logging, because we already nag if - // compaction task goes over it's period (20s) which is quite often in production. let (_guard, _permit) = tokio::select! { - tuple = prepare => { tuple }, - _ = self.cancel.cancelled() => return Ok(false), - _ = cancel.cancelled() => return Ok(false), + (guard, permit) = acquire => (guard, permit), + _ = self.l0_compaction_trigger.notified(), if yield_for_l0 => { + return Ok(CompactionOutcome::YieldForL0); + } + _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Skipped), + _ = cancel.cancelled() => return Ok(CompactionOutcome::Skipped), }; let last_record_lsn = self.get_last_record_lsn(); // Last record Lsn could be zero in case the timeline was just created if !last_record_lsn.is_valid() { - warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); - return Ok(false); + warn!( + "Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}" + ); + return Ok(CompactionOutcome::Skipped); } - match self.get_compaction_algorithm_settings().kind { + let result = match self.get_compaction_algorithm_settings().kind { CompactionAlgorithm::Tiered => { self.compact_tiered(cancel, ctx).await?; - Ok(false) + Ok(CompactionOutcome::Done) } CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await, - } + }; + + // Signal compaction failure to avoid L0 flush stalls when it's broken. + match result { + Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), + Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => { + self.compaction_failed.store(true, AtomicOrdering::Relaxed) + } + // Don't change the current value on offload failure or shutdown. We don't want to + // abruptly stall nor resume L0 flushes in these cases. + Err(CompactionError::Offload(_)) => {} + Err(CompactionError::ShuttingDown) => {} + Err(CompactionError::AlreadyRunning(_)) => {} + }; + + result } /// Mutate the timeline with a [`TimelineWriter`]. @@ -1777,7 +1958,7 @@ impl Timeline { self.last_record_lsn.shutdown(); if let ShutdownMode::FreezeAndFlush = mode { - if let Some((open, frozen)) = self + let do_flush = if let Some((open, frozen)) = self .layers .read() .await @@ -1786,43 +1967,56 @@ impl Timeline { .ok() .filter(|(open, frozen)| *open || *frozen > 0) { - tracing::info!(?open, frozen, "flushing and freezing on shutdown"); + if self.remote_client.is_archived() == Some(true) { + // No point flushing on shutdown for an archived timeline: it is not important + // to have it nice and fresh after our restart, and trying to flush here might + // race with trying to offload it (which also stops the flush loop) + false + } else { + tracing::info!(?open, frozen, "flushing and freezing on shutdown"); + true + } } else { - // this is double-shutdown, ignore it - } + // this is double-shutdown, it'll be a no-op + true + }; // we shut down walreceiver above, so, we won't add anything more // to the InMemoryLayer; freeze it and wait for all frozen layers // to reach the disk & upload queue, then shut the upload queue and // wait for it to drain. - match self.freeze_and_flush().await { - Ok(_) => { - // drain the upload queue - // if we did not wait for completion here, it might be our shutdown process - // didn't wait for remote uploads to complete at all, as new tasks can forever - // be spawned. - // - // what is problematic is the shutting down of RemoteTimelineClient, because - // obviously it does not make sense to stop while we wait for it, but what - // about corner cases like s3 suddenly hanging up? - self.remote_client.shutdown().await; + if do_flush { + match self.freeze_and_flush().await { + Ok(_) => { + // drain the upload queue + // if we did not wait for completion here, it might be our shutdown process + // didn't wait for remote uploads to complete at all, as new tasks can forever + // be spawned. + // + // what is problematic is the shutting down of RemoteTimelineClient, because + // obviously it does not make sense to stop while we wait for it, but what + // about corner cases like s3 suddenly hanging up? + self.remote_client.shutdown().await; + } + Err(FlushLayerError::Cancelled) => { + // this is likely the second shutdown, ignore silently. + // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080 + debug_assert!(self.cancel.is_cancelled()); + } + Err(e) => { + // Non-fatal. Shutdown is infallible. Failures to flush just mean that + // we have some extra WAL replay to do next time the timeline starts. + warn!("failed to freeze and flush: {e:#}"); + } } - Err(FlushLayerError::Cancelled) => { - // this is likely the second shutdown, ignore silently. - // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080 - debug_assert!(self.cancel.is_cancelled()); - } - Err(e) => { - // Non-fatal. Shutdown is infallible. Failures to flush just mean that - // we have some extra WAL replay to do next time the timeline starts. - warn!("failed to freeze and flush: {e:#}"); - } - } - // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but - // we also do a final check here to ensure that the queue is empty. - if !self.remote_client.no_pending_work() { - warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but + // we also do a final check here to ensure that the queue is empty. + if !self.remote_client.no_pending_work() { + warn!( + "still have pending work in remote upload queue, but continuing shutting down anyways" + ); + } } } @@ -1830,7 +2024,9 @@ impl Timeline { // drain the upload queue self.remote_client.shutdown().await; if !self.remote_client.no_pending_work() { - warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + warn!( + "still have pending work in remote upload queue, but continuing shutting down anyways" + ); } } @@ -1838,6 +2034,11 @@ impl Timeline { tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); + // If we have a background task downloading heatmap layers stop it. + // The background downloads are sensitive to timeline cancellation (done above), + // so the drain will be immediate. + self.stop_and_drain_heatmap_layers_download().await; + // Ensure Prevent new page service requests from starting. self.handles.shutdown(); @@ -1987,8 +2188,16 @@ impl Timeline { pub(crate) async fn download_layer( &self, layer_file_name: &LayerName, - ) -> anyhow::Result> { - let Some(layer) = self.find_layer(layer_file_name).await? else { + ) -> Result, super::storage_layer::layer::DownloadError> { + let Some(layer) = self + .find_layer(layer_file_name) + .await + .map_err(|e| match e { + layer_manager::Shutdown => { + super::storage_layer::layer::DownloadError::TimelineShutdown + } + })? + else { return Ok(None); }; @@ -2124,6 +2333,13 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } + fn get_compaction_period(&self) -> Duration { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_period + .unwrap_or(self.conf.default_tenant_conf.compaction_period) + } + fn get_compaction_target_size(&self) -> u64 { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2140,6 +2356,123 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + pub(crate) fn get_rel_size_v2_enabled(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .rel_size_v2_enabled + .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) + } + + fn get_compaction_upper_limit(&self) -> usize { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .compaction_upper_limit + .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit) + } + + pub fn get_compaction_l0_first(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_l0_first + .unwrap_or(self.conf.default_tenant_conf.compaction_l0_first) + } + + pub fn get_compaction_l0_semaphore(&self) -> bool { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_l0_semaphore + .unwrap_or(self.conf.default_tenant_conf.compaction_l0_semaphore) + } + + fn get_l0_flush_delay_threshold(&self) -> Option { + // Disable L0 flushes by default. This and compaction needs further tuning. + const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3 + + // If compaction is disabled, don't delay. + if self.get_compaction_period() == Duration::ZERO { + return None; + } + + let compaction_threshold = self.get_compaction_threshold(); + let tenant_conf = self.tenant_conf.load(); + let l0_flush_delay_threshold = tenant_conf + .tenant_conf + .l0_flush_delay_threshold + .or(self.conf.default_tenant_conf.l0_flush_delay_threshold) + .unwrap_or(DEFAULT_L0_FLUSH_DELAY_FACTOR * compaction_threshold); + + // 0 disables backpressure. + if l0_flush_delay_threshold == 0 { + return None; + } + + // Clamp the flush delay threshold to the compaction threshold; it doesn't make sense to + // backpressure flushes below this. + // TODO: the tenant config should have validation to prevent this instead. + debug_assert!(l0_flush_delay_threshold >= compaction_threshold); + Some(max(l0_flush_delay_threshold, compaction_threshold)) + } + + fn get_l0_flush_stall_threshold(&self) -> Option { + // Disable L0 stalls by default. In ingest benchmarks, we see image compaction take >10 + // minutes, blocking L0 compaction, and we can't stall L0 flushes for that long. + const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 0; // TODO: default to e.g. 5 + + // If compaction is disabled, don't stall. + if self.get_compaction_period() == Duration::ZERO { + return None; + } + + // If compaction is failing, don't stall and try to keep the tenant alive. This may not be a + // good idea: read amp can grow unbounded, leading to terrible performance, and we may take + // on unbounded compaction debt that can take a long time to fix once compaction comes back + // online. At least we'll delay flushes, slowing down the growth and buying some time. + if self.compaction_failed.load(AtomicOrdering::Relaxed) { + return None; + } + + let compaction_threshold = self.get_compaction_threshold(); + let tenant_conf = self.tenant_conf.load(); + let l0_flush_stall_threshold = tenant_conf + .tenant_conf + .l0_flush_stall_threshold + .or(self.conf.default_tenant_conf.l0_flush_stall_threshold); + + // Tests sometimes set compaction_threshold=1 to generate lots of layer files, and don't + // handle the 20-second compaction delay. Some (e.g. `test_backward_compatibility`) can't + // easily adjust the L0 backpressure settings, so just disable stalls in this case. + if cfg!(feature = "testing") + && compaction_threshold == 1 + && l0_flush_stall_threshold.is_none() + { + return None; + } + + let l0_flush_stall_threshold = l0_flush_stall_threshold + .unwrap_or(DEFAULT_L0_FLUSH_STALL_FACTOR * compaction_threshold); + + // 0 disables backpressure. + if l0_flush_stall_threshold == 0 { + return None; + } + + // Clamp the flush stall threshold to the compaction threshold; it doesn't make sense to + // backpressure flushes below this. + // TODO: the tenant config should have validation to prevent this instead. + debug_assert!(l0_flush_stall_threshold >= compaction_threshold); + Some(max(l0_flush_stall_threshold, compaction_threshold)) + } + + fn get_l0_flush_wait_upload(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .l0_flush_wait_upload + .unwrap_or(self.conf.default_tenant_conf.l0_flush_wait_upload) + } + fn get_image_creation_threshold(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2187,6 +2520,43 @@ impl Timeline { ) } + fn get_gc_compaction_settings(&self) -> GcCompactionCombinedSettings { + let tenant_conf = &self.tenant_conf.load(); + let gc_compaction_enabled = tenant_conf + .tenant_conf + .gc_compaction_enabled + .unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled); + let gc_compaction_initial_threshold_kb = tenant_conf + .tenant_conf + .gc_compaction_initial_threshold_kb + .unwrap_or( + self.conf + .default_tenant_conf + .gc_compaction_initial_threshold_kb, + ); + let gc_compaction_ratio_percent = tenant_conf + .tenant_conf + .gc_compaction_ratio_percent + .unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent); + GcCompactionCombinedSettings { + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + } + } + + fn get_image_creation_preempt_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .image_creation_preempt_threshold + .unwrap_or( + self.conf + .default_tenant_conf + .image_creation_preempt_threshold, + ) + } + /// Resolve the effective WAL receiver protocol to use for this tenant. /// /// Priority order is: @@ -2241,6 +2611,7 @@ impl Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, metadata: &TimelineMetadata, + previous_heatmap: Option, ancestor: Option>, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -2252,6 +2623,7 @@ impl Timeline { state: TimelineState, attach_wal_lag_cooldown: Arc>, create_idempotency: crate::tenant::CreateTimelineIdempotency, + gc_compaction_state: Option, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -2296,6 +2668,7 @@ impl Timeline { shard_identity, pg_version, layers: Default::default(), + gc_compaction_layer_update_lock: tokio::sync::RwLock::new(()), walredo_mgr, walreceiver: Mutex::new(None), @@ -2309,6 +2682,8 @@ impl Timeline { }), disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), + gc_compaction_state: ArcSwap::new(Arc::new(gc_compaction_state)), + last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), last_freeze_ts: RwLock::new(Instant::now()), @@ -2322,9 +2697,11 @@ impl Timeline { query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new( &tenant_shard_id, &timeline_id, + resources.pagestream_throttle_metrics, ), directory_metrics: array::from_fn(|_| AtomicU64::new(0)), + directory_metrics_inited: array::from_fn(|_| AtomicBool::new(false)), flush_loop_state: Mutex::new(FlushLoopState::NotStarted), @@ -2335,7 +2712,11 @@ impl Timeline { gc_info: std::sync::RwLock::new(GcInfo::default()), - latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), + last_image_layer_creation_status: ArcSwap::new(Arc::new( + LastImageLayerCreationStatus::default(), + )), + + applied_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), current_logical_size: if disk_consistent_lsn.is_valid() { @@ -2347,7 +2728,8 @@ impl Timeline { // initial logical size is 0. LogicalSize::empty_initial() }, - partitioning: tokio::sync::Mutex::new(( + + partitioning: GuardArcSwap::new(( (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()), Lsn(0), )), @@ -2374,6 +2756,8 @@ impl Timeline { gate: Gate::default(), compaction_lock: tokio::sync::Mutex::default(), + compaction_failed: AtomicBool::default(), + l0_compaction_trigger: resources.l0_compaction_trigger, gc_lock: tokio::sync::Mutex::default(), standby_horizon: AtomicLsn::new(0), @@ -2392,6 +2776,12 @@ impl Timeline { attach_wal_lag_cooldown, create_idempotency, + + page_trace: Default::default(), + + previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap), + + heatmap_layers_downloader: Mutex::new(None), }; result.repartition_threshold = @@ -2421,7 +2811,7 @@ impl Timeline { return; } FlushLoopState::Exited => { - warn!( + info!( "ignoring attempt to restart exited flush_loop {}/{}", self.tenant_shard_id, self.timeline_id ); @@ -2458,6 +2848,20 @@ impl Timeline { ); } + pub(crate) fn update_gc_compaction_state( + &self, + gc_compaction_state: GcCompactionState, + ) -> anyhow::Result<()> { + self.gc_compaction_state + .store(Arc::new(Some(gc_compaction_state.clone()))); + self.remote_client + .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state) + } + + pub(crate) fn get_gc_compaction_state(&self) -> Option { + self.gc_compaction_state.load_full().as_ref().clone() + } + /// Creates and starts the wal receiver. /// /// This function is expected to be called at most once per Timeline's lifecycle @@ -2501,6 +2905,7 @@ impl Timeline { auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(), availability_zone: self.conf.availability_zone.clone(), ingest_batch_size: self.conf.ingest_batch_size, + validate_wal_contiguity: self.conf.validate_wal_contiguity, }, broker_client, ctx, @@ -2525,8 +2930,9 @@ impl Timeline { disk_consistent_lsn: Lsn, index_part: IndexPart, ) -> anyhow::Result<()> { - use init::{Decision::*, Discovered, DismissedLayer}; use LayerName::*; + use init::Decision::*; + use init::{Discovered, DismissedLayer}; let mut guard = self.layers.write().await; @@ -2741,11 +3147,15 @@ impl Timeline { } TimelineState::Loading => { // Import does not return an activated timeline. - info!("discarding priority boost for logical size calculation because timeline is not yet active"); + info!( + "discarding priority boost for logical size calculation because timeline is not yet active" + ); } TimelineState::Active => { // activation should be setting the once cell - warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); + warn!( + "unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work" + ); debug_assert!(false); } } @@ -2805,12 +3215,10 @@ impl Timeline { "initial size calculation", // NB: don't log errors here, task_mgr will do that. async move { - let cancel = task_mgr::shutdown_token(); self_clone .initial_logical_size_calculation_task( initial_part_end, cancel_wait_for_background_loop_concurrency_limit_semaphore, - cancel, background_ctx, ) .await; @@ -2820,11 +3228,21 @@ impl Timeline { ); } + /// # Cancellation + /// + /// This method is sensitive to `Timeline::cancel`. + /// + /// It is _not_ sensitive to task_mgr::shutdown_token(). + /// + /// # Cancel-Safety + /// + /// It does Timeline IO, hence this should be polled to completion because + /// we could be leaving in-flight IOs behind, which is safe, but annoying + /// to reason about. async fn initial_logical_size_calculation_task( self: Arc, initial_part_end: Lsn, skip_concurrency_limiter: CancellationToken, - cancel: CancellationToken, background_ctx: RequestContext, ) { scopeguard::defer! { @@ -2837,8 +3255,7 @@ impl Timeline { let self_ref = &self; let skip_concurrency_limiter = &skip_concurrency_limiter; async move { - let cancel = task_mgr::shutdown_token(); - let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit( + let wait_for_permit = super::tasks::acquire_concurrency_permit( BackgroundLoopKind::InitialLogicalSizeCalculation, background_ctx, ); @@ -2851,9 +3268,6 @@ impl Timeline { _ = self_ref.cancel.cancelled() => { return Err(CalculateLogicalSizeError::Cancelled); } - _ = cancel.cancelled() => { - return Err(CalculateLogicalSizeError::Cancelled); - }, () = skip_concurrency_limiter.cancelled() => { // Some action that is part of a end user interaction requested logical size // => break out of the rate limit @@ -2870,6 +3284,14 @@ impl Timeline { crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances) }; + let io_concurrency = IoConcurrency::spawn_from_conf( + self_ref.conf, + self_ref + .gate + .enter() + .map_err(|_| CalculateLogicalSizeError::Cancelled)?, + ); + let calculated_size = self_ref .logical_size_calculation_task( initial_part_end, @@ -2879,7 +3301,11 @@ impl Timeline { .await?; self_ref - .trigger_aux_file_size_computation(initial_part_end, background_ctx) + .trigger_aux_file_size_computation( + initial_part_end, + background_ctx, + io_concurrency, + ) .await?; // TODO: add aux file size to logical size @@ -2912,22 +3338,18 @@ impl Timeline { ) .expect("10min < 1hour"), ); - tokio::time::sleep(sleep_duration).await; + tokio::select! { + _ = tokio::time::sleep(sleep_duration) => {} + _ = self.cancel.cancelled() => return ControlFlow::Break(()), + } } } } }; - let (calculated_size, metrics_guard) = tokio::select! { - res = retrying => { - match res { - ControlFlow::Continue(calculated_size) => calculated_size, - ControlFlow::Break(()) => return, - } - } - _ = cancel.cancelled() => { - return; - } + let (calculated_size, metrics_guard) = match retrying.await { + ControlFlow::Continue(calculated_size) => calculated_size, + ControlFlow::Break(()) => return, }; // we cannot query current_logical_size.current_size() to know the current @@ -2983,9 +3405,6 @@ impl Timeline { receiver } - /// # Cancel-Safety - /// - /// This method is cancellation-safe. #[instrument(skip_all)] async fn logical_size_calculation_task( self: &Arc, @@ -3003,32 +3422,13 @@ impl Timeline { .enter() .map_err(|_| CalculateLogicalSizeError::Cancelled)?; - let self_calculation = Arc::clone(self); - - let mut calculation = pin!(async { - let ctx = ctx.attached_child(); - self_calculation - .calculate_logical_size(lsn, cause, &guard, &ctx) - .await - }); - - tokio::select! { - res = &mut calculation => { res } - _ = self.cancel.cancelled() => { - debug!("cancelling logical size calculation for timeline shutdown"); - calculation.await - } - } + self.calculate_logical_size(lsn, cause, &guard, ctx).await } /// Calculate the logical size of the database at the latest LSN. /// /// NOTE: counted incrementally, includes ancestors. This can be a slow operation, /// especially if we need to download remote layers. - /// - /// # Cancel-Safety - /// - /// This method is cancellation-safe. async fn calculate_logical_size( &self, up_to_lsn: Lsn, @@ -3041,7 +3441,10 @@ impl Timeline { self.timeline_id, up_to_lsn ); - pausable_failpoint!("timeline-calculate-logical-size-pause"); + if let Err(()) = pausable_failpoint!("timeline-calculate-logical-size-pause", &self.cancel) + { + return Err(CalculateLogicalSizeError::Cancelled); + } // See if we've already done the work for initial size calculation. // This is a short-cut for timelines that are mostly unused. @@ -3087,8 +3490,42 @@ impl Timeline { } } - pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) { - self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed); + pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) { + // TODO: this directory metrics is not correct -- we could have multiple reldirs in the system + // for each of the database, but we only store one value, and therefore each pgdirmodification + // would overwrite the previous value if they modify different databases. + + match count { + MetricsUpdate::Set(count) => { + self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed); + self.directory_metrics_inited[kind.offset()].store(true, AtomicOrdering::Relaxed); + } + MetricsUpdate::Add(count) => { + // TODO: these operations are not atomic; but we only have one writer to the metrics, so + // it's fine. + if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) { + // The metrics has been initialized with `MetricsUpdate::Set` before, so we can add/sub + // the value reliably. + self.directory_metrics[kind.offset()].fetch_add(count, AtomicOrdering::Relaxed); + } + // Otherwise, ignore this update + } + MetricsUpdate::Sub(count) => { + // TODO: these operations are not atomic; but we only have one writer to the metrics, so + // it's fine. + if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) { + // The metrics has been initialized with `MetricsUpdate::Set` before. + // The operation could overflow so we need to normalize the value. + let prev_val = + self.directory_metrics[kind.offset()].load(AtomicOrdering::Relaxed); + let res = prev_val.saturating_sub(count); + self.directory_metrics[kind.offset()].store(res, AtomicOrdering::Relaxed); + } + // Otherwise, ignore this update + } + }; + + // TODO: remove this, there's no place in the code that updates this aux metrics. let aux_metric = self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed); @@ -3123,6 +3560,14 @@ impl Timeline { Ok(layer) } + pub(super) fn is_previous_heatmap_active(&self) -> bool { + self.previous_heatmap + .load() + .as_ref() + .map(|prev| matches!(**prev, PreviousHeatmap::Active { .. })) + .unwrap_or(false) + } + /// The timeline heatmap is a hint to secondary locations from the primary location, /// indicating which layers are currently on-disk on the primary. /// @@ -3137,12 +3582,52 @@ impl Timeline { let guard = self.layers.read().await; + // Firstly, if there's any heatmap left over from when this location + // was a secondary, take that into account. Keep layers that are: + // * present in the layer map + // * visible + // * non-resident + // * not evicted since we read the heatmap + // + // Without this, a new cold, attached location would clobber the previous + // heatamp. + let previous_heatmap = self.previous_heatmap.load(); + let visible_non_resident = match previous_heatmap.as_deref() { + Some(PreviousHeatmap::Active { heatmap, read_at }) => { + Some(heatmap.layers.iter().filter_map(|hl| { + let desc: PersistentLayerDesc = hl.name.clone().into(); + let layer = guard.try_get_from_key(&desc.key())?; + + if layer.visibility() == LayerVisibilityHint::Covered { + return None; + } + + if layer.is_likely_resident() { + return None; + } + + if layer.last_evicted_at().happened_after(*read_at) { + return None; + } + + Some((desc, hl.metadata.clone(), hl.access_time)) + })) + } + Some(PreviousHeatmap::Obsolete) => None, + None => None, + }; + + // Secondly, all currently visible, resident layers are included. let resident = guard.likely_resident_layers().filter_map(|layer| { match layer.visibility() { LayerVisibilityHint::Visible => { // Layer is visible to one or more read LSNs: elegible for inclusion in layer map let last_activity_ts = layer.latest_activity(); - Some((layer.layer_desc(), layer.metadata(), last_activity_ts)) + Some(( + layer.layer_desc().clone(), + layer.metadata(), + last_activity_ts, + )) } LayerVisibilityHint::Covered => { // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap. @@ -3151,7 +3636,19 @@ impl Timeline { } }); - let mut layers = resident.collect::>(); + let mut layers = match visible_non_resident { + Some(non_resident) => { + let mut non_resident = non_resident.peekable(); + if non_resident.peek().is_none() { + tracing::info!(timeline_id=%self.timeline_id, "Previous heatmap now obsolete"); + self.previous_heatmap + .store(Some(PreviousHeatmap::Obsolete.into())); + } + + non_resident.chain(resident).collect::>() + } + None => resident.collect::>(), + }; // Sort layers in order of which to download first. For a large set of layers to download, we // want to prioritize those layers which are most likely to still be in the resident many minutes @@ -3175,6 +3672,36 @@ impl Timeline { Some(HeatMapTimeline::new(self.timeline_id, layers)) } + pub(super) async fn generate_unarchival_heatmap(&self, end_lsn: Lsn) -> PreviousHeatmap { + let guard = self.layers.read().await; + + let now = SystemTime::now(); + let mut heatmap_layers = Vec::default(); + for vl in guard.visible_layers() { + if vl.layer_desc().get_lsn_range().start >= end_lsn { + continue; + } + + let hl = HeatMapLayer { + name: vl.layer_desc().layer_name(), + metadata: vl.metadata(), + access_time: now, + }; + heatmap_layers.push(hl); + } + + tracing::info!( + "Generating unarchival heatmap with {} layers", + heatmap_layers.len() + ); + + let heatmap = HeatMapTimeline::new(self.timeline_id, heatmap_layers); + PreviousHeatmap::Active { + heatmap, + read_at: Instant::now(), + } + } + /// Returns true if the given lsn is or was an ancestor branchpoint. pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool { // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original @@ -3233,7 +3760,7 @@ impl Timeline { // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid // stalling compaction. keyspace.remove_overlapping_with(&KeySpace { - ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE], + ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()], }); // Keyspace is fully retrieved @@ -3254,7 +3781,13 @@ impl Timeline { // keys from `keyspace`, we expect there to be no overlap between it and the image covered key // space. If that's not the case, we had at least one key encounter a gap in the image layer // and stop the search as a result of that. - let removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + // Do not fire missing key error and end early for sparse keys. Note that we hava already removed + // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of + // figuring out what is the inherited key range and do a fine-grained pruning. + removed.remove_overlapping_with(&KeySpace { + ranges: vec![SPARSE_RANGE], + }); if !removed.is_empty() { break Some(removed); } @@ -3269,6 +3802,21 @@ impl Timeline { timeline = &*timeline_owned; }; + // Remove sparse keys from the keyspace so that it doesn't fire errors. + let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace { + let mut missing_keyspace = missing_keyspace; + missing_keyspace.remove_overlapping_with(&KeySpace { + ranges: vec![SPARSE_RANGE], + }); + if missing_keyspace.is_empty() { + None + } else { + Some(missing_keyspace) + } + } else { + None + }; + if let Some(missing_keyspace) = missing_keyspace { return Err(GetVectoredError::MissingKey(MissingKeyError { key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */ @@ -3279,6 +3827,7 @@ impl Timeline { request_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), backtrace: None, + read_path: std::mem::take(&mut reconstruct_state.read_path), })); } @@ -3315,6 +3864,16 @@ impl Timeline { let mut completed_keyspace = KeySpace::default(); let mut image_covered_keyspace = KeySpaceRandomAccum::new(); + // Prevent GC from progressing while visiting the current timeline. + // If we are GC-ing because a new image layer was added while traversing + // the timeline, then it will remove layers that are required for fulfilling + // the current get request (read-path cannot "look back" and notice the new + // image layer). + let _gc_cutoff_holder = timeline.get_applied_gc_cutoff_lsn(); + + // See `compaction::compact_with_gc` for why we need this. + let _guard = timeline.gc_compaction_layer_update_lock.read().await; + loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); @@ -3387,6 +3946,9 @@ impl Timeline { } if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { + if let Some(ref mut read_path) = reconstruct_state.read_path { + read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range); + } let next_cont_lsn = lsn_range.start; layer_to_read .get_values_reconstruct_data( @@ -3455,7 +4017,12 @@ impl Timeline { } } ancestor - .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx) + .wait_lsn( + self.ancestor_lsn, + WaitLsnWaiter::Timeline(self), + WaitLsnTimeout::Default, + ctx, + ) .await .map_err(|e| match e { e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e), @@ -3538,7 +4105,7 @@ impl Timeline { let mut guard = self.layers.write().await; guard .open_mut()? - .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock) + .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics) .await }; @@ -3575,6 +4142,12 @@ impl Timeline { mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>, ctx: &RequestContext, ) { + // Subscribe to L0 delta layer updates, for compaction backpressure. + let mut watch_l0 = match self.layers.read().await.layer_map() { + Ok(lm) => lm.watch_level0_deltas(), + Err(Shutdown) => return, + }; + info!("started flush loop"); loop { tokio::select! { @@ -3599,43 +4172,68 @@ impl Timeline { return; } - let timer = self.metrics.flush_time_histo.start_timer(); + // Break to notify potential waiters as soon as we've flushed the requested LSN. If + // more requests have arrived in the meanwhile, we'll resume flushing afterwards. + if flushed_to_lsn >= frozen_to_lsn { + break Ok(()); + } - let num_frozen_layers; - let frozen_layer_total_size; - let layer_to_flush = { - let guard = self.layers.read().await; - let Ok(lm) = guard.layer_map() else { + // Fetch the next layer to flush, if any. + let (layer, l0_count, frozen_count, frozen_size) = { + let layers = self.layers.read().await; + let Ok(lm) = layers.layer_map() else { info!("dropping out of flush loop for timeline shutdown"); return; }; - num_frozen_layers = lm.frozen_layers.len(); - frozen_layer_total_size = lm + let l0_count = lm.level0_deltas().len(); + let frozen_count = lm.frozen_layers.len(); + let frozen_size: u64 = lm .frozen_layers .iter() .map(|l| l.estimated_in_mem_size()) - .sum::(); - lm.frozen_layers.front().cloned() - // drop 'layers' lock to allow concurrent reads and writes + .sum(); + let layer = lm.frozen_layers.front().cloned(); + (layer, l0_count, frozen_count, frozen_size) + // drop 'layers' lock }; - let Some(layer_to_flush) = layer_to_flush else { + let Some(layer) = layer else { break Ok(()); }; - if num_frozen_layers - > std::cmp::max( - self.get_compaction_threshold(), - DEFAULT_COMPACTION_THRESHOLD, - ) - && frozen_layer_total_size >= /* 128 MB */ 128000000 - { - tracing::warn!( - "too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes", - ); - } - match self.flush_frozen_layer(layer_to_flush, ctx).await { - Ok(this_layer_to_lsn) => { - flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn); + + // Stall flushes to backpressure if compaction can't keep up. This is propagated up + // to WAL ingestion by having ephemeral layer rolls wait for flushes. + // + // NB: the compaction loop only checks `compaction_threshold` every 20 seconds, so + // we can end up stalling before compaction even starts. Consider making it more + // responsive (e.g. via `watch_level0_deltas`). + if let Some(stall_threshold) = self.get_l0_flush_stall_threshold() { + if l0_count >= stall_threshold { + warn!( + "stalling layer flushes for compaction backpressure at {l0_count} \ + L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)" + ); + let stall_timer = self + .metrics + .flush_delay_histo + .start_timer() + .record_on_drop(); + tokio::select! { + result = watch_l0.wait_for(|l0| *l0 < stall_threshold) => { + if let Ok(l0) = result.as_deref() { + let delay = stall_timer.elapsed().as_secs_f64(); + info!("resuming layer flushes at {l0} L0 layers after {delay:.3}s"); + } + }, + _ = self.cancel.cancelled() => {}, + } + continue; // check again } + } + + // Flush the layer. + let flush_timer = self.metrics.flush_time_histo.start_timer(); + match self.flush_frozen_layer(layer, ctx).await { + Ok(layer_lsn) => flushed_to_lsn = max(flushed_to_lsn, layer_lsn), Err(FlushLayerError::Cancelled) => { info!("dropping out of flush loop for timeline shutdown"); return; @@ -3649,7 +4247,36 @@ impl Timeline { break err.map(|_| ()); } } - timer.stop_and_record(); + let flush_duration = flush_timer.stop_and_record(); + + // Notify the tenant compaction loop if L0 compaction is needed. + let l0_count = *watch_l0.borrow(); + if l0_count >= self.get_compaction_threshold() { + self.l0_compaction_trigger.notify_one(); + } + + // Delay the next flush to backpressure if compaction can't keep up. We delay by the + // flush duration such that the flush takes 2x as long. This is propagated up to WAL + // ingestion by having ephemeral layer rolls wait for flushes. + if let Some(delay_threshold) = self.get_l0_flush_delay_threshold() { + if l0_count >= delay_threshold { + let delay = flush_duration.as_secs_f64(); + info!( + "delaying layer flush by {delay:.3}s for compaction backpressure at \ + {l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)" + ); + let _delay_timer = self + .metrics + .flush_delay_histo + .start_timer() + .record_on_drop(); + tokio::select! { + _ = tokio::time::sleep(flush_duration) => {}, + _ = watch_l0.wait_for(|l0| *l0 < delay_threshold) => {}, + _ = self.cancel.cancelled() => {}, + } + } + } }; // Unsharded tenants should never advance their LSN beyond the end of the @@ -3668,10 +4295,14 @@ impl Timeline { // This path is only taken for tenants with multiple shards: single sharded tenants should // never encounter a gap in the wal. let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); - tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}"); + tracing::debug!( + "Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}" + ); if self.set_disk_consistent_lsn(frozen_to_lsn) { if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) { - tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}"); + tracing::warn!( + "Failed to schedule metadata upload after updating disk_consistent_lsn: {e}" + ); } } } @@ -3774,36 +4405,41 @@ impl Timeline { return Err(FlushLayerError::Cancelled); } - let mut layers_to_upload = Vec::new(); - layers_to_upload.extend( - self.create_image_layers( - &rel_partition, - self.initdb_lsn, - ImageLayerCreationMode::Initial, - ctx, - ) - .await?, - ); + // Ensure that we have a single call to `create_image_layers` with a combined dense keyspace. + // So that the key ranges don't overlap. + let mut partitions = KeyPartitioning::default(); + partitions.parts.extend(rel_partition.parts); if !metadata_partition.parts.is_empty() { assert_eq!( metadata_partition.parts.len(), 1, "currently sparse keyspace should only contain a single metadata keyspace" ); - layers_to_upload.extend( - self.create_image_layers( - // Safety: create_image_layers treat sparse keyspaces differently that it does not scan - // every single key within the keyspace, and therefore, it's safe to force converting it - // into a dense keyspace before calling this function. - &metadata_partition.into_dense(), - self.initdb_lsn, - ImageLayerCreationMode::Initial, - ctx, - ) - .await?, - ); + // Safety: create_image_layers treat sparse keyspaces differently that it does not scan + // every single key within the keyspace, and therefore, it's safe to force converting it + // into a dense keyspace before calling this function. + partitions + .parts + .extend(metadata_partition.into_dense().parts); } + let mut layers_to_upload = Vec::new(); + let (generated_image_layers, is_complete) = self + .create_image_layers( + &partitions, + self.initdb_lsn, + ImageLayerCreationMode::Initial, + ctx, + LastImageLayerCreationStatus::Initial, + false, // don't yield for L0, we're flushing L0 + ) + .await?; + debug_assert!( + matches!(is_complete, LastImageLayerCreationStatus::Complete), + "init image generation mode must fully cover the keyspace" + ); + layers_to_upload.extend(generated_image_layers); + (layers_to_upload, None) } else { // Normal case, write out a L0 delta layer file. @@ -3854,21 +4490,24 @@ impl Timeline { // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files. // This makes us refuse ingest until the new layers have been persisted to the remote - let start = Instant::now(); - self.remote_client - .wait_completion() - .await - .map_err(|e| match e { - WaitCompletionError::UploadQueueShutDownOrStopped - | WaitCompletionError::NotInitialized( - NotInitialized::ShuttingDown | NotInitialized::Stopped, - ) => FlushLayerError::Cancelled, - WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => { - FlushLayerError::Other(anyhow!(e).into()) - } - })?; - let duration = start.elapsed().as_secs_f64(); - self.metrics.flush_wait_upload_time_gauge_add(duration); + // TODO: remove this, and rely on l0_flush_{delay,stall}_threshold instead. + if self.get_l0_flush_wait_upload() { + let start = Instant::now(); + self.remote_client + .wait_completion() + .await + .map_err(|e| match e { + WaitCompletionError::UploadQueueShutDownOrStopped + | WaitCompletionError::NotInitialized( + NotInitialized::ShuttingDown | NotInitialized::Stopped, + ) => FlushLayerError::Cancelled, + WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => { + FlushLayerError::Other(anyhow!(e).into()) + } + })?; + let duration = start.elapsed().as_secs_f64(); + self.metrics.flush_wait_upload_time_gauge_add(duration); + } // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`, // a compaction can delete the file and then it won't be available for uploads any more. @@ -3888,7 +4527,10 @@ impl Timeline { /// This function must only be used from the layer flush task. fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { let old_value = self.disk_consistent_lsn.fetch_max(new_value); - assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}"); + assert!( + new_value >= old_value, + "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}" + ); self.metrics .disk_consistent_lsn_gauge @@ -3921,7 +4563,7 @@ impl Timeline { let update = crate::tenant::metadata::MetadataUpdate::new( disk_consistent_lsn, ondisk_prev_record_lsn, - *self.latest_gc_cutoff_lsn.read(), + *self.applied_gc_cutoff_lsn.read(), ); fail_point!("checkpoint-before-saving-metadata", |x| bail!( @@ -4020,15 +4662,15 @@ impl Timeline { flags: EnumSet, ctx: &RequestContext, ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> { - let Ok(mut partitioning_guard) = self.partitioning.try_lock() else { + let Ok(mut guard) = self.partitioning.try_write_guard() else { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. return Err(CompactionError::Other(anyhow!( - "repartition() called concurrently, this should not happen" + "repartition() called concurrently" ))); }; - let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard; + let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read(); if lsn < *partition_lsn { return Err(CompactionError::Other(anyhow!( "repartition() called with LSN going backwards, this should not happen" @@ -4051,17 +4693,20 @@ impl Timeline { )); } - let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; + let (dense_ks, sparse_ks) = self + .collect_keyspace(lsn, ctx) + .await + .map_err(CompactionError::CollectKeySpaceError)?; let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); let sparse_partitioning = SparseKeyPartitioning { parts: vec![sparse_ks], }; // no partitioning for metadata keys for now - *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn); - - Ok((partitioning_guard.0.clone(), partitioning_guard.1)) + let result = ((dense_partitioning, sparse_partitioning), lsn); + guard.write(result.clone()); + Ok(result) } - // Is it time to create a new image layer for the given partition? + // Is it time to create a new image layer for the given partition? True if we want to generate. async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool { let threshold = self.get_image_creation_threshold(); @@ -4116,6 +4761,7 @@ impl Timeline { /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large, /// so that at most one image layer will be produced from this function. + #[allow(clippy::too_many_arguments)] async fn create_image_layer_for_rel_blocks( self: &Arc, partition: &KeySpace, @@ -4123,7 +4769,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, img_range: Range, - start: Key, + io_concurrency: IoConcurrency, ) -> Result { let mut wrote_keys = false; @@ -4152,7 +4798,12 @@ impl Timeline { || (last_key_in_range && key_request_accum.raw_size() > 0) { let results = self - .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) + .get_vectored( + key_request_accum.consume_keyspace(), + lsn, + io_concurrency.clone(), + ctx, + ) .await?; if self.cancel.is_cancelled() { @@ -4179,7 +4830,9 @@ impl Timeline { // any metadata keys, keys, as that would lead to actual data // loss. if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() { - warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); + warn!( + "could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}" + ); ZERO_PAGE.clone() } else { return Err(CreateImageLayersError::from(err)); @@ -4198,29 +4851,30 @@ impl Timeline { if wrote_keys { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. - let (desc, path) = image_layer_writer.finish(ctx).await?; - let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; - info!("created image layer for rel {}", image_layer.local_path()); - Ok(ImageLayerCreationOutcome { - image: Some(image_layer), - next_start_key: img_range.end, + info!( + "produced image layer for rel {}", + ImageLayerName { + key_range: img_range.clone(), + lsn + }, + ); + Ok(ImageLayerCreationOutcome::Generated { + unfinished_image_layer: image_layer_writer, }) } else { - // Special case: the image layer may be empty if this is a sharded tenant and the - // partition does not cover any keys owned by this shard. In this case, to ensure - // we don't leave gaps between image layers, leave `start` where it is, so that the next - // layer we write will cover the key range that we just scanned. tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); - Ok(ImageLayerCreationOutcome { - image: None, - next_start_key: start, - }) + Ok(ImageLayerCreationOutcome::Empty) } } /// Create an image layer for metadata keys. This function produces one image layer for all metadata /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it /// would not be too large to fit in a single image layer. + /// + /// Creating image layers for metadata keys are different from relational keys. Firstly, instead of + /// iterating each key and get an image for each of them, we do a `vectored_get` scan over the sparse + /// keyspace to get all images in one run. Secondly, we use a different image layer generation metrics + /// for metadata keys than relational keys, which is the number of delta files visited during the scan. #[allow(clippy::too_many_arguments)] async fn create_image_layer_for_metadata_keys( self: &Arc, @@ -4230,11 +4884,13 @@ impl Timeline { ctx: &RequestContext, img_range: Range, mode: ImageLayerCreationMode, - start: Key, + io_concurrency: IoConcurrency, ) -> Result { // Metadata keys image layer creation. - let mut reconstruct_state = ValuesReconstructState::default(); + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let begin = Instant::now(); + // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should + // not contain too many keys, otherwise this takes a lot of memory. let data = self .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) .await?; @@ -4255,14 +4911,12 @@ impl Timeline { let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; info!( - "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64() + "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", + elapsed.as_secs_f64() ); if !trigger_generation && mode == ImageLayerCreationMode::Try { - return Ok(ImageLayerCreationOutcome { - image: None, - next_start_key: img_range.end, - }); + return Ok(ImageLayerCreationOutcome::Skip); } if self.cancel.is_cancelled() { return Err(CreateImageLayersError::Cancelled); @@ -4286,26 +4940,19 @@ impl Timeline { if wrote_any_image { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. - let (desc, path) = image_layer_writer.finish(ctx).await?; - let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?; info!( "created image layer for metadata {}", - image_layer.local_path() + ImageLayerName { + key_range: img_range.clone(), + lsn + } ); - Ok(ImageLayerCreationOutcome { - image: Some(image_layer), - next_start_key: img_range.end, + Ok(ImageLayerCreationOutcome::Generated { + unfinished_image_layer: image_layer_writer, }) } else { - // Special case: the image layer may be empty if this is a sharded tenant and the - // partition does not cover any keys owned by this shard. In this case, to ensure - // we don't leave gaps between image layers, leave `start` where it is, so that the next - // layer we write will cover the key range that we just scanned. tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); - Ok(ImageLayerCreationOutcome { - image: None, - next_start_key: start, - }) + Ok(ImageLayerCreationOutcome::Empty) } } @@ -4361,6 +5008,8 @@ impl Timeline { decision } + /// Returns the image layers generated and an enum indicating whether the process is fully completed. + /// true = we have generate all image layers, false = we preempt the process for L0 compaction. #[tracing::instrument(skip_all, fields(%lsn, %mode))] async fn create_image_layers( self: &Arc, @@ -4368,9 +5017,15 @@ impl Timeline { lsn: Lsn, mode: ImageLayerCreationMode, ctx: &RequestContext, - ) -> Result, CreateImageLayersError> { + last_status: LastImageLayerCreationStatus, + yield_for_l0: bool, + ) -> Result<(Vec, LastImageLayerCreationStatus), CreateImageLayersError> { let timer = self.metrics.create_images_time_histo.start_timer(); - let mut image_layers = Vec::new(); + + if partitioning.parts.is_empty() { + warn!("no partitions to create image layers for"); + return Ok((vec![], LastImageLayerCreationStatus::Complete)); + } // We need to avoid holes between generated image layers. // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one @@ -4383,13 +5038,65 @@ impl Timeline { // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. let mut start = Key::MIN; - let check_for_image_layers = self.should_check_if_image_layers_required(lsn); + let check_for_image_layers = + if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status { + info!( + "resuming image layer creation: last_status=incomplete, continue from {}", + last_key + ); + true + } else { + self.should_check_if_image_layers_required(lsn) + }; - for partition in partitioning.parts.iter() { + let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?; + + let mut all_generated = true; + + let mut partition_processed = 0; + let mut total_partitions = partitioning.parts.len(); + let mut last_partition_processed = None; + let mut partition_parts = partitioning.parts.clone(); + + if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status { + // We need to skip the partitions that have already been processed. + let mut found = false; + for (i, partition) in partition_parts.iter().enumerate() { + if last_key <= partition.end().unwrap() { + // ```plain + // |------|--------|----------|------| + // ^last_key + // ^start from this partition + // ``` + // Why `i+1` instead of `i`? + // It is possible that the user did some writes after the previous image layer creation attempt so that + // a relation grows in size, and the last_key is now in the middle of the partition. In this case, we + // still want to skip this partition, so that we can make progress and avoid generating image layers over + // the same partition. Doing a mod to ensure we don't end up with an empty vec. + if i + 1 >= total_partitions { + // In general, this case should not happen -- if last_key is on the last partition, the previous + // iteration of image layer creation should return a complete status. + break; // with found=false + } + partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements + total_partitions = partition_parts.len(); + // Update the start key to the partition start. + start = partition_parts[0].start().unwrap(); + found = true; + break; + } + } + if !found { + // Last key is within the last partition, or larger than all partitions. + return Ok((vec![], LastImageLayerCreationStatus::Complete)); + } + } + + for partition in partition_parts.iter() { if self.cancel.is_cancelled() { return Err(CreateImageLayersError::Cancelled); } - + partition_processed += 1; let img_range = start..partition.ranges.last().unwrap().end; let compact_metadata = partition.overlaps(&Key::metadata_key_range()); if compact_metadata { @@ -4424,6 +5131,8 @@ impl Timeline { lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn), is_delta: false, }) { + // TODO: this can be processed with the BatchLayerWriter::finish_with_discard + // in the future. tracing::info!( "Skipping image layer at {lsn} {}..{}, already exists", img_range.start, @@ -4451,43 +5160,95 @@ impl Timeline { ))) }); - if !compact_metadata { - let ImageLayerCreationOutcome { - image, - next_start_key, - } = self - .create_image_layer_for_rel_blocks( - partition, - image_layer_writer, - lsn, - ctx, - img_range, - start, - ) - .await?; + let io_concurrency = IoConcurrency::spawn_from_conf( + self.conf, + self.gate + .enter() + .map_err(|_| CreateImageLayersError::Cancelled)?, + ); - start = next_start_key; - image_layers.extend(image); + let outcome = if !compact_metadata { + self.create_image_layer_for_rel_blocks( + partition, + image_layer_writer, + lsn, + ctx, + img_range.clone(), + io_concurrency, + ) + .await? } else { - let ImageLayerCreationOutcome { - image, - next_start_key, - } = self - .create_image_layer_for_metadata_keys( - partition, - image_layer_writer, + self.create_image_layer_for_metadata_keys( + partition, + image_layer_writer, + lsn, + ctx, + img_range.clone(), + mode, + io_concurrency, + ) + .await? + }; + match outcome { + ImageLayerCreationOutcome::Empty => { + // No data in this partition, so we don't need to create an image layer (for now). + // The next image layer should cover this key range, so we don't advance the `start` + // key. + } + ImageLayerCreationOutcome::Generated { + unfinished_image_layer, + } => { + batch_image_writer.add_unfinished_image_writer( + unfinished_image_layer, + img_range.clone(), lsn, - ctx, - img_range, - mode, - start, - ) - .await?; - start = next_start_key; - image_layers.extend(image); + ); + // The next image layer should be generated right after this one. + start = img_range.end; + } + ImageLayerCreationOutcome::Skip => { + // We don't need to create an image layer for this partition. + // The next image layer should NOT cover this range, otherwise + // the keyspace becomes empty (reads don't go past image layers). + start = img_range.end; + } + } + + if let ImageLayerCreationMode::Try = mode { + // We have at least made some progress + if yield_for_l0 && batch_image_writer.pending_layer_num() >= 1 { + // The `Try` mode is currently only used on the compaction path. We want to avoid + // image layer generation taking too long time and blocking L0 compaction. So in this + // mode, we also inspect the current number of L0 layers and skip image layer generation + // if there are too many of them. + let image_preempt_threshold = self.get_image_creation_preempt_threshold() + * self.get_compaction_threshold(); + // TODO: currently we do not respect `get_image_creation_preempt_threshold` and always yield + // when there is a single timeline with more than L0 threshold L0 layers. As long as the + // `get_image_creation_preempt_threshold` is set to a value greater than 0, we will yield for L0 compaction. + if image_preempt_threshold != 0 { + let should_yield = self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some(); + if should_yield { + tracing::info!( + "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers", + partition.start().unwrap(), + partition.end().unwrap() + ); + last_partition_processed = Some(partition.clone()); + all_generated = false; + break; + } + } + } } } + let image_layers = batch_image_writer.finish(self, ctx).await?; + let mut guard = self.layers.write().await; // FIXME: we could add the images to be uploaded *before* returning from here, but right @@ -4497,14 +5258,44 @@ impl Timeline { .open_mut()? .track_new_image_layers(&image_layers, &self.metrics); drop_wlock(guard); - timer.stop_and_record(); + let duration = timer.stop_and_record(); // Creating image layers may have caused some previously visible layers to be covered if !image_layers.is_empty() { self.update_layer_visibility().await?; } - Ok(image_layers) + let total_layer_size = image_layers + .iter() + .map(|l| l.metadata().file_size) + .sum::(); + + if !image_layers.is_empty() { + info!( + "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions", + image_layers.len(), + total_layer_size, + duration.as_secs_f64(), + partition_processed, + total_partitions + ); + } + + Ok(( + image_layers, + if all_generated { + LastImageLayerCreationStatus::Complete + } else { + LastImageLayerCreationStatus::Incomplete { + last_key: if let Some(last_partition_processed) = last_partition_processed { + last_partition_processed.end().unwrap_or(Key::MIN) + } else { + // This branch should be unreachable, but in case it happens, we can just return the start key. + Key::MIN + }, + } + }, + )) } /// Wait until the background initial logical size calculation is complete, or @@ -4615,6 +5406,10 @@ impl Drop for Timeline { } } } + info!( + "Timeline {} for tenant {} is being dropped", + self.timeline_id, self.tenant_shard_id.tenant_id + ); } } @@ -4627,8 +5422,12 @@ pub(crate) enum CompactionError { #[error("Failed to offload timeline: {0}")] Offload(OffloadError), /// Compaction cannot be done right now; page reconstruction and so on. + #[error("Failed to collect keyspace: {0}")] + CollectKeySpaceError(CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), + #[error("Compaction already running: {0}")] + AlreadyRunning(&'static str), } impl From for CompactionError { @@ -4640,12 +5439,6 @@ impl From for CompactionError { } } -impl CompactionError { - pub fn is_cancelled(&self) -> bool { - matches!(self, CompactionError::ShuttingDown) - } -} - impl From for CompactionError { fn from(err: CollectKeySpaceError) -> Self { match err { @@ -4801,7 +5594,9 @@ impl Timeline { // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) { - return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); + return Err(CompactionError::Other(anyhow::anyhow!( + "compaction generates a L0 layer file as output, which will cause infinite compaction." + ))); } else { insert_layers.push(l.clone()); } @@ -4874,6 +5669,7 @@ impl Timeline { async fn find_gc_time_cutoff( &self, + now: SystemTime, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, @@ -4881,7 +5677,6 @@ impl Timeline { debug_assert_current_span_has_tenant_and_timeline_id(); if self.shard_identity.is_shard_zero() { // Shard Zero has SLRU data and can calculate the PITR time -> LSN mapping itself - let now = SystemTime::now(); let time_range = if pitr == Duration::ZERO { humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") } else { @@ -4925,8 +5720,10 @@ impl Timeline { .await { Ok((index_part, index_generation, _index_mtime)) => { - tracing::info!("GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}", - index_part.metadata.latest_gc_cutoff_lsn()); + tracing::info!( + "GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}", + index_part.metadata.latest_gc_cutoff_lsn() + ); Ok(Some(index_part.metadata.latest_gc_cutoff_lsn())) } Err(DownloadError::NotFound) => { @@ -4967,6 +5764,7 @@ impl Timeline { #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] pub(super) async fn find_gc_cutoffs( &self, + now: SystemTime, space_cutoff: Lsn, pitr: Duration, cancel: &CancellationToken, @@ -4994,7 +5792,7 @@ impl Timeline { // - if PITR interval is set, then this is our cutoff. // - if PITR interval is not set, then we do a lookup // based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases. - let time_cutoff = self.find_gc_time_cutoff(pitr, cancel, ctx).await?; + let time_cutoff = self.find_gc_time_cutoff(now, pitr, cancel, ctx).await?; Ok(match (pitr, time_cutoff) { (Duration::ZERO, Some(time_cutoff)) => { @@ -5016,7 +5814,7 @@ impl Timeline { // PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR // cannot advance beyond what was already GC'd, and respect space-based retention GcCutoffs { - time: *self.get_latest_gc_cutoff_lsn(), + time: *self.get_applied_gc_cutoff_lsn(), space: space_cutoff, } } @@ -5137,7 +5935,7 @@ impl Timeline { let mut result: GcResult = GcResult::default(); // Nothing to GC. Return early. - let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff = *self.get_applied_gc_cutoff_lsn(); if latest_gc_cutoff >= new_gc_cutoff { info!( "Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", @@ -5151,7 +5949,7 @@ impl Timeline { // // The GC cutoff should only ever move forwards. let waitlist = { - let write_guard = self.latest_gc_cutoff_lsn.lock_for_write(); + let write_guard = self.applied_gc_cutoff_lsn.lock_for_write(); if *write_guard > new_gc_cutoff { return Err(GcError::BadLsn { why: format!( @@ -5334,9 +6132,7 @@ impl Timeline { if let Some((img_lsn, img)) = &data.img { trace!( "found page image for key {} at {}, no WAL redo required, req LSN {}", - key, - img_lsn, - request_lsn, + key, img_lsn, request_lsn, ); Ok(img.clone()) } else { @@ -5365,7 +6161,12 @@ impl Timeline { request_lsn ); } else { - trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); + trace!( + "found {} WAL records that will init the page for {} at {}, performing WAL redo", + data.records.len(), + key, + request_lsn + ); }; let res = self .walredo_mgr @@ -5377,10 +6178,11 @@ impl Timeline { let img = match res { Ok(img) => img, Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled), - Err(walredo::Error::Other(e)) => { + Err(walredo::Error::Other(err)) => { + critical!("walredo failure during page reconstruction: {err:?}"); return Err(PageReconstructError::WalRedo( - e.context("reconstruct a page image"), - )) + err.context("reconstruct a page image"), + )); } }; Ok(img) @@ -5663,9 +6465,17 @@ impl Timeline { info!("force created image layer {}", image_layer.local_path()); { let mut guard = self.layers.write().await; - guard.open_mut().unwrap().force_insert_layer(image_layer); + guard + .open_mut() + .unwrap() + .force_insert_layer(image_layer.clone()); } + // Update remote_timeline_client state to reflect existence of this layer + self.remote_client + .schedule_layer_file_upload(image_layer) + .unwrap(); + Ok(()) } @@ -5717,9 +6527,17 @@ impl Timeline { info!("force created delta layer {}", delta_layer.local_path()); { let mut guard = self.layers.write().await; - guard.open_mut().unwrap().force_insert_layer(delta_layer); + guard + .open_mut() + .unwrap() + .force_insert_layer(delta_layer.clone()); } + // Update remote_timeline_client state to reflect existence of this layer + self.remote_client + .schedule_layer_file_upload(delta_layer) + .unwrap(); + Ok(()) } @@ -5729,13 +6547,14 @@ impl Timeline { self: &Arc, lsn: Lsn, ctx: &RequestContext, + io_concurrency: IoConcurrency, ) -> anyhow::Result> { let mut all_data = Vec::new(); let guard = self.layers.read().await; for layer in guard.layer_map()?.iter_historic_layers() { if !layer.is_delta() && layer.image_layer_lsn() == lsn { let layer = guard.get_from_desc(&layer); - let mut reconstruct_data = ValuesReconstructState::default(); + let mut reconstruct_data = ValuesReconstructState::new(io_concurrency.clone()); layer .get_values_reconstruct_data( KeySpace::single(Key::MIN..Key::MAX), @@ -5744,8 +6563,9 @@ impl Timeline { ctx, ) .await?; - for (k, v) in reconstruct_data.keys { - all_data.push((k, v?.img.unwrap().1)); + for (k, v) in std::mem::take(&mut reconstruct_data.keys) { + let v = v.collect_pending_ios().await?; + all_data.push((k, v.img.unwrap().1)); } } } @@ -5824,7 +6644,7 @@ enum OpenLayerAction { None, } -impl<'a> TimelineWriter<'a> { +impl TimelineWriter<'_> { async fn handle_open_layer_action( &mut self, at: Lsn, @@ -5866,13 +6686,39 @@ impl<'a> TimelineWriter<'a> { async fn roll_layer(&mut self, freeze_at: Lsn) -> Result<(), FlushLayerError> { let current_size = self.write_guard.as_ref().unwrap().current_size; + // If layer flushes are backpressured due to compaction not keeping up, wait for the flush + // to propagate the backpressure up into WAL ingestion. + let l0_count = self + .tl + .layers + .read() + .await + .layer_map()? + .level0_deltas() + .len(); + let wait_thresholds = [ + self.get_l0_flush_delay_threshold(), + self.get_l0_flush_stall_threshold(), + ]; + let wait_threshold = wait_thresholds.into_iter().flatten().min(); + // self.write_guard will be taken by the freezing - self.tl + let flush_id = self + .tl .freeze_inmem_layer_at(freeze_at, &mut self.write_guard) .await?; assert!(self.write_guard.is_none()); + if let Some(wait_threshold) = wait_threshold { + if l0_count >= wait_threshold { + debug!( + "layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers" + ); + self.tl.wait_flush_completion(flush_id).await?; + } + } + if current_size >= self.get_checkpoint_distance() * 2 { warn!("Flushed oversized open layer with size {}", current_size) } @@ -6050,17 +6896,29 @@ fn is_send() { #[cfg(test)] mod tests { + use std::sync::Arc; + use pageserver_api::key::Key; use pageserver_api::value::Value; - use utils::{id::TimelineId, lsn::Lsn}; + use tracing::Instrument; + use utils::id::TimelineId; + use utils::lsn::Lsn; - use crate::tenant::{ - harness::{test_img, TenantHarness}, - layer_map::LayerMap, - storage_layer::{Layer, LayerName}, - timeline::{DeltaLayerTestDesc, EvictionError}, - Timeline, - }; + use super::HeatMapTimeline; + use crate::tenant::harness::{TenantHarness, test_img}; + use crate::tenant::layer_map::LayerMap; + use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint}; + use crate::tenant::timeline::{DeltaLayerTestDesc, EvictionError}; + use crate::tenant::{PreviousHeatmap, Timeline}; + + fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) { + assert_eq!(lhs.layers.len(), rhs.layers.len()); + let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter()); + for (l, r) in lhs_rhs { + assert_eq!(l.name, r.name); + assert_eq!(l.metadata, r.metadata); + } + } #[tokio::test] async fn test_heatmap_generation() { @@ -6135,7 +6993,7 @@ mod tests { assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name()); let mut last_lsn = Lsn::MAX; - for layer in heatmap.layers { + for layer in &heatmap.layers { // Covered layer should be omitted assert!(layer.name != covered_delta.layer_name()); @@ -6150,6 +7008,144 @@ mod tests { last_lsn = layer_lsn; } } + + // Evict all the layers and stash the old heatmap in the timeline. + // This simulates a migration to a cold secondary location. + + let guard = timeline.layers.read().await; + let mut all_layers = Vec::new(); + let forever = std::time::Duration::from_secs(120); + for layer in guard.likely_resident_layers() { + all_layers.push(layer.clone()); + layer.evict_and_wait(forever).await.unwrap(); + } + drop(guard); + + timeline + .previous_heatmap + .store(Some(Arc::new(PreviousHeatmap::Active { + heatmap: heatmap.clone(), + read_at: std::time::Instant::now(), + }))); + + // Generate a new heatmap and assert that it contains the same layers as the old one. + let post_migration_heatmap = timeline.generate_heatmap().await.unwrap(); + assert_heatmaps_have_same_layers(&heatmap, &post_migration_heatmap); + + // Download each layer one by one. Generate the heatmap at each step and check + // that it's stable. + for layer in all_layers { + if layer.visibility() == LayerVisibilityHint::Covered { + continue; + } + + eprintln!("Downloading {layer} and re-generating heatmap"); + + let _resident = layer + .download_and_keep_resident() + .instrument(tracing::info_span!( + parent: None, + "download_layer", + tenant_id = %timeline.tenant_shard_id.tenant_id, + shard_id = %timeline.tenant_shard_id.shard_slug(), + timeline_id = %timeline.timeline_id + )) + .await + .unwrap(); + + let post_download_heatmap = timeline.generate_heatmap().await.unwrap(); + assert_heatmaps_have_same_layers(&heatmap, &post_download_heatmap); + } + + // Everything from the post-migration heatmap is now resident. + // Check that we drop it from memory. + assert!(matches!( + timeline.previous_heatmap.load().as_deref(), + Some(PreviousHeatmap::Obsolete) + )); + } + + #[tokio::test] + async fn test_previous_heatmap_obsoletion() { + let harness = TenantHarness::create("heatmap_previous_heatmap_obsoletion") + .await + .unwrap(); + + let l0_delta = DeltaLayerTestDesc::new( + Lsn(0x20)..Lsn(0x30), + Key::from_hex("000000000000000000000000000000000000").unwrap() + ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(), + vec![( + Key::from_hex("720000000033333333444444445500000000").unwrap(), + Lsn(0x25), + Value::Image(test_img("foo")), + )], + ); + + let image_layer = ( + Lsn(0x40), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + test_img("bar"), + )], + ); + + let delta_layers = vec![l0_delta]; + let image_layers = vec![image_layer]; + + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_test_timeline_with_layers( + TimelineId::generate(), + Lsn(0x10), + 14, + &ctx, + delta_layers, + image_layers, + Lsn(0x100), + ) + .await + .unwrap(); + + // Layer visibility is an input to heatmap generation, so refresh it first + timeline.update_layer_visibility().await.unwrap(); + + let heatmap = timeline + .generate_heatmap() + .await + .expect("Infallible while timeline is not shut down"); + + // Both layers should be in the heatmap + assert!(!heatmap.layers.is_empty()); + + // Now simulate a migration. + timeline + .previous_heatmap + .store(Some(Arc::new(PreviousHeatmap::Active { + heatmap: heatmap.clone(), + read_at: std::time::Instant::now(), + }))); + + // Evict all the layers in the previous heatmap + let guard = timeline.layers.read().await; + let forever = std::time::Duration::from_secs(120); + for layer in guard.likely_resident_layers() { + layer.evict_and_wait(forever).await.unwrap(); + } + drop(guard); + + // Generate a new heatmap and check that the previous heatmap + // has been marked obsolete. + let post_eviction_heatmap = timeline + .generate_heatmap() + .await + .expect("Infallible while timeline is not shut down"); + + assert!(post_eviction_heatmap.layers.is_empty()); + assert!(matches!( + timeline.previous_heatmap.load().as_deref(), + Some(PreviousHeatmap::Obsolete) + )); } #[tokio::test] diff --git a/pageserver/src/tenant/timeline/analysis.rs b/pageserver/src/tenant/timeline/analysis.rs index 6009b0b79a..96864ec44b 100644 --- a/pageserver/src/tenant/timeline/analysis.rs +++ b/pageserver/src/tenant/timeline/analysis.rs @@ -1,4 +1,5 @@ -use std::{collections::BTreeSet, ops::Range}; +use std::collections::BTreeSet; +use std::ops::Range; use utils::lsn::Lsn; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 34c85ccdfd..1f746930d5 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -4,32 +4,48 @@ //! //! The old legacy algorithm is implemented directly in `timeline.rs`. -use std::collections::{BinaryHeap, HashMap, HashSet}; +use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; use std::ops::{Deref, Range}; use std::sync::Arc; -use super::layer_manager::LayerManager; -use super::{ - CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder, - ImageLayerCreationMode, RecordedDuration, Timeline, -}; - -use anyhow::{anyhow, bail, Context}; +use anyhow::{Context, anyhow, bail}; use bytes::Bytes; +use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; -use pageserver_api::key::KEY_SIZE; -use pageserver_api::keyspace::ShardedRange; +use once_cell::sync::Lazy; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; +use pageserver_api::key::{KEY_SIZE, Key}; +use pageserver_api::keyspace::{KeySpace, ShardedRange}; +use pageserver_api::models::CompactInfoResponse; +use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; +use pageserver_api::value::Value; +use pageserver_compaction::helpers::{fully_contains, overlaps_with}; +use pageserver_compaction::interface::*; use serde::Serialize; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, info_span, trace, warn, Instrument}; +use tracing::{Instrument, debug, error, info, info_span, trace, warn}; +use utils::critical; use utils::id::TimelineId; +use utils::lsn::Lsn; +use super::layer_manager::LayerManager; +use super::{ + CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder, + GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, + RecordedDuration, Timeline, +}; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; +use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::statvfs::Statvfs; +use crate::tenant::checks::check_valid_layermap; +use crate::tenant::gc_block::GcBlock; +use crate::tenant::layer_map::LayerMap; use crate::tenant::remote_timeline_client::WaitCompletionError; +use crate::tenant::remote_timeline_client::index::GcCompactionState; use crate::tenant::storage_layer::batch_split_writer::{ BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter, }; @@ -38,49 +54,544 @@ use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::{ AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, }; -use crate::tenant::timeline::ImageLayerCreationOutcome; -use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; -use crate::tenant::timeline::{Layer, ResidentLayer}; -use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded}; -use crate::virtual_file::{MaybeFatalIo, VirtualFile}; -use pageserver_api::config::tenant_conf_defaults::{ - DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, +use crate::tenant::timeline::{ + DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, + ResidentLayer, drop_rlock, }; - -use pageserver_api::key::Key; -use pageserver_api::keyspace::KeySpace; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::value::Value; - -use utils::lsn::Lsn; - -use pageserver_compaction::helpers::{fully_contains, overlaps_with}; -use pageserver_compaction::interface::*; - -use super::CompactionError; +use crate::tenant::{DeltaLayer, MaybeOffloaded, gc_block}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; /// Maximum number of deltas before generating an image layer in bottom-most compaction. const COMPACTION_DELTA_THRESHOLD: usize = 5; -/// A scheduled compaction task. -pub(crate) struct ScheduledCompactionTask { - pub options: CompactOptions, - /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender. - pub result_tx: Option>, - /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard. - pub gc_block: Option, +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +pub struct GcCompactionJobId(pub usize); + +impl std::fmt::Display for GcCompactionJobId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } } +pub struct GcCompactionCombinedSettings { + pub gc_compaction_enabled: bool, + pub gc_compaction_initial_threshold_kb: u64, + pub gc_compaction_ratio_percent: u64, +} + +#[derive(Debug, Clone)] +pub enum GcCompactionQueueItem { + MetaJob { + /// Compaction options + options: CompactOptions, + /// Whether the compaction is triggered automatically (determines whether we need to update L2 LSN) + auto: bool, + }, + SubCompactionJob(CompactOptions), + Notify(GcCompactionJobId, Option), +} + +impl GcCompactionQueueItem { + pub fn into_compact_info_resp( + self, + id: GcCompactionJobId, + running: bool, + ) -> Option { + match self { + GcCompactionQueueItem::MetaJob { options, .. } => Some(CompactInfoResponse { + compact_key_range: options.compact_key_range, + compact_lsn_range: options.compact_lsn_range, + sub_compaction: options.sub_compaction, + running, + job_id: id.0, + }), + GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse { + compact_key_range: options.compact_key_range, + compact_lsn_range: options.compact_lsn_range, + sub_compaction: options.sub_compaction, + running, + job_id: id.0, + }), + GcCompactionQueueItem::Notify(_, _) => None, + } + } +} + +#[derive(Default)] +struct GcCompactionGuardItems { + notify: Option>, + gc_guard: Option, + permit: Option, +} + +struct GcCompactionQueueInner { + running: Option<(GcCompactionJobId, GcCompactionQueueItem)>, + queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>, + guards: HashMap, + last_id: GcCompactionJobId, +} + +impl GcCompactionQueueInner { + fn next_id(&mut self) -> GcCompactionJobId { + let id = self.last_id; + self.last_id = GcCompactionJobId(id.0 + 1); + id + } +} + +/// A structure to store gc_compaction jobs. +pub struct GcCompactionQueue { + /// All items in the queue, and the currently-running job. + inner: std::sync::Mutex, + /// Ensure only one thread is consuming the queue. + consumer_lock: tokio::sync::Mutex<()>, +} + +static CONCURRENT_GC_COMPACTION_TASKS: Lazy> = Lazy::new(|| { + // Only allow two timelines on one pageserver to run gc compaction at a time. + Arc::new(Semaphore::new(2)) +}); + +impl GcCompactionQueue { + pub fn new() -> Self { + GcCompactionQueue { + inner: std::sync::Mutex::new(GcCompactionQueueInner { + running: None, + queued: VecDeque::new(), + guards: HashMap::new(), + last_id: GcCompactionJobId(0), + }), + consumer_lock: tokio::sync::Mutex::new(()), + } + } + + pub fn cancel_scheduled(&self) { + let mut guard = self.inner.lock().unwrap(); + guard.queued.clear(); + // TODO: if there is a running job, we should keep the gc guard. However, currently, the cancel + // API is only used for testing purposes, so we can drop everything here. + guard.guards.clear(); + } + + /// Schedule a manual compaction job. + pub fn schedule_manual_compaction( + &self, + options: CompactOptions, + notify: Option>, + ) -> GcCompactionJobId { + let mut guard = self.inner.lock().unwrap(); + let id = guard.next_id(); + guard.queued.push_back(( + id, + GcCompactionQueueItem::MetaJob { + options, + auto: false, + }, + )); + guard.guards.entry(id).or_default().notify = notify; + info!("scheduled compaction job id={}", id); + id + } + + /// Schedule an auto compaction job. + fn schedule_auto_compaction( + &self, + options: CompactOptions, + permit: OwnedSemaphorePermit, + ) -> GcCompactionJobId { + let mut guard = self.inner.lock().unwrap(); + let id = guard.next_id(); + guard.queued.push_back(( + id, + GcCompactionQueueItem::MetaJob { + options, + auto: true, + }, + )); + guard.guards.entry(id).or_default().permit = Some(permit); + id + } + + /// Trigger an auto compaction. + pub async fn trigger_auto_compaction(&self, timeline: &Arc) { + let GcCompactionCombinedSettings { + gc_compaction_enabled, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + } = timeline.get_gc_compaction_settings(); + if !gc_compaction_enabled { + return; + } + if self.remaining_jobs_num() > 0 { + // Only schedule auto compaction when the queue is empty + return; + } + if timeline.ancestor_timeline().is_some() { + // Do not trigger auto compaction for child timelines. We haven't tested + // it enough in staging yet. + return; + } + + let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else { + // Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure + // the fairness of the lock across timelines. We should listen for both `acquire` and `l0_compaction_trigger` + // to ensure the fairness while avoid starving other tasks. + return; + }; + + let gc_compaction_state = timeline.get_gc_compaction_state(); + let l2_lsn = gc_compaction_state + .map(|x| x.last_completed_lsn) + .unwrap_or(Lsn::INVALID); + + let layers = { + let guard = timeline.layers.read().await; + let layer_map = guard.layer_map().unwrap(); + layer_map.iter_historic_layers().collect_vec() + }; + let mut l2_size: u64 = 0; + let mut l1_size = 0; + let gc_cutoff = *timeline.get_applied_gc_cutoff_lsn(); + for layer in layers { + if layer.lsn_range.start <= l2_lsn { + l2_size += layer.file_size(); + } else if layer.lsn_range.start <= gc_cutoff { + l1_size += layer.file_size(); + } + } + + fn trigger_compaction( + l1_size: u64, + l2_size: u64, + gc_compaction_initial_threshold_kb: u64, + gc_compaction_ratio_percent: u64, + ) -> bool { + const AUTO_TRIGGER_LIMIT: u64 = 150 * 1024 * 1024 * 1024; // 150GB + if l1_size >= AUTO_TRIGGER_LIMIT || l2_size >= AUTO_TRIGGER_LIMIT { + // Do not auto-trigger when physical size >= 150GB + return false; + } + // initial trigger + if l2_size == 0 && l1_size >= gc_compaction_initial_threshold_kb * 1024 { + info!( + "trigger auto-compaction because l1_size={} >= gc_compaction_initial_threshold_kb={}", + l1_size, gc_compaction_initial_threshold_kb + ); + return true; + } + // size ratio trigger + if l2_size == 0 { + return false; + } + if l1_size as f64 / l2_size as f64 >= (gc_compaction_ratio_percent as f64 / 100.0) { + info!( + "trigger auto-compaction because l1_size={} / l2_size={} > gc_compaction_ratio_percent={}", + l1_size, l2_size, gc_compaction_ratio_percent + ); + return true; + } + false + } + + if trigger_compaction( + l1_size, + l2_size, + gc_compaction_initial_threshold_kb, + gc_compaction_ratio_percent, + ) { + self.schedule_auto_compaction( + CompactOptions { + flags: { + let mut flags = EnumSet::new(); + flags |= CompactFlags::EnhancedGcBottomMostCompaction; + flags + }, + sub_compaction: true, + compact_key_range: None, + compact_lsn_range: None, + sub_compaction_max_job_size_mb: None, + }, + permit, + ); + info!( + "scheduled auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}", + l1_size, l2_size, l2_lsn, gc_cutoff + ); + } else { + info!( + "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}", + l1_size, l2_size, l2_lsn, gc_cutoff + ); + } + } + + /// Notify the caller the job has finished and unblock GC. + fn notify_and_unblock(&self, id: GcCompactionJobId) { + info!("compaction job id={} finished", id); + let mut guard = self.inner.lock().unwrap(); + if let Some(items) = guard.guards.remove(&id) { + drop(items.gc_guard); + if let Some(tx) = items.notify { + let _ = tx.send(()); + } + } + } + + async fn handle_sub_compaction( + &self, + id: GcCompactionJobId, + options: CompactOptions, + timeline: &Arc, + gc_block: &GcBlock, + auto: bool, + ) -> Result<(), CompactionError> { + info!( + "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" + ); + let jobs = timeline + .gc_compaction_split_jobs( + GcCompactJob::from_compact_options(options.clone()), + options.sub_compaction_max_job_size_mb, + ) + .await + .map_err(CompactionError::Other)?; + if jobs.is_empty() { + info!("no jobs to run, skipping scheduled compaction task"); + self.notify_and_unblock(id); + } else { + let gc_guard = match gc_block.start().await { + Ok(guard) => guard, + Err(e) => { + return Err(CompactionError::Other(anyhow!( + "cannot run gc-compaction because gc is blocked: {}", + e + ))); + } + }; + + let jobs_len = jobs.len(); + let mut pending_tasks = Vec::new(); + // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate. + // And therefore, we simply assume the maximum LSN of all jobs is the expected L2 LSN. + let expected_l2_lsn = jobs.iter().map(|job| job.compact_lsn_range.end).max(); + for job in jobs { + // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions` + // until we do further refactors to allow directly call `compact_with_gc`. + let mut flags: EnumSet = EnumSet::default(); + flags |= CompactFlags::EnhancedGcBottomMostCompaction; + if job.dry_run { + flags |= CompactFlags::DryRun; + } + let options = CompactOptions { + flags, + sub_compaction: false, + compact_key_range: Some(job.compact_key_range.into()), + compact_lsn_range: Some(job.compact_lsn_range.into()), + sub_compaction_max_job_size_mb: None, + }; + pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options)); + } + + if !auto { + pending_tasks.push(GcCompactionQueueItem::Notify(id, None)); + } else { + pending_tasks.push(GcCompactionQueueItem::Notify(id, expected_l2_lsn)); + } + + { + let mut guard = self.inner.lock().unwrap(); + guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); + let mut tasks = Vec::new(); + for task in pending_tasks { + let id = guard.next_id(); + tasks.push((id, task)); + } + tasks.reverse(); + for item in tasks { + guard.queued.push_front(item); + } + } + info!( + "scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", + jobs_len + ); + } + Ok(()) + } + + /// Take a job from the queue and process it. Returns if there are still pending tasks. + pub async fn iteration( + &self, + cancel: &CancellationToken, + ctx: &RequestContext, + gc_block: &GcBlock, + timeline: &Arc, + ) -> Result { + let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else { + return Err(CompactionError::AlreadyRunning( + "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.", + )); + }; + let has_pending_tasks; + let Some((id, item)) = ({ + let mut guard = self.inner.lock().unwrap(); + if let Some((id, item)) = guard.queued.pop_front() { + guard.running = Some((id, item.clone())); + has_pending_tasks = !guard.queued.is_empty(); + Some((id, item)) + } else { + has_pending_tasks = false; + None + } + }) else { + self.trigger_auto_compaction(timeline).await; + // Always yield after triggering auto-compaction. Gc-compaction is a low-priority task and we + // have not implemented preemption mechanism yet. We always want to yield it to more important + // tasks if there is one. + return Ok(CompactionOutcome::Done); + }; + match item { + GcCompactionQueueItem::MetaJob { options, auto } => { + if !options + .flags + .contains(CompactFlags::EnhancedGcBottomMostCompaction) + { + warn!( + "ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", + options + ); + } else if options.sub_compaction { + info!( + "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" + ); + self.handle_sub_compaction(id, options, timeline, gc_block, auto) + .await?; + } else { + // Auto compaction always enables sub-compaction so we don't need to handle update_l2_lsn + // in this branch. + let gc_guard = match gc_block.start().await { + Ok(guard) => guard, + Err(e) => { + return Err(CompactionError::Other(anyhow!( + "cannot run gc-compaction because gc is blocked: {}", + e + ))); + } + }; + { + let mut guard = self.inner.lock().unwrap(); + guard.guards.entry(id).or_default().gc_guard = Some(gc_guard); + } + let _ = timeline.compact_with_options(cancel, options, ctx).await?; + self.notify_and_unblock(id); + } + } + GcCompactionQueueItem::SubCompactionJob(options) => { + // TODO: error handling, clear the queue if any task fails? + let _ = timeline.compact_with_options(cancel, options, ctx).await?; + } + GcCompactionQueueItem::Notify(id, l2_lsn) => { + self.notify_and_unblock(id); + if let Some(l2_lsn) = l2_lsn { + let current_l2_lsn = timeline + .get_gc_compaction_state() + .map(|x| x.last_completed_lsn) + .unwrap_or(Lsn::INVALID); + if l2_lsn >= current_l2_lsn { + info!("l2_lsn updated to {}", l2_lsn); + timeline + .update_gc_compaction_state(GcCompactionState { + last_completed_lsn: l2_lsn, + }) + .map_err(CompactionError::Other)?; + } else { + warn!( + "l2_lsn updated to {} but it is less than the current l2_lsn {}", + l2_lsn, current_l2_lsn + ); + } + } + } + } + { + let mut guard = self.inner.lock().unwrap(); + guard.running = None; + } + Ok(if has_pending_tasks { + CompactionOutcome::Pending + } else { + CompactionOutcome::Done + }) + } + + #[allow(clippy::type_complexity)] + pub fn remaining_jobs( + &self, + ) -> ( + Option<(GcCompactionJobId, GcCompactionQueueItem)>, + VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>, + ) { + let guard = self.inner.lock().unwrap(); + (guard.running.clone(), guard.queued.clone()) + } + + pub fn remaining_jobs_num(&self) -> usize { + let guard = self.inner.lock().unwrap(); + guard.queued.len() + if guard.running.is_some() { 1 } else { 0 } + } +} + +/// A job description for the gc-compaction job. This structure describes the rectangle range that the job will +/// process. The exact layers that need to be compacted/rewritten will be generated when `compact_with_gc` gets +/// called. +#[derive(Debug, Clone)] +pub(crate) struct GcCompactJob { + pub dry_run: bool, + /// The key range to be compacted. The compaction algorithm will only regenerate key-value pairs within this range + /// [left inclusive, right exclusive), and other pairs will be rewritten into new files if necessary. + pub compact_key_range: Range, + /// The LSN range to be compacted. The compaction algorithm will use this range to determine the layers to be + /// selected for the compaction, and it does not guarantee the generated layers will have exactly the same LSN range + /// as specified here. The true range being compacted is `min_lsn/max_lsn` in [`GcCompactionJobDescription`]. + /// min_lsn will always <= the lower bound specified here, and max_lsn will always >= the upper bound specified here. + pub compact_lsn_range: Range, +} + +impl GcCompactJob { + pub fn from_compact_options(options: CompactOptions) -> Self { + GcCompactJob { + dry_run: options.flags.contains(CompactFlags::DryRun), + compact_key_range: options + .compact_key_range + .map(|x| x.into()) + .unwrap_or(Key::MIN..Key::MAX), + compact_lsn_range: options + .compact_lsn_range + .map(|x| x.into()) + .unwrap_or(Lsn::INVALID..Lsn::MAX), + } + } +} + +/// A job description for the gc-compaction job. This structure is generated when `compact_with_gc` is called +/// and contains the exact layers we want to compact. pub struct GcCompactionJobDescription { /// All layers to read in the compaction job selected_layers: Vec, - /// GC cutoff of the job + /// GC cutoff of the job. This is the lowest LSN that will be accessed by the read/GC path and we need to + /// keep all deltas <= this LSN or generate an image == this LSN. gc_cutoff: Lsn, - /// LSNs to retain for the job + /// LSNs to retain for the job. Read path will use this LSN so we need to keep deltas <= this LSN or + /// generate an image == this LSN. retain_lsns_below_horizon: Vec, - /// Maximum layer LSN processed in this compaction + /// Maximum layer LSN processed in this compaction, that is max(end_lsn of layers). Exclusive. All data + /// \>= this LSN will be kept and will not be rewritten. max_layer_lsn: Lsn, - /// Only compact layers overlapping with this range + /// Minimum layer LSN processed in this compaction, that is min(start_lsn of layers). Inclusive. + /// All access below (strict lower than `<`) this LSN will be routed through the normal read path instead of + /// k-merge within gc-compaction. + min_layer_lsn: Lsn, + /// Only compact layers overlapping with this range. compaction_key_range: Range, /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap. /// This field is here solely for debugging. The field will not be read once the compaction @@ -120,12 +631,19 @@ impl KeyHistoryRetention { if dry_run { return true; } - let guard = tline.layers.read().await; - if !guard.contains_key(key) { - return false; + if LayerMap::is_l0(&key.key_range, key.is_delta) { + // gc-compaction should not produce L0 deltas, otherwise it will break the layer order. + // We should ignore such layers. + return true; + } + let layer_generation; + { + let guard = tline.layers.read().await; + if !guard.contains_key(key) { + return false; + } + layer_generation = guard.get_from_key(key).metadata().generation; } - let layer_generation = guard.get_from_key(key).metadata().generation; - drop(guard); if layer_generation == tline.generation { info!( key=%key, @@ -274,6 +792,22 @@ impl CompactionStatistics { } } +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)] +pub enum CompactionOutcome { + #[default] + /// No layers need to be compacted after this round. Compaction doesn't need + /// to be immediately scheduled. + Done, + /// Still has pending layers to be compacted after this round. Ideally, the scheduler + /// should immediately schedule another compaction. + Pending, + /// A timeline needs L0 compaction. Yield and schedule an immediate L0 compaction pass (only + /// guaranteed when `compaction_l0_first` is enabled). + YieldForL0, + /// Compaction was skipped, because the timeline is ineligible for compaction. + Skipped, +} + impl Timeline { /// TODO: cancellation /// @@ -283,7 +817,7 @@ impl Timeline { cancel: &CancellationToken, options: CompactOptions, ctx: &RequestContext, - ) -> Result { + ) -> Result { if options .flags .contains(CompactFlags::EnhancedGcBottomMostCompaction) @@ -291,7 +825,7 @@ impl Timeline { self.compact_with_gc(cancel, options, ctx) .await .map_err(CompactionError::Other)?; - return Ok(false); + return Ok(CompactionOutcome::Done); } if options.flags.contains(CompactFlags::DryRun) { @@ -300,7 +834,7 @@ impl Timeline { ))); } - if options.compact_range.is_some() { + if options.compact_key_range.is_some() || options.compact_lsn_range.is_some() { // maybe useful in the future? could implement this at some point return Err(CompactionError::Other(anyhow!( "compaction range is not supported for legacy compaction for now" @@ -309,7 +843,13 @@ impl Timeline { // High level strategy for compaction / image creation: // - // 1. First, calculate the desired "partitioning" of the + // 1. First, do a L0 compaction to ensure we move the L0 + // layers into the historic layer map get flat levels of + // layers. If we did not compact all L0 layers, we will + // prioritize compacting the timeline again and not do + // any of the compactions below. + // + // 2. Then, calculate the desired "partitioning" of the // currently in-use key space. The goal is to partition the // key space into roughly fixed-size chunks, but also take into // account any existing image layers, and try to align the @@ -323,7 +863,7 @@ impl Timeline { // identify a relation. This is just an optimization, // though. // - // 2. Once we know the partitioning, for each partition, + // 3. Once we know the partitioning, for each partition, // decide if it's time to create a new image layer. The // criteria is: there has been too much "churn" since the last // image layer? The "churn" is fuzzy concept, it's a @@ -331,15 +871,8 @@ impl Timeline { // total in the delta file. Or perhaps: if creating an image // file would allow to delete some older files. // - // 3. After that, we compact all level0 delta files if there - // are too many of them. While compacting, we also garbage - // collect any page versions that are no longer needed because - // of the new image layers we created in step 2. - // - // TODO: This high level strategy hasn't been implemented yet. - // Below are functions compact_level0() and create_image_layers() - // but they are a bit ad hoc and don't quite work like it's explained - // above. Rewrite it. + // 4. In the end, if the tenant gets auto-sharded, we will run + // a shard-ancestor compaction. // Is the timeline being deleted? if self.is_stopping() { @@ -351,8 +884,34 @@ impl Timeline { // Define partitioning schema if needed - // FIXME: the match should only cover repartitioning, not the next steps - let (partition_count, has_pending_tasks) = match self + // 1. L0 Compact + let l0_outcome = { + let timer = self.metrics.compact_time_histo.start_timer(); + let l0_outcome = self + .compact_level0( + target_file_size, + options.flags.contains(CompactFlags::ForceL0Compaction), + ctx, + ) + .await?; + timer.stop_and_record(); + l0_outcome + }; + + if options.flags.contains(CompactFlags::OnlyL0Compaction) { + return Ok(l0_outcome); + } + + // Yield if we have pending L0 compaction. The scheduler will do another pass. + if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0) + && !options.flags.contains(CompactFlags::NoYield) + { + info!("image/ancestor compaction yielding for L0 compaction"); + return Ok(CompactionOutcome::YieldForL0); + } + + // 2. Repartition and create image layers if necessary + match self .repartition( self.get_last_record_lsn(), self.get_compaction_target_size(), @@ -367,61 +926,79 @@ impl Timeline { .access_stats_behavior(AccessStatsBehavior::Skip) .build(); - // 2. Compact - let timer = self.metrics.compact_time_histo.start_timer(); - let fully_compacted = self - .compact_level0( - target_file_size, - options.flags.contains(CompactFlags::ForceL0Compaction), - ctx, - ) - .await?; - timer.stop_and_record(); - let mut partitioning = dense_partitioning; partitioning .parts .extend(sparse_partitioning.into_dense().parts); - // 3. Create new image layers for partitions that have been modified - // "enough". Skip image layer creation if L0 compaction cannot keep up. - if fully_compacted { - let image_layers = self - .create_image_layers( - &partitioning, - lsn, - if options - .flags - .contains(CompactFlags::ForceImageLayerCreation) - { - ImageLayerCreationMode::Force - } else { - ImageLayerCreationMode::Try - }, - &image_ctx, - ) - .await?; + // 3. Create new image layers for partitions that have been modified "enough". + let (image_layers, outcome) = self + .create_image_layers( + &partitioning, + lsn, + if options + .flags + .contains(CompactFlags::ForceImageLayerCreation) + { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, + &image_ctx, + self.last_image_layer_creation_status + .load() + .as_ref() + .clone(), + !options.flags.contains(CompactFlags::NoYield), + ) + .await + .inspect_err(|err| { + if let CreateImageLayersError::GetVectoredError( + GetVectoredError::MissingKey(_), + ) = err + { + critical!("missing key during compaction: {err:?}"); + } + })?; - self.upload_new_image_layers(image_layers)?; - } else { - info!("skipping image layer generation due to L0 compaction did not include all layers."); + self.last_image_layer_creation_status + .store(Arc::new(outcome.clone())); + + self.upload_new_image_layers(image_layers)?; + if let LastImageLayerCreationStatus::Incomplete { .. } = outcome { + // Yield and do not do any other kind of compaction. + info!( + "skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)." + ); + return Ok(CompactionOutcome::YieldForL0); } - (partitioning.parts.len(), !fully_compacted) - } - Err(err) => { - // no partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline - // as a simple key-value store, ignoring the datadir layout. Log the - // error but continue. - // - // Suppress error when it's due to cancellation - if !self.cancel.is_cancelled() && !err.is_cancelled() { - tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); - } - (1, false) } + + // Suppress errors when cancelled. + Err(_) if self.cancel.is_cancelled() => {} + Err(CompactionError::ShuttingDown) => {} + Err(CompactionError::CollectKeySpaceError(CollectKeySpaceError::Cancelled)) => {} + + // Alert on critical errors that indicate data corruption. + Err( + err @ CompactionError::CollectKeySpaceError( + CollectKeySpaceError::Decode(_) + | CollectKeySpaceError::PageRead( + PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), + ), + ), + ) => critical!("could not compact, repartitioning keyspace failed: {err:?}"), + + // Log other errors. No partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline as a simple + // key-value store, ignoring the datadir layout. Log the error but continue. + Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"), }; + let partition_count = self.partitioning.read().0.0.parts.len(); + + // 4. Shard ancestor compaction + if self.shard_identity.count >= ShardCount::new(2) { // Limit the number of layer rewrites to the number of partitions: this means its // runtime should be comparable to a full round of image layer creations, rather than @@ -431,7 +1008,7 @@ impl Timeline { self.compact_shard_ancestors(rewrite_max, ctx).await?; } - Ok(has_pending_tasks) + Ok(CompactionOutcome::Done) } /// Check for layers that are elegible to be rewritten: @@ -456,7 +1033,7 @@ impl Timeline { // // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we // are rewriting layers. - let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn(); tracing::info!( "latest_gc_cutoff: {}, pitr cutoff {}", @@ -628,7 +1205,7 @@ impl Timeline { Ok(()) => (), Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { - return Err(CompactionError::ShuttingDown) + return Err(CompactionError::ShuttingDown); } } @@ -643,7 +1220,7 @@ impl Timeline { /// /// The result may be used as an input to eviction and secondary downloads to de-prioritize layers /// that we know won't be needed for reads. - pub(super) async fn update_layer_visibility( + pub(crate) async fn update_layer_visibility( &self, ) -> Result<(), super::layer_manager::Shutdown> { let head_lsn = self.get_last_record_lsn(); @@ -689,11 +1266,11 @@ impl Timeline { target_file_size: u64, force_compaction_ignore_threshold: bool, ctx: &RequestContext, - ) -> Result { + ) -> Result { let CompactLevel0Phase1Result { new_layers, deltas_to_compact, - fully_compacted, + outcome, } = { let phase1_span = info_span!("compact_level0_phase1"); let ctx = ctx.attached_child(); @@ -722,12 +1299,12 @@ impl Timeline { if new_layers.is_empty() && deltas_to_compact.is_empty() { // nothing to do - return Ok(true); + return Ok(CompactionOutcome::Done); } self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact) .await?; - Ok(fully_compacted) + Ok(outcome) } /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. @@ -798,16 +1375,15 @@ impl Timeline { // Accumulate the size of layers in `deltas_to_compact` let mut deltas_to_compact_bytes = 0; - // Under normal circumstances, we will accumulate up to compaction_interval L0s of size + // Under normal circumstances, we will accumulate up to compaction_upper_limit L0s of size // checkpoint_distance each. To avoid edge cases using extra system resources, bound our // work in this function to only operate on this much delta data at once. // - // Take the max of the configured value & the default, so that tests that configure tiny values - // can still use a sensible amount of memory, but if a deployed system configures bigger values we - // still let them compact a full stack of L0s in one go. + // In general, compaction_threshold should be <= compaction_upper_limit, but in case that + // the constraint is not respected, we use the larger of the two. let delta_size_limit = std::cmp::max( + self.get_compaction_upper_limit(), self.get_compaction_threshold(), - DEFAULT_COMPACTION_THRESHOLD, ) as u64 * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE); @@ -924,7 +1500,7 @@ impl Timeline { let last_record_lsn = self.get_last_record_lsn(); let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; let min_hole_coverage_size = 3; // TODO: something more flexible? - // min-heap (reserve space for one more element added before eviction) + // min-heap (reserve space for one more element added before eviction) let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); let mut prev: Option = None; @@ -1068,7 +1644,7 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + let same_key = prev_key == Some(key); // We need to check key boundaries once we reach next key or end of layer with the same key if !same_key || lsn == dup_end_lsn { let mut next_key_size = 0u64; @@ -1184,11 +1760,9 @@ impl Timeline { .await .map_err(CompactionError::Other)?; } else { - let shard = self.shard_identity.shard_index(); let owner = self.shard_identity.get_shard_number(&key); - if cfg!(debug_assertions) { - panic!("key {key} does not belong on shard {shard}, owned by {owner}"); - } + + // This happens after a shard split, when we're compacting an L0 created by our parent shard debug!("dropping key {key} during compaction (it belongs on shard {owner})"); } @@ -1273,7 +1847,11 @@ impl Timeline { .into_iter() .map(|x| x.drop_eviction_guard()) .collect::>(), - fully_compacted, + outcome: if fully_compacted { + CompactionOutcome::Done + } else { + CompactionOutcome::Pending + }, }) } } @@ -1284,7 +1862,7 @@ struct CompactLevel0Phase1Result { deltas_to_compact: Vec, // Whether we have included all L0 layers, or selected only part of them due to the // L0 compaction size limit. - fully_compacted: bool, + outcome: CompactionOutcome, } #[derive(Default)] @@ -1463,7 +2041,10 @@ impl Timeline { base_img_from_ancestor: Option<(Key, Lsn, Bytes)>, ) -> anyhow::Result { // Pre-checks for the invariants - if cfg!(debug_assertions) { + + let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing"); + + if debug_mode { for (log_key, _, _) in full_history { assert_eq!(log_key, &key, "mismatched key"); } @@ -1609,15 +2190,19 @@ impl Timeline { output } + let mut key_exists = false; for (i, split_for_lsn) in split_history.into_iter().enumerate() { // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly. records_since_last_image += split_for_lsn.len(); - let generate_image = if i == 0 && !has_ancestor { + // Whether to produce an image into the final layer files + let produce_image = if i == 0 && !has_ancestor { // We always generate images for the first batch (below horizon / lowest retain_lsn) true } else if i == batch_cnt - 1 { // Do not generate images for the last batch (above horizon) false + } else if records_since_last_image == 0 { + false } else if records_since_last_image >= delta_threshold_cnt { // Generate images when there are too many records true @@ -1632,29 +2217,45 @@ impl Timeline { break; } } - if let Some((_, _, val)) = replay_history.first() { - if !val.will_init() { - return Err(anyhow::anyhow!("invalid history, no base image")).with_context( - || { - generate_debug_trace( - Some(&replay_history), - full_history, - retain_lsn_below_horizon, - horizon, - ) - }, - ); - } + if replay_history.is_empty() && !key_exists { + // The key does not exist at earlier LSN, we can skip this iteration. + retention.push(Vec::new()); + continue; + } else { + key_exists = true; } - if generate_image && records_since_last_image > 0 { + let Some((_, _, val)) = replay_history.first() else { + unreachable!("replay history should not be empty once it exists") + }; + if !val.will_init() { + return Err(anyhow::anyhow!("invalid history, no base image")).with_context(|| { + generate_debug_trace( + Some(&replay_history), + full_history, + retain_lsn_below_horizon, + horizon, + ) + }); + } + // Whether to reconstruct the image. In debug mode, we will generate an image + // at every retain_lsn to ensure data is not corrupted, but we won't put the + // image into the final layer. + let generate_image = produce_image || debug_mode; + if produce_image { records_since_last_image = 0; - let replay_history_for_debug = if cfg!(debug_assertions) { + } + let img_and_lsn = if generate_image { + let replay_history_for_debug = if debug_mode { Some(replay_history.clone()) } else { None }; let replay_history_for_debug_ref = replay_history_for_debug.as_deref(); - let history = std::mem::take(&mut replay_history); + let history = if produce_image { + std::mem::take(&mut replay_history) + } else { + replay_history.clone() + }; let mut img = None; let mut records = Vec::with_capacity(history.len()); if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() { @@ -1691,8 +2292,20 @@ impl Timeline { } records.reverse(); let state = ValueReconstructState { img, records }; - let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range + // last batch does not generate image so i is always in range, unless we force generate + // an image during testing + let request_lsn = if i >= lsn_split_points.len() { + Lsn::MAX + } else { + lsn_split_points[i] + }; let img = self.reconstruct_value(key, request_lsn, state).await?; + Some((request_lsn, img)) + } else { + None + }; + if produce_image { + let (request_lsn, img) = img_and_lsn.unwrap(); replay_history.push((key, request_lsn, Value::Image(img.clone()))); retention.push(vec![(request_lsn, Value::Image(img))]); } else { @@ -1751,38 +2364,67 @@ impl Timeline { let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */ if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space { - return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", - available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size)); + return Err(anyhow!( + "not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", + available_space, + allocated_space, + all_layer_size, + remote_layer_size, + all_layer_size + remote_layer_size + )); } Ok(()) } - /// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of - /// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much - /// ad-hoc information about gc compaction itself. + /// Get a watermark for gc-compaction, that is the lowest LSN that we can use as the `gc_horizon` for + /// the compaction algorithm. It is min(space_cutoff, time_cutoff, latest_gc_cutoff, standby_horizon). + /// Leases and retain_lsns are considered in the gc-compaction job itself so we don't need to account for them + /// here. + pub(crate) fn get_gc_compaction_watermark(self: &Arc) -> Lsn { + let gc_cutoff_lsn = { + let gc_info = self.gc_info.read().unwrap(); + gc_info.min_cutoff() + }; + + // TODO: standby horizon should use leases so we don't really need to consider it here. + // let watermark = watermark.min(self.standby_horizon.load()); + + // TODO: ensure the child branches will not use anything below the watermark, or consider + // them when computing the watermark. + gc_cutoff_lsn.min(*self.get_applied_gc_cutoff_lsn()) + } + + /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job. + /// The function returns a list of compaction jobs that can be executed separately. If the upper bound of the compact LSN + /// range is not specified, we will use the latest gc_cutoff as the upper bound, so that all jobs in the jobset acts + /// like a full compaction of the specified keyspace. pub(crate) async fn gc_compaction_split_jobs( self: &Arc, - options: CompactOptions, - ) -> anyhow::Result> { - if !options.sub_compaction { - return Ok(vec![options]); - } - let compact_range = options.compact_range.clone().unwrap_or(CompactRange { - start: Key::MIN, - end: Key::MAX, - }); - let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn { - compact_below_lsn + job: GcCompactJob, + sub_compaction_max_job_size_mb: Option, + ) -> anyhow::Result> { + let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX { + job.compact_lsn_range.end } else { - *self.get_latest_gc_cutoff_lsn() // use the real gc cutoff + self.get_gc_compaction_watermark() }; - let mut compact_jobs = Vec::new(); + + if compact_below_lsn == Lsn::INVALID { + tracing::warn!( + "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction" + ); + return Ok(vec![]); + } + + // Split compaction job to about 4GB each + const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; + let sub_compaction_max_job_size_mb = + sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB); + + let mut compact_jobs = Vec::::new(); // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning // by estimating the amount of files read for a compaction job. We should also partition on LSN. - let Ok(partition) = self.partitioning.try_lock() else { - bail!("failed to acquire partition lock"); - }; - let ((dense_ks, sparse_ks), _) = &*partition; + let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone(); // Truncate the key range to be within user specified compaction range. fn truncate_to( source_start: &Key, @@ -1811,19 +2453,20 @@ impl Timeline { let Some((start, end)) = truncate_to( &range.start, &range.end, - &compact_range.start, - &compact_range.end, + &job.compact_key_range.start, + &job.compact_key_range.end, ) else { continue; }; split_key_ranges.push((start, end)); } split_key_ranges.sort(); - let guard = self.layers.read().await; - let layer_map = guard.layer_map()?; + let all_layers = { + let guard = self.layers.read().await; + let layer_map = guard.layer_map()?; + layer_map.iter_historic_layers().collect_vec() + }; let mut current_start = None; - // Split compaction job to about 2GB each - const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future let ranges_num = split_key_ranges.len(); for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() { if current_start.is_none() { @@ -1834,15 +2477,23 @@ impl Timeline { // We have already processed this partition. continue; } - let res = layer_map.range_search(start..end, compact_below_lsn); - let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::(); - if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 { - let mut compact_options = options.clone(); + let overlapping_layers = { + let mut desc = Vec::new(); + for layer in all_layers.iter() { + if overlaps_with(&layer.get_key_range(), &(start..end)) + && layer.get_lsn_range().start <= compact_below_lsn + { + desc.push(layer.clone()); + } + } + desc + }; + let total_size = overlapping_layers.iter().map(|x| x.file_size).sum::(); + if total_size > sub_compaction_max_job_size_mb * 1024 * 1024 || ranges_num == idx + 1 { // Try to extend the compaction range so that we include at least one full layer file. - let extended_end = res - .found - .keys() - .map(|layer| layer.layer.key_range.end) + let extended_end = overlapping_layers + .iter() + .map(|layer| layer.key_range.end) .min(); // It is possible that the search range does not contain any layer files when we reach the end of the loop. // In this case, we simply use the specified key range end. @@ -1851,18 +2502,33 @@ impl Timeline { } else { end }; - info!( - "splitting compaction job: {}..{}, estimated_size={}", - start, end, total_size - ); - compact_options.compact_range = Some(CompactRange { start, end }); - compact_options.compact_below_lsn = Some(compact_below_lsn); - compact_options.sub_compaction = false; - compact_jobs.push(compact_options); - current_start = Some(end); + let end = if ranges_num == idx + 1 { + // extend the compaction range to the end of the key range if it's the last partition + end.max(job.compact_key_range.end) + } else { + end + }; + if total_size == 0 && !compact_jobs.is_empty() { + info!( + "splitting compaction job: {}..{}, estimated_size={}, extending the previous job", + start, end, total_size + ); + compact_jobs.last_mut().unwrap().compact_key_range.end = end; + current_start = Some(end); + } else { + info!( + "splitting compaction job: {}..{}, estimated_size={}", + start, end, total_size + ); + compact_jobs.push(GcCompactJob { + dry_run: job.dry_run, + compact_key_range: start..end, + compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn, + }); + current_start = Some(end); + } } } - drop(guard); Ok(compact_jobs) } @@ -1880,7 +2546,7 @@ impl Timeline { /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not /// part of the range. /// - /// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with + /// If `options.compact_lsn_range.end` is provided, the compaction will only compact layers below or intersect with /// the LSN. Otherwise, it will use the gc cutoff by default. pub(crate) async fn compact_with_gc( self: &Arc, @@ -1888,9 +2554,15 @@ impl Timeline { options: CompactOptions, ctx: &RequestContext, ) -> anyhow::Result<()> { - if options.sub_compaction { - info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"); - let jobs = self.gc_compaction_split_jobs(options).await?; + let sub_compaction = options.sub_compaction; + let job = GcCompactJob::from_compact_options(options.clone()); + if sub_compaction { + info!( + "running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs" + ); + let jobs = self + .gc_compaction_split_jobs(job, options.sub_compaction_max_job_size_mb) + .await?; let jobs_len = jobs.len(); for (idx, job) in jobs.into_iter().enumerate() { info!( @@ -1905,19 +2577,15 @@ impl Timeline { } return Ok(()); } - self.compact_with_gc_inner(cancel, options, ctx).await + self.compact_with_gc_inner(cancel, job, ctx).await } async fn compact_with_gc_inner( self: &Arc, cancel: &CancellationToken, - options: CompactOptions, + job: GcCompactJob, ctx: &RequestContext, ) -> anyhow::Result<()> { - assert!( - !options.sub_compaction, - "sub-compaction should be handled by the outer function" - ); // Block other compaction/GC tasks from running for now. GC-compaction could run along // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. // Note that we already acquired the compaction lock when the outer `compact` function gets called. @@ -1937,19 +2605,19 @@ impl Timeline { ) .await?; - let flags = options.flags; - let compaction_key_range = options - .compact_range - .map(|range| range.start..range.end) - .unwrap_or_else(|| Key::MIN..Key::MAX); + let dry_run = job.dry_run; + let compact_key_range = job.compact_key_range; + let compact_lsn_range = job.compact_lsn_range; - let dry_run = flags.contains(CompactFlags::DryRun); + let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing"); - if compaction_key_range == (Key::MIN..Key::MAX) { - info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end); - } else { - info!("running enhanced gc bottom-most compaction, dry_run={dry_run}"); - } + info!( + "running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", + compact_key_range.start, + compact_key_range.end, + compact_lsn_range.start, + compact_lsn_range.end + ); scopeguard::defer! { info!("done enhanced gc bottom-most compaction"); @@ -1971,13 +2639,27 @@ impl Timeline { // Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of // cleaning everything that theoritically it could. In the future, it should use `self.gc_info` // to get the truth data. - let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); + let real_gc_cutoff = self.get_gc_compaction_watermark(); // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for - // each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use + // each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use // the real cutoff. - let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff); + let mut gc_cutoff = if compact_lsn_range.end == Lsn::MAX { + if real_gc_cutoff == Lsn::INVALID { + // If the gc_cutoff is not generated yet, we should not compact anything. + tracing::warn!( + "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction" + ); + return Ok(()); + } + real_gc_cutoff + } else { + compact_lsn_range.end + }; if gc_cutoff > real_gc_cutoff { - warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff); + warn!( + "provided compact_lsn_range.end={} is larger than the real_gc_cutoff={}, using the real gc cutoff", + gc_cutoff, real_gc_cutoff + ); gc_cutoff = real_gc_cutoff; } gc_cutoff @@ -1994,14 +2676,38 @@ impl Timeline { } let mut selected_layers: Vec = Vec::new(); drop(gc_info); - // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers. + // Firstly, pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers. let Some(max_layer_lsn) = layers .iter_historic_layers() .filter(|desc| desc.get_lsn_range().start <= gc_cutoff) .map(|desc| desc.get_lsn_range().end) .max() else { - info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff); + info!( + "no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", + gc_cutoff + ); + return Ok(()); + }; + // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below + // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if + // it is a branch. + let Some(min_layer_lsn) = layers + .iter_historic_layers() + .filter(|desc| { + if compact_lsn_range.start == Lsn::INVALID { + true // select all layers below if start == Lsn(0) + } else { + desc.get_lsn_range().end > compact_lsn_range.start // strictly larger than compact_above_lsn + } + }) + .map(|desc| desc.get_lsn_range().start) + .min() + else { + info!( + "no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}", + compact_lsn_range.end + ); return Ok(()); }; // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key @@ -2009,22 +2715,25 @@ impl Timeline { let mut rewrite_layers = Vec::new(); for desc in layers.iter_historic_layers() { if desc.get_lsn_range().end <= max_layer_lsn - && overlaps_with(&desc.get_key_range(), &compaction_key_range) + && desc.get_lsn_range().start >= min_layer_lsn + && overlaps_with(&desc.get_key_range(), &compact_key_range) { // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range, // even if it might contain extra keys selected_layers.push(guard.get_from_desc(&desc)); // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine // to overlap image layers) - if desc.is_delta() - && !fully_contains(&compaction_key_range, &desc.get_key_range()) + if desc.is_delta() && !fully_contains(&compact_key_range, &desc.get_key_range()) { rewrite_layers.push(desc); } } } if selected_layers.is_empty() { - info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end); + info!( + "no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", + gc_cutoff, compact_key_range.start, compact_key_range.end + ); return Ok(()); } retain_lsns_below_horizon.sort(); @@ -2032,20 +2741,27 @@ impl Timeline { selected_layers, gc_cutoff, retain_lsns_below_horizon, + min_layer_lsn, max_layer_lsn, - compaction_key_range, + compaction_key_range: compact_key_range, rewrite_layers, } }; - let lowest_retain_lsn = if self.ancestor_timeline.is_some() { - Lsn(self.ancestor_lsn.0 + 1) + let (has_data_below, lowest_retain_lsn) = if compact_lsn_range.start != Lsn::INVALID { + // If we only compact above some LSN, we should get the history from the current branch below the specified LSN. + // We use job_desc.min_layer_lsn as if it's the lowest branch point. + (true, job_desc.min_layer_lsn) + } else if self.ancestor_timeline.is_some() { + // In theory, we can also use min_layer_lsn here, but using ancestor LSN makes sure the delta layers cover the + // LSN ranges all the way to the ancestor timeline. + (true, self.ancestor_lsn) } else { let res = job_desc .retain_lsns_below_horizon .first() .copied() .unwrap_or(job_desc.gc_cutoff); - if cfg!(debug_assertions) { + if debug_mode { assert_eq!( res, job_desc @@ -2056,17 +2772,19 @@ impl Timeline { .unwrap_or(job_desc.gc_cutoff) ); } - res + (false, res) }; info!( - "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}", + "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}", job_desc.selected_layers.len(), job_desc.rewrite_layers.len(), job_desc.max_layer_lsn, + job_desc.min_layer_lsn, job_desc.gc_cutoff, lowest_retain_lsn, job_desc.compaction_key_range.start, - job_desc.compaction_key_range.end + job_desc.compaction_key_range.end, + has_data_below, ); for layer in &job_desc.selected_layers { @@ -2091,15 +2809,17 @@ impl Timeline { // Step 1: construct a k-merge iterator over all layers. // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point. - // disable the check for now because we need to adjust the check for partial compactions, will enable later. - // let layer_names = job_desc - // .selected_layers - // .iter() - // .map(|layer| layer.layer_desc().layer_name()) - // .collect_vec(); - // if let Some(err) = check_valid_layermap(&layer_names) { - // warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err); - // } + let layer_names = job_desc + .selected_layers + .iter() + .map(|layer| layer.layer_desc().layer_name()) + .collect_vec(); + if let Some(err) = check_valid_layermap(&layer_names) { + bail!( + "gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", + err + ); + } // The maximum LSN we are processing in this compaction loop let end_lsn = job_desc .selected_layers @@ -2110,10 +2830,22 @@ impl Timeline { let mut delta_layers = Vec::new(); let mut image_layers = Vec::new(); let mut downloaded_layers = Vec::new(); + let mut total_downloaded_size = 0; + let mut total_layer_size = 0; for layer in &job_desc.selected_layers { + if layer.needs_download().await?.is_some() { + total_downloaded_size += layer.layer_desc().file_size; + } + total_layer_size += layer.layer_desc().file_size; let resident_layer = layer.download_and_keep_resident().await?; downloaded_layers.push(resident_layer); } + info!( + "finish downloading layers, downloaded={}, total={}, ratio={:.2}", + total_downloaded_size, + total_layer_size, + total_downloaded_size as f64 / total_layer_size as f64 + ); for resident_layer in &downloaded_layers { if resident_layer.layer_desc().is_delta() { let layer = resident_layer.get_as_delta(ctx).await?; @@ -2136,7 +2868,7 @@ impl Timeline { // Only create image layers when there is no ancestor branches. TODO: create covering image layer // when some condition meet. - let mut image_layer_writer = if self.ancestor_timeline.is_none() { + let mut image_layer_writer = if !has_data_below { Some( SplitImageLayerWriter::new( self.conf, @@ -2170,7 +2902,11 @@ impl Timeline { } let mut delta_layer_rewriters = HashMap::, RewritingLayers>::new(); - /// Returns None if there is no ancestor branch. Throw an error when the key is not found. + /// When compacting not at a bottom range (=`[0,X)`) of the root branch, we "have data below" (`has_data_below=true`). + /// The two cases are compaction in ancestor branches and when `compact_lsn_range.start` is set. + /// In those cases, we need to pull up data from below the LSN range we're compaction. + /// + /// This function unifies the cases so that later code doesn't have to think about it. /// /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image /// is needed for reconstruction. This should be fixed in the future. @@ -2178,17 +2914,19 @@ impl Timeline { /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor /// images. async fn get_ancestor_image( - tline: &Arc, + this_tline: &Arc, key: Key, ctx: &RequestContext, + has_data_below: bool, + history_lsn_point: Lsn, ) -> anyhow::Result> { - if tline.ancestor_timeline.is_none() { + if !has_data_below { return Ok(None); }; // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing // as much existing code as possible. - let img = tline.get(key, tline.ancestor_lsn, ctx).await?; - Ok(Some((key, tline.ancestor_lsn, img))) + let img = this_tline.get(key, history_lsn_point, ctx).await?; + Ok(Some((key, history_lsn_point, img))) } // Actually, we can decide not to write to the image layer at all at this point because @@ -2274,7 +3012,8 @@ impl Timeline { job_desc.gc_cutoff, &job_desc.retain_lsns_below_horizon, COMPACTION_DELTA_THRESHOLD, - get_ancestor_image(self, *last_key, ctx).await?, + get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn) + .await?, ) .await?; retention @@ -2304,7 +3043,7 @@ impl Timeline { job_desc.gc_cutoff, &job_desc.retain_lsns_below_horizon, COMPACTION_DELTA_THRESHOLD, - get_ancestor_image(self, last_key, ctx).await?, + get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?, ) .await?; retention @@ -2463,19 +3202,113 @@ impl Timeline { "produced {} delta layers and {} image layers, {} layers are kept", produced_delta_layers_len, produced_image_layers_len, - layer_selection.len() + keep_layers.len() ); // Step 3: Place back to the layer map. + + // First, do a sanity check to ensure the newly-created layer map does not contain overlaps. + let all_layers = { + let guard = self.layers.read().await; + let layer_map = guard.layer_map()?; + layer_map.iter_historic_layers().collect_vec() + }; + + let mut final_layers = all_layers + .iter() + .map(|layer| layer.layer_name()) + .collect::>(); + for layer in &layer_selection { + final_layers.remove(&layer.layer_desc().layer_name()); + } + for layer in &compact_to { + final_layers.insert(layer.layer_desc().layer_name()); + } + let final_layers = final_layers.into_iter().collect_vec(); + + // TODO: move this check before we call `finish` on image layer writers. However, this will require us to get the layer name before we finish + // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are + // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails. + if let Some(err) = check_valid_layermap(&final_layers) { + bail!( + "gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", + err + ); + } + + // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only + // operate on L1 layers. { - // TODO: sanity check if the layer map is valid (i.e., should not have overlaps) + // Gc-compaction will rewrite the history of a key. This could happen in two ways: + // + // 1. We create an image layer to replace all the deltas below the compact LSN. In this case, assume + // we have 2 delta layers A and B, both below the compact LSN. We create an image layer I to replace + // A and B at the compact LSN. If the read path finishes reading A, yields, and now we update the layer + // map, the read path then cannot find any keys below A, reporting a missing key error, while the key + // now gets stored in I at the compact LSN. + // + // --------------- --------------- + // delta1@LSN20 image1@LSN20 + // --------------- (read path collects delta@LSN20, => --------------- (read path cannot find anything + // delta1@LSN10 yields) below LSN 20) + // --------------- + // + // 2. We create a delta layer to replace all the deltas below the compact LSN, and in the delta layers, + // we combines the history of a key into a single image. For example, we have deltas at LSN 1, 2, 3, 4, + // Assume one delta layer contains LSN 1, 2, 3 and the other contains LSN 4. + // + // We let gc-compaction combine delta 2, 3, 4 into an image at LSN 4, which produces a delta layer that + // contains the delta at LSN 1, the image at LSN 4. If the read path finishes reading the original delta + // layer containing 4, yields, and we update the layer map to put the delta layer. + // + // --------------- --------------- + // delta1@LSN4 image1@LSN4 + // --------------- (read path collects delta@LSN4, => --------------- (read path collects LSN4 and LSN1, + // delta1@LSN1-3 yields) delta1@LSN1 which is an invalid history) + // --------------- --------------- + // + // Therefore, the gc-compaction layer update operation should wait for all ongoing reads, block all pending reads, + // and only allow reads to continue after the update is finished. + + let update_guard = self.gc_compaction_layer_update_lock.write().await; + // Acquiring the update guard ensures current read operations end and new read operations are blocked. + // TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect? let mut guard = self.layers.write().await; guard .open_mut()? - .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics) + .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics); + drop(update_guard); // Allow new reads to start ONLY after we finished updating the layer map. + }; + + // Schedule an index-only upload to update the `latest_gc_cutoff` in the index_part.json. + // Otherwise, after restart, the index_part only contains the old `latest_gc_cutoff` and + // find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should + // be batched into `schedule_compaction_update`. + let disk_consistent_lsn = self.disk_consistent_lsn.load(); + self.schedule_uploads(disk_consistent_lsn, None)?; + // If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead + // of `compact_from`. + let compact_from = { + let mut compact_from = Vec::new(); + let mut compact_to_set = HashMap::new(); + for layer in &compact_to { + compact_to_set.insert(layer.layer_desc().key(), layer); + } + for layer in &layer_selection { + if let Some(to) = compact_to_set.get(&layer.layer_desc().key()) { + tracing::info!( + "skipping delete {} because found same layer key at different generation {}", + layer, + to + ); + } else { + compact_from.push(layer.clone()); + } + } + compact_from }; self.remote_client - .schedule_compaction_update(&layer_selection, &compact_to)?; + .schedule_compaction_update(&compact_from, &compact_to)?; drop(gc_lock); @@ -2724,11 +3557,7 @@ impl TimelineAdaptor { ranges: self.get_keyspace(key_range, lsn, ctx).await?, }; // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly - let start = Key::MIN; - let ImageLayerCreationOutcome { - image, - next_start_key: _, - } = self + let outcome = self .timeline .create_image_layer_for_rel_blocks( &keyspace, @@ -2736,11 +3565,17 @@ impl TimelineAdaptor { lsn, ctx, key_range.clone(), - start, + IoConcurrency::sequential(), ) .await?; - if let Some(image_layer) = image { + if let ImageLayerCreationOutcome::Generated { + unfinished_image_layer, + } = outcome + { + let (desc, path) = unfinished_image_layer.finish(ctx).await?; + let image_layer = + Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; self.new_images.push(image_layer); } @@ -2827,7 +3662,7 @@ impl CompactionLayer for ResidentDeltaLayer { impl CompactionDeltaLayer for ResidentDeltaLayer { type DeltaEntry<'a> = DeltaEntry<'a>; - async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result>> { + async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result>> { self.0.get_as_delta(ctx).await?.index_entries(ctx).await } } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 47a93b19d2..7cdc69e55f 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -1,28 +1,26 @@ -use std::{ - ops::{Deref, DerefMut}, - sync::Arc, -}; +use std::ops::{Deref, DerefMut}; +use std::sync::Arc; use anyhow::Context; -use pageserver_api::{models::TimelineState, shard::TenantShardId}; +use pageserver_api::models::TimelineState; +use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use tokio::sync::OwnedMutexGuard; -use tracing::{error, info, info_span, instrument, Instrument}; -use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint}; +use tracing::{Instrument, error, info, info_span, instrument}; +use utils::id::TimelineId; +use utils::{crashsafe, fs_ext, pausable_failpoint}; -use crate::{ - config::PageServerConf, - task_mgr::{self, TaskKind}, - tenant::{ - metadata::TimelineMetadata, - remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, - CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, - TenantManifestError, TimelineOrOffloaded, - }, - virtual_file::MaybeFatalIo, +use crate::config::PageServerConf; +use crate::task_mgr::{self, TaskKind}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::{ + PersistIndexPartWithDeletedFlagError, RemoteTimelineClient, }; - -use super::{Timeline, TimelineResources}; +use crate::tenant::{ + CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, TenantManifestError, + Timeline, TimelineOrOffloaded, +}; +use crate::virtual_file::MaybeFatalIo; /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. @@ -112,7 +110,7 @@ pub(super) async fn delete_local_timeline_directory( } /// It is important that this gets called when DeletionGuard is being held. -/// For more context see comments in [`DeleteTimelineFlow::prepare`] +/// For more context see comments in [`make_timeline_delete_guard`] async fn remove_maybe_offloaded_timeline_from_tenant( tenant: &Tenant, timeline: &TimelineOrOffloaded, @@ -139,6 +137,11 @@ async fn remove_maybe_offloaded_timeline_from_tenant( timelines.remove(&timeline.timeline_id).expect( "timeline that we were deleting was concurrently removed from 'timelines' map", ); + tenant + .scheduled_compaction_tasks + .lock() + .unwrap() + .remove(&timeline.timeline_id); } TimelineOrOffloaded::Offloaded(timeline) => { let offloaded_timeline = timelines_offloaded @@ -193,8 +196,8 @@ impl DeleteTimelineFlow { ) -> Result<(), DeleteTimelineError> { super::debug_assert_current_span_has_tenant_and_timeline_id(); - let allow_offloaded_children = false; - let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?; + let (timeline, mut guard) = + make_timeline_delete_guard(tenant, timeline_id, TimelineDeleteGuardKind::Delete)?; guard.mark_in_progress()?; @@ -296,15 +299,13 @@ impl DeleteTimelineFlow { timeline_id, local_metadata, None, // Ancestor is not needed for deletion. - TimelineResources { - remote_client, - pagestream_throttle: tenant.pagestream_throttle.clone(), - l0_flush_global_state: tenant.l0_flush_global_state.clone(), - }, + None, // Previous heatmap is not needed for deletion + tenant.get_timeline_resources_for(remote_client), // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. CreateTimelineCause::Delete, crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here + None, // doesn't matter what we put here ) .context("create_timeline_struct")?; @@ -330,72 +331,6 @@ impl DeleteTimelineFlow { Ok(()) } - pub(super) fn prepare( - tenant: &Tenant, - timeline_id: TimelineId, - allow_offloaded_children: bool, - ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> { - // Note the interaction between this guard and deletion guard. - // Here we attempt to lock deletion guard when we're holding a lock on timelines. - // This is important because when you take into account `remove_timeline_from_tenant` - // we remove timeline from memory when we still hold the deletion guard. - // So here when timeline deletion is finished timeline wont be present in timelines map at all - // which makes the following sequence impossible: - // T1: get preempted right before the try_lock on `Timeline::delete_progress` - // T2: do a full deletion, acquire and drop `Timeline::delete_progress` - // T1: acquire deletion lock, do another `DeleteTimelineFlow::run` - // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346` - let timelines = tenant.timelines.lock().unwrap(); - let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); - - let timeline = match timelines.get(&timeline_id) { - Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)), - None => match timelines_offloaded.get(&timeline_id) { - Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)), - None => return Err(DeleteTimelineError::NotFound), - }, - }; - - // Ensure that there are no child timelines, because we are about to remove files, - // which will break child branches - let mut children = Vec::new(); - if !allow_offloaded_children { - children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| { - (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id) - })); - } - children.extend(timelines.iter().filter_map(|(id, entry)| { - (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id) - })); - - if !children.is_empty() { - return Err(DeleteTimelineError::HasChildren(children)); - } - - // Note that using try_lock here is important to avoid a deadlock. - // Here we take lock on timelines and then the deletion guard. - // At the end of the operation we're holding the guard and need to lock timelines map - // to remove the timeline from it. - // Always if you have two locks that are taken in different order this can result in a deadlock. - - let delete_progress = Arc::clone(timeline.delete_progress()); - let delete_lock_guard = match delete_progress.try_lock_owned() { - Ok(guard) => DeletionGuard(guard), - Err(_) => { - // Unfortunately if lock fails arc is consumed. - return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone( - timeline.delete_progress(), - ))); - } - }; - - if let TimelineOrOffloaded::Timeline(timeline) = &timeline { - timeline.set_state(TimelineState::Stopping); - } - - Ok((timeline, delete_lock_guard)) - } - fn schedule_background( guard: DeletionGuard, conf: &'static PageServerConf, @@ -406,6 +341,13 @@ impl DeleteTimelineFlow { let tenant_shard_id = timeline.tenant_shard_id(); let timeline_id = timeline.timeline_id(); + // Take a tenant gate guard, because timeline deletion needs access to the tenant to update its manifest. + let Ok(tenant_guard) = tenant.gate.enter() else { + // It is safe to simply skip here, because we only schedule background work once the timeline is durably marked for deletion. + info!("Tenant is shutting down, timeline deletion will be resumed when it next starts"); + return; + }; + task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, @@ -413,6 +355,8 @@ impl DeleteTimelineFlow { Some(timeline_id), "timeline_delete", async move { + let _guard = tenant_guard; + if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await { // Only log as an error if it's not a cancellation. if matches!(err, DeleteTimelineError::Cancelled) { @@ -477,6 +421,80 @@ impl DeleteTimelineFlow { } } +#[derive(Copy, Clone, PartialEq, Eq)] +pub(super) enum TimelineDeleteGuardKind { + Offload, + Delete, +} + +pub(super) fn make_timeline_delete_guard( + tenant: &Tenant, + timeline_id: TimelineId, + guard_kind: TimelineDeleteGuardKind, +) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> { + // Note the interaction between this guard and deletion guard. + // Here we attempt to lock deletion guard when we're holding a lock on timelines. + // This is important because when you take into account `remove_timeline_from_tenant` + // we remove timeline from memory when we still hold the deletion guard. + // So here when timeline deletion is finished timeline wont be present in timelines map at all + // which makes the following sequence impossible: + // T1: get preempted right before the try_lock on `Timeline::delete_progress` + // T2: do a full deletion, acquire and drop `Timeline::delete_progress` + // T1: acquire deletion lock, do another `DeleteTimelineFlow::run` + // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346` + let timelines = tenant.timelines.lock().unwrap(); + let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); + + let timeline = match timelines.get(&timeline_id) { + Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)), + None => match timelines_offloaded.get(&timeline_id) { + Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)), + None => return Err(DeleteTimelineError::NotFound), + }, + }; + + // Ensure that there are no child timelines, because we are about to remove files, + // which will break child branches + let mut children = Vec::new(); + if guard_kind == TimelineDeleteGuardKind::Delete { + children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| { + (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id) + })); + } + children.extend(timelines.iter().filter_map(|(id, entry)| { + (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id) + })); + + if !children.is_empty() { + return Err(DeleteTimelineError::HasChildren(children)); + } + + // Note that using try_lock here is important to avoid a deadlock. + // Here we take lock on timelines and then the deletion guard. + // At the end of the operation we're holding the guard and need to lock timelines map + // to remove the timeline from it. + // Always if you have two locks that are taken in different order this can result in a deadlock. + + let delete_progress = Arc::clone(timeline.delete_progress()); + let delete_lock_guard = match delete_progress.try_lock_owned() { + Ok(guard) => DeletionGuard(guard), + Err(_) => { + // Unfortunately if lock fails arc is consumed. + return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone( + timeline.delete_progress(), + ))); + } + }; + + if guard_kind == TimelineDeleteGuardKind::Delete { + if let TimelineOrOffloaded::Timeline(timeline) = &timeline { + timeline.set_state(TimelineState::Stopping); + } + } + + Ok((timeline, delete_lock_guard)) +} + pub(super) struct DeletionGuard(OwnedMutexGuard); impl Deref for DeletionGuard { diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 4e9cc837d0..cad4c3ac64 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -1,22 +1,27 @@ -use std::{collections::HashSet, sync::Arc}; +use std::collections::HashSet; +use std::sync::Arc; -use super::{layer_manager::LayerManager, FlushLayerError, Timeline}; -use crate::{ - context::{DownloadBehavior, RequestContext}, - task_mgr::TaskKind, - tenant::{ - remote_timeline_client::index::GcBlockingReason::DetachAncestor, - storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}, - Tenant, - }, - virtual_file::{MaybeFatalIo, VirtualFile}, -}; use anyhow::Context; -use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity}; +use http_utils::error::ApiError; +use pageserver_api::models::detach_ancestor::AncestorDetached; +use pageserver_api::shard::ShardIdentity; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; +use utils::completion; +use utils::generation::Generation; +use utils::id::TimelineId; +use utils::lsn::Lsn; + +use super::layer_manager::LayerManager; +use super::{FlushLayerError, Timeline}; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::TaskKind; +use crate::tenant::Tenant; +use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor; +use crate::tenant::storage_layer::layer::local_layer_path; +use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; #[derive(Debug, thiserror::Error)] pub(crate) enum Error { @@ -61,9 +66,10 @@ impl Error { where F: Fn(anyhow::Error) -> Error, { + use remote_storage::TimeoutOrCancel; + use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::upload_queue::NotInitialized; - use remote_storage::TimeoutOrCancel; if e.is::() || TimeoutOrCancel::caused_by_cancel(&e) @@ -351,18 +357,7 @@ pub(super) async fn prepare( // FIXME: the fsync should be mandatory, after both rewrites and copies if wrote_any { - let timeline_dir = VirtualFile::open( - &detached - .conf - .timeline_path(&detached.tenant_shard_id, &detached.timeline_id), - ctx, - ) - .await - .fatal_err("VirtualFile::open for timeline dir fsync"); - timeline_dir - .sync_all() - .await - .fatal_err("VirtualFile::sync_all timeline dir"); + fsync_timeline_dir(detached, ctx).await; } } @@ -376,7 +371,7 @@ pub(super) async fn prepare( tasks.spawn( async move { let _permit = limiter.acquire().await; - let owned = remote_copy( + let (owned, did_hardlink) = remote_copy( &adopted, &timeline, timeline.generation, @@ -384,16 +379,20 @@ pub(super) async fn prepare( &timeline.cancel, ) .await?; - tracing::info!(layer=%owned, "remote copied"); - Ok(owned) + tracing::info!(layer=%owned, did_hard_link=%did_hardlink, "remote copied"); + Ok((owned, did_hardlink)) } .in_current_span(), ); } + let mut should_fsync = false; while let Some(res) = tasks.join_next().await { match res { - Ok(Ok(owned)) => { + Ok(Ok((owned, did_hardlink))) => { + if did_hardlink { + should_fsync = true; + } new_layers.push(owned); } Ok(Err(failed)) => { @@ -403,7 +402,10 @@ pub(super) async fn prepare( } } - // TODO: fsync directory again if we hardlinked something + // fsync directory again if we hardlinked something + if should_fsync { + fsync_timeline_dir(detached, ctx).await; + } let prepared = PreparedTimelineDetach { layers: new_layers }; @@ -630,35 +632,52 @@ async fn copy_lsn_prefix( } } -/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote -/// storage on successful return without the adopted layer being added to `index_part.json`. +/// Creates a new Layer instance for the adopted layer, and ensures it is found in the remote +/// storage on successful return. without the adopted layer being added to `index_part.json`. +/// Returns (Layer, did hardlink) async fn remote_copy( adopted: &Layer, adoptee: &Arc, generation: Generation, shard_identity: ShardIdentity, cancel: &CancellationToken, -) -> Result { - // depending if Layer::keep_resident we could hardlink - +) -> Result<(Layer, bool), Error> { let mut metadata = adopted.metadata(); debug_assert!(metadata.generation <= generation); metadata.generation = generation; metadata.shard = shard_identity.shard_index(); - let owned = crate::tenant::storage_layer::Layer::for_evicted( - adoptee.conf, - adoptee, - adopted.layer_desc().layer_name(), - metadata, - ); + let conf = adoptee.conf; + let file_name = adopted.layer_desc().layer_name(); - adoptee + // depending if Layer::keep_resident, do a hardlink + let did_hardlink; + let owned = if let Some(adopted_resident) = adopted.keep_resident().await { + let adopted_path = adopted_resident.local_path(); + let adoptee_path = local_layer_path( + conf, + &adoptee.tenant_shard_id, + &adoptee.timeline_id, + &file_name, + &metadata.generation, + ); + std::fs::hard_link(adopted_path, &adoptee_path) + .map_err(|e| Error::launder(e.into(), Error::Prepare))?; + did_hardlink = true; + Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard() + } else { + did_hardlink = false; + Layer::for_evicted(conf, adoptee, file_name, metadata) + }; + + let layer = adoptee .remote_client .copy_timeline_layer(adopted, &owned, cancel) .await .map(move |()| owned) - .map_err(|e| Error::launder(e, Error::Prepare)) + .map_err(|e| Error::launder(e, Error::Prepare))?; + + Ok((layer, did_hardlink)) } pub(crate) enum DetachingAndReparenting { @@ -765,7 +784,7 @@ pub(super) async fn detach_and_reparent( // TODO: make sure there are no `?` before tenant_reset from after a questionmark from // here. panic!( - "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" + "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" ); } }; @@ -1002,3 +1021,16 @@ fn check_no_archived_children_of_ancestor( } Ok(()) } + +async fn fsync_timeline_dir(timeline: &Timeline, ctx: &RequestContext) { + let path = &timeline + .conf + .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id); + let timeline_dir = VirtualFile::open(&path, ctx) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); +} diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 26c2861b93..187d9f248e 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -13,31 +13,27 @@ //! Items with parentheses are not (yet) touched by this task. //! //! See write-up on restart on-demand download spike: -use std::{ - collections::HashMap, - ops::ControlFlow, - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::collections::HashMap; +use std::ops::ControlFlow; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, info_span, instrument, warn, Instrument}; - -use crate::{ - context::{DownloadBehavior, RequestContext}, - pgdatadir_mapping::CollectKeySpaceError, - task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, - tenant::{ - size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint, - tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, - }, -}; - -use utils::{completion, sync::gate::GateGuard}; +use tracing::{Instrument, debug, info, info_span, instrument, warn}; +use utils::completion; +use utils::sync::gate::GateGuard; use super::Timeline; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::pgdatadir_mapping::CollectKeySpaceError; +use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind}; +use crate::tenant::size::CalculateSyntheticSizeError; +use crate::tenant::storage_layer::LayerVisibilityHint; +use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random}; +use crate::tenant::timeline::EvictionError; +use crate::tenant::{LogicalSizeCalculationCause, Tenant}; #[derive(Default)] pub struct EvictionTaskTimelineState { @@ -80,8 +76,6 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] async fn eviction_task(self: Arc, tenant: Arc) { - use crate::tenant::tasks::random_init_delay; - // acquire the gate guard only once within a useful span let Ok(guard) = self.gate.enter() else { return; @@ -94,7 +88,7 @@ impl Timeline { EvictionPolicy::OnlyImitiate(lat) => lat.period, EvictionPolicy::NoEviction => Duration::from_secs(10), }; - if random_init_delay(period, &self.cancel).await.is_err() { + if sleep_random(period, &self.cancel).await.is_err() { return; } } @@ -330,11 +324,9 @@ impl Timeline { &self, cancel: &CancellationToken, ctx: &RequestContext, - ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> { - let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( - BackgroundLoopKind::Eviction, - ctx, - ); + ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> { + let acquire_permit = + crate::tenant::tasks::acquire_concurrency_permit(BackgroundLoopKind::Eviction, ctx); tokio::select! { permit = acquire_permit => ControlFlow::Continue(permit), @@ -374,7 +366,7 @@ impl Timeline { p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, gate: &GateGuard, - permit: tokio::sync::SemaphorePermit<'static>, + permit: BackgroundLoopSemaphorePermit<'static>, ctx: &RequestContext, ) -> ControlFlow<()> { if !self.tenant_shard_id.is_shard_zero() { diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index e82559b8b3..67fb89c433 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -32,54 +32,151 @@ //! //! # Design //! +//! ## Data Structures +//! //! There are three user-facing data structures: //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime. //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime. //! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`. -//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request) +//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows +//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*. //! -//! The `Handle` is just a wrapper around an `Arc`. +//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`. +//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`. //! -//! There is one long-lived `Arc`, which is stored in the `PerTimelineState`. -//! The `Cache` stores a `Weak` for each cached Timeline. +//! The `HandleInner` is allocated as a `Arc>` and +//! referenced weakly and strongly from various places which we are now illustrating. +//! For brevity, we will omit the `Arc>` part in the following and instead +//! use `strong ref` and `weak ref` when referring to the `Arc>` +//! or `Weak>`, respectively. +//! +//! - The `Handle` is a strong ref. +//! - The `WeakHandle` is a weak ref. +//! - The `PerTimelineState` contains a `HashMap`. +//! - The `Cache` is a `HashMap`. +//! +//! Lifetimes: +//! - `WeakHandle` and `Handle`: single pagestream request. +//! - `Cache`: single page service connection. +//! - `PerTimelineState`: lifetime of the Timeline object (i.e., i.e., till `Timeline::shutdown`). +//! +//! ## Request Handling Flow (= filling and using the `Cache``) //! //! To dispatch a request, the page service connection calls `Cache::get`. //! //! A cache miss means we consult the tenant manager for shard routing, -//! resulting in an `Arc`. We enter its gate _once_ and construct an -//! `Arc`. We store a `Weak` in the cache -//! and the `Arc` in the `PerTimelineState`. +//! resulting in an `Arc`. We enter its gate _once_ and store it in the the +//! `Arc>>`. A weak ref is stored in the `Cache` +//! and a strong ref in the `PerTimelineState`. +//! A strong ref is returned wrapped in a `Handle`. //! //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing -//! and find the `Weak` in the cache. -//! We upgrade the `Weak` to an `Arc` and wrap it in the user-facing `Handle` type. +//! and find the weak ref in the cache. +//! We upgrade the weak ref to a strong ref and return it wrapped in a `Handle`. //! -//! The request handler dispatches the request to the right `>::$request_method`. +//! The pagestream processing is pipelined and involves a batching step. +//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`. +//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle` +//! and the request handler dispatches the request to the right `>::$request_method`. //! It then drops the `Handle`, which drops the `Arc`. //! -//! # Memory Management / How The Reference Cycle Is Broken +//! # Performance //! -//! The attentive reader may have noticed the strong reference cycle -//! from `Arc` to `PerTimelineState` to `Arc`. +//! Remember from the introductory section: //! -//! This cycle is intentional: while it exists, the `Cache` can upgrade its -//! `Weak` to an `Arc` in a single atomic operation. +//! > However, we want to avoid the overhead of entering the gate for every +//! > method invocation. +//! +//! Why do we want to avoid that? +//! Because the gate is a shared location in memory and entering it involves +//! bumping refcounts, which leads to cache contention if done frequently +//! from multiple cores in parallel. +//! +//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`. +//! That `Arc` is private to the `HandleInner` and hence to the connection. +//! (Review the "Data Structures" section if that is unclear to you.) +//! +//! A `WeakHandle` is a weak ref to the `HandleInner`. +//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and +//! further acquire an additional strong ref to the `Arc` inside it. +//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection. +//! +//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc`. +//! Again, this is cheap because the `Arc` is private to the connection. +//! +//! In addition to the GateGuard, we need to provide `Deref` impl. +//! For this, both `Handle` need infallible access to an `Arc`. +//! We could clone the `Arc` when upgrading a `WeakHandle`, but that would cause contention +//! on the shared memory location that trakcs the refcount of the `Arc`. +//! Instead, we wrap the `Arc` into another `Arc`. +//! so that we can clone it cheaply when upgrading a `WeakHandle`. +//! +//! # Shutdown +//! +//! The attentive reader may have noticed the following reference cycle around the `Arc`: +//! +//! ```text +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline +//! ``` +//! +//! Further, there is this cycle: +//! +//! ```text +//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline +//! ``` +//! +//! The former cycle is a memory leak if not broken. +//! The latter cycle further prevents the Timeline from shutting down +//! because we certainly won't drop the Timeline while the GateGuard is alive. +//! Preventing shutdown is the whole point of this handle/cache system, +//! but when the Timeline needs to shut down, we need to break the cycle. //! //! The cycle is broken by either -//! - `PerTimelineState::shutdown` or -//! - dropping the `Cache`. +//! - Timeline shutdown (=> `PerTimelineState::shutdown`) +//! - Connection shutdown (=> dropping the `Cache`). //! -//! Concurrently existing `Handle`s will extend the existence of the cycle. +//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to +//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the +//! `Arc`. +//! +//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains, +//! thereby breaking the cycle. +//! It also initiates draining of already existing `Handle`s by +//! poisoning things so that no new `HandleInner`'s can be added +//! to the `PerTimelineState`, which will make subsequent `Cache::get` fail. +//! +//! Concurrently existing / already upgraded `Handle`s will extend the +//! lifetime of the `Arc>` and hence cycles. //! However, since `Handle`s are short-lived and new `Handle`s are not -//! handed out after either `PerTimelineState::shutdown` or `Cache` drop, -//! that extension of the cycle is bounded. +//! handed out from `Cache::get` or `WeakHandle::upgrade` after +//! `PerTimelineState::shutdown`, that extension of the cycle is bounded. +//! +//! Concurrently existing `WeakHandle`s will fail to `upgrade()`: +//! while they will succeed in upgrading `Weak>`, +//! they will find the inner in state `HandleInner::ShutDown` state where the +//! `Arc` and Timeline has already been dropped. +//! +//! Dropping the `Cache` undoes the registration of this `Cache`'s +//! `HandleInner`s from all the `PerTimelineState`s, i.e., it +//! removes the strong ref to each of its `HandleInner`s +//! from all the `PerTimelineState`. +//! +//! # Locking Rules +//! +//! To prevent deadlocks we: +//! +//! 1. Only ever hold one of the locks at a time. +//! 2. Don't add more than one Drop impl that locks on the +//! cycles above. +//! +//! As per (2), that impl is in `Drop for Cache`. //! //! # Fast Path for Shard Routing //! //! The `Cache` has a fast path for shard routing to avoid calling into //! the tenant manager for every request. //! -//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak`. +//! The `Cache` maintains a hash map of `ShardTimelineId` to `WeakHandle`s. //! //! The current implementation uses the first entry in the hash map //! to determine the `ShardParameters` and derive the correct @@ -87,38 +184,31 @@ //! //! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`. //! -//! If the lookup is successful and the `Weak` can be upgraded, +//! If the lookup is successful and the `WeakHandle` can be upgraded, //! it's a hit. //! //! ## Cache invalidation //! -//! The insight is that cache invalidation is sufficient and most efficiently done lazily. +//! The insight is that cache invalidation is sufficient and most efficiently if done lazily. //! The only reasons why an entry in the cache can become stale are: //! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is //! being detached, timeline or shard deleted, or pageserver is shutting down. //! 2. We're doing a shard split and new traffic should be routed to the child shards. //! -//! Regarding (1), we will eventually fail to upgrade the `Weak` once the +//! Regarding (1), we will eventually fail to upgrade the `WeakHandle` once the //! timeline has shut down, and when that happens, we remove the entry from the cache. //! //! Regarding (2), the insight is that it is toally fine to keep dispatching requests //! to the parent shard during a shard split. Eventually, the shard split task will //! shut down the parent => case (1). -use std::collections::hash_map; -use std::collections::HashMap; -use std::sync::atomic::AtomicBool; -use std::sync::atomic::Ordering; -use std::sync::Arc; -use std::sync::Mutex; -use std::sync::Weak; +use std::collections::{HashMap, hash_map}; +use std::sync::{Arc, Mutex, Weak}; use pageserver_api::shard::ShardIdentity; -use tracing::instrument; -use tracing::trace; +use tracing::{instrument, trace}; use utils::id::TimelineId; -use utils::shard::ShardIndex; -use utils::shard::ShardNumber; +use utils::shard::{ShardIndex, ShardNumber}; use crate::tenant::mgr::ShardSelector; @@ -152,7 +242,7 @@ pub(crate) struct Cache { map: Map, } -type Map = HashMap>>; +type Map = HashMap>; impl Default for Cache { fn default() -> Self { @@ -170,12 +260,22 @@ pub(crate) struct ShardTimelineId { } /// See module-level comment. -pub(crate) struct Handle(Arc>); -struct HandleInner { - shut_down: AtomicBool, - timeline: T::Timeline, - // The timeline's gate held open. - _gate_guard: utils::sync::gate::GateGuard, +pub(crate) struct Handle { + timeline: Arc, + #[allow(dead_code)] // the field exists to keep the gate open + gate_guard: Arc, + inner: Arc>>, +} +pub(crate) struct WeakHandle { + inner: Weak>>, +} +enum HandleInner { + KeepingTimelineGateOpen { + #[allow(dead_code)] + gate_guard: Arc, + timeline: Arc, + }, + ShutDown, } /// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`. @@ -183,7 +283,8 @@ struct HandleInner { /// See module-level comment for details. pub struct PerTimelineState { // None = shutting down - handles: Mutex>>>>, + #[allow(clippy::type_complexity)] + handles: Mutex>>>>>, } impl Default for PerTimelineState { @@ -243,49 +344,24 @@ impl Cache { shard_selector: ShardSelector, tenant_manager: &T::TenantManager, ) -> Result, GetError> { - // terminates because each iteration removes an element from the map - loop { - let handle = self - .get_impl(timeline_id, shard_selector, tenant_manager) - .await?; - if handle.0.shut_down.load(Ordering::Relaxed) { - let removed = self - .map - .remove(&handle.0.timeline.shard_timeline_id()) - .expect("invariant of get_impl is that the returned handle is in the map"); - assert!( - Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)), - "shard_timeline_id() incorrect?" - ); - } else { - return Ok(handle); - } - } - } - - #[instrument(level = "trace", skip_all)] - async fn get_impl( - &mut self, - timeline_id: TimelineId, - shard_selector: ShardSelector, - tenant_manager: &T::TenantManager, - ) -> Result, GetError> { - let miss: ShardSelector = { + // terminates because when every iteration we remove an element from the map + let miss: ShardSelector = loop { let routing_state = self.shard_routing(timeline_id, shard_selector); match routing_state { RoutingResult::FastPath(handle) => return Ok(handle), RoutingResult::SlowPath(key) => match self.map.get(&key) { Some(cached) => match cached.upgrade() { - Some(upgraded) => return Ok(Handle(upgraded)), - None => { + Ok(upgraded) => return Ok(upgraded), + Err(HandleUpgradeError::ShutDown) => { + // TODO: dedup with shard_routing() trace!("handle cache stale"); self.map.remove(&key).unwrap(); - ShardSelector::Known(key.shard_index) + continue; } }, - None => ShardSelector::Known(key.shard_index), + None => break ShardSelector::Known(key.shard_index), }, - RoutingResult::NeedConsultTenantManager => shard_selector, + RoutingResult::NeedConsultTenantManager => break shard_selector, } }; self.get_miss(timeline_id, miss, tenant_manager).await @@ -302,7 +378,7 @@ impl Cache { let Some((first_key, first_handle)) = self.map.iter().next() else { return RoutingResult::NeedConsultTenantManager; }; - let Some(first_handle) = first_handle.upgrade() else { + let Ok(first_handle) = first_handle.upgrade() else { // TODO: dedup with get() trace!("handle cache stale"); let first_key_owned = *first_key; @@ -310,7 +386,7 @@ impl Cache { continue; }; - let first_handle_shard_identity = first_handle.timeline.get_shard_identity(); + let first_handle_shard_identity = first_handle.get_shard_identity(); let make_shard_index = |shard_num: ShardNumber| ShardIndex { shard_number: shard_num, shard_count: first_handle_shard_identity.count, @@ -329,11 +405,11 @@ impl Cache { }; let first_handle_shard_timeline_id = ShardTimelineId { shard_index: first_handle_shard_identity.shard_index(), - timeline_id: first_handle.timeline.shard_timeline_id().timeline_id, + timeline_id: first_handle.shard_timeline_id().timeline_id, }; if need_shard_timeline_id == first_handle_shard_timeline_id { - return RoutingResult::FastPath(Handle(first_handle)); + return RoutingResult::FastPath(first_handle); } else { return RoutingResult::SlowPath(need_shard_timeline_id); } @@ -357,23 +433,30 @@ impl Cache { ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index), } - let gate_guard = match timeline.gate().enter() { - Ok(guard) => guard, - Err(_) => { - return Err(GetError::TimelineGateClosed); - } - }; trace!("creating new HandleInner"); - let handle = Arc::new( - // TODO: global metric that keeps track of the number of live HandlerTimeline instances - // so we can identify reference cycle bugs. - HandleInner { - shut_down: AtomicBool::new(false), - _gate_guard: gate_guard, - timeline: timeline.clone(), - }, - ); - let handle = { + let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen { + gate_guard: Arc::new( + // this enter() is expensive in production code because + // it hits the global Arc::gate refcounts + match timeline.gate().enter() { + Ok(guard) => guard, + Err(_) => { + return Err(GetError::TimelineGateClosed); + } + }, + ), + // this clone is expensive in production code because + // it hits the global Arc::clone refcounts + timeline: Arc::new(timeline.clone()), + })); + let handle_weak = WeakHandle { + inner: Arc::downgrade(&handle_inner_arc), + }; + let handle = handle_weak + .upgrade() + .ok() + .expect("we just created it and it's not linked anywhere yet"); + { let mut lock_guard = timeline .per_timeline_state() .handles @@ -381,7 +464,8 @@ impl Cache { .expect("mutex poisoned"); match &mut *lock_guard { Some(per_timeline_state) => { - let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle)); + let replaced = + per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc)); assert!(replaced.is_none(), "some earlier code left a stale handle"); match self.map.entry(key) { hash_map::Entry::Occupied(_o) => { @@ -392,8 +476,7 @@ impl Cache { unreachable!() } hash_map::Entry::Vacant(v) => { - v.insert(Arc::downgrade(&handle)); - handle + v.insert(handle_weak); } } } @@ -401,14 +484,62 @@ impl Cache { return Err(GetError::PerTimelineStateShutDown); } } - }; - Ok(Handle(handle)) + } + Ok(handle) } Err(e) => Err(GetError::TenantManager(e)), } } } +pub(crate) enum HandleUpgradeError { + ShutDown, +} + +impl WeakHandle { + pub(crate) fn upgrade(&self) -> Result, HandleUpgradeError> { + let Some(inner) = Weak::upgrade(&self.inner) else { + return Err(HandleUpgradeError::ShutDown); + }; + let lock_guard = inner.lock().expect("poisoned"); + match &*lock_guard { + HandleInner::KeepingTimelineGateOpen { + timeline, + gate_guard, + } => { + let gate_guard = Arc::clone(gate_guard); + let timeline = Arc::clone(timeline); + drop(lock_guard); + Ok(Handle { + timeline, + gate_guard, + inner, + }) + } + HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown), + } + } + + pub(crate) fn is_same_handle_as(&self, other: &WeakHandle) -> bool { + Weak::ptr_eq(&self.inner, &other.inner) + } +} + +impl std::ops::Deref for Handle { + type Target = T::Timeline; + fn deref(&self) -> &Self::Target { + &self.timeline + } +} + +impl Handle { + pub(crate) fn downgrade(&self) -> WeakHandle { + WeakHandle { + inner: Arc::downgrade(&self.inner), + } + } +} + impl PerTimelineState { /// After this method returns, [`Cache::get`] will never again return a [`Handle`] /// to the [`Types::Timeline`] that embeds this per-timeline state. @@ -430,43 +561,62 @@ impl PerTimelineState { trace!("already shut down"); return; }; - for handle in handles.values() { + for handle_inner_arc in handles.values() { // Make hits fail. - handle.shut_down.store(true, Ordering::Relaxed); + let mut lock_guard = handle_inner_arc.lock().expect("poisoned"); + lock_guard.shutdown(); } drop(handles); } } -impl std::ops::Deref for Handle { - type Target = T::Timeline; - fn deref(&self) -> &Self::Target { - &self.0.timeline - } -} - -#[cfg(test)] -impl Drop for HandleInner { - fn drop(&mut self) { - trace!("HandleInner dropped"); - } -} - // When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle. impl Drop for Cache { fn drop(&mut self) { - for (_, weak) in self.map.drain() { - if let Some(strong) = weak.upgrade() { - // handle is still being kept alive in PerTimelineState - let timeline = strong.timeline.per_timeline_state(); - let mut handles = timeline.handles.lock().expect("mutex poisoned"); - if let Some(handles) = &mut *handles { - let Some(removed) = handles.remove(&self.id) else { - // There could have been a shutdown inbetween us upgrading the weak and locking the mutex. - continue; - }; - assert!(Arc::ptr_eq(&removed, &strong)); - } + for ( + _, + WeakHandle { + inner: handle_inner_weak, + }, + ) in self.map.drain() + { + let Some(handle_inner_arc) = handle_inner_weak.upgrade() else { + continue; + }; + let Some(handle_timeline) = handle_inner_arc + // locking rules: drop lock before acquiring other lock below + .lock() + .expect("poisoned") + .shutdown() + else { + // Concurrent PerTimelineState::shutdown. + continue; + }; + // Clean up per_timeline_state so the HandleInner allocation can be dropped. + let per_timeline_state = handle_timeline.per_timeline_state(); + let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned"); + let Some(handles) = &mut *handles_lock_guard else { + continue; + }; + let Some(removed_handle_inner_arc) = handles.remove(&self.id) else { + // Concurrent PerTimelineState::shutdown. + continue; + }; + drop(handles_lock_guard); // locking rules! + assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc)); + } + } +} + +impl HandleInner { + fn shutdown(&mut self) -> Option> { + match std::mem::replace(self, HandleInner::ShutDown) { + HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline), + HandleInner::ShutDown => { + // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown + // may do it concurrently, but locking rules disallow holding per-timeline-state lock and + // the handle lock at the same time. + None } } } @@ -474,12 +624,12 @@ impl Drop for Cache { #[cfg(test)] mod tests { - use pageserver_api::{ - key::{rel_block_to_key, Key, DBDIR_KEY}, - models::ShardParameters, - reltag::RelTag, - shard::ShardStripeSize, - }; + use std::sync::Weak; + + use pageserver_api::key::{DBDIR_KEY, Key, rel_block_to_key}; + use pageserver_api::models::ShardParameters; + use pageserver_api::reltag::RelTag; + use pageserver_api::shard::ShardStripeSize; use utils::shard::ShardCount; use super::*; @@ -583,39 +733,13 @@ mod tests { // // fill the cache // - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (2, 1), - "strong: shard0, mgr; weak: myself" - ); - let handle: Handle<_> = cache .get(timeline_id, ShardSelector::Page(key), &mgr) .await .expect("we have the timeline"); - let handle_inner_weak = Arc::downgrade(&handle.0); assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); - assert_eq!( - ( - Weak::strong_count(&handle_inner_weak), - Weak::weak_count(&handle_inner_weak) - ), - (2, 2), - "strong: handle, per_timeline_state, weak: handle_inner_weak, cache" - ); assert_eq!(cache.map.len(), 1); - - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (3, 1), - "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" - ); drop(handle); - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (3, 1), - "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself" - ); // // demonstrate that Handle holds up gate closure @@ -640,21 +764,11 @@ mod tests { // SHUTDOWN shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown - assert_eq!( - 1, - Weak::strong_count(&handle_inner_weak), - "through local var handle" - ); assert_eq!( cache.map.len(), 1, "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after" ); - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (3, 1), - "strong: handleinner(via handle), shard0, mgr; weak: myself" - ); // this handle is perfectly usable handle.getpage(); @@ -678,16 +792,6 @@ mod tests { } drop(handle); - assert_eq!( - 0, - Weak::strong_count(&handle_inner_weak), - "the HandleInner destructor already ran" - ); - assert_eq!( - (Arc::strong_count(&shard0), Arc::weak_count(&shard0)), - (2, 1), - "strong: shard0, mgr; weak: myself" - ); // closing gate succeeds after dropping handle tokio::select! { @@ -706,10 +810,8 @@ mod tests { assert_eq!(cache.map.len(), 0); // ensure all refs to shard0 are gone and we're not leaking anything - let myself = Weak::clone(&shard0.myself); drop(shard0); drop(mgr); - assert_eq!(Weak::strong_count(&myself), 0); } #[tokio::test] @@ -948,15 +1050,11 @@ mod tests { handle }; handle.getpage(); - used_handles.push(Arc::downgrade(&handle.0)); + used_handles.push(Arc::downgrade(&handle.timeline)); } - // No handles exist, thus gates are closed and don't require shutdown - assert!(used_handles - .iter() - .all(|weak| Weak::strong_count(weak) == 0)); - - // ... thus the gate should close immediately, even without shutdown + // No handles exist, thus gates are closed and don't require shutdown. + // Thus the gate should close immediately, even without shutdown. tokio::select! { _ = shard0.gate.close() => { } _ = tokio::time::sleep(FOREVER) => { @@ -964,4 +1062,172 @@ mod tests { } } } + + #[tokio::test(start_paused = true)] + async fn test_weak_handles() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + + let refcount_start = Arc::strong_count(&shard0); + + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + + let weak_handle = handle.downgrade(); + + drop(handle); + + let upgraded_handle = weak_handle.upgrade().ok().expect("we can upgrade it"); + + // Start shutdown + shard0.per_timeline_state.shutdown(); + + // Upgrades during shutdown don't work, even if upgraded_handle exists. + weak_handle + .upgrade() + .err() + .expect("can't upgrade weak handle as soon as shutdown started"); + + // But upgraded_handle is still alive, so the gate won't close. + tokio::select! { + _ = shard0.gate.close() => { + panic!("handle is keeping gate open"); + } + _ = tokio::time::sleep(FOREVER) => { } + } + + // Drop the last handle. + drop(upgraded_handle); + + // The gate should close now, despite there still being a weak_handle. + tokio::select! { + _ = shard0.gate.close() => { } + _ = tokio::time::sleep(FOREVER) => { + panic!("only strong handle is dropped and we shut down per-timeline-state") + } + } + + // The weak handle still can't be upgraded. + weak_handle + .upgrade() + .err() + .expect("still shouldn't be able to upgrade the weak handle"); + + // There should be no strong references to the timeline object except the one on "stack". + assert_eq!(Arc::strong_count(&shard0), refcount_start); + } + + #[tokio::test(start_paused = true)] + async fn test_reference_cycle_broken_when_cache_is_dropped() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + + // helper to check if a handle is referenced by per_timeline_state + let per_timeline_state_refs_handle = |handle_weak: &Weak>>| { + let per_timeline_state = shard0.per_timeline_state.handles.lock().unwrap(); + let per_timeline_state = per_timeline_state.as_ref().unwrap(); + per_timeline_state + .values() + .any(|v| Weak::ptr_eq(&Arc::downgrade(v), handle_weak)) + }; + + // Fill the cache. + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + assert!(Weak::ptr_eq(&handle.myself, &shard0.myself)); + let handle_inner_weak = Arc::downgrade(&handle.inner); + assert!( + per_timeline_state_refs_handle(&handle_inner_weak), + "we still hold `handle` _and_ haven't dropped `cache` yet" + ); + + // Drop the cache. + drop(cache); + + assert!( + !(per_timeline_state_refs_handle(&handle_inner_weak)), + "nothing should reference the handle allocation anymore" + ); + assert!( + Weak::upgrade(&handle_inner_weak).is_some(), + "the local `handle` still keeps the allocation alive" + ); + // but obviously the cache is gone so no new allocations can be handed out. + + // Drop handle. + drop(handle); + assert!( + Weak::upgrade(&handle_inner_weak).is_none(), + "the local `handle` is dropped, so the allocation should be dropped by now" + ); + } + + #[tokio::test(start_paused = true)] + async fn test_reference_cycle_broken_when_per_timeline_state_shutdown() { + crate::tenant::harness::setup_logging(); + let timeline_id = TimelineId::generate(); + let shard0 = Arc::new_cyclic(|myself| StubTimeline { + gate: Default::default(), + id: timeline_id, + shard: ShardIdentity::unsharded(), + per_timeline_state: PerTimelineState::default(), + myself: myself.clone(), + }); + let mgr = StubManager { + shards: vec![shard0.clone()], + }; + let key = DBDIR_KEY; + + let mut cache = Cache::::default(); + let handle = cache + .get(timeline_id, ShardSelector::Page(key), &mgr) + .await + .expect("we have the timeline"); + // grab a weak reference to the inner so can later try to Weak::upgrade it and assert that fails + let handle_inner_weak = Arc::downgrade(&handle.inner); + + // drop the handle, obviously the lifetime of `inner` is at least as long as each strong reference to it + drop(handle); + assert!(Weak::upgrade(&handle_inner_weak).is_some(), "can still"); + + // Shutdown the per_timeline_state. + shard0.per_timeline_state.shutdown(); + assert!(Weak::upgrade(&handle_inner_weak).is_none(), "can no longer"); + + // cache only contains Weak's, so, it can outlive the per_timeline_state without + // Drop explicitly solely to make this point. + drop(cache); + } } diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs new file mode 100644 index 0000000000..27243ba378 --- /dev/null +++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs @@ -0,0 +1,163 @@ +//! Timeline utility module to hydrate everything from the current heatmap. +//! +//! Provides utilities to spawn and abort a background task where the downloads happen. +//! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers. + +use std::sync::{Arc, Mutex}; + +use futures::StreamExt; +use http_utils::error::ApiError; +use tokio_util::sync::CancellationToken; +use utils::sync::gate::Gate; + +use super::Timeline; + +// This status is not strictly necessary now, but gives us a nice place +// to store progress information if we ever wish to expose it. +pub(super) enum HeatmapLayersDownloadStatus { + InProgress, + Complete, +} + +pub(super) struct HeatmapLayersDownloader { + handle: tokio::task::JoinHandle<()>, + status: Arc>, + cancel: CancellationToken, + downloads_guard: Arc, +} + +impl HeatmapLayersDownloader { + fn new( + timeline: Arc, + concurrency: usize, + ) -> Result { + let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?; + + let cancel = timeline.cancel.child_token(); + let downloads_guard = Arc::new(Gate::default()); + + let status = Arc::new(Mutex::new(HeatmapLayersDownloadStatus::InProgress)); + + let handle = tokio::task::spawn({ + let status = status.clone(); + let downloads_guard = downloads_guard.clone(); + let cancel = cancel.clone(); + + async move { + let _guard = tl_guard; + + scopeguard::defer! { + *status.lock().unwrap() = HeatmapLayersDownloadStatus::Complete; + } + + let Some(heatmap) = timeline.generate_heatmap().await else { + tracing::info!("Heatmap layers download failed to generate heatmap"); + return; + }; + + tracing::info!( + resident_size=%timeline.resident_physical_size(), + heatmap_layers=%heatmap.layers.len(), + "Starting heatmap layers download" + ); + + let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map( + |layer| { + let tl = timeline.clone(); + let dl_guard = match downloads_guard.enter() { + Ok(g) => g, + Err(_) => { + // [`Self::shutdown`] was called. Don't spawn any more downloads. + return None; + } + }; + + Some(async move { + let _dl_guard = dl_guard; + + let res = tl.download_layer(&layer.name).await; + if let Err(err) = res { + if !err.is_cancelled() { + tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}") + } + } + }) + } + )).buffered(concurrency); + + tokio::select! { + _ = stream.collect::<()>() => { + tracing::info!( + resident_size=%timeline.resident_physical_size(), + "Heatmap layers download completed" + ); + }, + _ = cancel.cancelled() => { + tracing::info!("Heatmap layers download cancelled"); + } + } + } + }); + + Ok(Self { + status, + handle, + cancel, + downloads_guard, + }) + } + + fn is_complete(&self) -> bool { + matches!( + *self.status.lock().unwrap(), + HeatmapLayersDownloadStatus::Complete + ) + } + + /// Drive any in-progress downloads to completion and stop spawning any new ones. + /// + /// This has two callers and they behave differently + /// 1. [`Timeline::shutdown`]: the drain will be immediate since downloads themselves + /// are sensitive to timeline cancellation. + /// + /// 2. Endpoint handler in [`crate::http::routes`]: the drain will wait for any in-progress + /// downloads to complete. + async fn stop_and_drain(self) { + // Counterintuitive: close the guard before cancelling. + // Something needs to poll the already created download futures to completion. + // If we cancel first, then the underlying task exits and we lost + // the poller. + self.downloads_guard.close().await; + self.cancel.cancel(); + if let Err(err) = self.handle.await { + tracing::warn!("Failed to join heatmap layer downloader task: {err}"); + } + } +} + +impl Timeline { + pub(crate) async fn start_heatmap_layers_download( + self: &Arc, + concurrency: usize, + ) -> Result<(), ApiError> { + let mut locked = self.heatmap_layers_downloader.lock().unwrap(); + if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) { + let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?; + *locked = Some(dl); + Ok(()) + } else { + Err(ApiError::Conflict("Already running".to_string())) + } + } + + pub(crate) async fn stop_and_drain_heatmap_layers_download(&self) { + // This can race with the start of a new downloader and lead to a situation + // where one donloader is shutting down and another one is in-flight. + // The only impact is that we'd end up using more remote storage semaphore + // units than expected. + let downloader = self.heatmap_layers_downloader.lock().unwrap().take(); + if let Some(dl) = downloader { + dl.stop_and_drain().await; + } + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index de56468580..8b94a114d6 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -1,14 +1,14 @@ use std::sync::Arc; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use remote_storage::RemotePath; use tokio_util::sync::CancellationToken; -use tracing::{info, info_span, Instrument}; +use tracing::{Instrument, info, info_span}; use utils::lsn::Lsn; -use crate::{context::RequestContext, tenant::metadata::TimelineMetadata}; - use super::Timeline; +use crate::context::RequestContext; +use crate::tenant::metadata::TimelineMetadata; mod flow; mod importbucket_client; @@ -113,7 +113,7 @@ pub async fn doit( match res { Ok(_) => break, Err(err) => { - info!(?err, "indefintely waiting for pgdata to finish"); + info!(?err, "indefinitely waiting for pgdata to finish"); if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) .await .is_ok() diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 6e224acf3e..c8f151b56b 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -28,52 +28,38 @@ //! An incomplete set of TODOs from the Hackathon: //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest) +use std::collections::HashSet; +use std::ops::Range; use std::sync::Arc; use anyhow::{bail, ensure}; use bytes::Bytes; - use itertools::Itertools; -use pageserver_api::{ - key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY}, - reltag::RelTag, - shard::ShardIdentity, -}; -use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, BLCKSZ}; -use tokio::task::JoinSet; -use tracing::{debug, info_span, instrument, Instrument}; - -use crate::{ - assert_u64_eq_usize::UsizeIsU64, - pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory}, -}; -use crate::{ - context::{DownloadBehavior, RequestContext}, - pgdatadir_mapping::{DbDirectory, RelDirectory}, - task_mgr::TaskKind, - tenant::storage_layer::{ImageLayerWriter, Layer}, -}; - -use pageserver_api::key::Key; use pageserver_api::key::{ - slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, CHECKPOINT_KEY, CONTROLFILE_KEY, - TWOPHASEDIR_KEY, + CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key, + rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, + slru_segment_size_to_key, }; -use pageserver_api::keyspace::singleton_range; -use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range}; -use pageserver_api::reltag::SlruKind; +use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range}; +use pageserver_api::reltag::{RelTag, SlruKind}; +use pageserver_api::shard::ShardIdentity; +use postgres_ffi::relfile_utils::parse_relfilename; +use postgres_ffi::{BLCKSZ, pg_constants}; +use remote_storage::RemotePath; +use tokio::task::JoinSet; +use tracing::{Instrument, debug, info_span, instrument}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; -use std::collections::HashSet; -use std::ops::Range; - -use super::{ - importbucket_client::{ControlFile, RemoteStorageWrapper}, - Timeline, +use super::Timeline; +use super::importbucket_client::{ControlFile, RemoteStorageWrapper}; +use crate::assert_u64_eq_usize::UsizeIsU64; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::pgdatadir_mapping::{ + DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory, }; - -use remote_storage::RemotePath; +use crate::task_mgr::TaskKind; +use crate::tenant::storage_layer::{ImageLayerWriter, Layer}; pub async fn run( timeline: Arc, diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index bc4d148a29..a17a10d56b 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -1,4 +1,5 @@ -use std::{ops::Bound, sync::Arc}; +use std::ops::Bound; +use std::sync::Arc; use anyhow::Context; use bytes::Bytes; @@ -12,9 +13,9 @@ use tokio_util::sync::CancellationToken; use tracing::{debug, info, instrument}; use utils::lsn::Lsn; -use crate::{assert_u64_eq_usize::U64IsUsize, config::PageServerConf}; - use super::{importbucket_format, index_part_format}; +use crate::assert_u64_eq_usize::U64IsUsize; +use crate::config::PageServerConf; pub async fn new( conf: &'static PageServerConf, @@ -308,7 +309,7 @@ impl ControlFile { 202107181 => 14, 202209061 => 15, 202307071 => 16, - /* XXX pg17 */ + 202406281 => 17, catversion => { anyhow::bail!("unrecognized catalog version {catversion}") } diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs index 310d97a6a9..ea7a41b25f 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs @@ -1,7 +1,6 @@ -use serde::{Deserialize, Serialize}; - #[cfg(feature = "testing")] use camino::Utf8PathBuf; +use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum Root { diff --git a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs index c5210f9a30..7c7a4de2fc 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs @@ -1,13 +1,12 @@ //! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate. use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt}; +use reqwest::Method; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; use tracing::error; -use crate::config::PageServerConf; -use reqwest::Method; - use super::importbucket_format::Spec; +use crate::config::PageServerConf; pub struct Client { base_url: String, diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 6634d07a0d..e952df0845 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -1,22 +1,16 @@ -use crate::{ - is_temporary, - tenant::{ - ephemeral_file::is_ephemeral_file, - remote_timeline_client::{ - self, - index::{IndexPart, LayerFileMetadata}, - }, - storage_layer::LayerName, - }, -}; +use std::collections::{HashMap, hash_map}; +use std::str::FromStr; + use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use std::{ - collections::{hash_map, HashMap}, - str::FromStr, -}; use utils::lsn::Lsn; +use crate::is_temporary; +use crate::tenant::ephemeral_file::is_ephemeral_file; +use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata}; +use crate::tenant::remote_timeline_client::{self}; +use crate::tenant::storage_layer::LayerName; + /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 3888e7f86a..e552ea83de 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,27 +1,22 @@ -use anyhow::{bail, ensure, Context}; +use std::collections::HashMap; +use std::sync::Arc; + +use anyhow::{Context, bail, ensure}; use itertools::Itertools; use pageserver_api::shard::TenantShardId; -use std::{collections::HashMap, sync::Arc}; use tracing::trace; -use utils::{ - id::TimelineId, - lsn::{AtomicLsn, Lsn}, -}; - -use crate::{ - config::PageServerConf, - context::RequestContext, - metrics::TimelineMetrics, - tenant::{ - layer_map::{BatchedUpdates, LayerMap}, - storage_layer::{ - AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey, - ResidentLayer, - }, - }, -}; +use utils::id::TimelineId; +use utils::lsn::{AtomicLsn, Lsn}; use super::TimelineWriterState; +use crate::config::PageServerConf; +use crate::context::RequestContext; +use crate::metrics::TimelineMetrics; +use crate::tenant::layer_map::{BatchedUpdates, LayerMap}; +use crate::tenant::storage_layer::{ + AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc, + PersistentLayerKey, ResidentLayer, +}; /// Provides semantic APIs to manipulate the layer map. pub(crate) enum LayerManager { @@ -91,6 +86,7 @@ impl LayerManager { layer_map, layer_fmgr: LayerFileManager(hashmap), }) => { + // NB: no need to decrement layer metrics; metrics are removed on timeline shutdown. let open = layer_map.open_layer.take(); let frozen = layer_map.frozen_layers.len(); let taken_writer_state = writer_state.take(); @@ -117,6 +113,12 @@ impl LayerManager { self.layers().values().filter(|l| l.is_likely_resident()) } + pub(crate) fn visible_layers(&self) -> impl Iterator + '_ { + self.layers() + .values() + .filter(|l| l.visibility() == LayerVisibilityHint::Visible) + } + pub(crate) fn contains(&self, layer: &Layer) -> bool { self.contains_key(&layer.layer_desc().key()) } @@ -207,9 +209,7 @@ impl OpenLayerManager { trace!( "creating in-memory layer at {}/{} for record at {}", - timeline_id, - start_lsn, - lsn + timeline_id, start_lsn, lsn ); let new_layer = @@ -234,6 +234,7 @@ impl OpenLayerManager { lsn: Lsn, last_freeze_at: &AtomicLsn, write_lock: &mut tokio::sync::MutexGuard<'_, Option>, + metrics: &TimelineMetrics, ) -> bool { let Lsn(last_record_lsn) = lsn; let end_lsn = Lsn(last_record_lsn + 1); @@ -242,6 +243,11 @@ impl OpenLayerManager { let open_layer_rc = Arc::clone(open_layer); open_layer.freeze(end_lsn).await; + // Increment the frozen layer metrics. This is decremented in `finish_flush_l0_layer()`. + // TODO: It would be nicer to do this via `InMemoryLayer::drop()`, but it requires a + // reference to the timeline metrics. Other methods use a metrics borrow as well. + metrics.inc_frozen_layer(open_layer); + // The layer is no longer open, update the layer map to reflect this. // We will replace it with on-disk historics below. self.layer_map.frozen_layers.push_back(open_layer_rc); @@ -298,6 +304,7 @@ impl OpenLayerManager { .frozen_layers .pop_front() .expect("there must be a inmem layer to flush"); + metrics.dec_frozen_layer(&inmem); // Only one task may call this function at a time (for this // timeline). If two tasks tried to flush the same frozen @@ -337,16 +344,45 @@ impl OpenLayerManager { compact_to: &[ResidentLayer], metrics: &TimelineMetrics, ) { - // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification. - self.finish_compact_l0(compact_from, compact_to, metrics) + // gc-compaction could contain layer rewrites. We need to delete the old layers and insert the new ones. + + // Match the old layers with the new layers + let mut add_layers = HashMap::new(); + let mut rewrite_layers = HashMap::new(); + let mut drop_layers = HashMap::new(); + for layer in compact_from { + drop_layers.insert(layer.layer_desc().key(), layer.clone()); + } + for layer in compact_to { + if let Some(old_layer) = drop_layers.remove(&layer.layer_desc().key()) { + rewrite_layers.insert(layer.layer_desc().key(), (old_layer.clone(), layer.clone())); + } else { + add_layers.insert(layer.layer_desc().key(), layer.clone()); + } + } + let add_layers = add_layers.values().cloned().collect::>(); + let drop_layers = drop_layers.values().cloned().collect::>(); + let rewrite_layers = rewrite_layers.values().cloned().collect::>(); + + self.rewrite_layers_inner(&rewrite_layers, &drop_layers, &add_layers, metrics); } /// Called post-compaction when some previous generation image layers were trimmed. - pub(crate) fn rewrite_layers( + pub fn rewrite_layers( &mut self, rewrite_layers: &[(Layer, ResidentLayer)], drop_layers: &[Layer], metrics: &TimelineMetrics, + ) { + self.rewrite_layers_inner(rewrite_layers, drop_layers, &[], metrics); + } + + fn rewrite_layers_inner( + &mut self, + rewrite_layers: &[(Layer, ResidentLayer)], + drop_layers: &[Layer], + add_layers: &[ResidentLayer], + metrics: &TimelineMetrics, ) { let mut updates = self.layer_map.batch_update(); for (old_layer, new_layer) in rewrite_layers { @@ -382,6 +418,10 @@ impl OpenLayerManager { for l in drop_layers { Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr); } + for l in add_layers { + Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr); + metrics.record_new_file_metrics(l.layer_desc().file_size); + } updates.flush(); } diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index f4a4eea54a..397037ca9f 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -1,11 +1,10 @@ -use anyhow::Context; +use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; +use anyhow::Context; use once_cell::sync::OnceCell; use tokio_util::sync::CancellationToken; use utils::lsn::Lsn; -use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; - /// Internal structure to hold all data needed for logical size calculation. /// /// Calculation consists of two stages: diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 3bfbfb5061..43ffaa6aab 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -1,9 +1,15 @@ use std::sync::Arc; -use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}; +use pageserver_api::models::{TenantState, TimelineState}; + use super::Timeline; +use super::delete::{DeletionGuard, delete_local_timeline_directory}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded}; +use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; +use crate::tenant::timeline::delete::{TimelineDeleteGuardKind, make_timeline_delete_guard}; +use crate::tenant::{ + DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded, +}; #[derive(thiserror::Error, Debug)] pub(crate) enum OffloadError { @@ -33,29 +39,40 @@ pub(crate) async fn offload_timeline( debug_assert_current_span_has_tenant_and_timeline_id(); tracing::info!("offloading archived timeline"); - let allow_offloaded_children = true; + let delete_guard_res = make_timeline_delete_guard( + tenant, + timeline.timeline_id, + TimelineDeleteGuardKind::Offload, + ); + if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res { + let is_archived = timeline.is_archived(); + if is_archived == Some(true) { + tracing::error!("timeline is archived but has non-archived children: {children:?}"); + return Err(OffloadError::NotArchived); + } + tracing::info!( + ?is_archived, + "timeline is not archived and has unarchived children" + ); + return Err(OffloadError::NotArchived); + }; let (timeline, guard) = - DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children) - .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; + delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; let TimelineOrOffloaded::Timeline(timeline) = timeline else { tracing::error!("timeline already offloaded, but given timeline object"); return Ok(()); }; - let is_archived = timeline.is_archived(); - match is_archived { - Some(true) => (), - Some(false) => { - tracing::warn!("tried offloading a non-archived timeline"); - return Err(OffloadError::NotArchived); - } - None => { - // This is legal: calls to this function can race with the timeline shutting down - tracing::info!("tried offloading a timeline whose remote storage is not initialized"); - return Err(OffloadError::Cancelled); + match timeline.remote_client.shutdown_if_archived().await { + Ok(()) => {} + Err(ShutdownIfArchivedError::NotInitialized(_)) => { + // Either the timeline is being deleted, the operation is being retried, or we are shutting down. + // Don't return cancelled here to keep it idempotent. } + Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived), } + timeline.set_state(TimelineState::Stopping); // Now that the Timeline is in Stopping state, request all the related tasks to shut down. timeline.shutdown(super::ShutdownMode::Reload).await; @@ -70,6 +87,15 @@ pub(crate) async fn offload_timeline( { let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap(); + if matches!( + tenant.current_state(), + TenantState::Stopping { .. } | TenantState::Broken { .. } + ) { + // Cancel the operation if the tenant is shutting down. Do this while the + // timelines_offloaded lock is held to prevent a race with Tenant::shutdown + // for defusing the lock + return Err(OffloadError::Cancelled); + } offloaded_timelines.insert( timeline.timeline_id, Arc::new( @@ -93,7 +119,7 @@ pub(crate) async fn offload_timeline( } /// It is important that this gets called when DeletionGuard is being held. -/// For more context see comments in [`DeleteTimelineFlow::prepare`] +/// For more context see comments in [`make_timeline_delete_guard`] /// /// Returns the strong count of the timeline `Arc` fn remove_timeline_from_tenant( @@ -117,5 +143,12 @@ fn remove_timeline_from_tenant( .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); + // Clear the compaction queue for this timeline + tenant + .scheduled_compaction_tasks + .lock() + .unwrap() + .remove(&timeline.timeline_id); + Arc::strong_count(&timeline) } diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index 80a09b4840..f66c0ffa0f 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -1,17 +1,21 @@ -use std::{collections::hash_map::Entry, fs, sync::Arc}; +use std::collections::hash_map::Entry; +use std::fs; +use std::future::Future; +use std::sync::Arc; use anyhow::Context; use camino::Utf8PathBuf; use tracing::{error, info, info_span}; -use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard}; - -use crate::{ - context::RequestContext, - import_datadir, - tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded}, -}; +use utils::fs_ext; +use utils::id::TimelineId; +use utils::lsn::Lsn; +use utils::sync::gate::GateGuard; use super::Timeline; +use crate::context::RequestContext; +use crate::import_datadir; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded}; /// A timeline with some of its files on disk, being initialized. /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or @@ -24,6 +28,9 @@ pub struct UninitializedTimeline<'t> { pub(crate) owning_tenant: &'t Tenant, timeline_id: TimelineId, raw_timeline: Option<(Arc, TimelineCreateGuard)>, + /// Whether we spawned the inner Timeline's tasks such that we must later shut it down + /// if aborting the timeline creation + needs_shutdown: bool, } impl<'t> UninitializedTimeline<'t> { @@ -36,6 +43,50 @@ impl<'t> UninitializedTimeline<'t> { owning_tenant, timeline_id, raw_timeline, + needs_shutdown: false, + } + } + + /// When writing data to this timeline during creation, use this wrapper: it will take care of + /// setup of Timeline tasks required for I/O (flush loop) and making sure they are torn down + /// later. + pub(crate) async fn write(&mut self, f: F) -> anyhow::Result<()> + where + F: FnOnce(Arc) -> Fut, + Fut: Future>, + { + debug_assert_current_span_has_tenant_and_timeline_id(); + + // Remember that we did I/O (spawned the flush loop), so that we can check we shut it down on drop + self.needs_shutdown = true; + + let timeline = self.raw_timeline()?; + + // Spawn flush loop so that the Timeline is ready to accept writes + timeline.maybe_spawn_flush_loop(); + + // Invoke the provided function, which will write some data into the new timeline + if let Err(e) = f(timeline.clone()).await { + self.abort().await; + return Err(e.into()); + } + + // Flush the underlying timeline's ephemeral layers to disk + if let Err(e) = timeline + .freeze_and_flush() + .await + .context("Failed to flush after timeline creation writes") + { + self.abort().await; + return Err(e); + } + + Ok(()) + } + + pub(crate) async fn abort(&self) { + if let Some((raw_timeline, _)) = self.raw_timeline.as_ref() { + raw_timeline.shutdown(super::ShutdownMode::Hard).await; } } @@ -44,11 +95,13 @@ impl<'t> UninitializedTimeline<'t> { /// This function launches the flush loop if not already done. /// /// The caller is responsible for activating the timeline (function `.activate()`). - pub(crate) fn finish_creation(mut self) -> anyhow::Result> { + pub(crate) async fn finish_creation(mut self) -> anyhow::Result> { let timeline_id = self.timeline_id; let tenant_shard_id = self.owning_tenant.tenant_shard_id; if self.raw_timeline.is_none() { + self.abort().await; + return Err(anyhow::anyhow!( "No timeline for initialization found for {tenant_shard_id}/{timeline_id}" )); @@ -62,16 +115,25 @@ impl<'t> UninitializedTimeline<'t> { .0 .get_disk_consistent_lsn(); - anyhow::ensure!( - new_disk_consistent_lsn.is_valid(), - "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn" - ); + if !new_disk_consistent_lsn.is_valid() { + self.abort().await; + + return Err(anyhow::anyhow!( + "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn" + )); + } let mut timelines = self.owning_tenant.timelines.lock().unwrap(); match timelines.entry(timeline_id) { - Entry::Occupied(_) => anyhow::bail!( - "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map" - ), + Entry::Occupied(_) => { + // Unexpected, bug in the caller. Tenant is responsible for preventing concurrent creation of the same timeline. + // + // We do not call Self::abort here. Because we don't cleanly shut down our Timeline, [`Self::drop`] should + // skip trying to delete the timeline directory too. + anyhow::bail!( + "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map" + ) + } Entry::Vacant(v) => { // after taking here should be no fallible operations, because the drop guard will not // cleanup after and would block for example the tenant deletion @@ -93,36 +155,31 @@ impl<'t> UninitializedTimeline<'t> { /// Prepares timeline data by loading it from the basebackup archive. pub(crate) async fn import_basebackup_from_tar( - self, + mut self, tenant: Arc, copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, broker_client: storage_broker::BrokerClientChannel, ctx: &RequestContext, ) -> anyhow::Result> { - let raw_timeline = self.raw_timeline()?; + self.write(|raw_timeline| async move { + import_datadir::import_basebackup_from_tar(&raw_timeline, copyin_read, base_lsn, ctx) + .await + .context("Failed to import basebackup") + .map_err(CreateTimelineError::Other)?; - import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx) - .await - .context("Failed to import basebackup")?; + fail::fail_point!("before-checkpoint-new-timeline", |_| { + Err(CreateTimelineError::Other(anyhow::anyhow!( + "failpoint before-checkpoint-new-timeline" + ))) + }); - // Flush the new layer files to disk, before we make the timeline as available to - // the outside world. - // - // Flush loop needs to be spawned in order to be able to flush. - raw_timeline.maybe_spawn_flush_loop(); - - fail::fail_point!("before-checkpoint-new-timeline", |_| { - anyhow::bail!("failpoint before-checkpoint-new-timeline"); - }); - - raw_timeline - .freeze_and_flush() - .await - .context("Failed to flush after basebackup import")?; + Ok(()) + }) + .await?; // All the data has been imported. Insert the Timeline into the tenant's timelines map - let tl = self.finish_creation()?; + let tl = self.finish_creation().await?; tl.activate(tenant, broker_client, None, ctx); Ok(tl) } @@ -143,12 +200,19 @@ impl<'t> UninitializedTimeline<'t> { impl Drop for UninitializedTimeline<'_> { fn drop(&mut self) { - if let Some((_, create_guard)) = self.raw_timeline.take() { + if let Some((timeline, create_guard)) = self.raw_timeline.take() { let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered(); - // This is unusual, but can happen harmlessly if the pageserver is stopped while - // creating a timeline. - info!("Timeline got dropped without initializing, cleaning its files"); - cleanup_timeline_directory(create_guard); + if self.needs_shutdown && !timeline.gate.close_complete() { + // This should not happen: caller should call [`Self::abort`] on failures + tracing::warn!( + "Timeline not shut down after initialization failure, cannot clean up files" + ); + } else { + // This is unusual, but can happen harmlessly if the pageserver is stopped while + // creating a timeline. + info!("Timeline got dropped without initializing, cleaning its files"); + cleanup_timeline_directory(create_guard); + } } } } diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index f831f5e48a..4f80073cc3 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -23,17 +23,11 @@ mod connection_manager; mod walreceiver_connection; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; -use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::timeline::walreceiver::connection_manager::{ - connection_manager_loop_step, ConnectionManagerState, -}; - use std::future::Future; use std::num::NonZeroU64; use std::sync::Arc; use std::time::Duration; + use storage_broker::BrokerClientChannel; use tokio::sync::watch; use tokio_util::sync::CancellationToken; @@ -41,8 +35,13 @@ use tracing::*; use utils::postgres_client::PostgresClientProtocol; use self::connection_manager::ConnectionManagerStatus; - use super::Timeline; +use crate::context::{DownloadBehavior, RequestContext}; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; +use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::timeline::walreceiver::connection_manager::{ + ConnectionManagerState, connection_manager_loop_step, +}; #[derive(Clone)] pub struct WalReceiverConf { @@ -56,6 +55,7 @@ pub struct WalReceiverConf { pub auth_token: Option>, pub availability_zone: Option, pub ingest_batch_size: u64, + pub validate_wal_contiguity: bool, } pub struct WalReceiver { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 583d6309ab..df2663f6bb 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -9,45 +9,42 @@ //! then a (re)connection happens, if necessary. //! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel. -use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration}; +use std::collections::HashMap; +use std::num::NonZeroU64; +use std::ops::ControlFlow; +use std::sync::Arc; +use std::time::Duration; -use super::{TaskStateUpdate, WalReceiverConf}; +use anyhow::Context; +use chrono::{NaiveDateTime, Utc}; +use pageserver_api::models::TimelineState; +use postgres_connection::PgConnectionConfig; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, + SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, + TypedMessage, +}; +use storage_broker::{BrokerClientChannel, Code, Streaming}; +use tokio_util::sync::CancellationToken; +use tracing::*; +use utils::backoff::{ + DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff, +}; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; +use utils::postgres_client::{ + ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config, +}; + +use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError}; +use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf}; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::{ WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, }; use crate::task_mgr::TaskKind; -use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline}; -use anyhow::Context; -use chrono::{NaiveDateTime, Utc}; -use pageserver_api::models::TimelineState; - -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; -use storage_broker::proto::{ - FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, - SubscribeByFilterRequest, TypeSubscription, TypedMessage, -}; -use storage_broker::{BrokerClientChannel, Code, Streaming}; -use tokio_util::sync::CancellationToken; -use tracing::*; - -use postgres_connection::PgConnectionConfig; -use utils::backoff::{ - exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, -}; -use utils::postgres_client::{ - wal_stream_connection_config, ConnectionConfigArgs, PostgresClientProtocol, -}; -use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, -}; - -use super::{ - walreceiver_connection::WalConnectionStatus, walreceiver_connection::WalReceiverError, - TaskEvent, TaskHandle, -}; +use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id}; pub(crate) struct Cancelled; @@ -164,9 +161,10 @@ pub(super) async fn connection_manager_loop_step( Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update), Err(status) => { match status.code() { - Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") => { + Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => { // tonic's error handling doesn't provide a clear code for disconnections: we get // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe" + // => https://github.com/neondatabase/neon/issues/9562 info!("broker disconnected: {status}"); }, _ => { @@ -273,7 +271,7 @@ pub(super) async fn connection_manager_loop_step( }; last_discovery_ts = Some(std::time::Instant::now()); - debug!("No active connection and no candidates, sending discovery request to the broker"); + info!("No active connection and no candidates, sending discovery request to the broker"); // Cancellation safety: we want to send a message to the broker, but publish_one() // function can get cancelled by the other select! arm. This is absolutely fine, because @@ -348,7 +346,9 @@ async fn subscribe_for_timeline_updates( Err(e) => { // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error. - info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}"); + info!( + "Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}" + ); continue; } } @@ -511,11 +511,11 @@ impl ConnectionManagerState { fn spawn( &self, task: impl FnOnce( - tokio::sync::watch::Sender>, - CancellationToken, - ) -> Fut - + Send - + 'static, + tokio::sync::watch::Sender>, + CancellationToken, + ) -> Fut + + Send + + 'static, ) -> TaskHandle where Fut: std::future::Future> + Send, @@ -536,6 +536,7 @@ impl ConnectionManagerState { let connect_timeout = self.conf.wal_connect_timeout; let ingest_batch_size = self.conf.ingest_batch_size; let protocol = self.conf.protocol; + let validate_wal_contiguity = self.conf.validate_wal_contiguity; let timeline = Arc::clone(&self.timeline); let ctx = ctx.detached_child( TaskKind::WalReceiverConnectionHandler, @@ -557,6 +558,7 @@ impl ConnectionManagerState { ctx, node_id, ingest_batch_size, + validate_wal_contiguity, ) .await; @@ -877,8 +879,7 @@ impl ConnectionManagerState { discovered_new_wal = if candidate_commit_lsn > current_commit_lsn { trace!( "New candidate has commit_lsn {}, higher than current_commit_lsn {}", - candidate_commit_lsn, - current_commit_lsn + candidate_commit_lsn, current_commit_lsn ); Some(NewCommittedWAL { lsn: candidate_commit_lsn, @@ -1045,7 +1046,9 @@ impl ConnectionManagerState { if !node_ids_to_remove.is_empty() { for node_id in node_ids_to_remove { - info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"); + info!( + "Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections" + ); self.wal_connection_retries.remove(&node_id); WALRECEIVER_CANDIDATES_REMOVED.inc(); } @@ -1116,11 +1119,12 @@ impl ReconnectReason { #[cfg(test)] mod tests { - use super::*; - use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL; use url::Host; + use super::*; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + fn dummy_broker_sk_timeline( commit_lsn: u64, safekeeper_connstr: &str, @@ -1562,6 +1566,7 @@ mod tests { auth_token: None, availability_zone: None, ingest_batch_size: 1, + validate_wal_contiguity: false, }, wal_connection: None, wal_stream_candidates: HashMap::new(), diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 3f10eeda60..f41a9cfe82 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -1,46 +1,48 @@ //! Actual Postgres connection handler to stream WAL to the server. -use std::{ - error::Error, - pin::pin, - str::FromStr, - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::error::Error; +use std::pin::pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use bytes::BytesMut; use chrono::{NaiveDateTime, Utc}; use fail::fail_point; use futures::StreamExt; -use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow}; -use postgres_ffi::WAL_SEGMENT_SIZE; -use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError}; -use postgres_protocol::message::backend::ReplicationMessage; -use postgres_types::PgLsn; -use tokio::{select, sync::watch, time}; -use tokio_postgres::{replication::ReplicationStream, Client}; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, trace, warn, Instrument}; -use wal_decoder::{ - models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords}, - wire_format::FromWireFormat, -}; - -use super::TaskStateUpdate; -use crate::{ - context::RequestContext, - metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, - pgdatadir_mapping::DatadirModification, - task_mgr::{TaskKind, WALRECEIVER_RUNTIME}, - tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, - walingest::WalIngest, -}; use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; -use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol}; -use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; +use postgres_ffi::WAL_SEGMENT_SIZE; +use postgres_ffi::v14::xlog_utils::normalize_lsn; +use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder}; +use postgres_protocol::message::backend::ReplicationMessage; +use postgres_types::PgLsn; +use tokio::sync::watch; +use tokio::{select, time}; +use tokio_postgres::error::SqlState; +use tokio_postgres::replication::ReplicationStream; +use tokio_postgres::{Client, SimpleQueryMessage, SimpleQueryRow}; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, debug, error, info, trace, warn}; +use utils::critical; +use utils::id::NodeId; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; +use utils::postgres_client::PostgresClientProtocol; +use utils::sync::gate::GateError; +use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords}; +use wal_decoder::wire_format::FromWireFormat; + +use super::TaskStateUpdate; +use crate::context::RequestContext; +use crate::metrics::{LIVE_CONNECTIONS, WAL_INGEST, WALRECEIVER_STARTED_CONNECTIONS}; +use crate::pgdatadir_mapping::DatadirModification; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; +use crate::tenant::{ + Timeline, WalReceiverInfo, debug_assert_current_span_has_tenant_and_timeline_id, +}; +use crate::walingest::WalIngest; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -64,7 +66,7 @@ pub(super) struct WalConnectionStatus { pub(super) enum WalReceiverError { /// An error of a type that does not indicate an issue, e.g. a connection closing - ExpectedSafekeeperError(postgres::Error), + ExpectedSafekeeperError(tokio_postgres::Error), /// An "error" message that carries a SUCCESSFUL_COMPLETION status code. Carries /// the message part of the original postgres error SuccessfulCompletion(String), @@ -118,8 +120,9 @@ pub(super) async fn handle_walreceiver_connection( cancellation: CancellationToken, connect_timeout: Duration, ctx: RequestContext, - node: NodeId, + safekeeper_node: NodeId, ingest_batch_size: u64, + validate_wal_contiguity: bool, ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -140,15 +143,17 @@ pub(super) async fn handle_walreceiver_connection( let (replication_client, connection) = { let mut config = wal_source_connconf.to_tokio_postgres_config(); - config.application_name("pageserver"); + config.application_name(format!("pageserver-{}", timeline.conf.id.0).as_str()); config.replication_mode(tokio_postgres::config::ReplicationMode::Physical); - match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await { + match time::timeout(connect_timeout, config.connect(tokio_postgres::NoTls)).await { Ok(client_and_conn) => client_and_conn?, Err(_elapsed) => { // Timing out to connect to a safekeeper node could happen long time, due to // many reasons that pageserver cannot control. // Do not produce an error, but make it visible, that timeouts happen by logging the `event. - info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open"); + info!( + "Timed out while waiting {connect_timeout:?} for walreceiver connection to open" + ); return Ok(()); } } @@ -162,10 +167,12 @@ pub(super) async fn handle_walreceiver_connection( latest_wal_update: Utc::now().naive_utc(), streaming_lsn: None, commit_lsn: None, - node, + node: safekeeper_node, }; if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { - warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); + warn!( + "Wal connection event listener dropped right after connection init, aborting the connection: {e}" + ); return Ok(()); } @@ -226,7 +233,9 @@ pub(super) async fn handle_walreceiver_connection( connection_status.latest_wal_update = Utc::now().naive_utc(); connection_status.commit_lsn = Some(end_of_wal); if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { - warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); + warn!( + "Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}" + ); return Ok(()); } @@ -253,7 +262,9 @@ pub(super) async fn handle_walreceiver_connection( // to the safekeepers. startpoint = normalize_lsn(startpoint, WAL_SEGMENT_SIZE); - info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..."); + info!( + "last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..." + ); let query = format!("START_REPLICATION PHYSICAL {startpoint}"); @@ -264,6 +275,8 @@ pub(super) async fn handle_walreceiver_connection( let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?; + let shard = vec![*timeline.get_shard_identity()]; + let interpreted_proto_config = match protocol { PostgresClientProtocol::Vanilla => None, PostgresClientProtocol::Interpreted { @@ -272,6 +285,7 @@ pub(super) async fn handle_walreceiver_connection( } => Some((format, compression)), }; + let mut expected_wal_start = startpoint; while let Some(replication_message) = { select! { _ = cancellation.cancelled() => { @@ -319,27 +333,11 @@ pub(super) async fn handle_walreceiver_connection( return Ok(()); } - async fn commit( - modification: &mut DatadirModification<'_>, - uncommitted: &mut u64, - filtered: &mut u64, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - WAL_INGEST - .records_committed - .inc_by(*uncommitted - *filtered); - modification.commit(ctx).await?; - *uncommitted = 0; - *filtered = 0; - Ok(()) - } - let status_update = match replication_message { ReplicationMessage::RawInterpretedWalRecords(raw) => { WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64); let mut uncommitted_records = 0; - let mut filtered_records = 0; // This is the end LSN of the raw WAL from which the records // were interpreted. @@ -354,13 +352,49 @@ pub(super) async fn handle_walreceiver_connection( ) })?; + // Guard against WAL gaps. If the start LSN of the PG WAL section + // from which the interpreted records were extracted, doesn't match + // the end of the previous batch (or the starting point for the first batch), + // then kill this WAL receiver connection and start a new one. + if validate_wal_contiguity { + if let Some(raw_wal_start_lsn) = batch.raw_wal_start_lsn { + match raw_wal_start_lsn.cmp(&expected_wal_start) { + std::cmp::Ordering::Greater => { + let msg = format!( + "Gap in streamed WAL: [{}, {})", + expected_wal_start, raw_wal_start_lsn + ); + critical!("{msg}"); + return Err(WalReceiverError::Other(anyhow!(msg))); + } + std::cmp::Ordering::Less => { + // Other shards are reading WAL behind us. + // This is valid, but check that we received records + // that we haven't seen before. + if let Some(first_rec) = batch.records.first() { + if first_rec.next_record_lsn < last_rec_lsn { + let msg = format!( + "Received record with next_record_lsn multiple times ({} < {})", + first_rec.next_record_lsn, expected_wal_start + ); + critical!("{msg}"); + return Err(WalReceiverError::Other(anyhow!(msg))); + } + } + } + std::cmp::Ordering::Equal => {} + } + } + } + let InterpretedWalRecords { records, next_record_lsn, + raw_wal_start_lsn: _, } = batch; tracing::debug!( - "Received WAL up to {} with next_record_lsn={:?}", + "Received WAL up to {} with next_record_lsn={}", streaming_lsn, next_record_lsn ); @@ -369,6 +403,19 @@ pub(super) async fn handle_walreceiver_connection( // advances it to its end LSN. 0 is just an initialization placeholder. let mut modification = timeline.begin_modification(Lsn(0)); + async fn commit( + modification: &mut DatadirModification<'_>, + ctx: &RequestContext, + uncommitted: &mut u64, + ) -> anyhow::Result<()> { + let stats = modification.stats(); + modification.commit(ctx).await?; + WAL_INGEST.records_committed.inc_by(*uncommitted); + WAL_INGEST.inc_values_committed(&stats); + *uncommitted = 0; + Ok(()) + } + if !records.is_empty() { timeline .metrics @@ -380,31 +427,29 @@ pub(super) async fn handle_walreceiver_connection( if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) && uncommitted_records > 0 { - commit( - &mut modification, - &mut uncommitted_records, - &mut filtered_records, - &ctx, - ) - .await?; + commit(&mut modification, &ctx, &mut uncommitted_records).await?; } let local_next_record_lsn = interpreted.next_record_lsn; - let ingested = walingest + + if interpreted.is_observed() { + WAL_INGEST.records_observed.inc(); + } + + walingest .ingest_record(interpreted, &mut modification, &ctx) .await .with_context(|| { format!("could not ingest record at {local_next_record_lsn}") + }) + .inspect_err(|err| { + // TODO: we can't differentiate cancellation errors with + // anyhow::Error, so just ignore it if we're cancelled. + if !cancellation.is_cancelled() { + critical!("{err:?}") + } })?; - if !ingested { - tracing::debug!( - "ingest: filtered out record @ LSN {local_next_record_lsn}" - ); - WAL_INGEST.records_filtered.inc(); - filtered_records += 1; - } - uncommitted_records += 1; // FIXME: this cannot be made pausable_failpoint without fixing the @@ -418,13 +463,7 @@ pub(super) async fn handle_walreceiver_connection( || modification.approx_pending_bytes() > DatadirModification::MAX_PENDING_BYTES { - commit( - &mut modification, - &mut uncommitted_records, - &mut filtered_records, - &ctx, - ) - .await?; + commit(&mut modification, &ctx, &mut uncommitted_records).await?; } } @@ -432,23 +471,16 @@ pub(super) async fn handle_walreceiver_connection( // need to advance last record LSN on all shards. If we've not ingested the latest // record, then set the LSN of the modification past it. This way all shards // advance their last record LSN at the same time. - let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) { - Some(lsn) if lsn > modification.get_lsn() => { - modification.set_lsn(lsn).unwrap(); - true - } - _ => false, + let needs_last_record_lsn_advance = if next_record_lsn > modification.get_lsn() { + modification.set_lsn(next_record_lsn).unwrap(); + true + } else { + false }; if uncommitted_records > 0 || needs_last_record_lsn_advance { // Commit any uncommitted records - commit( - &mut modification, - &mut uncommitted_records, - &mut filtered_records, - &ctx, - ) - .await?; + commit(&mut modification, &ctx, &mut uncommitted_records).await?; } if !caught_up && streaming_lsn >= end_of_wal { @@ -461,14 +493,30 @@ pub(super) async fn handle_walreceiver_connection( timeline.get_last_record_lsn() ); - if let Some(lsn) = next_record_lsn { - last_rec_lsn = lsn; - } + last_rec_lsn = next_record_lsn; + expected_wal_start = streaming_lsn; Some(streaming_lsn) } ReplicationMessage::XLogData(xlog_data) => { + async fn commit( + modification: &mut DatadirModification<'_>, + uncommitted: &mut u64, + filtered: &mut u64, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let stats = modification.stats(); + modification.commit(ctx).await?; + WAL_INGEST + .records_committed + .inc_by(*uncommitted - *filtered); + WAL_INGEST.inc_values_committed(&stats); + *uncommitted = 0; + *filtered = 0; + Ok(()) + } + // Pass the WAL data to the decoder, and see if we can decode // more records as a result. let data = xlog_data.data(); @@ -496,10 +544,12 @@ pub(super) async fn handle_walreceiver_connection( // Deserialize and interpret WAL record let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, - modification.tline.get_shard_identity(), + &shard, next_record_lsn, modification.tline.pg_version, - )?; + )? + .remove(timeline.get_shard_identity()) + .unwrap(); if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) && uncommitted_records > 0 @@ -523,6 +573,13 @@ pub(super) async fn handle_walreceiver_connection( .await .with_context(|| { format!("could not ingest record at {next_record_lsn}") + }) + .inspect_err(|err| { + // TODO: we can't differentiate cancellation errors with + // anyhow::Error, so just ignore it if we're cancelled. + if !cancellation.is_cancelled() { + critical!("{err:?}") + } })?; if !ingested { tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}"); @@ -579,7 +636,9 @@ pub(super) async fn handle_walreceiver_connection( let timestamp = keepalive.timestamp(); let reply_requested = keepalive.reply() != 0; - trace!("received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})"); + trace!( + "received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})" + ); if reply_requested { Some(last_rec_lsn) diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index ef3aa759f3..d5dc9666ce 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,28 +1,35 @@ -use super::storage_layer::LayerName; -use super::storage_layer::ResidentLayer; -use crate::tenant::metadata::TimelineMetadata; -use crate::tenant::remote_timeline_client::index::IndexPart; -use crate::tenant::remote_timeline_client::index::LayerFileMetadata; -use std::collections::HashSet; -use std::collections::{HashMap, VecDeque}; +use std::collections::{HashMap, HashSet, VecDeque}; use std::fmt::Debug; +use std::sync::Arc; +use std::sync::atomic::AtomicU32; use chrono::NaiveDateTime; -use std::sync::Arc; +use once_cell::sync::Lazy; use tracing::info; -use utils::lsn::AtomicLsn; - -use std::sync::atomic::AtomicU32; -use utils::lsn::Lsn; - use utils::generation::Generation; +use utils::lsn::{AtomicLsn, Lsn}; + +use super::remote_timeline_client::is_same_remote_layer_path; +use super::storage_layer::{AsLayerDesc as _, LayerName, ResidentLayer}; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::index::{IndexPart, LayerFileMetadata}; + +/// Kill switch for upload queue reordering in case it causes problems. +/// TODO: remove this once we have confidence in it. +static DISABLE_UPLOAD_QUEUE_REORDERING: Lazy = + Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_REORDERING").as_deref() == Ok("true")); + +/// Kill switch for index upload coalescing in case it causes problems. +/// TODO: remove this once we have confidence in it. +static DISABLE_UPLOAD_QUEUE_INDEX_COALESCING: Lazy = + Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_INDEX_COALESCING").as_deref() == Ok("true")); // clippy warns that Uninitialized is much smaller than Initialized, which wastes // memory for Uninitialized variants. Doesn't matter in practice, there are not // that many upload queues in a running pageserver, and most of them are initialized // anyway. #[allow(clippy::large_enum_variant)] -pub(super) enum UploadQueue { +pub enum UploadQueue { Uninitialized, Initialized(UploadQueueInitialized), Stopped(UploadQueueStopped), @@ -39,13 +46,16 @@ impl UploadQueue { } #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] -pub(crate) enum OpType { +pub enum OpType { MayReorder, FlushDeletion, } /// This keeps track of queued and in-progress tasks. -pub(crate) struct UploadQueueInitialized { +pub struct UploadQueueInitialized { + /// Maximum number of inprogress tasks to schedule. 0 is no limit. + pub(crate) inprogress_limit: usize, + /// Counter to assign task IDs pub(crate) task_counter: u64, @@ -70,21 +80,16 @@ pub(crate) struct UploadQueueInitialized { /// we skip validation) pub(crate) visible_remote_consistent_lsn: Arc, - // Breakdown of different kinds of tasks currently in-progress - pub(crate) num_inprogress_layer_uploads: usize, - pub(crate) num_inprogress_metadata_uploads: usize, - pub(crate) num_inprogress_deletions: usize, - /// Tasks that are currently in-progress. In-progress means that a tokio Task /// has been launched for it. An in-progress task can be busy uploading, but it can /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can /// be waiting for retry in `exponential_backoff`. - pub(crate) inprogress_tasks: HashMap>, + pub inprogress_tasks: HashMap>, /// Queued operations that have not been launched yet. They might depend on previous /// tasks to finish. For example, metadata upload cannot be performed before all /// preceding layer file uploads have completed. - pub(crate) queued_operations: VecDeque, + pub queued_operations: VecDeque, /// Files which have been unlinked but not yet had scheduled a deletion for. Only kept around /// for error logging. @@ -122,6 +127,167 @@ impl UploadQueueInitialized { let lsn = self.clean.0.metadata.disk_consistent_lsn(); self.clean.1.map(|_| lsn) } + + /// Returns and removes the next ready operation from the queue, if any. This isn't necessarily + /// the first operation in the queue, to avoid head-of-line blocking -- an operation can jump + /// the queue if it doesn't conflict with operations ahead of it. + /// + /// Also returns any operations that were coalesced into this one, e.g. multiple index uploads. + /// + /// None may be returned even if the queue isn't empty, if no operations are ready yet. + /// + /// NB: this is quadratic, but queues are expected to be small, and bounded by inprogress_limit. + pub fn next_ready(&mut self) -> Option<(UploadOp, Vec)> { + // If inprogress_tasks is already at limit, don't schedule anything more. + if self.inprogress_limit > 0 && self.inprogress_tasks.len() >= self.inprogress_limit { + return None; + } + + for (i, candidate) in self.queued_operations.iter().enumerate() { + // If this candidate is ready, go for it. Otherwise, try the next one. + if self.is_ready(i) { + // Shutdown operations are left at the head of the queue, to prevent further + // operations from starting. Signal that we're ready to shut down. + if matches!(candidate, UploadOp::Shutdown) { + assert!(self.inprogress_tasks.is_empty(), "shutdown with tasks"); + assert_eq!(i, 0, "shutdown not at head of queue"); + self.shutdown_ready.close(); + return None; + } + + let mut op = self.queued_operations.remove(i).expect("i can't disappear"); + + // Coalesce any back-to-back index uploads by only uploading the newest one that's + // ready. This typically happens with layer/index/layer/index/... sequences, where + // the layers bypass the indexes, leaving the indexes queued. + // + // If other operations are interleaved between index uploads we don't try to + // coalesce them, since we may as well update the index concurrently with them. + // This keeps the index fresh and avoids starvation. + // + // NB: we assume that all uploaded indexes have the same remote path. This + // is true at the time of writing: the path only depends on the tenant, + // timeline and generation, all of which are static for a timeline instance. + // Otherwise, we must be careful not to coalesce different paths. + let mut coalesced_ops = Vec::new(); + if matches!(op, UploadOp::UploadMetadata { .. }) { + while let Some(UploadOp::UploadMetadata { .. }) = self.queued_operations.get(i) + { + if *DISABLE_UPLOAD_QUEUE_INDEX_COALESCING { + break; + } + if !self.is_ready(i) { + break; + } + coalesced_ops.push(op); + op = self.queued_operations.remove(i).expect("i can't disappear"); + } + } + + return Some((op, coalesced_ops)); + } + + // Nothing can bypass a barrier or shutdown. If it wasn't scheduled above, give up. + if matches!(candidate, UploadOp::Barrier(_) | UploadOp::Shutdown) { + return None; + } + + // If upload queue reordering is disabled, bail out after the first operation. + if *DISABLE_UPLOAD_QUEUE_REORDERING { + return None; + } + } + None + } + + /// Returns true if the queued operation at the given position is ready to be uploaded, i.e. if + /// it doesn't conflict with any in-progress or queued operations ahead of it. Operations are + /// allowed to skip the queue when it's safe to do so, to increase parallelism. + /// + /// The position must be valid for the queue size. + fn is_ready(&self, pos: usize) -> bool { + let candidate = self.queued_operations.get(pos).expect("invalid position"); + self + // Look at in-progress operations, in random order. + .inprogress_tasks + .values() + .map(|task| &task.op) + // Then queued operations ahead of the candidate, front-to-back. + .chain(self.queued_operations.iter().take(pos)) + // Keep track of the active index ahead of each operation. This is used to ensure that + // an upload doesn't skip the queue too far, such that it modifies a layer that's + // referenced by an active index. + // + // It's okay that in-progress operations are emitted in random order above, since at + // most one of them can be an index upload (enforced by can_bypass). + .scan(&self.clean.0, |next_active_index, op| { + let active_index = *next_active_index; + if let UploadOp::UploadMetadata { uploaded } = op { + *next_active_index = uploaded; // stash index for next operation after this + } + Some((op, active_index)) + }) + // Check if the candidate can bypass all of them. + .all(|(op, active_index)| candidate.can_bypass(op, active_index)) + } + + /// Returns the number of in-progress deletion operations. + #[cfg(test)] + pub(crate) fn num_inprogress_deletions(&self) -> usize { + self.inprogress_tasks + .iter() + .filter(|(_, t)| matches!(t.op, UploadOp::Delete(_))) + .count() + } + + /// Returns the number of in-progress layer uploads. + #[cfg(test)] + pub(crate) fn num_inprogress_layer_uploads(&self) -> usize { + self.inprogress_tasks + .iter() + .filter(|(_, t)| matches!(t.op, UploadOp::UploadLayer(_, _, _))) + .count() + } + + /// Test helper that schedules all ready operations into inprogress_tasks, and returns + /// references to them. + /// + /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into + /// UploadQueue, so we can use the same code path. + #[cfg(test)] + fn schedule_ready(&mut self) -> Vec> { + let mut tasks = Vec::new(); + // NB: schedule operations one by one, to handle conflicts with inprogress_tasks. + while let Some((op, coalesced_ops)) = self.next_ready() { + self.task_counter += 1; + let task = Arc::new(UploadTask { + task_id: self.task_counter, + op, + coalesced_ops, + retries: 0.into(), + }); + self.inprogress_tasks.insert(task.task_id, task.clone()); + tasks.push(task); + } + tasks + } + + /// Test helper that marks an operation as completed, removing it from inprogress_tasks. + /// + /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into + /// UploadQueue, so we can use the same code path. + #[cfg(test)] + fn complete(&mut self, task_id: u64) { + let Some(task) = self.inprogress_tasks.remove(&task_id) else { + return; + }; + // Update the clean index on uploads. + if let UploadOp::UploadMetadata { ref uploaded } = task.op { + if task.task_id > self.clean.1.unwrap_or_default() { + self.clean = (*uploaded.clone(), Some(task.task_id)); + } + } + } } #[derive(Clone, Copy)] @@ -131,12 +297,12 @@ pub(super) enum SetDeletedFlagProgress { Successful(NaiveDateTime), } -pub(super) struct UploadQueueStoppedDeletable { +pub struct UploadQueueStoppedDeletable { pub(super) upload_queue_for_deletion: UploadQueueInitialized, pub(super) deleted_at: SetDeletedFlagProgress, } -pub(super) enum UploadQueueStopped { +pub enum UploadQueueStopped { Deletable(UploadQueueStoppedDeletable), Uninitialized, } @@ -163,9 +329,10 @@ impl NotInitialized { } impl UploadQueue { - pub(crate) fn initialize_empty_remote( + pub fn initialize_empty_remote( &mut self, metadata: &TimelineMetadata, + inprogress_limit: usize, ) -> anyhow::Result<&mut UploadQueueInitialized> { match self { UploadQueue::Uninitialized => (), @@ -179,15 +346,13 @@ impl UploadQueue { let index_part = IndexPart::empty(metadata.clone()); let state = UploadQueueInitialized { + inprogress_limit, dirty: index_part.clone(), clean: (index_part, None), latest_files_changes_since_metadata_upload_scheduled: 0, visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations task_counter: 0, - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, inprogress_tasks: HashMap::new(), queued_operations: VecDeque::new(), #[cfg(feature = "testing")] @@ -202,9 +367,10 @@ impl UploadQueue { Ok(self.initialized_mut().expect("we just set it")) } - pub(crate) fn initialize_with_current_remote_index_part( + pub fn initialize_with_current_remote_index_part( &mut self, index_part: &IndexPart, + inprogress_limit: usize, ) -> anyhow::Result<&mut UploadQueueInitialized> { match self { UploadQueue::Uninitialized => (), @@ -219,6 +385,7 @@ impl UploadQueue { ); let state = UploadQueueInitialized { + inprogress_limit, dirty: index_part.clone(), clean: (index_part.clone(), None), latest_files_changes_since_metadata_upload_scheduled: 0, @@ -227,9 +394,6 @@ impl UploadQueue { ), // what follows are boring default initializations task_counter: 0, - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, inprogress_tasks: HashMap::new(), queued_operations: VecDeque::new(), #[cfg(feature = "testing")] @@ -244,9 +408,7 @@ impl UploadQueue { Ok(self.initialized_mut().expect("we just set it")) } - pub(crate) fn initialized_mut( - &mut self, - ) -> Result<&mut UploadQueueInitialized, NotInitialized> { + pub fn initialized_mut(&mut self) -> Result<&mut UploadQueueInitialized, NotInitialized> { use UploadQueue::*; match self { Uninitialized => Err(NotInitialized::Uninitialized), @@ -276,23 +438,27 @@ impl UploadQueue { /// An in-progress upload or delete task. #[derive(Debug)] -pub(crate) struct UploadTask { +pub struct UploadTask { /// Unique ID of this task. Used as the key in `inprogress_tasks` above. - pub(crate) task_id: u64, - pub(crate) retries: AtomicU32, - - pub(crate) op: UploadOp, + pub task_id: u64, + /// Number of task retries. + pub retries: AtomicU32, + /// The upload operation. + pub op: UploadOp, + /// Any upload operations that were coalesced into this operation. This typically happens with + /// back-to-back index uploads, see `UploadQueueInitialized::next_ready()`. + pub coalesced_ops: Vec, } /// A deletion of some layers within the lifetime of a timeline. This is not used /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug, Clone)] -pub(crate) struct Delete { - pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>, +pub struct Delete { + pub layers: Vec<(LayerName, LayerFileMetadata)>, } -#[derive(Debug)] -pub(crate) enum UploadOp { +#[derive(Clone, Debug)] +pub enum UploadOp { /// Upload a layer file. The last field indicates the last operation for thie file. UploadLayer(ResidentLayer, LayerFileMetadata, Option), @@ -338,3 +504,900 @@ impl std::fmt::Display for UploadOp { } } } + +impl UploadOp { + /// Returns true if self can bypass other, i.e. if the operations don't conflict. index is the + /// active index when other would be uploaded -- if we allow self to bypass other, this would + /// be the active index when self is uploaded. + pub fn can_bypass(&self, other: &UploadOp, index: &IndexPart) -> bool { + match (self, other) { + // Nothing can bypass a barrier or shutdown, and it can't bypass anything. + (UploadOp::Barrier(_), _) | (_, UploadOp::Barrier(_)) => false, + (UploadOp::Shutdown, _) | (_, UploadOp::Shutdown) => false, + + // Uploads and deletes can bypass each other unless they're for the same file. + (UploadOp::UploadLayer(a, ameta, _), UploadOp::UploadLayer(b, bmeta, _)) => { + let aname = &a.layer_desc().layer_name(); + let bname = &b.layer_desc().layer_name(); + !is_same_remote_layer_path(aname, ameta, bname, bmeta) + } + (UploadOp::UploadLayer(u, umeta, _), UploadOp::Delete(d)) + | (UploadOp::Delete(d), UploadOp::UploadLayer(u, umeta, _)) => { + d.layers.iter().all(|(dname, dmeta)| { + !is_same_remote_layer_path(&u.layer_desc().layer_name(), umeta, dname, dmeta) + }) + } + + // Deletes are idempotent and can always bypass each other. + (UploadOp::Delete(_), UploadOp::Delete(_)) => true, + + // Uploads and deletes can bypass an index upload as long as neither the uploaded index + // nor the active index below it references the file. A layer can't be modified or + // deleted while referenced by an index. + // + // Similarly, index uploads can bypass uploads and deletes as long as neither the + // uploaded index nor the active index references the file (the latter would be + // incorrect use by the caller). + (UploadOp::UploadLayer(u, umeta, _), UploadOp::UploadMetadata { uploaded: i }) + | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::UploadLayer(u, umeta, _)) => { + let uname = u.layer_desc().layer_name(); + !i.references(&uname, umeta) && !index.references(&uname, umeta) + } + (UploadOp::Delete(d), UploadOp::UploadMetadata { uploaded: i }) + | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::Delete(d)) => { + d.layers.iter().all(|(dname, dmeta)| { + !i.references(dname, dmeta) && !index.references(dname, dmeta) + }) + } + + // Indexes can never bypass each other. They can coalesce though, and + // `UploadQueue::next_ready()` currently does this when possible. + (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => false, + } + } +} + +#[cfg(test)] +mod tests { + use std::str::FromStr as _; + + use itertools::Itertools as _; + use utils::shard::{ShardCount, ShardIndex, ShardNumber}; + + use super::*; + use crate::DEFAULT_PG_VERSION; + use crate::tenant::Timeline; + use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; + use crate::tenant::storage_layer::Layer; + use crate::tenant::storage_layer::layer::local_layer_path; + + /// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq. + #[track_caller] + fn assert_same_op(a: &UploadOp, b: &UploadOp) { + use UploadOp::*; + match (a, b) { + (UploadLayer(a, ameta, atype), UploadLayer(b, bmeta, btype)) => { + assert_eq!(a.layer_desc().layer_name(), b.layer_desc().layer_name()); + assert_eq!(ameta, bmeta); + assert_eq!(atype, btype); + } + (Delete(a), Delete(b)) => assert_eq!(a.layers, b.layers), + (UploadMetadata { uploaded: a }, UploadMetadata { uploaded: b }) => assert_eq!(a, b), + (Barrier(_), Barrier(_)) => {} + (Shutdown, Shutdown) => {} + (a, b) => panic!("{a:?} != {b:?}"), + } + } + + /// Test helper which asserts that two sets of operations are the same. + #[track_caller] + fn assert_same_ops<'a>( + a: impl IntoIterator, + b: impl IntoIterator, + ) { + a.into_iter() + .zip_eq(b) + .for_each(|(a, b)| assert_same_op(a, b)) + } + + /// Test helper to construct a test timeline. + /// + /// TODO: it really shouldn't be necessary to construct an entire tenant and timeline just to + /// test the upload queue -- decouple ResidentLayer from Timeline. + /// + /// TODO: the upload queue uses TimelineMetadata::example() instead, because there's no way to + /// obtain a TimelineMetadata from a Timeline. + fn make_timeline() -> Arc { + // Grab the current test name from the current thread name. + // TODO: TenantHarness shouldn't take a &'static str, but just leak the test name for now. + let test_name = std::thread::current().name().unwrap().to_string(); + let test_name = Box::leak(test_name.into_boxed_str()); + + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create runtime"); + + runtime + .block_on(async { + let harness = TenantHarness::create(test_name).await?; + let (tenant, ctx) = harness.load().await; + tenant + .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) + .await + }) + .expect("failed to create timeline") + } + + /// Test helper to construct an (empty) resident layer. + fn make_layer(timeline: &Arc, name: &str) -> ResidentLayer { + make_layer_with_size(timeline, name, 0) + } + + /// Test helper to construct a resident layer with the given size. + fn make_layer_with_size(timeline: &Arc, name: &str, size: usize) -> ResidentLayer { + let metadata = LayerFileMetadata { + generation: timeline.generation, + shard: timeline.get_shard_index(), + file_size: size as u64, + }; + make_layer_with_metadata(timeline, name, metadata) + } + + /// Test helper to construct a layer with the given metadata. + fn make_layer_with_metadata( + timeline: &Arc, + name: &str, + metadata: LayerFileMetadata, + ) -> ResidentLayer { + let name = LayerName::from_str(name).expect("invalid name"); + let local_path = local_layer_path( + timeline.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &name, + &metadata.generation, + ); + std::fs::write(&local_path, vec![0; metadata.file_size as usize]) + .expect("failed to write file"); + Layer::for_resident(timeline.conf, timeline, local_path, name, metadata) + } + + /// Test helper to add a layer to an index and return a new index. + fn index_with(index: &IndexPart, layer: &ResidentLayer) -> Box { + let mut index = index.clone(); + index + .layer_metadata + .insert(layer.layer_desc().layer_name(), layer.metadata()); + Box::new(index) + } + + /// Test helper to remove a layer from an index and return a new index. + fn index_without(index: &IndexPart, layer: &ResidentLayer) -> Box { + let mut index = index.clone(); + index + .layer_metadata + .remove(&layer.layer_desc().layer_name()); + Box::new(index) + } + + /// Nothing can bypass a barrier, and it can't bypass inprogress tasks. + #[test] + fn schedule_barrier() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?; + let tli = make_timeline(); + + let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let (barrier, _) = tokio::sync::watch::channel(()); + + // Enqueue non-conflicting upload, delete, and index before and after a barrier. + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + UploadOp::Barrier(barrier), + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // Schedule the initial operations ahead of the barrier. + let tasks = queue.schedule_ready(); + + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]); + assert!(matches!( + queue.queued_operations.front(), + Some(&UploadOp::Barrier(_)) + )); + + // Complete the initial operations. The barrier isn't scheduled while they're pending. + for task in tasks { + assert!(queue.schedule_ready().is_empty()); + queue.complete(task.task_id); + } + + // Schedule the barrier. The later tasks won't schedule until it completes. + let tasks = queue.schedule_ready(); + + assert_eq!(tasks.len(), 1); + assert!(matches!(tasks[0].op, UploadOp::Barrier(_))); + assert_eq!(queue.queued_operations.len(), 3); + + // Complete the barrier. The rest of the tasks schedule immediately. + queue.complete(tasks[0].task_id); + + let tasks = queue.schedule_ready(); + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[4..]); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Deletes can be scheduled in parallel, even if they're for the same file. + #[test] + fn schedule_delete_parallel() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?; + let tli = make_timeline(); + + // Enqueue a bunch of deletes, some with conflicting names. + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + + let ops = [ + UploadOp::Delete(Delete { + layers: vec![(layer0.layer_desc().layer_name(), layer0.metadata())], + }), + UploadOp::Delete(Delete { + layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())], + }), + UploadOp::Delete(Delete { + layers: vec![ + (layer1.layer_desc().layer_name(), layer1.metadata()), + (layer2.layer_desc().layer_name(), layer2.metadata()), + ], + }), + UploadOp::Delete(Delete { + layers: vec![(layer2.layer_desc().layer_name(), layer2.metadata())], + }), + UploadOp::Delete(Delete { + layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], + }), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Schedule all ready operations. Since deletes don't conflict, they're all scheduled. + let tasks = queue.schedule_ready(); + + assert_same_ops(tasks.iter().map(|t| &t.op), &ops); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Conflicting uploads are serialized. + #[test] + fn schedule_upload_conflicts() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue three versions of the same layer, with different file sizes. + let layer0a = make_layer_with_size( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + 1, + ); + let layer0b = make_layer_with_size( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + 2, + ); + let layer0c = make_layer_with_size( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + 3, + ); + + let ops = [ + UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None), + UploadOp::UploadLayer(layer0b.clone(), layer0b.metadata(), None), + UploadOp::UploadLayer(layer0c.clone(), layer0c.metadata(), None), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Only one version should be scheduled and uploaded at a time. + for op in ops { + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &op); + queue.complete(tasks[0].task_id); + } + assert!(queue.schedule_ready().is_empty()); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Conflicting uploads and deletes are serialized. + #[test] + fn schedule_upload_delete_conflicts() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue two layer uploads, with a delete of both layers in between them. These should be + // scheduled one at a time, since deletes can't bypass uploads and vice versa. + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![ + (layer0.layer_desc().layer_name(), layer0.metadata()), + (layer1.layer_desc().layer_name(), layer1.metadata()), + ], + }), + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Only one version should be scheduled and uploaded at a time. + for op in ops { + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &op); + queue.complete(tasks[0].task_id); + } + assert!(queue.schedule_ready().is_empty()); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Non-conflicting uploads and deletes can bypass the queue, avoiding the conflicting + /// delete/upload operations at the head of the queue. + #[test] + fn schedule_upload_delete_conflicts_bypass() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue two layer uploads, with a delete of both layers in between them. These should be + // scheduled one at a time, since deletes can't bypass uploads and vice versa. + // + // Also enqueue non-conflicting uploads and deletes at the end. These can bypass the queue + // and run immediately. + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![ + (layer0.layer_desc().layer_name(), layer0.metadata()), + (layer1.layer_desc().layer_name(), layer1.metadata()), + ], + }), + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], + }), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Operations 0, 3, and 4 are scheduled immediately. + let tasks = queue.schedule_ready(); + assert_same_ops(tasks.iter().map(|t| &t.op), [&ops[0], &ops[3], &ops[4]]); + assert_eq!(queue.queued_operations.len(), 2); + + Ok(()) + } + + /// Non-conflicting uploads are parallelized. + #[test] + fn schedule_upload_parallel() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue three different layer uploads. + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + ]; + + queue.queued_operations.extend(ops.clone()); + + // All uploads should be scheduled concurrently. + let tasks = queue.schedule_ready(); + + assert_same_ops(tasks.iter().map(|t| &t.op), &ops); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Index uploads are coalesced. + #[test] + fn schedule_index_coalesce() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + + // Enqueue three uploads of the current empty index. + let index = Box::new(queue.clean.0.clone()); + + let ops = [ + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // The index uploads are coalesced into a single operation. + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &ops[2]); + assert_same_ops(&tasks[0].coalesced_ops, &ops[0..2]); + + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Chains of upload/index operations lead to parallel layer uploads and serial index uploads. + /// This is the common case with layer flushes. + #[test] + fn schedule_index_upload_chain() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue three uploads of the current empty index. + let index = Box::new(queue.clean.0.clone()); + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let index0 = index_with(&index, &layer0); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let index1 = index_with(&index0, &layer1); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let index2 = index_with(&index1, &layer2); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index0.clone(), + }, + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index1.clone(), + }, + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index2.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // The layer uploads should be scheduled immediately. The indexes must wait. + let upload_tasks = queue.schedule_ready(); + assert_same_ops( + upload_tasks.iter().map(|t| &t.op), + [&ops[0], &ops[2], &ops[4]], + ); + + // layer2 completes first. None of the indexes can upload yet. + queue.complete(upload_tasks[2].task_id); + assert!(queue.schedule_ready().is_empty()); + + // layer0 completes. index0 can upload. It completes. + queue.complete(upload_tasks[0].task_id); + let index_tasks = queue.schedule_ready(); + assert_eq!(index_tasks.len(), 1); + assert_same_op(&index_tasks[0].op, &ops[1]); + queue.complete(index_tasks[0].task_id); + + // layer 1 completes. This unblocks index 1 and 2, which coalesce into + // a single upload for index 2. + queue.complete(upload_tasks[1].task_id); + + let index_tasks = queue.schedule_ready(); + assert_eq!(index_tasks.len(), 1); + assert_same_op(&index_tasks[0].op, &ops[5]); + assert_same_ops(&index_tasks[0].coalesced_ops, &ops[3..4]); + + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// A delete can't bypass an index upload if an index ahead of it still references it. + #[test] + fn schedule_index_delete_dereferenced() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Create a layer to upload. + let layer = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let index_upload = index_with(&queue.clean.0, &layer); + + // Remove the layer reference in a new index, then delete the layer. + let index_deref = index_without(&index_upload, &layer); + + let ops = [ + // Initial upload, with a barrier to prevent index coalescing. + UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index_upload.clone(), + }, + UploadOp::Barrier(tokio::sync::watch::channel(()).0), + // Dereference the layer and delete it. + UploadOp::UploadMetadata { + uploaded: index_deref.clone(), + }, + UploadOp::Delete(Delete { + layers: vec![(layer.layer_desc().layer_name(), layer.metadata())], + }), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Operations are serialized. + for op in ops { + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &op); + queue.complete(tasks[0].task_id); + } + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// An upload with a reused layer name doesn't clobber the previous layer. Specifically, a + /// dereference/upload/reference cycle can't allow the upload to bypass the reference. + #[test] + fn schedule_index_upload_dereferenced() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Create a layer to upload. + let layer = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + + // Upload the layer. Then dereference the layer, and upload/reference it again. + let index_upload = index_with(&queue.clean.0, &layer); + let index_deref = index_without(&index_upload, &layer); + let index_ref = index_with(&index_deref, &layer); + + let ops = [ + // Initial upload, with a barrier to prevent index coalescing. + UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index_upload.clone(), + }, + UploadOp::Barrier(tokio::sync::watch::channel(()).0), + // Dereference the layer. + UploadOp::UploadMetadata { + uploaded: index_deref.clone(), + }, + // Replace and reference the layer. + UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index_ref.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // Operations are serialized. + for op in ops { + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &op); + queue.complete(tasks[0].task_id); + } + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Nothing can bypass a shutdown, and it waits for inprogress tasks. It's never returned from + /// next_ready(), but is left at the head of the queue. + #[test] + fn schedule_shutdown() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?; + let tli = make_timeline(); + + let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + + // Enqueue non-conflicting upload, delete, and index before and after a shutdown. + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + UploadOp::Shutdown, + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // Schedule the initial operations ahead of the shutdown. + let tasks = queue.schedule_ready(); + + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]); + assert!(matches!( + queue.queued_operations.front(), + Some(&UploadOp::Shutdown) + )); + + // Complete the initial operations. The shutdown isn't triggered while they're pending. + for task in tasks { + assert!(queue.schedule_ready().is_empty()); + queue.complete(task.task_id); + } + + // The shutdown is triggered the next time we try to pull an operation. It isn't returned, + // but is left in the queue. + assert!(!queue.shutdown_ready.is_closed()); + assert!(queue.next_ready().is_none()); + assert!(queue.shutdown_ready.is_closed()); + + Ok(()) + } + + /// Scheduling respects inprogress_limit. + #[test] + fn schedule_inprogress_limit() -> anyhow::Result<()> { + // Create a queue with inprogress_limit=2. + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 2)?; + let tli = make_timeline(); + + // Enqueue a bunch of uploads. + let layer0 = make_layer( + &tli, + "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer1 = make_layer( + &tli, + "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer2 = make_layer( + &tli, + "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + let layer3 = make_layer( + &tli, + "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", + ); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::UploadLayer(layer3.clone(), layer3.metadata(), None), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Schedule all ready operations. Only 2 are scheduled. + let tasks = queue.schedule_ready(); + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..2]); + assert!(queue.next_ready().is_none()); + + // When one completes, another is scheduled. + queue.complete(tasks[0].task_id); + let tasks = queue.schedule_ready(); + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[2..3]); + + Ok(()) + } + + /// Tests that can_bypass takes name, generation and shard index into account for all operations. + #[test] + fn can_bypass_path() -> anyhow::Result<()> { + let tli = make_timeline(); + + let name0 = &"000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"; + let name1 = &"100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"; + + // Asserts that layers a and b either can or can't bypass each other, for all combinations + // of operations (except Delete and UploadMetadata which are special-cased). + #[track_caller] + fn assert_can_bypass(a: ResidentLayer, b: ResidentLayer, can_bypass: bool) { + let index = IndexPart::empty(TimelineMetadata::example()); + for (a, b) in make_ops(a).into_iter().zip(make_ops(b)) { + match (&a, &b) { + // Deletes can always bypass each other. + (UploadOp::Delete(_), UploadOp::Delete(_)) => assert!(a.can_bypass(&b, &index)), + // Indexes can never bypass each other. + (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => { + assert!(!a.can_bypass(&b, &index)) + } + // For other operations, assert as requested. + (a, b) => assert_eq!(a.can_bypass(b, &index), can_bypass), + } + } + } + + fn make_ops(layer: ResidentLayer) -> Vec { + let mut index = IndexPart::empty(TimelineMetadata::example()); + index + .layer_metadata + .insert(layer.layer_desc().layer_name(), layer.metadata()); + vec![ + UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer.layer_desc().layer_name(), layer.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: Box::new(index), + }, + ] + } + + // Makes a ResidentLayer. + let layer = |name: &'static str, shard: Option, generation: u32| -> ResidentLayer { + let shard = shard + .map(|n| ShardIndex::new(ShardNumber(n), ShardCount(8))) + .unwrap_or(ShardIndex::unsharded()); + let metadata = LayerFileMetadata { + shard, + generation: Generation::Valid(generation), + file_size: 0, + }; + make_layer_with_metadata(&tli, name, metadata) + }; + + // Same name and metadata can't bypass. This goes both for unsharded and sharded, as well as + // 0 or >0 generation. + assert_can_bypass(layer(name0, None, 0), layer(name0, None, 0), false); + assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(0), 0), false); + assert_can_bypass(layer(name0, None, 1), layer(name0, None, 1), false); + + // Different names can bypass. + assert_can_bypass(layer(name0, None, 0), layer(name1, None, 0), true); + + // Different shards can bypass. Shard 0 is different from unsharded. + assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(1), 0), true); + assert_can_bypass(layer(name0, Some(0), 0), layer(name0, None, 0), true); + + // Different generations can bypass, both sharded and unsharded. + assert_can_bypass(layer(name0, None, 0), layer(name0, None, 1), true); + assert_can_bypass(layer(name0, Some(1), 0), layer(name0, Some(1), 1), true); + + Ok(()) + } +} diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 7ac0dfd7e2..c017383121 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -27,14 +27,14 @@ use utils::vec_map::VecMap; use crate::context::RequestContext; use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK}; -use crate::virtual_file::IoBufferMut; -use crate::virtual_file::{self, VirtualFile}; +use crate::virtual_file::{self, IoBufferMut, VirtualFile}; /// Metadata bundled with the start and end offset of a blob. #[derive(Copy, Clone, Debug)] pub struct BlobMeta { pub key: Key, pub lsn: Lsn, + pub will_init: bool, } /// A view into the vectored blobs read buffer. @@ -138,7 +138,10 @@ impl VectoredBlob { bits => { let error = std::io::Error::new( std::io::ErrorKind::InvalidData, - format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end), + format!( + "Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", + self.meta.key, self.meta.lsn, self.start, self.end + ), ); Err(error) } @@ -310,7 +313,15 @@ pub enum BlobFlag { /// * Iterate over the collected blobs and coalesce them into reads at the end pub struct VectoredReadPlanner { // Track all the blob offsets. Start offsets must be ordered. - blobs: BTreeMap>, + // Values in the value tuples are: + // ( + // lsn of the blob, + // start offset of the blob in the underlying file, + // end offset of the blob in the underlying file, + // whether the blob initializes the page image or not + // see [`pageserver_api::record::NeonWalRecord::will_init`] + // ) + blobs: BTreeMap>, // Arguments for previous blob passed into [`VectoredReadPlanner::handle`] prev: Option<(Key, Lsn, u64, BlobFlag)>, @@ -371,12 +382,12 @@ impl VectoredReadPlanner { match flag { BlobFlag::None => { let blobs_for_key = self.blobs.entry(key).or_default(); - blobs_for_key.push((lsn, start_offset, end_offset)); + blobs_for_key.push((lsn, start_offset, end_offset, false)); } BlobFlag::ReplaceAll => { let blobs_for_key = self.blobs.entry(key).or_default(); blobs_for_key.clear(); - blobs_for_key.push((lsn, start_offset, end_offset)); + blobs_for_key.push((lsn, start_offset, end_offset, true)); } BlobFlag::Ignore => {} } @@ -387,11 +398,17 @@ impl VectoredReadPlanner { let mut reads = Vec::new(); for (key, blobs_for_key) in self.blobs { - for (lsn, start_offset, end_offset) in blobs_for_key { + for (lsn, start_offset, end_offset, will_init) in blobs_for_key { let extended = match &mut current_read_builder { - Some(read_builder) => { - read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }) - } + Some(read_builder) => read_builder.extend( + start_offset, + end_offset, + BlobMeta { + key, + lsn, + will_init, + }, + ), None => VectoredReadExtended::No, }; @@ -399,7 +416,11 @@ impl VectoredReadPlanner { let next_read_builder = ChunkedVectoredReadBuilder::new( start_offset, end_offset, - BlobMeta { key, lsn }, + BlobMeta { + key, + lsn, + will_init, + }, self.max_read_size, ); @@ -527,7 +548,7 @@ impl<'a> VectoredBlobReader<'a> { pub struct StreamingVectoredReadPlanner { read_builder: Option, // Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`] - prev: Option<(Key, Lsn, u64)>, + prev: Option<(Key, Lsn, u64, bool)>, /// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150, /// we will produce a single batch instead of split them. max_read_size: u64, @@ -550,27 +571,47 @@ impl StreamingVectoredReadPlanner { } } - pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64) -> Option { + pub fn handle( + &mut self, + key: Key, + lsn: Lsn, + offset: u64, + will_init: bool, + ) -> Option { // Implementation note: internally lag behind by one blob such that // we have a start and end offset when initialising [`VectoredRead`] - let (prev_key, prev_lsn, prev_offset) = match self.prev { + let (prev_key, prev_lsn, prev_offset, prev_will_init) = match self.prev { None => { - self.prev = Some((key, lsn, offset)); + self.prev = Some((key, lsn, offset, will_init)); return None; } Some(prev) => prev, }; - let res = self.add_blob(prev_key, prev_lsn, prev_offset, offset, false); + let res = self.add_blob( + prev_key, + prev_lsn, + prev_offset, + offset, + false, + prev_will_init, + ); - self.prev = Some((key, lsn, offset)); + self.prev = Some((key, lsn, offset, will_init)); res } pub fn handle_range_end(&mut self, offset: u64) -> Option { - let res = if let Some((prev_key, prev_lsn, prev_offset)) = self.prev { - self.add_blob(prev_key, prev_lsn, prev_offset, offset, true) + let res = if let Some((prev_key, prev_lsn, prev_offset, prev_will_init)) = self.prev { + self.add_blob( + prev_key, + prev_lsn, + prev_offset, + offset, + true, + prev_will_init, + ) } else { None }; @@ -587,10 +628,19 @@ impl StreamingVectoredReadPlanner { start_offset: u64, end_offset: u64, is_last_blob_in_read: bool, + will_init: bool, ) -> Option { match &mut self.read_builder { Some(read_builder) => { - let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }); + let extended = read_builder.extend( + start_offset, + end_offset, + BlobMeta { + key, + lsn, + will_init, + }, + ); assert_eq!(extended, VectoredReadExtended::Yes); } None => { @@ -598,7 +648,11 @@ impl StreamingVectoredReadPlanner { Some(ChunkedVectoredReadBuilder::new_streaming( start_offset, end_offset, - BlobMeta { key, lsn }, + BlobMeta { + key, + lsn, + will_init, + }, )) }; } @@ -625,13 +679,12 @@ impl StreamingVectoredReadPlanner { mod tests { use anyhow::Error; + use super::super::blob_io::tests::{random_array, write_maybe_compressed}; + use super::*; use crate::context::DownloadBehavior; use crate::page_cache::PAGE_SZ; use crate::task_mgr::TaskKind; - use super::super::blob_io::tests::{random_array, write_maybe_compressed}; - use super::*; - fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64; assert_eq!(read.start % ALIGN, 0); @@ -812,7 +865,7 @@ mod tests { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1000); let mut reads = Vec::new(); for (key, lsn, offset, _) in blob_descriptions.clone() { - reads.extend(planner.handle(key, lsn, offset)); + reads.extend(planner.handle(key, lsn, offset, false)); } reads.extend(planner.handle_range_end(652 * 1024)); @@ -850,7 +903,7 @@ mod tests { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); let mut reads = Vec::new(); for (key, lsn, offset, _) in blob_descriptions.clone() { - reads.extend(planner.handle(key, lsn, offset)); + reads.extend(planner.handle(key, lsn, offset, false)); } reads.extend(planner.handle_range_end(652 * 1024)); @@ -875,7 +928,7 @@ mod tests { { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); let mut reads = Vec::new(); - reads.extend(planner.handle(key, lsn, 0)); + reads.extend(planner.handle(key, lsn, 0, false)); reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), 1); validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); @@ -883,8 +936,8 @@ mod tests { { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 1); let mut reads = Vec::new(); - reads.extend(planner.handle(key, lsn, 0)); - reads.extend(planner.handle(key, lsn, 128 * 1024)); + reads.extend(planner.handle(key, lsn, 0, false)); + reads.extend(planner.handle(key, lsn, 128 * 1024, false)); reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), 2); validate_read(&reads[0], &[(key, lsn, 0, BlobFlag::None)]); @@ -893,8 +946,8 @@ mod tests { { let mut planner = StreamingVectoredReadPlanner::new(max_read_size, 2); let mut reads = Vec::new(); - reads.extend(planner.handle(key, lsn, 0)); - reads.extend(planner.handle(key, lsn, 128 * 1024)); + reads.extend(planner.handle(key, lsn, 0, false)); + reads.extend(planner.handle(key, lsn, 128 * 1024, false)); reads.extend(planner.handle_range_end(652 * 1024)); assert_eq!(reads.len(), 1); validate_read( @@ -923,6 +976,7 @@ mod tests { let meta = BlobMeta { key: Key::MIN, lsn: Lsn(0), + will_init: false, }; for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index a0223f3bce..29d1a31aaf 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -3,13 +3,15 @@ //! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the //! truth. -use anyhow::Context; use std::path::Path; + +use anyhow::Context; +use pageserver_api::models::PageserverUtilization; use utils::serde_percent::Percent; -use pageserver_api::models::PageserverUtilization; - -use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager}; +use crate::config::PageServerConf; +use crate::metrics::NODE_UTILIZATION_SCORE; +use crate::tenant::mgr::TenantManager; pub(crate) fn regenerate( conf: &PageServerConf, @@ -49,7 +51,7 @@ pub(crate) fn regenerate( }; // Express a static value for how many shards we may schedule on one node - const MAX_SHARDS: u32 = 20000; + const MAX_SHARDS: u32 = 5000; let mut doc = PageserverUtilization { disk_usage_bytes: used, diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index a0dc493bf4..1f5a820ce7 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -11,11 +11,13 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! -use crate::context::RequestContext; -use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; +use std::fs::File; +use std::io::{Error, ErrorKind, Seek, SeekFrom}; +use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; +#[cfg(target_os = "linux")] +use std::os::unix::fs::OpenOptionsExt; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; -use crate::page_cache::{PageWriteGuard, PAGE_SZ}; -use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer; @@ -23,31 +25,30 @@ use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlig use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut}; use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; +pub use pageserver_api::models::virtual_file as api; use pageserver_api::shard::TenantShardId; -use std::fs::File; -use std::io::{Error, ErrorKind, Seek, SeekFrom}; -#[cfg(target_os = "linux")] -use std::os::unix::fs::OpenOptionsExt; -use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; - -use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; -use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::time::Instant; +use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; -pub use pageserver_api::models::virtual_file as api; +use crate::context::RequestContext; +use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC, StorageIoOperation}; +use crate::page_cache::{PAGE_SZ, PageWriteGuard}; +use crate::tenant::TENANTS_SEGMENT_NAME; pub(crate) mod io_engine; -pub use io_engine::feature_test as io_engine_feature_test; -pub use io_engine::io_engine_for_bench; -pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; +pub use io_engine::{ + FeatureTestResult as IoEngineFeatureTestResult, feature_test as io_engine_feature_test, + io_engine_for_bench, +}; mod metadata; mod open_options; -use self::owned_buffers_io::write::OwnedAsyncWriter; pub(crate) use api::IoMode; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; +use self::owned_buffers_io::write::OwnedAsyncWriter; + pub(crate) mod owned_buffers_io { //! Abstractions for IO with owned buffers. //! @@ -230,6 +231,19 @@ impl VirtualFile { ) -> (FullSlice, Result) { self.inner.write_all(buf, ctx).await } + + async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { + self.inner.read_to_end(buf, ctx).await + } + + pub(crate) async fn read_to_string( + &mut self, + ctx: &RequestContext, + ) -> Result { + let mut buf = Vec::new(); + self.read_to_end(&mut buf, ctx).await?; + Ok(String::from_utf8(buf)?) + } } /// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing @@ -479,7 +493,8 @@ pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool { /// bad storage or bad configuration, and we can't fix that from inside /// a running process. pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! { - tracing::error!("Fatal I/O error: {e}: {context})"); + let backtrace = std::backtrace::Backtrace::force_capture(); + tracing::error!("Fatal I/O error: {e}: {context})\n{backtrace}"); std::process::abort(); } @@ -924,13 +939,18 @@ impl VirtualFileInner { where Buf: tokio_epoll_uring::IoBufMut + Send, { - let file_guard = match self.lock_file().await { + let file_guard = match self + .lock_file() + .await + .maybe_fatal_err("lock_file inside VirtualFileInner::read_at") + { Ok(file_guard) => file_guard, Err(e) => return (buf, Err(e)), }; observe_duration!(StorageIoOperation::Read, { let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await; + let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at"); if let Ok(size) = res { STORAGE_IO_SIZE .with_label_values(&[ @@ -983,6 +1003,24 @@ impl VirtualFileInner { (buf, result) }) } + + async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { + let mut tmp = vec![0; 128]; + loop { + let slice = tmp.slice(..128); + let (slice, res) = self.read_at(slice, self.pos, ctx).await; + match res { + Ok(0) => return Ok(()), + Ok(n) => { + self.pos += n as u64; + buf.extend_from_slice(&slice[..n]); + } + Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + tmp = slice.into_inner(); + } + } } // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 @@ -1031,7 +1069,8 @@ where #[cfg(test)] mod test_read_exact_at_impl { - use std::{collections::VecDeque, sync::Arc}; + use std::collections::VecDeque; + use std::sync::Arc; use tokio_epoll_uring::{BoundedBuf, BoundedBufMut}; @@ -1227,10 +1266,6 @@ impl VirtualFile { ) -> Result, std::io::Error> { self.inner.read_blk(blknum, ctx).await } - - async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { - self.inner.read_to_end(buf, ctx).await - } } #[cfg(test)] @@ -1250,24 +1285,6 @@ impl VirtualFileInner { slice.into_inner(), )) } - - async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { - let mut tmp = vec![0; 128]; - loop { - let slice = tmp.slice(..128); - let (slice, res) = self.read_at(slice, self.pos, ctx).await; - match res { - Ok(0) => return Ok(()), - Ok(n) => { - self.pos += n as u64; - buf.extend_from_slice(&slice[..n]); - } - Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return Err(e), - } - tmp = slice.into_inner(); - } - } } impl Drop for VirtualFileInner { @@ -1400,19 +1417,19 @@ static SYNC_MODE: AtomicU8 = AtomicU8::new(SyncMode::Sync as u8); #[cfg(test)] mod tests { - use crate::context::DownloadBehavior; - use crate::task_mgr::TaskKind; - - use super::*; - use owned_buffers_io::io_buf_ext::IoBufExt; - use owned_buffers_io::slice::SliceMutExt; - use rand::seq::SliceRandom; - use rand::thread_rng; - use rand::Rng; use std::io::Write; use std::os::unix::fs::FileExt; use std::sync::Arc; + use owned_buffers_io::io_buf_ext::IoBufExt; + use owned_buffers_io::slice::SliceMutExt; + use rand::seq::SliceRandom; + use rand::{Rng, thread_rng}; + + use super::*; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + enum MaybeVirtualFile { VirtualFile(VirtualFile), File(File), diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index ccde90ee1a..758dd6e377 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -80,7 +80,9 @@ pub(crate) fn get() -> IoEngine { Ok(v) => match v.parse::() { Ok(engine_kind) => engine_kind, Err(e) => { - panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}") + panic!( + "invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}" + ) } }, Err(std::env::VarError::NotPresent) => { @@ -107,15 +109,12 @@ pub(crate) fn get() -> IoEngine { } } -use std::{ - os::unix::prelude::FileExt, - sync::atomic::{AtomicU8, Ordering}, -}; +use std::os::unix::prelude::FileExt; +use std::sync::atomic::{AtomicU8, Ordering}; -use super::{ - owned_buffers_io::{io_buf_ext::FullSlice, slice::SliceMutExt}, - FileGuard, Metadata, -}; +use super::owned_buffers_io::io_buf_ext::FullSlice; +use super::owned_buffers_io::slice::SliceMutExt; +use super::{FileGuard, Metadata}; #[cfg(target_os = "linux")] fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs index c67215492f..ad17405b64 100644 --- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs +++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs @@ -5,18 +5,16 @@ //! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series. //! See for more details. -use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; - -use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; -use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use tokio_epoll_uring::{System, SystemHandle}; - -use crate::virtual_file::on_fatal_io_error; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, error, info, info_span, warn}; +use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use crate::metrics::tokio_epoll_uring::{self as metrics, THREAD_LOCAL_METRICS_STORAGE}; +use crate::virtual_file::on_fatal_io_error; #[derive(Clone)] struct ThreadLocalState(Arc); @@ -194,7 +192,7 @@ impl std::ops::Deref for Handle { fn deref(&self) -> &Self::Target { self.0 - .0 + .0 .cell .get() .expect("must be already initialized when using this") diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index 7f951270d1..e188b8649b 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -1,7 +1,9 @@ //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; +use std::os::fd::OwnedFd; +use std::path::Path; + use super::io_engine::IoEngine; -use std::{os::fd::OwnedFd, path::Path}; #[derive(Debug, Clone)] pub enum OpenOptions { diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index a5c26cd746..090d2ece85 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -1,9 +1,9 @@ -use std::{ - ops::{Deref, Range, RangeBounds}, - sync::Arc, -}; +use std::ops::{Deref, Range, RangeBounds}; +use std::sync::Arc; -use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign}; +use super::alignment::Alignment; +use super::raw::RawAlignedBuffer; +use super::{AlignedBufferMut, ConstAlign}; /// An shared, immutable aligned buffer type. #[derive(Clone, Debug)] diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index be07d5539f..c1b4015ae2 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -1,13 +1,9 @@ -use std::{ - mem::MaybeUninit, - ops::{Deref, DerefMut}, -}; +use std::mem::MaybeUninit; +use std::ops::{Deref, DerefMut}; -use super::{ - alignment::{Alignment, ConstAlign}, - buffer::AlignedBuffer, - raw::RawAlignedBuffer, -}; +use super::alignment::{Alignment, ConstAlign}; +use super::buffer::AlignedBuffer; +use super::raw::RawAlignedBuffer; /// A mutable aligned buffer type. #[derive(Debug)] @@ -75,7 +71,8 @@ impl AlignedBufferMut { /// Force the length of the buffer to `new_len`. #[inline] unsafe fn set_len(&mut self, new_len: usize) { - self.raw.set_len(new_len) + // SAFETY: the caller is unsafe + unsafe { self.raw.set_len(new_len) } } #[inline] @@ -222,8 +219,10 @@ unsafe impl bytes::BufMut for AlignedBufferMut { panic_advance(cnt, remaining); } - // Addition will not overflow since the sum is at most the capacity. - self.set_len(len + cnt); + // SAFETY: Addition will not overflow since the sum is at most the capacity. + unsafe { + self.set_len(len + cnt); + } } #[inline] @@ -275,7 +274,10 @@ unsafe impl tokio_epoll_uring::IoBufMut for AlignedBufferMut { unsafe fn set_init(&mut self, init_len: usize) { if self.len() < init_len { - self.set_len(init_len); + // SAFETY: caller function is unsafe + unsafe { + self.set_len(init_len); + } } } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs index 6c26dec0db..97a6c4049a 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs @@ -1,9 +1,7 @@ use core::slice; -use std::{ - alloc::{self, Layout}, - cmp, - mem::ManuallyDrop, -}; +use std::alloc::{self, Layout}; +use std::cmp; +use std::mem::ManuallyDrop; use super::alignment::{Alignment, ConstAlign}; diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index 525f447b6d..4c671c2652 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -1,11 +1,12 @@ //! See [`FullSlice`]. -use crate::virtual_file::{IoBuffer, IoBufferMut}; -use bytes::{Bytes, BytesMut}; use std::ops::{Deref, Range}; + +use bytes::{Bytes, BytesMut}; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; use super::write::CheapCloneForRead; +use crate::virtual_file::{IoBuffer, IoBufferMut}; /// The true owned equivalent for Rust [`slice`]. Use this for the write path. /// diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs index 6100593663..9f4a05dd57 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/slice.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs @@ -1,7 +1,4 @@ -use tokio_epoll_uring::BoundedBuf; -use tokio_epoll_uring::BoundedBufMut; -use tokio_epoll_uring::IoBufMut; -use tokio_epoll_uring::Slice; +use tokio_epoll_uring::{BoundedBuf, BoundedBufMut, IoBufMut, Slice}; pub(crate) trait SliceMutExt { /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO. @@ -35,10 +32,11 @@ where mod tests { use std::io::Read; - use super::*; use bytes::Buf; use tokio_epoll_uring::Slice; + use super::*; + #[test] fn test_slice_full_zeroed() { let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader(); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 2170845dd8..124d8fb75a 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,22 +1,17 @@ mod flush; use std::sync::Arc; +use super::io_buf_aligned::{IoBufAligned, IoBufAlignedMut}; use bytes::BufMut; use flush::FlushHandle; use tokio_epoll_uring::IoBuf; -use crate::{ - context::RequestContext, - virtual_file::{IoBuffer, IoBufferMut}, -}; - -use super::{ - io_buf_aligned::{IoBufAligned, IoBufAlignedMut}, - io_buf_ext::{FullSlice, IoBufExt}, -}; - pub(crate) use flush::FlushControl; +use super::io_buf_ext::{FullSlice, IoBufExt}; +use crate::context::RequestContext; +use crate::virtual_file::{IoBuffer, IoBufferMut}; + pub(crate) trait CheapCloneForRead { /// Returns a cheap clone of the buffer. fn cheap_clone(&self) -> Self; diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index e0381e9b22..a90226b783 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -2,12 +2,10 @@ use std::{marker::PhantomData, sync::Arc}; use utils::sync::duplex; -use crate::{ - context::RequestContext, - virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice}, -}; - use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter}; +use crate::context::RequestContext; +use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAligned; +use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice; /// A handle to the flush task. pub struct FlushHandle { diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 30c8965d51..18df065f76 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -22,39 +22,34 @@ //! bespoken Rust code. use std::collections::HashMap; -use std::sync::Arc; -use std::sync::OnceLock; -use std::time::Duration; -use std::time::Instant; -use std::time::SystemTime; +use std::sync::{Arc, OnceLock}; +use std::time::{Duration, Instant, SystemTime}; +use anyhow::{Result, bail}; +use bytes::{Buf, Bytes}; +use pageserver_api::key::rel_block_to_key; +use pageserver_api::record::NeonWalRecord; +use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::fsm_logical_to_physical; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::walrecord::*; -use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz}; +use postgres_ffi::{ + TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, + fsm_logical_to_physical, pg_constants, +}; +use tracing::*; +use utils::bin_ser::SerializeError; +use utils::lsn::Lsn; +use utils::rate_limit::RateLimit; +use utils::{critical, failpoint_support}; use wal_decoder::models::*; -use anyhow::{bail, Result}; -use bytes::{Buf, Bytes}; -use tracing::*; -use utils::failpoint_support; -use utils::rate_limit::RateLimit; - +use crate::ZERO_PAGE; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::{DatadirModification, Version}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::PageReconstructError; -use crate::tenant::Timeline; -use crate::ZERO_PAGE; -use pageserver_api::key::rel_block_to_key; -use pageserver_api::record::NeonWalRecord; -use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; -use postgres_ffi::pg_constants; -use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; -use postgres_ffi::TransactionId; -use utils::bin_ser::SerializeError; -use utils::lsn::Lsn; +use crate::tenant::{PageReconstructError, Timeline}; enum_pgversion! {CheckPoint, pgv::CheckPoint} @@ -303,12 +298,14 @@ impl WalIngest { if xid > next_xid { // Wraparound occurred, must be from a prev epoch. if epoch == 0 { - bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"); + bail!( + "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}" + ); } epoch -= 1; } - Ok((epoch as u64) << 32 | xid as u64) + Ok(((epoch as u64) << 32) | xid as u64) } async fn ingest_clear_vm_bits( @@ -327,93 +324,75 @@ impl WalIngest { let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); - // Sometimes, Postgres seems to create heap WAL records with the - // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is - // not set. In fact, it's possible that the VM page does not exist at all. - // In that case, we don't want to store a record to clear the VM bit; - // replaying it would fail to find the previous image of the page, because - // it doesn't exist. So check if the VM page(s) exist, and skip the WAL - // record if it doesn't. - // - // TODO: analyze the metrics and tighten this up accordingly. This logic - // implicitly assumes that VM pages see explicit WAL writes before - // implicit ClearVmBits, and will otherwise silently drop updates. + // VM bits can only be cleared on the shard(s) owning the VM relation, and must be within + // its view of the VM relation size. Out of caution, error instead of failing WAL ingestion, + // as there has historically been cases where PostgreSQL has cleared spurious VM pages. See: + // https://github.com/neondatabase/neon/pull/10634. let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else { - WAL_INGEST - .clear_vm_bits_unknown - .with_label_values(&["relation"]) - .inc(); + critical!("clear_vm_bits for unknown VM relation {vm_rel}"); return Ok(()); }; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { - WAL_INGEST - .clear_vm_bits_unknown - .with_label_values(&["new_page"]) - .inc(); + critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}"); new_vm_blk = None; } } if let Some(blknum) = old_vm_blk { if blknum >= vm_size { - WAL_INGEST - .clear_vm_bits_unknown - .with_label_values(&["old_page"]) - .inc(); + critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}"); old_vm_blk = None; } } - if new_vm_blk.is_some() || old_vm_blk.is_some() { - if new_vm_blk == old_vm_blk { - // An UPDATE record that needs to clear the bits for both old and the - // new page, both of which reside on the same VM page. + if new_vm_blk.is_none() && old_vm_blk.is_none() { + return Ok(()); + } else if new_vm_blk == old_vm_blk { + // An UPDATE record that needs to clear the bits for both old and the new page, both of + // which reside on the same VM page. + self.put_rel_wal_record( + modification, + vm_rel, + new_vm_blk.unwrap(), + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, + old_heap_blkno, + flags, + }, + ctx, + ) + .await?; + } else { + // Clear VM bits for one heap page, or for two pages that reside on different VM pages. + if let Some(new_vm_blk) = new_vm_blk { self.put_rel_wal_record( modification, vm_rel, - new_vm_blk.unwrap(), + new_vm_blk, NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, + old_heap_blkno: None, + flags, + }, + ctx, + ) + .await?; + } + if let Some(old_vm_blk) = old_vm_blk { + self.put_rel_wal_record( + modification, + vm_rel, + old_vm_blk, + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno: None, old_heap_blkno, flags, }, ctx, ) .await?; - } else { - // Clear VM bits for one heap page, or for two pages that reside on - // different VM pages. - if let Some(new_vm_blk) = new_vm_blk { - self.put_rel_wal_record( - modification, - vm_rel, - new_vm_blk, - NeonWalRecord::ClearVisibilityMapFlags { - new_heap_blkno, - old_heap_blkno: None, - flags, - }, - ctx, - ) - .await?; - } - if let Some(old_vm_blk) = old_vm_blk { - self.put_rel_wal_record( - modification, - vm_rel, - old_vm_blk, - NeonWalRecord::ClearVisibilityMapFlags { - new_heap_blkno: None, - old_heap_blkno, - flags, - }, - ctx, - ) - .await?; - } } } - Ok(()) } @@ -499,7 +478,13 @@ impl WalIngest { let content = modification .tline - .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx) + .get_rel_page_at_lsn( + src_rel, + blknum, + Version::Modified(modification), + ctx, + crate::tenant::storage_layer::IoConcurrency::sequential(), + ) .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; @@ -809,9 +794,7 @@ impl WalIngest { // Remove twophase file. see RemoveTwoPhaseFile() in postgres code trace!( "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", - xl_xid, - parsed.xid, - lsn, + xl_xid, parsed.xid, lsn, ); let xid: u64 = if modification.tline.pg_version >= 17 { @@ -877,22 +860,24 @@ impl WalIngest { // will block waiting for the last valid LSN to advance up to // it. So we use the previous record's LSN in the get calls // instead. - for segno in modification - .tline - .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx) - .await? - { - let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; + if modification.tline.get_shard_identity().is_shard_zero() { + for segno in modification + .tline + .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx) + .await? + { + let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; - let may_delete = dispatch_pgversion!(modification.tline.pg_version, { - pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno) - }); + let may_delete = dispatch_pgversion!(modification.tline.pg_version, { + pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno) + }); - if may_delete { - modification - .drop_slru_segment(SlruKind::Clog, segno, ctx) - .await?; - trace!("Drop CLOG segment {:>04X}", segno); + if may_delete { + modification + .drop_slru_segment(SlruKind::Clog, segno, ctx) + .await?; + trace!("Drop CLOG segment {:>04X}", segno); + } } } @@ -1047,16 +1032,18 @@ impl WalIngest { // Delete all the segments except the last one. The last segment can still // contain, possibly partially, valid data. - while segment != endsegment { - modification - .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx) - .await?; + if modification.tline.get_shard_identity().is_shard_zero() { + while segment != endsegment { + modification + .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx) + .await?; - /* move to next segment, handling wraparound correctly */ - if segment == maxsegment { - segment = 0; - } else { - segment += 1; + /* move to next segment, handling wraparound correctly */ + if segment == maxsegment { + segment = 0; + } else { + segment += 1; + } } } @@ -1139,16 +1126,14 @@ impl WalIngest { let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?; trace!( "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", - xlog_checkpoint.oldestXid, - cp.oldestXid + xlog_checkpoint.oldestXid, cp.oldestXid ); if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 { cp.oldestXid = xlog_checkpoint.oldestXid; } trace!( "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", - xlog_checkpoint.oldestActiveXid, - cp.oldestActiveXid + xlog_checkpoint.oldestActiveXid, cp.oldestActiveXid ); // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, @@ -1189,6 +1174,50 @@ impl WalIngest { } else { cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; } + // NB: We abuse the Checkpoint.redo field: + // + // - In PostgreSQL, the Checkpoint struct doesn't store the information + // of whether this is an online checkpoint or a shutdown checkpoint. It's + // stored in the XLOG info field of the WAL record, shutdown checkpoints + // use record type XLOG_CHECKPOINT_SHUTDOWN and online checkpoints use + // XLOG_CHECKPOINT_ONLINE. We don't store the original WAL record headers + // in the pageserver, however. + // + // - In PostgreSQL, the Checkpoint.redo field stores the *start* of the + // checkpoint record, if it's a shutdown checkpoint. But when we are + // starting from a shutdown checkpoint, the basebackup LSN is the *end* + // of the shutdown checkpoint WAL record. That makes it difficult to + // correctly detect whether we're starting from a shutdown record or + // not. + // + // To address both of those issues, we store 0 in the redo field if it's + // an online checkpoint record, and the record's *end* LSN if it's a + // shutdown checkpoint. We don't need the original redo pointer in neon, + // because we don't perform WAL replay at startup anyway, so we can get + // away with abusing the redo field like this. + // + // XXX: Ideally, we would persist the extra information in a more + // explicit format, rather than repurpose the fields of the Postgres + // struct like this. However, we already have persisted data like this, + // so we need to maintain backwards compatibility. + // + // NB: We didn't originally have this convention, so there are still old + // persisted records that didn't do this. Before, we didn't update the + // persisted redo field at all. That means that old records have a bogus + // redo pointer that points to some old value, from the checkpoint record + // that was originally imported from the data directory. If it was a + // project created in Neon, that means it points to the first checkpoint + // after initdb. That's OK for our purposes: all such old checkpoints are + // treated as old online checkpoints when the basebackup is created. + cp.redo = if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { + // Store the *end* LSN of the checkpoint record. Or to be precise, + // the start LSN of the *next* record, i.e. if the record ends + // exactly at page boundary, the redo LSN points to just after the + // page header on the next page. + lsn.into() + } else { + Lsn::INVALID.into() + }; // Write a new checkpoint key-value pair on every checkpoint record, even // if nothing really changed. Not strictly required, but it seems nice to @@ -1333,8 +1362,9 @@ impl WalIngest { // with zero pages. Logging is rate limited per pg version to // avoid skewing. if gap_blocks_filled > 0 { - use once_cell::sync::Lazy; use std::sync::Mutex; + + use once_cell::sync::Lazy; use utils::rate_limit::RateLimit; struct RateLimitPerPgVersion { @@ -1440,10 +1470,7 @@ impl WalIngest { if new_nblocks > old_nblocks { trace!( "extending SLRU {:?} seg {} from {} to {} blocks", - kind, - segno, - old_nblocks, - new_nblocks + kind, segno, old_nblocks, new_nblocks ); modification.put_slru_extend(kind, segno, new_nblocks)?; @@ -1482,12 +1509,13 @@ async fn get_relsize( #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { - use super::*; - use crate::tenant::harness::*; - use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; use postgres_ffi::RELSEG_SIZE; + use super::*; use crate::DEFAULT_PG_VERSION; + use crate::tenant::harness::*; + use crate::tenant::remote_timeline_client::{INITDB_PATH, remote_initdb_archive_path}; + use crate::tenant::storage_layer::IoConcurrency; /// Arbitrary relation tag, for testing. const TESTREL_A: RelTag = RelTag { @@ -1528,6 +1556,7 @@ mod tests { #[tokio::test] async fn test_relsize() -> Result<()> { let (tenant, ctx) = TenantHarness::create("test_relsize").await?.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -1569,10 +1598,12 @@ mod tests { .await?, false ); - assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) - .await - .is_err()); + assert!( + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .await + .is_err() + ); assert_eq!( tline .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) @@ -1595,7 +1626,13 @@ mod tests { // Check page contents at each LSN assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x20)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 2") @@ -1603,7 +1640,13 @@ mod tests { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x30)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") @@ -1611,14 +1654,26 @@ mod tests { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x40)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1, + Version::Lsn(Lsn(0x40)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") @@ -1626,21 +1681,39 @@ mod tests { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 2, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 2 at 5") @@ -1663,14 +1736,26 @@ mod tests { ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x60)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1, + Version::Lsn(Lsn(0x60)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1 at 4") @@ -1685,7 +1770,13 @@ mod tests { ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 2, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 2 at 5") @@ -1718,14 +1809,26 @@ mod tests { ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 0, + Version::Lsn(Lsn(0x70)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, ZERO_PAGE ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1, + Version::Lsn(Lsn(0x70)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1") @@ -1746,7 +1849,13 @@ mod tests { for blk in 2..1500 { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blk, + Version::Lsn(Lsn(0x80)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, ZERO_PAGE @@ -1754,7 +1863,13 @@ mod tests { } assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + 1500, + Version::Lsn(Lsn(0x80)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img("foo blk 1500") @@ -1847,6 +1962,7 @@ mod tests { .await? .load() .await; + let io_concurrency = IoConcurrency::spawn_for_test(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; @@ -1875,10 +1991,12 @@ mod tests { .await?, false ); - assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) - .await - .is_err()); + assert!( + tline + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .await + .is_err() + ); assert_eq!( tline @@ -1899,7 +2017,13 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blkno, + Version::Lsn(lsn), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img(&data) @@ -1927,7 +2051,13 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blkno, + Version::Lsn(Lsn(0x60)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img(&data) @@ -1946,7 +2076,13 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blkno, + Version::Lsn(Lsn(0x50)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img(&data) @@ -1983,7 +2119,13 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_page_at_lsn( + TESTREL_A, + blkno, + Version::Lsn(Lsn(0x80)), + &ctx, + io_concurrency.clone() + ) .instrument(test_span.clone()) .await?, test_img(&data) @@ -2084,9 +2226,10 @@ mod tests { /// without waiting for unrelated steps. #[tokio::test] async fn test_ingest_real_wal() { - use crate::tenant::harness::*; - use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::WAL_SEGMENT_SIZE; + use postgres_ffi::waldecoder::WalStreamDecoder; + + use crate::tenant::harness::*; // Define test data path and constants. // @@ -2159,10 +2302,12 @@ mod tests { while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() { let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, - modification.tline.get_shard_identity(), + &[*modification.tline.get_shard_identity()], lsn, modification.tline.pg_version, ) + .unwrap() + .remove(modification.tline.get_shard_identity()) .unwrap(); walingest diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 027a6eb7d7..22d8d83811 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -24,26 +24,27 @@ mod process; /// Code to apply [`NeonWalRecord`]s. pub(crate) mod apply_neon; -use crate::config::PageServerConf; -use crate::metrics::{ - WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, - WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, -}; +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use anyhow::Context; use bytes::{Bytes, BytesMut}; use pageserver_api::key::Key; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; -use std::future::Future; -use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; use tracing::*; use utils::lsn::Lsn; use utils::sync::gate::GateError; use utils::sync::heavier_once_cell; +use crate::config::PageServerConf; +use crate::metrics::{ + WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, + WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, +}; + /// The real implementation that uses a Postgres process to /// perform WAL replay. /// @@ -547,15 +548,18 @@ impl PostgresRedoManager { #[cfg(test)] mod tests { - use super::PostgresRedoManager; - use crate::config::PageServerConf; + use std::str::FromStr; + use bytes::Bytes; use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; - use std::str::FromStr; use tracing::Instrument; - use utils::{id::TenantId, lsn::Lsn}; + use utils::id::TenantId; + use utils::lsn::Lsn; + + use super::PostgresRedoManager; + use crate::config::PageServerConf; #[tokio::test] async fn test_ping() { diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index d62e325310..61ae1eb970 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -4,13 +4,12 @@ use bytes::BytesMut; use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::SlruKind; -use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, }; -use postgres_ffi::BLCKSZ; +use postgres_ffi::{BLCKSZ, pg_constants}; use tracing::*; use utils::lsn::Lsn; diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index 7e9477cfbc..5a9fc63e63 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -2,28 +2,28 @@ mod no_leak_child; /// The IPC protocol that pageserver and walredo process speak over their shared pipe. mod protocol; -use self::no_leak_child::NoLeakChild; -use crate::{ - config::PageServerConf, - metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, - page_cache::PAGE_SZ, - span::debug_assert_current_span_has_tenant_id, -}; +use std::collections::VecDeque; +use std::process::{Command, Stdio}; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::time::Duration; + use anyhow::Context; use bytes::Bytes; use pageserver_api::record::NeonWalRecord; -use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use pageserver_api::reltag::RelTag; +use pageserver_api::shard::TenantShardId; use postgres_ffi::BLCKSZ; -#[cfg(feature = "testing")] -use std::sync::atomic::AtomicUsize; -use std::{ - collections::VecDeque, - process::{Command, Stdio}, - time::Duration, -}; use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tracing::{debug, error, instrument, Instrument}; -use utils::{lsn::Lsn, poison::Poison}; +use tracing::{Instrument, debug, error, instrument}; +use utils::lsn::Lsn; +use utils::poison::Poison; + +use self::no_leak_child::NoLeakChild; +use crate::config::PageServerConf; +use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER, WalRedoKillCause}; +use crate::page_cache::PAGE_SZ; +use crate::span::debug_assert_current_span_has_tenant_id; pub struct WalRedoProcess { #[allow(dead_code)] @@ -79,6 +79,14 @@ impl WalRedoProcess { .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir_path) .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + .env( + "ASAN_OPTIONS", + std::env::var("ASAN_OPTIONS").unwrap_or_default(), + ) + .env( + "UBSAN_OPTIONS", + std::env::var("UBSAN_OPTIONS").unwrap_or_default(), + ) // NB: The redo process is not trusted after we sent it the first // walredo work. Before that, it is trusted. Specifically, we trust // it to diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs index 1a0d7039df..9939fc4b36 100644 --- a/pageserver/src/walredo/process/no_leak_child.rs +++ b/pageserver/src/walredo/process/no_leak_child.rs @@ -1,19 +1,11 @@ -use tracing::instrument; -use tracing::{error, info}; - -use crate::metrics::WalRedoKillCause; -use crate::metrics::WAL_REDO_PROCESS_COUNTERS; - use std::io; -use std::process::Command; - -use std::ops::DerefMut; - -use std::ops::Deref; - -use std::process::Child; +use std::ops::{Deref, DerefMut}; +use std::process::{Child, Command}; use pageserver_api::shard::TenantShardId; +use tracing::{error, info, instrument}; + +use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WalRedoKillCause}; /// Wrapper type around `std::process::Child` which guarantees that the child /// will be killed and waited-for by this process before being dropped. diff --git a/pgxn/hnsw/Makefile b/pgxn/hnsw/Makefile deleted file mode 100644 index 66436b5920..0000000000 --- a/pgxn/hnsw/Makefile +++ /dev/null @@ -1,26 +0,0 @@ -EXTENSION = hnsw -EXTVERSION = 0.1.0 - -MODULE_big = hnsw -DATA = $(wildcard *--*.sql) -OBJS = hnsw.o hnswalg.o - -TESTS = $(wildcard test/sql/*.sql) -REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) -REGRESS_OPTS = --inputdir=test --load-extension=hnsw - -# For auto-vectorization: -# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html -PG_CFLAGS += -O3 -PG_CXXFLAGS += -O3 -std=c++11 -PG_LDFLAGS += -lstdc++ - -all: $(EXTENSION)--$(EXTVERSION).sql - -PG_CONFIG ?= pg_config -PGXS := $(shell $(PG_CONFIG) --pgxs) -include $(PGXS) - -dist: - mkdir -p dist - git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master diff --git a/pgxn/hnsw/README.md b/pgxn/hnsw/README.md deleted file mode 100644 index bc9c8d571c..0000000000 --- a/pgxn/hnsw/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors - -This ANN extension of Postgres is based -on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw), -the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper: - -[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html), -
-Dmitry Baranchuk, Artem Babenko, Yury Malkov - -# Postgres extension - -HNSW index is hold in memory (built on demand) and it's maxial size is limited -by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type). -Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters -described in the article). - -# Example of usage: - -``` -create extension hnsw; -create table embeddings(id integer primary key, payload real[]); -create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32); -select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100; -``` \ No newline at end of file diff --git a/pgxn/hnsw/hnsw--0.1.0.sql b/pgxn/hnsw/hnsw--0.1.0.sql deleted file mode 100644 index ebf424326d..0000000000 --- a/pgxn/hnsw/hnsw--0.1.0.sql +++ /dev/null @@ -1,29 +0,0 @@ --- complain if script is sourced in psql, rather than via CREATE EXTENSION -\echo Use "CREATE EXTENSION hnsw" to load this file. \quit - --- functions - -CREATE FUNCTION l2_distance(real[], real[]) RETURNS real - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - --- operators - -CREATE OPERATOR <-> ( - LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance, - COMMUTATOR = '<->' -); - --- access method - -CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler - AS 'MODULE_PATHNAME' LANGUAGE C; - -CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler; - -COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method'; - --- opclasses - -CREATE OPERATOR CLASS knn_ops - DEFAULT FOR TYPE real[] USING hnsw AS - OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops; diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c deleted file mode 100644 index e624cb831f..0000000000 --- a/pgxn/hnsw/hnsw.c +++ /dev/null @@ -1,590 +0,0 @@ -#include "postgres.h" - -#include "access/amapi.h" -#include "access/generic_xlog.h" -#include "access/relation.h" -#include "access/reloptions.h" -#include "access/tableam.h" -#include "catalog/index.h" -#include "commands/vacuum.h" -#include "nodes/execnodes.h" -#include "storage/bufmgr.h" -#include "utils/guc.h" -#include "utils/selfuncs.h" - -#include -#include - -#include "hnsw.h" - -PG_MODULE_MAGIC; - -typedef struct { - int32 vl_len_; /* varlena header (do not touch directly!) */ - int dims; - int maxelements; - int efConstruction; - int efSearch; - int M; -} HnswOptions; - -static relopt_kind hnsw_relopt_kind; - -typedef struct { - HierarchicalNSW* hnsw; - size_t curr; - size_t n_results; - ItemPointer results; -} HnswScanOpaqueData; - -typedef HnswScanOpaqueData* HnswScanOpaque; - -typedef struct { - Oid relid; - uint32 status; - HierarchicalNSW* hnsw; -} HnswHashEntry; - - -#define SH_PREFIX hnsw_index -#define SH_ELEMENT_TYPE HnswHashEntry -#define SH_KEY_TYPE Oid -#define SH_KEY relid -#define SH_STORE_HASH -#define SH_GET_HASH(tb, a) ((a)->relid) -#define SH_HASH_KEY(tb, key) (key) -#define SH_EQUAL(tb, a, b) ((a) == (b)) -#define SH_SCOPE static inline -#define SH_DEFINE -#define SH_DECLARE -#include "lib/simplehash.h" - -#define INDEX_HASH_SIZE 11 - -#define DEFAULT_EF_SEARCH 64 - -PGDLLEXPORT void _PG_init(void); - -static hnsw_index_hash *hnsw_indexes; - -/* - * Initialize index options and variables - */ -void -_PG_init(void) -{ - hnsw_relopt_kind = add_reloption_kind(); - add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions", - 0, 0, INT_MAX, AccessExclusiveLock); - add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements", - 0, 0, INT_MAX, AccessExclusiveLock); - add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex", - 100, 0, INT_MAX, AccessExclusiveLock); - add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction", - 16, 1, INT_MAX, AccessExclusiveLock); - add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search", - 64, 1, INT_MAX, AccessExclusiveLock); - hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL); -} - - -static void -hnsw_build_callback(Relation index, ItemPointer tid, Datum *values, - bool *isnull, bool tupleIsAlive, void *state) -{ - HierarchicalNSW* hnsw = (HierarchicalNSW*) state; - ArrayType* array; - int n_items; - label_t label = 0; - - /* Skip nulls */ - if (isnull[0]) - return; - - array = DatumGetArrayTypeP(values[0]); - n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); - if (n_items != hnsw_dimensions(hnsw)) - { - elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", - n_items, hnsw_dimensions(hnsw)); - } - - memcpy(&label, tid, sizeof(*tid)); - hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label); -} - -static void -hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel) -{ - IndexInfo* indexInfo = BuildIndexInfo(indexRel); - Assert(indexInfo->ii_NumIndexAttrs == 1); - table_index_build_scan(heapRel, indexRel, indexInfo, - true, true, hnsw_build_callback, (void *) hnsw, NULL); -} - -#ifdef __APPLE__ - -#include -#include - -static void -hnsw_check_available_memory(Size requested) -{ - size_t total; - if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0) - elog(ERROR, "Failed to get amount of RAM: %m"); - - if ((Size)NBuffers*BLCKSZ + requested >= total) - elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available", - requested, total - (Size)NBuffers*BLCKSZ); -} - -#else - -#include - -static void -hnsw_check_available_memory(Size requested) -{ - struct sysinfo si; - Size total; - if (sysinfo(&si) < 0) - elog(ERROR, "Failed to get amount of RAM: %m"); - - total = si.totalram*si.mem_unit; - if ((Size)NBuffers*BLCKSZ + requested >= total) - elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available", - requested, total - (Size)NBuffers*BLCKSZ); -} - -#endif - -static HierarchicalNSW* -hnsw_get_index(Relation indexRel, Relation heapRel) -{ - HierarchicalNSW* hnsw; - Oid indexoid = RelationGetRelid(indexRel); - HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid); - if (entry == NULL) - { - size_t dims, maxelements; - size_t M; - size_t maxM; - size_t size_links_level0; - size_t size_data_per_element; - size_t data_size; - dsm_handle handle = indexoid << 1; /* make it even */ - void* impl_private = NULL; - void* mapped_address = NULL; - Size mapped_size = 0; - Size shmem_size; - bool exists = true; - bool found; - HnswOptions *opts = (HnswOptions *) indexRel->rd_options; - if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) { - elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified"); - } - dims = opts->dims; - maxelements = opts->maxelements; - M = opts->M; - maxM = M * 2; - data_size = dims * sizeof(coord_t); - size_links_level0 = (maxM + 1) * sizeof(idx_t); - size_data_per_element = size_links_level0 + data_size + sizeof(label_t); - shmem_size = hnsw_sizeof() + maxelements * size_data_per_element; - - hnsw_check_available_memory(shmem_size); - - /* first try to attach to existed index */ - if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private, - &mapped_address, &mapped_size, DEBUG1)) - { - /* index doesn't exists: try to create it */ - if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private, - &mapped_address, &mapped_size, DEBUG1)) - { - /* We can do it under shared lock, so some other backend may - * try to initialize index. If create is failed because index already - * created by somebody else, then try to attach to it once again - */ - if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private, - &mapped_address, &mapped_size, ERROR)) - { - return NULL; - } - } - else - { - exists = false; - } - } - Assert(mapped_size == shmem_size); - hnsw = (HierarchicalNSW*)mapped_address; - - if (!exists) - { - hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction); - hnsw_populate(hnsw, indexRel, heapRel); - } - entry = hnsw_index_insert(hnsw_indexes, indexoid, &found); - Assert(!found); - entry->hnsw = hnsw; - } - else - { - hnsw = entry->hnsw; - } - return hnsw; -} - -/* - * Start or restart an index scan - */ -static IndexScanDesc -hnsw_beginscan(Relation index, int nkeys, int norderbys) -{ - IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys); - HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData)); - Relation heap = relation_open(index->rd_index->indrelid, NoLock); - so->hnsw = hnsw_get_index(index, heap); - relation_close(heap, NoLock); - so->curr = 0; - so->n_results = 0; - so->results = NULL; - scan->opaque = so; - return scan; -} - -/* - * Start or restart an index scan - */ -static void -hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys) -{ - HnswScanOpaque so = (HnswScanOpaque) scan->opaque; - if (so->results) - { - pfree(so->results); - so->results = NULL; - } - so->curr = 0; - if (orderbys && scan->numberOfOrderBys > 0) - memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData)); -} - -/* - * Fetch the next tuple in the given scan - */ -static bool -hnsw_gettuple(IndexScanDesc scan, ScanDirection dir) -{ - HnswScanOpaque so = (HnswScanOpaque) scan->opaque; - - /* - * Index can be used to scan backward, but Postgres doesn't support - * backward scan on operators - */ - Assert(ScanDirectionIsForward(dir)); - - if (so->curr == 0) - { - Datum value; - ArrayType* array; - int n_items; - size_t n_results; - label_t* results; - HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options; - size_t efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH; - - /* Safety check */ - if (scan->orderByData == NULL) - elog(ERROR, "cannot scan HNSW index without order"); - - /* No items will match if null */ - if (scan->orderByData->sk_flags & SK_ISNULL) - return false; - - value = scan->orderByData->sk_argument; - array = DatumGetArrayTypeP(value); - n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); - if (n_items != hnsw_dimensions(so->hnsw)) - { - elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", - n_items, hnsw_dimensions(so->hnsw)); - } - - if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results)) - elog(ERROR, "HNSW index search failed"); - so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData)); - so->n_results = n_results; - for (size_t i = 0; i < n_results; i++) - { - memcpy(&so->results[i], &results[i], sizeof(so->results[i])); - } - free(results); - } - if (so->curr >= so->n_results) - { - return false; - } - else - { - scan->xs_heaptid = so->results[so->curr++]; - scan->xs_recheckorderby = false; - return true; - } -} - -/* - * End a scan and release resources - */ -static void -hnsw_endscan(IndexScanDesc scan) -{ - HnswScanOpaque so = (HnswScanOpaque) scan->opaque; - if (so->results) - pfree(so->results); - pfree(so); - scan->opaque = NULL; -} - - -/* - * Estimate the cost of an index scan - */ -static void -hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count, - Cost *indexStartupCost, Cost *indexTotalCost, - Selectivity *indexSelectivity, double *indexCorrelation - ,double *indexPages -) -{ - GenericCosts costs; - - /* Never use index without order */ - if (path->indexorderbys == NULL) - { - *indexStartupCost = DBL_MAX; - *indexTotalCost = DBL_MAX; - *indexSelectivity = 0; - *indexCorrelation = 0; - *indexPages = 0; - return; - } - - MemSet(&costs, 0, sizeof(costs)); - - genericcostestimate(root, path, loop_count, &costs); - - /* Startup cost and total cost are same */ - *indexStartupCost = costs.indexTotalCost; - *indexTotalCost = costs.indexTotalCost; - *indexSelectivity = costs.indexSelectivity; - *indexCorrelation = costs.indexCorrelation; - *indexPages = costs.numIndexPages; -} - -/* - * Parse and validate the reloptions - */ -static bytea * -hnsw_options(Datum reloptions, bool validate) -{ - static const relopt_parse_elt tab[] = { - {"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)}, - {"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)}, - {"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)}, - {"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)}, - {"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)} - }; - - return (bytea *) build_reloptions(reloptions, validate, - hnsw_relopt_kind, - sizeof(HnswOptions), - tab, lengthof(tab)); -} - -/* - * Validate catalog entries for the specified operator class - */ -static bool -hnsw_validate(Oid opclassoid) -{ - return true; -} - -/* - * Build the index for a logged table - */ -static IndexBuildResult * -hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo) -{ - HierarchicalNSW* hnsw = hnsw_get_index(index, heap); - IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); - result->heap_tuples = result->index_tuples = hnsw_count(hnsw); - - return result; -} - -/* - * Insert a tuple into the index - */ -static bool -hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid, - Relation heap, IndexUniqueCheck checkUnique, - bool indexUnchanged, - IndexInfo *indexInfo) -{ - HierarchicalNSW* hnsw = hnsw_get_index(index, heap); - Datum value; - ArrayType* array; - int n_items; - label_t label = 0; - - /* Skip nulls */ - if (isnull[0]) - return false; - - /* Detoast value */ - value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); - array = DatumGetArrayTypeP(value); - n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); - if (n_items != hnsw_dimensions(hnsw)) - { - elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", - n_items, hnsw_dimensions(hnsw)); - } - memcpy(&label, heap_tid, sizeof(*heap_tid)); - if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label)) - elog(ERROR, "HNSW index insert failed"); - return true; -} - -/* - * Build the index for an unlogged table - */ -static void -hnsw_buildempty(Relation index) -{ - /* index will be constructed on dema nd when accessed */ -} - -/* - * Clean up after a VACUUM operation - */ -static IndexBulkDeleteResult * -hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) -{ - Relation rel = info->index; - - if (stats == NULL) - return NULL; - - stats->num_pages = RelationGetNumberOfBlocks(rel); - - return stats; -} - -/* - * Bulk delete tuples from the index - */ -static IndexBulkDeleteResult * -hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, - IndexBulkDeleteCallback callback, void *callback_state) -{ - if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); - return stats; -} - -/* - * Define index handler - * - * See https://www.postgresql.org/docs/current/index-api.html - */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler); -Datum -hnsw_handler(PG_FUNCTION_ARGS) -{ - IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); - - amroutine->amstrategies = 0; - amroutine->amsupport = 0; - amroutine->amoptsprocnum = 0; - amroutine->amcanorder = false; - amroutine->amcanorderbyop = true; - amroutine->amcanbackward = false; /* can change direction mid-scan */ - amroutine->amcanunique = false; - amroutine->amcanmulticol = false; - amroutine->amoptionalkey = true; - amroutine->amsearcharray = false; - amroutine->amsearchnulls = false; - amroutine->amstorage = false; - amroutine->amclusterable = false; - amroutine->ampredlocks = false; - amroutine->amcanparallel = false; - amroutine->amcaninclude = false; - amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */ - amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL; - amroutine->amkeytype = InvalidOid; - - /* Interface functions */ - amroutine->ambuild = hnsw_build; - amroutine->ambuildempty = hnsw_buildempty; - amroutine->aminsert = hnsw_insert; - amroutine->ambulkdelete = hnsw_bulkdelete; - amroutine->amvacuumcleanup = hnsw_vacuumcleanup; - amroutine->amcanreturn = NULL; /* tuple not included in heapsort */ - amroutine->amcostestimate = hnsw_costestimate; - amroutine->amoptions = hnsw_options; - amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */ - amroutine->ambuildphasename = NULL; - amroutine->amvalidate = hnsw_validate; - amroutine->amadjustmembers = NULL; - amroutine->ambeginscan = hnsw_beginscan; - amroutine->amrescan = hnsw_rescan; - amroutine->amgettuple = hnsw_gettuple; - amroutine->amgetbitmap = NULL; - amroutine->amendscan = hnsw_endscan; - amroutine->ammarkpos = NULL; - amroutine->amrestrpos = NULL; - - /* Interface functions to support parallel index scans */ - amroutine->amestimateparallelscan = NULL; - amroutine->aminitparallelscan = NULL; - amroutine->amparallelrescan = NULL; - - PG_RETURN_POINTER(amroutine); -} - -/* - * Get the L2 distance between vectors - */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance); -Datum -l2_distance(PG_FUNCTION_ARGS) -{ - ArrayType *a = PG_GETARG_ARRAYTYPE_P(0); - ArrayType *b = PG_GETARG_ARRAYTYPE_P(1); - int a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a)); - int b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b)); - dist_t distance = 0.0; - dist_t diff; - coord_t *ax = (coord_t*)ARR_DATA_PTR(a); - coord_t *bx = (coord_t*)ARR_DATA_PTR(b); - - if (a_dim != b_dim) - { - ereport(ERROR, - (errcode(ERRCODE_DATA_EXCEPTION), - errmsg("different array dimensions %d and %d", a_dim, b_dim))); - } - - for (int i = 0; i < a_dim; i++) - { - diff = ax[i] - bx[i]; - distance += diff * diff; - } - - PG_RETURN_FLOAT4((dist_t)sqrt(distance)); -} diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control deleted file mode 100644 index fbfa1a5b47..0000000000 --- a/pgxn/hnsw/hnsw.control +++ /dev/null @@ -1,4 +0,0 @@ -comment = '** Deprecated ** Please use pg_embedding instead' -default_version = '0.1.0' -module_pathname = '$libdir/hnsw' -relocatable = true diff --git a/pgxn/hnsw/hnsw.h b/pgxn/hnsw/hnsw.h deleted file mode 100644 index d4065ab8fe..0000000000 --- a/pgxn/hnsw/hnsw.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -typedef float coord_t; -typedef float dist_t; -typedef uint32_t idx_t; -typedef uint64_t label_t; - -typedef struct HierarchicalNSW HierarchicalNSW; - -bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results); -bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label); -void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction); -int hnsw_dimensions(HierarchicalNSW* hnsw); -size_t hnsw_count(HierarchicalNSW* hnsw); -size_t hnsw_sizeof(void); diff --git a/pgxn/hnsw/hnswalg.cpp b/pgxn/hnsw/hnswalg.cpp deleted file mode 100644 index f6de3b8314..0000000000 --- a/pgxn/hnsw/hnswalg.cpp +++ /dev/null @@ -1,379 +0,0 @@ -#include "hnswalg.h" - -#if defined(__GNUC__) -#define PORTABLE_ALIGN32 __attribute__((aligned(32))) -#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint) -#else -#define PORTABLE_ALIGN32 __declspec(align(32)) -#define PREFETCH(addr,hint) -#endif - -HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_) -{ - dim = dim_; - data_size = dim * sizeof(coord_t); - - efConstruction = efConstruction_; - - maxelements = maxelements_; - M = M_; - maxM = maxM_; - size_links_level0 = (maxM + 1) * sizeof(idx_t); - size_data_per_element = size_links_level0 + data_size + sizeof(label_t); - offset_data = size_links_level0; - offset_label = offset_data + data_size; - - enterpoint_node = 0; - cur_element_count = 0; -#ifdef __x86_64__ - use_avx2 = __builtin_cpu_supports("avx2"); -#endif -} - -std::priority_queue> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef) -{ - std::vector visited; - visited.resize((cur_element_count + 31) >> 5); - - std::priority_queue> topResults; - std::priority_queue> candidateSet; - - dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node)); - - topResults.emplace(dist, enterpoint_node); - candidateSet.emplace(-dist, enterpoint_node); - visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31); - dist_t lowerBound = dist; - - while (!candidateSet.empty()) - { - std::pair curr_el_pair = candidateSet.top(); - if (-curr_el_pair.first > lowerBound) - break; - - candidateSet.pop(); - idx_t curNodeNum = curr_el_pair.second; - - idx_t* data = get_linklist0(curNodeNum); - size_t size = *data++; - - PREFETCH(getDataByInternalId(*data), 0); - - for (size_t j = 0; j < size; ++j) { - size_t tnum = *(data + j); - - PREFETCH(getDataByInternalId(*(data + j + 1)), 0); - - if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) { - visited[tnum >> 5] |= 1 << (tnum & 31); - - dist = fstdistfunc(point, getDataByInternalId(tnum)); - - if (topResults.top().first > dist || topResults.size() < ef) { - candidateSet.emplace(-dist, tnum); - - PREFETCH(get_linklist0(candidateSet.top().second), 0); - topResults.emplace(dist, tnum); - - if (topResults.size() > ef) - topResults.pop(); - - lowerBound = topResults.top().first; - } - } - } - } - return topResults; -} - - -void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue> &topResults, size_t NN) -{ - if (topResults.size() < NN) - return; - - std::priority_queue> resultSet; - std::vector> returnlist; - - while (topResults.size() > 0) { - resultSet.emplace(-topResults.top().first, topResults.top().second); - topResults.pop(); - } - - while (resultSet.size()) { - if (returnlist.size() >= NN) - break; - std::pair curen = resultSet.top(); - dist_t dist_to_query = -curen.first; - resultSet.pop(); - bool good = true; - for (std::pair curen2 : returnlist) { - dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second), - getDataByInternalId(curen.second)); - if (curdist < dist_to_query) { - good = false; - break; - } - } - if (good) returnlist.push_back(curen); - } - for (std::pair elem : returnlist) - topResults.emplace(-elem.first, elem.second); -} - -void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c, - std::priority_queue> topResults) -{ - getNeighborsByHeuristic(topResults, M); - - std::vector res; - res.reserve(M); - while (topResults.size() > 0) { - res.push_back(topResults.top().second); - topResults.pop(); - } - { - idx_t* data = get_linklist0(cur_c); - if (*data) - throw std::runtime_error("Should be blank"); - - *data++ = res.size(); - - for (size_t idx = 0; idx < res.size(); idx++) { - if (data[idx]) - throw std::runtime_error("Should be blank"); - data[idx] = res[idx]; - } - } - for (size_t idx = 0; idx < res.size(); idx++) { - if (res[idx] == cur_c) - throw std::runtime_error("Connection to the same element"); - - size_t resMmax = maxM; - idx_t *ll_other = get_linklist0(res[idx]); - idx_t sz_link_list_other = *ll_other; - - if (sz_link_list_other > resMmax || sz_link_list_other < 0) - throw std::runtime_error("Bad sz_link_list_other"); - - if (sz_link_list_other < resMmax) { - idx_t *data = ll_other + 1; - data[sz_link_list_other] = cur_c; - *ll_other = sz_link_list_other + 1; - } else { - // finding the "weakest" element to replace it with the new one - idx_t *data = ll_other + 1; - dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx])); - // Heuristic: - std::priority_queue> candidates; - candidates.emplace(d_max, cur_c); - - for (size_t j = 0; j < sz_link_list_other; j++) - candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]); - - getNeighborsByHeuristic(candidates, resMmax); - - size_t indx = 0; - while (!candidates.empty()) { - data[indx] = candidates.top().second; - candidates.pop(); - indx++; - } - *ll_other = indx; - } - } -} - -void HierarchicalNSW::addPoint(const coord_t *point, label_t label) -{ - if (cur_element_count >= maxelements) { - throw std::runtime_error("The number of elements exceeds the specified limit"); - } - idx_t cur_c = cur_element_count++; - memset((char *) get_linklist0(cur_c), 0, size_data_per_element); - memcpy(getDataByInternalId(cur_c), point, data_size); - memcpy(getExternalLabel(cur_c), &label, sizeof label); - - // Do nothing for the first element - if (cur_c != 0) { - std::priority_queue > topResults = searchBaseLayer(point, efConstruction); - mutuallyConnectNewElement(point, cur_c, topResults); - } -}; - -std::priority_queue> HierarchicalNSW::searchKnn(const coord_t *query, size_t k) -{ - std::priority_queue> topResults; - auto topCandidates = searchBaseLayer(query, k); - while (topCandidates.size() > k) { - topCandidates.pop(); - } - while (!topCandidates.empty()) { - std::pair rez = topCandidates.top(); - label_t label; - memcpy(&label, getExternalLabel(rez.second), sizeof(label)); - topResults.push(std::pair(rez.first, label)); - topCandidates.pop(); - } - - return topResults; -}; - -dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n) -{ - dist_t distance = 0.0; - - for (size_t i = 0; i < n; i++) - { - dist_t diff = x[i] - y[i]; - distance += diff * diff; - } - return distance; - -} - -#ifdef __x86_64__ -#include - -__attribute__((target("avx2"))) -dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n) -{ - const size_t TmpResSz = sizeof(__m256) / sizeof(float); - float PORTABLE_ALIGN32 TmpRes[TmpResSz]; - size_t qty16 = n / 16; - const float *pEnd1 = x + (qty16 * 16); - __m256 diff, v1, v2; - __m256 sum = _mm256_set1_ps(0); - - while (x < pEnd1) { - v1 = _mm256_loadu_ps(x); - x += 8; - v2 = _mm256_loadu_ps(y); - y += 8; - diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - - v1 = _mm256_loadu_ps(x); - x += 8; - v2 = _mm256_loadu_ps(y); - y += 8; - diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - } - _mm256_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; - return (res); -} - -dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n) -{ - const size_t TmpResSz = sizeof(__m128) / sizeof(float); - float PORTABLE_ALIGN32 TmpRes[TmpResSz]; - size_t qty16 = n / 16; - const float *pEnd1 = x + (qty16 * 16); - - __m128 diff, v1, v2; - __m128 sum = _mm_set1_ps(0); - - while (x < pEnd1) { - v1 = _mm_loadu_ps(x); - x += 4; - v2 = _mm_loadu_ps(y); - y += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(x); - x += 4; - v2 = _mm_loadu_ps(y); - y += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(x); - x += 4; - v2 = _mm_loadu_ps(y); - y += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(x); - x += 4; - v2 = _mm_loadu_ps(y); - y += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - } - _mm_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; - return res; -} -#endif - -dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y) -{ -#ifndef __x86_64__ - return fstdistfunc_scalar(x, y, dim); -#else - if(use_avx2) - return fstdistfunc_avx2(x, y, dim); - - return fstdistfunc_sse(x, y, dim); -#endif -} - -bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results) -{ - try - { - auto result = hnsw->searchKnn(point, efSearch); - size_t nResults = result.size(); - *results = (label_t*)malloc(nResults*sizeof(label_t)); - for (size_t i = nResults; i-- != 0;) - { - (*results)[i] = result.top().second; - result.pop(); - } - *n_results = nResults; - return true; - } - catch (std::exception& x) - { - return false; - } -} - -bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label) -{ - try - { - hnsw->addPoint(point, label); - return true; - } - catch (std::exception& x) - { - fprintf(stderr, "Catch %s\n", x.what()); - return false; - } -} - -void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction) -{ - new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction); -} - - -int hnsw_dimensions(HierarchicalNSW* hnsw) -{ - return (int)hnsw->dim; -} - -size_t hnsw_count(HierarchicalNSW* hnsw) -{ - return hnsw->cur_element_count; -} - -size_t hnsw_sizeof(void) -{ - return sizeof(HierarchicalNSW); -} diff --git a/pgxn/hnsw/hnswalg.h b/pgxn/hnsw/hnswalg.h deleted file mode 100644 index f38aeac362..0000000000 --- a/pgxn/hnsw/hnswalg.h +++ /dev/null @@ -1,69 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern "C" { -#include "hnsw.h" -} - -struct HierarchicalNSW -{ - size_t maxelements; - size_t cur_element_count; - - idx_t enterpoint_node; - - size_t dim; - size_t data_size; - size_t offset_data; - size_t offset_label; - size_t size_data_per_element; - size_t M; - size_t maxM; - size_t size_links_level0; - size_t efConstruction; - -#ifdef __x86_64__ - bool use_avx2; -#endif - - char data_level0_memory[0]; // varying size - - public: - HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction); - ~HierarchicalNSW(); - - - inline coord_t *getDataByInternalId(idx_t internal_id) const { - return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data]; - } - - inline idx_t *get_linklist0(idx_t internal_id) const { - return (idx_t*)&data_level0_memory[internal_id * size_data_per_element]; - } - - inline label_t *getExternalLabel(idx_t internal_id) const { - return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label]; - } - - std::priority_queue> searchBaseLayer(const coord_t *x, size_t ef); - - void getNeighborsByHeuristic(std::priority_queue> &topResults, size_t NN); - - void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue> topResults); - - void addPoint(const coord_t *point, label_t label); - - std::priority_queue> searchKnn(const coord_t *query_data, size_t k); - - dist_t fstdistfunc(const coord_t *x, const coord_t *y); -}; diff --git a/pgxn/hnsw/test/expected/knn.out b/pgxn/hnsw/test/expected/knn.out deleted file mode 100644 index a1cee4525e..0000000000 --- a/pgxn/hnsw/test/expected/knn.out +++ /dev/null @@ -1,28 +0,0 @@ -SET enable_seqscan = off; -CREATE TABLE t (val real[]); -INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL); -CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3); -INSERT INTO t (val) VALUES (array[1,2,4]); -explain SELECT * FROM t ORDER BY val <-> array[3,3,3]; - QUERY PLAN --------------------------------------------------------------------- - Index Scan using t_val_idx on t (cost=4.02..8.06 rows=3 width=36) - Order By: (val <-> '{3,3,3}'::real[]) -(2 rows) - -SELECT * FROM t ORDER BY val <-> array[3,3,3]; - val ---------- - {1,2,3} - {1,2,4} - {1,1,1} - {0,0,0} -(4 rows) - -SELECT COUNT(*) FROM t; - count -------- - 5 -(1 row) - -DROP TABLE t; diff --git a/pgxn/hnsw/test/sql/knn.sql b/pgxn/hnsw/test/sql/knn.sql deleted file mode 100644 index 0635bda4a2..0000000000 --- a/pgxn/hnsw/test/sql/knn.sql +++ /dev/null @@ -1,13 +0,0 @@ -SET enable_seqscan = off; - -CREATE TABLE t (val real[]); -INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL); -CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3); - -INSERT INTO t (val) VALUES (array[1,2,4]); - -explain SELECT * FROM t ORDER BY val <-> array[3,3,3]; -SELECT * FROM t ORDER BY val <-> array[3,3,3]; -SELECT COUNT(*) FROM t; - -DROP TABLE t; diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index b47b22cd20..59096a1bc8 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -428,6 +428,8 @@ MergeTable() hash_seq_init(&status, old_table->role_table); while ((entry = hash_seq_search(&status)) != NULL) { + RoleEntry * old; + bool found_old = false; RoleEntry *to_write = hash_search( CurrentDdlTable->role_table, entry->name, @@ -435,30 +437,23 @@ MergeTable() NULL); to_write->type = entry->type; - if (entry->password) - to_write->password = entry->password; + to_write->password = entry->password; strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); - if (entry->old_name[0] != '\0') - { - bool found_old = false; - RoleEntry *old = hash_search( - CurrentDdlTable->role_table, - entry->old_name, - HASH_FIND, - &found_old); + if (entry->old_name[0] == '\0') + continue; - if (found_old) - { - if (old->old_name[0] != '\0') - strlcpy(to_write->old_name, old->old_name, NAMEDATALEN); - else - strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); - hash_search(CurrentDdlTable->role_table, - entry->old_name, - HASH_REMOVE, - NULL); - } - } + old = hash_search( + CurrentDdlTable->role_table, + entry->old_name, + HASH_FIND, + &found_old); + if (!found_old) + continue; + strlcpy(to_write->old_name, old->old_name, NAMEDATALEN); + hash_search(CurrentDdlTable->role_table, + entry->old_name, + HASH_REMOVE, + NULL); } hash_destroy(old_table->role_table); } diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c index e38af08f89..0331f961b4 100644 --- a/pgxn/neon/extension_server.c +++ b/pgxn/neon/extension_server.c @@ -14,10 +14,12 @@ #include "utils/guc.h" -#include "extension_server.h" +#include "extension_server.h" #include "neon_utils.h" static int extension_server_port = 0; +static int extension_server_request_timeout = 60; +static int extension_server_connect_timeout = 60; static download_extension_file_hook_type prev_download_extension_file_hook = NULL; @@ -34,19 +36,18 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL static bool neon_download_extension_file_http(const char *filename, bool is_library) { - static CURL *handle = NULL; - CURLcode res; - char *compute_ctl_url; bool ret = false; + CURL *handle = NULL; + char *compute_ctl_url; - if (handle == NULL) - { - handle = alloc_curl_handle(); + handle = alloc_curl_handle(); - curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); - curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ ); - } + curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); + if (extension_server_request_timeout > 0) + curl_easy_setopt(handle, CURLOPT_TIMEOUT, (long)extension_server_request_timeout /* seconds */ ); + if (extension_server_connect_timeout > 0) + curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, (long)extension_server_connect_timeout /* seconds */ ); compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s", extension_server_port, filename, is_library ? "?is_library=true" : ""); @@ -57,6 +58,8 @@ neon_download_extension_file_http(const char *filename, bool is_library) /* Perform the request, res will get the return code */ res = curl_easy_perform(handle); + curl_easy_cleanup(handle); + /* Check for errors */ if (res == CURLE_OK) { @@ -88,6 +91,24 @@ pg_init_extension_server() 0, /* no flags required */ NULL, NULL, NULL); + DefineCustomIntVariable("neon.extension_server_request_timeout", + "timeout for fetching extensions in seconds", + NULL, + &extension_server_request_timeout, + 60, 0, INT_MAX, + PGC_SUSET, + GUC_UNIT_S, + NULL, NULL, NULL); + + DefineCustomIntVariable("neon.extension_server_connect_timeout", + "timeout for connecting to the extension server in seconds", + NULL, + &extension_server_connect_timeout, + 60, 0, INT_MAX, + PGC_SUSET, + GUC_UNIT_S, + NULL, NULL, NULL); + /* set download_extension_file_hook */ prev_download_extension_file_hook = download_extension_file_hook; download_extension_file_hook = neon_download_extension_file_http; diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 70b250d394..f6a577abfc 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -22,6 +22,7 @@ #include "neon_pgversioncompat.h" #include "access/parallel.h" +#include "access/xlog.h" #include "funcapi.h" #include "miscadmin.h" #include "pagestore_client.h" @@ -40,12 +41,16 @@ #include "utils/dynahash.h" #include "utils/guc.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif + #include "hll.h" #include "bitmap.h" #include "neon.h" #include "neon_perf_counters.h" -#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) +#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) /* * Local file cache is used to temporary store relations pages in local file system. @@ -93,7 +98,23 @@ #define MB ((uint64)1024*1024) #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) -#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32) + +/* + * Blocks are read or written to LFC file outside LFC critical section. + * To synchronize access to such block, writer set state of such block to PENDING. + * If some other backend (read or writer) see PENDING status, it change it to REQUESTED and start + * waiting until status is changed on conditional variable. + * When writer completes is operation, it checks if status is REQUESTED and if so, broadcast conditional variable, + * waking up all backend waiting for access to this block. + */ +typedef enum FileCacheBlockState +{ + UNAVAILABLE, /* block is not present in cache */ + AVAILABLE, /* block can be used */ + PENDING, /* block is loaded */ + REQUESTED /* some other backend is waiting for block to be loaded */ +} FileCacheBlockState; + typedef struct FileCacheEntry { @@ -101,10 +122,16 @@ typedef struct FileCacheEntry uint32 hash; uint32 offset; uint32 access_count; - uint32 bitmap[CHUNK_BITMAP_SIZE]; + uint32 state[(BLOCKS_PER_CHUNK + 31) / 32 * 2]; /* two bits per block */ dlist_node list_node; /* LRU/holes list node */ } FileCacheEntry; +#define GET_STATE(entry, i) (((entry)->state[(i) / 16] >> ((i) % 16 * 2)) & 3) +#define SET_STATE(entry, i, new_state) (entry)->state[(i) / 16] = ((entry)->state[(i) / 16] & ~(3 << ((i) % 16 * 2))) | ((new_state) << ((i) % 16 * 2)) + +#define N_COND_VARS 64 +#define CV_WAIT_TIMEOUT 10 + typedef struct FileCacheControl { uint64 generation; /* generation is needed to handle correct hash @@ -118,18 +145,24 @@ typedef struct FileCacheControl uint64 writes; /* number of writes issued */ uint64 time_read; /* time spent reading (us) */ uint64 time_write; /* time spent writing (us) */ + uint64 resizes; /* number of LFC resizes */ + uint64 evicted_pages; /* number of evicted pages */ dlist_head lru; /* double linked list for LRU replacement * algorithm */ dlist_head holes; /* double linked list of punched holes */ HyperLogLogState wss_estimation; /* estimation of working set size */ + ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */ } FileCacheControl; +bool lfc_store_prefetch_result; + static HTAB *lfc_hash; -static int lfc_desc = 0; +static int lfc_desc = -1; static LWLockId lfc_lock; static int lfc_max_size; static int lfc_size_limit; static char *lfc_path; +static uint64 lfc_generation; static FileCacheControl *lfc_ctl; static shmem_startup_hook_type prev_shmem_startup_hook; #if PG_VERSION_NUM>=150000 @@ -138,6 +171,20 @@ static shmem_request_hook_type prev_shmem_request_hook; #define LFC_ENABLED() (lfc_ctl->limit != 0) +/* + * Close LFC file if opened. + * All backends should close their LFC files once LFC is disabled. + */ +static void +lfc_close_file(void) +{ + if (lfc_desc >= 0) + { + close(lfc_desc); + lfc_desc = -1; + } +} + /* * Local file cache is optional and Neon can work without it. * In case of any any errors with this cache, we should disable it but to not throw error. @@ -145,20 +192,16 @@ static shmem_request_hook_type prev_shmem_request_hook; * All cache content should be invalidated to avoid reading of stale or corrupted data */ static void -lfc_disable(char const *op) +lfc_switch_off(void) { int fd; - elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path); - - /* Invalidate hash */ - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - if (LFC_ENABLED()) { HASH_SEQ_STATUS status; FileCacheEntry *entry; + /* Invalidate hash */ hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { @@ -171,41 +214,33 @@ lfc_disable(char const *op) dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); - if (lfc_desc > 0) - { - int rc; + /* + * We need to use unlink to to avoid races in LFC write, because it is not + * protected by lock + */ + unlink(lfc_path); - /* - * If the reason of error is ENOSPC, then truncation of file may - * help to reclaim some space - */ - pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_TRUNCATE); - rc = ftruncate(lfc_desc, 0); - pgstat_report_wait_end(); + fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); + if (fd < 0) + elog(WARNING, "LFC: failed to recreate local file cache %s: %m", lfc_path); + else + close(fd); - if (rc < 0) - elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path); - } + /* Wakeup waiting backends */ + for (int i = 0; i < N_COND_VARS; i++) + ConditionVariableBroadcast(&lfc_ctl->cv[i]); } + lfc_close_file(); +} - /* - * We need to use unlink to to avoid races in LFC write, because it is not - * protectedby - */ - unlink(lfc_path); - - fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); - if (fd < 0) - elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path); - else - close(fd); +static void +lfc_disable(char const *op) +{ + elog(WARNING, "LFC: failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path); + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + lfc_switch_off(); LWLockRelease(lfc_lock); - - if (lfc_desc > 0) - close(lfc_desc); - - lfc_desc = -1; } /* @@ -217,13 +252,20 @@ lfc_maybe_disabled(void) return !lfc_ctl || !LFC_ENABLED(); } +/* + * Open LFC file if not opened yet or generation is changed. + * Should be called under LFC lock. + */ static bool lfc_ensure_opened(void) { - bool enabled = !lfc_maybe_disabled(); - + if (lfc_generation != lfc_ctl->generation) + { + lfc_close_file(); + lfc_generation = lfc_ctl->generation; + } /* Open cache file if not done yet */ - if (lfc_desc <= 0 && enabled) + if (lfc_desc < 0) { lfc_desc = BasicOpenFile(lfc_path, O_RDWR); @@ -233,7 +275,7 @@ lfc_ensure_opened(void) return false; } } - return enabled; + return true; } static void @@ -267,14 +309,7 @@ lfc_shmem_startup(void) n_chunks + 1, n_chunks + 1, &info, HASH_ELEM | HASH_BLOBS); - lfc_ctl->generation = 0; - lfc_ctl->size = 0; - lfc_ctl->used = 0; - lfc_ctl->hits = 0; - lfc_ctl->misses = 0; - lfc_ctl->writes = 0; - lfc_ctl->time_read = 0; - lfc_ctl->time_write = 0; + memset(lfc_ctl, 0, sizeof(FileCacheControl)); dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); @@ -285,7 +320,7 @@ lfc_shmem_startup(void) fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); if (fd < 0) { - elog(WARNING, "Failed to create local file cache %s: %m", lfc_path); + elog(WARNING, "LFC: failed to create local file cache %s: %m", lfc_path); lfc_ctl->limit = 0; } else @@ -293,6 +328,11 @@ lfc_shmem_startup(void) close(fd); lfc_ctl->limit = SIZE_MB_TO_CHUNKS(lfc_size_limit); } + + /* Initialize turnstile of condition variables */ + for (int i = 0; i < N_COND_VARS; i++) + ConditionVariableInit(&lfc_ctl->cv[i]); + } LWLockRelease(AddinShmemInitLock); } @@ -327,7 +367,7 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source) { if (*newval > lfc_max_size) { - elog(ERROR, "neon.file_cache_size_limit can not be larger than neon.max_file_cache_size"); + elog(ERROR, "LFC: neon.file_cache_size_limit can not be larger than neon.max_file_cache_size"); return false; } return true; @@ -338,14 +378,31 @@ lfc_change_limit_hook(int newval, void *extra) { uint32 new_size = SIZE_MB_TO_CHUNKS(newval); - if (!is_normal_backend()) - return; - - if (!lfc_ensure_opened()) + if (!lfc_ctl || !is_normal_backend()) return; LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + /* Open LFC file only if LFC was enabled or we are going to reenable it */ + if (newval == 0 && !LFC_ENABLED()) + { + LWLockRelease(lfc_lock); + /* File should be reopened if LFC is reenabled */ + lfc_close_file(); + return; + } + + if (!lfc_ensure_opened()) + { + LWLockRelease(lfc_lock); + return; + } + + if (lfc_ctl->limit != new_size) + { + lfc_ctl->resizes += 1; + } + while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru)) { /* @@ -365,6 +422,12 @@ lfc_change_limit_hook(int newval, void *extra) neon_log(LOG, "Failed to punch hole in file: %m"); #endif /* We remove the old entry, and re-enter a hole to the hash table */ + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + { + bool is_page_cached = GET_STATE(victim, i) == AVAILABLE; + lfc_ctl->used_pages -= is_page_cached; + lfc_ctl->evicted_pages += is_page_cached; + } hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); memset(&holetag, 0, sizeof(holetag)); @@ -379,10 +442,11 @@ lfc_change_limit_hook(int newval, void *extra) lfc_ctl->used -= 1; } - lfc_ctl->limit = new_size; - if (new_size == 0) { - lfc_ctl->generation += 1; - } + if (new_size == 0) + lfc_switch_off(); + else + lfc_ctl->limit = new_size; + neon_log(DEBUG1, "set local file cache limit to %d", new_size); LWLockRelease(lfc_lock); @@ -399,6 +463,17 @@ lfc_init(void) neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries"); + DefineCustomBoolVariable("neon.store_prefetch_result_in_lfc", + "Immediately store received prefetch result in LFC", + NULL, + &lfc_store_prefetch_result, + false, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + DefineCustomIntVariable("neon.max_file_cache_size", "Maximal size of Neon local file cache", NULL, @@ -476,7 +551,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) if (LFC_ENABLED()) { entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); - found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) != 0; + found = entry != NULL && GET_STATE(entry, chunk_offs) != UNAVAILABLE; } LWLockRelease(lfc_lock); return found; @@ -505,46 +580,42 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); hash = get_hash_value(lfc_hash, &tag); - chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); + chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); LWLockAcquire(lfc_lock, LW_SHARED); + if (!LFC_ENABLED()) + { + LWLockRelease(lfc_lock); + return 0; + } while (true) { - int this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs); - if (LFC_ENABLED()) - { - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + int this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); - if (entry != NULL) + if (entry != NULL) + { + for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) { - for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) + if (GET_STATE(entry, chunk_offs) != UNAVAILABLE) { - if ((entry->bitmap[chunk_offs >> 5] & - (1 << (chunk_offs & 31))) != 0) - { - BITMAP_SET(bitmap, i); - found++; - } + BITMAP_SET(bitmap, i); + found++; } } - else - { - i += this_chunk; - } } else { - return found; + i += this_chunk; } - /* * Break out of the iteration before doing expensive stuff for * a next iteration */ - if (i + 1 >= nblocks) + if (i >= nblocks) break; /* @@ -558,8 +629,8 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, LWLockRelease(lfc_lock); -#if USE_ASSERT_CHECKING - do { +#ifdef USE_ASSERT_CHECKING + { int count = 0; for (int j = 0; j < nblocks; j++) @@ -569,93 +640,12 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } Assert(count == found); - } while (false); + } #endif return found; } -/* - * Evict a page (if present) from the local file cache - */ -void -lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) -{ - BufferTag tag; - FileCacheEntry *entry; - bool found; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); - uint32 hash; - - if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ - return; - - CopyNRelFileInfoToBufTag(tag, rinfo); - tag.forkNum = forkNum; - tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1)); - - CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) - { - LWLockRelease(lfc_lock); - return; - } - - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, &found); - - if (!found) - { - /* nothing to do */ - LWLockRelease(lfc_lock); - return; - } - - /* remove the page from the cache */ - entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1))); - - if (entry->access_count == 0) - { - /* - * If the chunk has no live entries, we can position the chunk to be - * recycled first. - */ - if (entry->bitmap[chunk_offs >> 5] == 0) - { - bool has_remaining_pages = false; - - for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) - { - if (entry->bitmap[i] != 0) - { - has_remaining_pages = true; - break; - } - } - - /* - * Put the entry at the position that is first to be reclaimed when we - * have no cached pages remaining in the chunk - */ - if (!has_remaining_pages) - { - dlist_delete(&entry->list_node); - dlist_push_head(&lfc_ctl->lru, &entry->list_node); - } - } - } - - /* - * Done: apart from empty chunks, we don't move chunks in the LRU when - * they're empty because eviction isn't usage. - */ - - LWLockRelease(lfc_lock); -} - /* * Try to read pages from local cache. * Returns the number of pages read from the local cache, and sets bits in @@ -683,17 +673,14 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ - return 0; - - if (!lfc_ensure_opened()) - return 0; + return -1; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - /* + /* * For every chunk that has blocks we're interested in, we * 1. get the chunk header * 2. Check if the chunk actually has the blocks we're interested in @@ -710,22 +697,35 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int iteration_hits = 0; int iteration_misses = 0; uint64 io_time_us = 0; + int n_blocks_to_read = 0; + ConditionVariable* cv; + Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) { + n_blocks_to_read += (BITMAP_ISSET(mask, buf_offset + i) != 0); iov[i].iov_base = buffers[buf_offset + i]; iov[i].iov_len = BLCKSZ; + BITMAP_CLR(mask, buf_offset + i); + } + if (n_blocks_to_read == 0) + { + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + continue; } tag.blockNum = blkno - chunk_offs; hash = get_hash_value(lfc_hash, &tag); + cv = &lfc_ctl->cv[hash % N_COND_VARS]; LWLockAcquire(lfc_lock, LW_EXCLUSIVE); /* We can return the blocks we've read before LFC got disabled; * assuming we read any. */ - if (!LFC_ENABLED()) + if (!LFC_ENABLED() || !lfc_ensure_opened()) { LWLockRelease(lfc_lock); return blocks_read; @@ -761,15 +761,32 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, generation = lfc_ctl->generation; entry_offset = entry->offset; - LWLockRelease(lfc_lock); - for (int i = 0; i < blocks_in_chunk; i++) { - /* - * If the page is valid, we consider it "read". - * All other pages will be fetched separately by the next cache - */ - if (entry->bitmap[(chunk_offs + i) / 32] & (1 << ((chunk_offs + i) % 32))) + FileCacheBlockState state = UNAVAILABLE; + bool sleeping = false; + while (lfc_ctl->generation == generation) + { + state = GET_STATE(entry, chunk_offs + i); + if (state == PENDING) { + SET_STATE(entry, chunk_offs + i, REQUESTED); + } else if (state != REQUESTED) { + break; + } + if (!sleeping) + { + ConditionVariablePrepareToSleep(cv); + sleeping = true; + } + LWLockRelease(lfc_lock); + ConditionVariableTimedSleep(cv, CV_WAIT_TIMEOUT, WAIT_EVENT_NEON_LFC_CV_WAIT); + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + } + if (sleeping) + { + ConditionVariableCancelSleep(); + } + if (state == AVAILABLE) { BITMAP_SET(mask, buf_offset + i); iteration_hits++; @@ -777,6 +794,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, else iteration_misses++; } + LWLockRelease(lfc_lock); Assert(iteration_hits + iteration_misses > 0); @@ -818,6 +836,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, else { /* generation mismatch, assume error condition */ + lfc_close_file(); LWLockRelease(lfc_lock); return -1; } @@ -833,6 +852,249 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, return blocks_read; } +/* + * Initialize new LFC hash entry, perform eviction if needed. + * Returns false if there are no unpinned entries and chunk can not be added. + */ +static bool +lfc_init_new_entry(FileCacheEntry* entry, uint32 hash) +{ + /*----------- + * If the chunk wasn't already in the LFC then we have these + * options, in order of preference: + * + * Unless there is no space available, we can: + * 1. Use an entry from the `holes` list, and + * 2. Create a new entry. + * We can always, regardless of space in the LFC: + * 3. evict an entry from LRU, and + * 4. ignore the write operation (the least favorite option) + */ + if (lfc_ctl->used < lfc_ctl->limit) + { + if (!dlist_is_empty(&lfc_ctl->holes)) + { + /* We can reuse a hole that was left behind when the LFC was shrunk previously */ + FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, + dlist_pop_head_node(&lfc_ctl->holes)); + uint32 offset = hole->offset; + bool hole_found; + + hash_search_with_hash_value(lfc_hash, &hole->key, + hole->hash, HASH_REMOVE, &hole_found); + CriticalAssert(hole_found); + + lfc_ctl->used += 1; + entry->offset = offset; /* reuse the hole */ + } + else + { + lfc_ctl->used += 1; + entry->offset = lfc_ctl->size++;/* allocate new chunk at end + * of file */ + } + } + /* + * We've already used up all allocated LFC entries. + * + * If we can clear an entry from the LRU, do that. + * If we can't (e.g. because all other slots are being accessed) + * then we will remove this entry from the hash and continue + * on to the next chunk, as we may not exceed the limit. + */ + else if (!dlist_is_empty(&lfc_ctl->lru)) + { + /* Cache overflow: evict least recently used chunk */ + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, + dlist_pop_head_node(&lfc_ctl->lru)); + + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + { + bool is_page_cached = GET_STATE(victim, i) == AVAILABLE; + lfc_ctl->used_pages -= is_page_cached; + lfc_ctl->evicted_pages += is_page_cached; + } + + CriticalAssert(victim->access_count == 0); + entry->offset = victim->offset; /* grab victim's chunk */ + hash_search_with_hash_value(lfc_hash, &victim->key, + victim->hash, HASH_REMOVE, NULL); + neon_log(DEBUG2, "Swap file cache page"); + } + else + { + /* Can't add this chunk - we don't have the space for it */ + hash_search_with_hash_value(lfc_hash, &entry->key, hash, + HASH_REMOVE, NULL); + + return false; + } + + entry->access_count = 1; + entry->hash = hash; + + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + SET_STATE(entry, i, UNAVAILABLE); + + return true; +} + +/* + * Store received prefetch result in LFC cache. + * Unlike lfc_read/lfc_write this call is is not protected by shared buffer lock. + * So we should be ready that other backends will try to concurrently read or write this block. + * We do not store prefetched block if it already exists in LFC or it's not_modified_since LSN is smaller + * than current last written LSN (LwLSN). + * + * We can enforce correctness of storing page in LFC by the following steps: + * 1. Check under LFC lock that page in not present in LFC. + * 2. Check under LFC lock that LwLSN is not changed since prefetch request time (not_modified_since). + * 3. Change page state to "Pending" under LFC lock to prevent all other backends to read or write this + * pages until this write is completed. + * 4. Assume that some other backend creates new image of the page without reading it + * (because reads will be blocked because of 2). This version of the page is stored in shared buffer. + * Any attempt to throw away this page from shared buffer will be blocked, because Postgres first + * needs to save dirty page and write will be blocked because of 2. + * So any backend trying to access this page, will take it from shared buffer without accessing + * SMGR and LFC. + * 5. After write completion we once again obtain LFC lock and wake-up all waiting backends. + * If there is some backend waiting to write new image of the page (4) then now it will be able to + * do it,overwriting old (prefetched) page image. As far as this write will be completed before + * shared buffer can be reassigned, not other backend can see old page image. +*/ +bool +lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + const void* buffer, XLogRecPtr lsn) +{ + BufferTag tag; + FileCacheEntry *entry; + ssize_t rc; + bool found; + uint32 hash; + uint64 generation; + uint32 entry_offset; + instr_time io_start, io_end; + ConditionVariable* cv; + FileCacheBlockState state; + XLogRecPtr lwlsn; + + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + return false; + + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forknum; + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + cv = &lfc_ctl->cv[hash % N_COND_VARS]; + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (!LFC_ENABLED() || !lfc_ensure_opened()) + { + LWLockRelease(lfc_lock); + return false; + } + lwlsn = GetLastWrittenLSN(rinfo, forknum, blkno); + if (lwlsn > lsn) + { + elog(DEBUG1, "Skip LFC write for %d because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X", + blkno, LSN_FORMAT_ARGS(lwlsn), LSN_FORMAT_ARGS(lsn)); + LWLockRelease(lfc_lock); + return false; + } + + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); + + if (found) + { + state = GET_STATE(entry, chunk_offs); + if (state != UNAVAILABLE) { + /* Do not rewrite existed LFC entry */ + LWLockRelease(lfc_lock); + return false; + } + /* + * Unlink entry from LRU list to pin it for the duration of IO + * operation + */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); + } + else + { + if (!lfc_init_new_entry(entry, hash)) + { + /* + * We can't process this chunk due to lack of space in LFC, + * so skip to the next one + */ + LWLockRelease(lfc_lock); + return false; + } + } + + generation = lfc_ctl->generation; + entry_offset = entry->offset; + + SET_STATE(entry, chunk_offs, PENDING); + + LWLockRelease(lfc_lock); + + pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); + INSTR_TIME_SET_CURRENT(io_start); + rc = pwrite(lfc_desc, buffer, BLCKSZ, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + INSTR_TIME_SET_CURRENT(io_end); + pgstat_report_wait_end(); + + if (rc != BLCKSZ) + { + lfc_disable("write"); + } + else + { + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (lfc_ctl->generation == generation) + { + uint64 time_spent_us; + CriticalAssert(LFC_ENABLED()); + /* Place entry to the head of LRU list */ + CriticalAssert(entry->access_count > 0); + + lfc_ctl->writes += 1; + INSTR_TIME_SUBTRACT(io_start, io_end); + time_spent_us = INSTR_TIME_GET_MICROSEC(io_start); + lfc_ctl->time_write += time_spent_us; + inc_page_cache_write_wait(time_spent_us); + + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + + state = GET_STATE(entry, chunk_offs); + if (state == REQUESTED) { + ConditionVariableBroadcast(cv); + } + if (state != AVAILABLE) + { + lfc_ctl->used_pages += 1; + SET_STATE(entry, chunk_offs, AVAILABLE); + } + } + else + { + lfc_close_file(); + } + LWLockRelease(lfc_lock); + } + return true; +} + /* * Put page in local file cache. * If cache is full then evict some other page. @@ -853,15 +1115,21 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return; - if (!lfc_ensure_opened()) - return; - CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - /* + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (!LFC_ENABLED() || !lfc_ensure_opened()) + { + LWLockRelease(lfc_lock); + return; + } + generation = lfc_ctl->generation; + + /* * For every chunk that has blocks we're interested in, we * 1. get the chunk header * 2. Check if the chunk actually has the blocks we're interested in @@ -876,6 +1144,8 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); instr_time io_start, io_end; + ConditionVariable* cv; + Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -886,14 +1156,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); hash = get_hash_value(lfc_hash, &tag); - - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) - { - LWLockRelease(lfc_lock); - return; - } + cv = &lfc_ctl->cv[hash % N_COND_VARS]; entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); @@ -908,62 +1171,48 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } else { - /* - * We have two choices if all cache pages are pinned (i.e. used in IO - * operations): - * - * 1) Wait until some of this operation is completed and pages is - * unpinned. - * - * 2) Allocate one more chunk, so that specified cache size is more - * recommendation than hard limit. - * - * As far as probability of such event (that all pages are pinned) is - * considered to be very very small: there are should be very large - * number of concurrent IO operations and them are limited by - * max_connections, we prefer not to complicate code and use second - * approach. - */ - if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) + if (!lfc_init_new_entry(entry, hash)) { - /* Cache overflow: evict least recently used chunk */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); - - for (int i = 0; i < BLOCKS_PER_CHUNK; i++) - { - lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1; - } - CriticalAssert(victim->access_count == 0); - entry->offset = victim->offset; /* grab victim's chunk */ - hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); - neon_log(DEBUG2, "Swap file cache page"); + /* + * We can't process this chunk due to lack of space in LFC, + * so skip to the next one + */ + blkno += blocks_in_chunk; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + continue; } - else if (!dlist_is_empty(&lfc_ctl->holes)) - { - /* We can reuse a hole that was left behind when the LFC was shrunk previously */ - FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); - uint32 offset = hole->offset; - bool hole_found; - - hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &hole_found); - CriticalAssert(hole_found); - - lfc_ctl->used += 1; - entry->offset = offset; /* reuse the hole */ - } - else - { - lfc_ctl->used += 1; - entry->offset = lfc_ctl->size++; /* allocate new chunk at end - * of file */ - } - entry->access_count = 1; - entry->hash = hash; - memset(entry->bitmap, 0, sizeof entry->bitmap); } - generation = lfc_ctl->generation; entry_offset = entry->offset; + + for (int i = 0; i < blocks_in_chunk; i++) + { + FileCacheBlockState state = UNAVAILABLE; + bool sleeping = false; + while (lfc_ctl->generation == generation) + { + state = GET_STATE(entry, chunk_offs + i); + if (state == PENDING) { + SET_STATE(entry, chunk_offs + i, REQUESTED); + } else if (state != REQUESTED) { + SET_STATE(entry, chunk_offs + i, PENDING); + break; + } + if (!sleeping) + { + ConditionVariablePrepareToSleep(cv); + sleeping = true; + } + LWLockRelease(lfc_lock); + ConditionVariableTimedSleep(cv, CV_WAIT_TIMEOUT, WAIT_EVENT_NEON_LFC_CV_WAIT); + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + } + if (sleeping) + { + ConditionVariableCancelSleep(); + } + } LWLockRelease(lfc_lock); pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); @@ -976,6 +1225,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (rc != BLCKSZ * blocks_in_chunk) { lfc_disable("write"); + return; } else { @@ -999,18 +1249,30 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, for (int i = 0; i < blocks_in_chunk; i++) { - lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1); - entry->bitmap[(chunk_offs + i) >> 5] |= - (1 << ((chunk_offs + i) & 31)); + FileCacheBlockState state = GET_STATE(entry, chunk_offs + i); + if (state == REQUESTED) + { + ConditionVariableBroadcast(cv); + } + if (state != AVAILABLE) + { + lfc_ctl->used_pages += 1; + SET_STATE(entry, chunk_offs + i, AVAILABLE); + } } } - - LWLockRelease(lfc_lock); + else + { + /* stop iteration if LFC was disabled */ + lfc_close_file(); + break; + } } blkno += blocks_in_chunk; buf_offset += blocks_in_chunk; nblocks -= blocks_in_chunk; } + LWLockRelease(lfc_lock); } typedef struct @@ -1097,6 +1359,16 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) if (lfc_ctl) value = lfc_ctl->used_pages; break; + case 6: + key = "file_cache_evicted_pages"; + if (lfc_ctl) + value = lfc_ctl->evicted_pages; + break; + case 7: + key = "file_cache_limit"; + if (lfc_ctl) + value = lfc_ctl->limit; + break; default: SRF_RETURN_DONE(funcctx); } @@ -1220,8 +1492,8 @@ local_cache_pages(PG_FUNCTION_ARGS) hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { - for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) - n_pages += pg_popcount32(entry->bitmap[i]); + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + n_pages += GET_STATE(entry, i) == AVAILABLE; } } } @@ -1249,7 +1521,7 @@ local_cache_pages(PG_FUNCTION_ARGS) { for (int i = 0; i < BLOCKS_PER_CHUNK; i++) { - if (entry->bitmap[i >> 5] & (1 << (i & 31))) + if (GET_STATE(entry, i) == AVAILABLE) { fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i; fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c index 1f53c8fd36..bbaad09f5f 100644 --- a/pgxn/neon/hll.c +++ b/pgxn/neon/hll.c @@ -122,8 +122,8 @@ addSHLL(HyperLogLogState *cState, uint32 hash) index = hash >> HLL_C_BITS; /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ - count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); - + count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS) - 1; + Assert(count <= HLL_C_BITS); cState->regs[index][count] = now; } @@ -136,7 +136,7 @@ getMaximum(const TimestampTz* reg, TimestampTz since) { if (reg[i] >= since) { - max = i; + max = i + 1; } } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 6513ba4dd6..f71f11ff93 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -14,6 +14,8 @@ */ #include "postgres.h" +#include + #include "access/xlog.h" #include "common/hashfn.h" #include "fmgr.h" @@ -36,6 +38,11 @@ #include "pagestore_client.h" #include "walproposer.h" +#ifdef __linux__ +#include +#include +#endif + #define PageStoreTrace DEBUG5 #define MIN_RECONNECT_INTERVAL_USEC 1000 @@ -56,6 +63,9 @@ int neon_protocol_version = 2; static int max_reconnect_attempts = 60; static int stripe_size; +static int pageserver_response_log_timeout = 10000; +static int pageserver_response_disconnect_timeout = 120000; /* 2 minutes */ + typedef struct { char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE]; @@ -124,6 +134,11 @@ typedef struct uint64 nrequests_sent; uint64 nresponses_received; + /* State for the receive timeout mechanism in call_PQgetCopyData() */ + instr_time receive_start_time; /* when we started waiting */ + instr_time receive_last_log_time; /* when we last printed a log message for the wait */ + bool receive_logged; /* has the wait been logged */ + /*--- * WaitEventSet containing: * - WL_SOCKET_READABLE on 'conn' @@ -373,8 +388,9 @@ pageserver_connect(shardno_t shard_no, int elevel) { case PS_Disconnected: { - const char *keywords[3]; - const char *values[3]; + const char *keywords[4]; + const char *values[4]; + char pid_str[16]; int n_pgsql_params; TimestampTz now; int64 us_since_last_attempt; @@ -419,14 +435,30 @@ pageserver_connect(shardno_t shard_no, int elevel) * can override the password from the env variable. Seems useful, although * we don't currently use that capability anywhere. */ - keywords[0] = "dbname"; - values[0] = connstr; - n_pgsql_params = 1; + n_pgsql_params = 0; + + /* + * Pageserver logs include this in the connection's tracing span. + * This allows for reasier log correlation between compute and pageserver. + */ + keywords[n_pgsql_params] = "application_name"; + { + int ret = snprintf(pid_str, sizeof(pid_str), "%d", MyProcPid); + if (ret < 0 || ret >= (int)(sizeof(pid_str))) + elog(FATAL, "stack-allocated buffer too small to hold pid"); + } + /* lifetime: PQconnectStartParams strdups internally */ + values[n_pgsql_params] = (const char*) pid_str; + n_pgsql_params++; + + keywords[n_pgsql_params] = "dbname"; + values[n_pgsql_params] = connstr; + n_pgsql_params++; if (neon_auth_token) { - keywords[1] = "password"; - values[1] = neon_auth_token; + keywords[n_pgsql_params] = "password"; + values[n_pgsql_params] = neon_auth_token; n_pgsql_params++; } @@ -556,6 +588,9 @@ pageserver_connect(shardno_t shard_no, int elevel) switch (neon_protocol_version) { + case 3: + pagestream_query = psprintf("pagestream_v3 %s %s", neon_tenant, neon_timeline); + break; case 2: pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline); break; @@ -636,6 +671,9 @@ pageserver_connect(shardno_t shard_no, int elevel) shard->state = PS_Connected; shard->nrequests_sent = 0; shard->nresponses_received = 0; + INSTR_TIME_SET_ZERO(shard->receive_start_time); + INSTR_TIME_SET_ZERO(shard->receive_last_log_time); + shard->receive_logged = false; } /* FALLTHROUGH */ case PS_Connected: @@ -655,6 +693,33 @@ pageserver_connect(shardno_t shard_no, int elevel) Assert(false); } +static void +get_socket_stats(int socketfd, int *sndbuf, int *recvbuf) +{ + *sndbuf = -1; + *recvbuf = -1; + +#ifdef __linux__ + /* + * get kernel's send and recv queue size via ioctl + * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27 + */ + if (socketfd != -1) + { + int ioctl_err; + + ioctl_err = ioctl(socketfd, SIOCOUTQ, sndbuf); + if (ioctl_err!= 0) { + *sndbuf = -errno; + } + ioctl_err = ioctl(socketfd, FIONREAD, recvbuf); + if (ioctl_err != 0) { + *recvbuf = -errno; + } + } +#endif +} + /* * A wrapper around PQgetCopyData that checks for interrupts while sleeping. */ @@ -665,46 +730,54 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer) PageServer *shard = &page_servers[shard_no]; PGconn *pageserver_conn = shard->conn; instr_time now, - start_ts, since_start, - last_log_ts, since_last_log; - bool logged = false; - - /* - * As a debugging aid, if we don't get a response for a long time, print a - * log message. - * - * 10 s is a very generous threshold, normally we expect a response in a - * few milliseconds. We have metrics to track latencies in normal ranges, - * but in the cases that take exceptionally long, it's useful to log the - * exact timestamps. - */ -#define LOG_INTERVAL_US UINT64CONST(10 * 1000000) - - INSTR_TIME_SET_CURRENT(now); - start_ts = last_log_ts = now; - INSTR_TIME_SET_ZERO(since_last_log); retry: ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ ); if (ret == 0) { - WaitEvent event; + WaitEvent occurred_event; + int noccurred; + double log_timeout, + disconnect_timeout; long timeout; - timeout = Min(0, LOG_INTERVAL_US - INSTR_TIME_GET_MICROSEC(since_last_log)); + /* + * Calculate time elapsed since the start, and since the last progress + * log message. On first call, remember the start time. + */ + INSTR_TIME_SET_CURRENT(now); + if (INSTR_TIME_IS_ZERO(shard->receive_start_time)) + { + shard->receive_start_time = now; + INSTR_TIME_SET_ZERO(since_start); + shard->receive_last_log_time = now; + INSTR_TIME_SET_ZERO(since_last_log); + shard->receive_logged = false; + } + else + { + since_start = now; + INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); + since_last_log = now; + INSTR_TIME_SUBTRACT(since_last_log, shard->receive_last_log_time); + } - /* Sleep until there's something to do */ - (void) WaitEventSetWait(shard->wes_read, timeout, &event, 1, - WAIT_EVENT_NEON_PS_READ); + /* Sleep until the log or disconnect timeout is reached. */ + log_timeout = Max(0, (double) pageserver_response_log_timeout - INSTR_TIME_GET_MILLISEC(since_last_log)); + disconnect_timeout = Max(0, (double) pageserver_response_disconnect_timeout - INSTR_TIME_GET_MILLISEC(since_start)); + timeout = (long) ceil(Min(log_timeout, disconnect_timeout)); + + noccurred = WaitEventSetWait(shard->wes_read, timeout, &occurred_event, 1, + WAIT_EVENT_NEON_PS_READ); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); /* Data available in socket? */ - if (event.events & WL_SOCKET_READABLE) + if (noccurred > 0 && (occurred_event.events & WL_SOCKET_READABLE) != 0) { if (!PQconsumeInput(pageserver_conn)) { @@ -714,24 +787,61 @@ retry: pfree(msg); return -1; } + goto retry; + } + + /* Timeout was reached, or we were interrupted for some other reason */ + INSTR_TIME_SET_CURRENT(now); + since_last_log = now; + INSTR_TIME_SUBTRACT(since_last_log, shard->receive_last_log_time); + since_start = now; + INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); + + /* + * As a debugging aid, if we don't get a response to a pageserver request + * for a long time, print a log message. + * + * The default neon.pageserver_response_log_timeout value, 10 s, is + * very generous. Normally we expect a response in a few + * milliseconds. We have metrics to track latencies in normal ranges, + * but in the cases that take exceptionally long, it's useful to log + * the exact timestamps. + */ + if (INSTR_TIME_GET_MILLISEC(since_last_log) >= pageserver_response_log_timeout) + { + int sndbuf; + int recvbuf; + + get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf); + + neon_shard_log(shard_no, LOG, + "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)", + INSTR_TIME_GET_DOUBLE(since_start), + shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf); + shard->receive_last_log_time = now; + shard->receive_logged = true; } /* - * Print a message to the log if a long time has passed with no - * response. + * If an even longer time has passed without receiving a response from + * the pageserver, disconnect. That triggers a reconnection attempt + * in the caller. + * + * If this happens, the pageserver is likely dead and isn't coming + * back, or there's some kind of a network glitch and the connection + * is permanently gone. Without this, if the pageserver or the network + * connection is dead, it could take a very long time (15 minutes or + * more) until the TCP keepalive timeout notices that. Even if we + * would in fact get a response if we just waited a little longer, + * there's a good chance that we'll get the response sooner by + * reconnecting. */ - INSTR_TIME_SET_CURRENT(now); - since_last_log = now; - INSTR_TIME_SUBTRACT(since_last_log, last_log_ts); - if (INSTR_TIME_GET_MICROSEC(since_last_log) >= LOG_INTERVAL_US) + if (INSTR_TIME_GET_MILLISEC(since_start) >= pageserver_response_disconnect_timeout) { - since_start = now; - INSTR_TIME_SUBTRACT(since_start, start_ts); - neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)", - INSTR_TIME_GET_DOUBLE(since_start), - shard->nrequests_sent, shard->nresponses_received); - last_log_ts = now; - logged = true; + neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting", + INSTR_TIME_GET_DOUBLE(since_start)); + pageserver_disconnect(shard_no); + return -1; } goto retry; @@ -741,14 +851,18 @@ retry: * If we logged earlier that the response is taking a long time, log * another message when the response is finally received. */ - if (logged) + if (shard->receive_logged) { INSTR_TIME_SET_CURRENT(now); since_start = now; - INSTR_TIME_SUBTRACT(since_start, start_ts); - neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s", + INSTR_TIME_SUBTRACT(since_start, shard->receive_start_time); + neon_shard_log(shard_no, LOG, + "received response from pageserver after %0.3f s", INSTR_TIME_GET_DOUBLE(since_start)); } + INSTR_TIME_SET_ZERO(shard->receive_start_time); + INSTR_TIME_SET_ZERO(shard->receive_last_log_time); + shard->receive_logged = false; return ret; } @@ -827,7 +941,6 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) { while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) { - HandleMainLoopInterrupts(); shard->n_reconnect_attempts += 1; } shard->n_reconnect_attempts = 0; @@ -909,7 +1022,82 @@ pageserver_receive(shardno_t shard_no) } PG_CATCH(); { - neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response"); + neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response"); + pageserver_disconnect(shard_no); + PG_RE_THROW(); + } + PG_END_TRY(); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = nm_to_string((NeonMessage *) resp); + + neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg); + pfree(msg); + } + } + else if (rc == -1 && shard->state == PS_Disconnected) + { + /* If the state is 'Disconnected', the disconnection message was already logged */ + resp = NULL; + } + else if (rc == -1) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", msg); + pfree(msg); + pageserver_disconnect(shard_no); + resp = NULL; + } + else if (rc == -2) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg); + } + else + { + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc); + } + + shard->nresponses_received++; + return (NeonResponse *) resp; +} + +static NeonResponse * +pageserver_try_receive(shardno_t shard_no) +{ + StringInfoData resp_buff; + NeonResponse *resp; + PageServer *shard = &page_servers[shard_no]; + PGconn *pageserver_conn = shard->conn; + /* read response */ + int rc; + + if (shard->state != PS_Connected) + return NULL; + + Assert(pageserver_conn); + + rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async = true */); + + if (rc == 0) + return NULL; + else if (rc > 0) + { + PG_TRY(); + { + resp_buff.len = rc; + resp_buff.cursor = 0; + resp = nm_unpack_response(&resp_buff); + PQfreemem(resp_buff.data); + } + PG_CATCH(); + { + neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response"); pageserver_disconnect(shard_no); PG_RE_THROW(); } @@ -978,6 +1166,7 @@ page_server_api api = .send = pageserver_send, .flush = pageserver_flush, .receive = pageserver_receive, + .try_receive = pageserver_try_receive, .disconnect = pageserver_disconnect_shard }; @@ -1136,13 +1325,33 @@ pg_init_libpagestore(void) "Version of compute<->page server protocol", NULL, &neon_protocol_version, - 2, /* use protocol version 2 */ - 2, /* min */ - 2, /* max */ + 2, /* use protocol version 2 */ + 2, /* min */ + 3, /* max */ PGC_SU_BACKEND, 0, /* no flags required */ NULL, NULL, NULL); + DefineCustomIntVariable("neon.pageserver_response_log_timeout", + "pageserver response log timeout", + "If the pageserver doesn't respond to a request within this timeout," + "a message is printed to the log.", + &pageserver_response_log_timeout, + 10000, 100, INT_MAX, + PGC_SUSET, + GUC_UNIT_MS, + NULL, NULL, NULL); + + DefineCustomIntVariable("neon.pageserver_response_disconnect_timeout", + "pageserver response diconnect timeout", + "If the pageserver doesn't respond to a request within this timeout," + "disconnect and reconnect.", + &pageserver_response_disconnect_timeout, + 120000, 100, INT_MAX, + PGC_SUSET, + GUC_UNIT_MS, + NULL, NULL, NULL); + relsize_hash_init(); if (page_server != NULL) diff --git a/pgxn/neon/logical_replication_monitor.c b/pgxn/neon/logical_replication_monitor.c index 5eee5a1679..b94faafdfa 100644 --- a/pgxn/neon/logical_replication_monitor.c +++ b/pgxn/neon/logical_replication_monitor.c @@ -131,8 +131,8 @@ get_snapshots_cutoff_lsn(void) { cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn; elog(LOG, - "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d", - LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files); + "ls_monitor: number of snapshot files, %zu, is larger than limit of %d", + snapshot_index, logical_replication_max_snap_files); } /* Is the size of the logical snapshots directory larger than specified? @@ -162,8 +162,8 @@ get_snapshots_cutoff_lsn(void) } if (cutoff != original) - elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB", - LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size); + elog(LOG, "ls_monitor: " SNAPDIR " is larger than %d KB", + logical_replication_max_logicalsnapdir_size); } pfree(snapshot_descriptors); @@ -214,9 +214,13 @@ InitLogicalReplicationMonitor(void) } /* - * Unused logical replication slots pins WAL and prevents deletion of snapshots. + * Unused logical replication slots pins WAL and prevent deletion of snapshots. * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which - * need too many .snap files. + * need too many .snap files. These files are stored as AUX files, which are a + * pageserver mechanism for storing non-relation data. AUX files are shipped in + * in the basebackup which is requested by compute_ctl before Postgres starts. + * The larger the time to retrieve the basebackup, the more likely it is the + * compute will be killed by the control plane due to a timeout. */ void LogicalSlotsMonitorMain(Datum main_arg) @@ -239,10 +243,7 @@ LogicalSlotsMonitorMain(Datum main_arg) ProcessConfigFile(PGC_SIGHUP); } - /* - * If there are too many .snap files, just drop all logical slots to - * prevent aux files bloat. - */ + /* Get the cutoff LSN */ cutoff_lsn = get_snapshots_cutoff_lsn(); if (cutoff_lsn > 0) { @@ -252,31 +253,37 @@ LogicalSlotsMonitorMain(Datum main_arg) ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; XLogRecPtr restart_lsn; - /* find the name */ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); - /* Consider only logical repliction slots */ + + /* Consider only active logical repliction slots */ if (!s->in_use || !SlotIsLogical(s)) { LWLockRelease(ReplicationSlotControlLock); continue; } - /* do we need to drop it? */ + /* + * Retrieve the restart LSN to determine if we need to drop the + * slot + */ SpinLockAcquire(&s->mutex); restart_lsn = s->data.restart_lsn; SpinLockRelease(&s->mutex); + + strlcpy(slot_name, s->data.name.data, sizeof(slot_name)); + LWLockRelease(ReplicationSlotControlLock); + if (restart_lsn >= cutoff_lsn) { - LWLockRelease(ReplicationSlotControlLock); + elog(LOG, "ls_monitor: not dropping replication slot %s because restart LSN %X/%X is greater than cutoff LSN %X/%X", + slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn)); continue; } - strlcpy(slot_name, s->data.name.data, NAMEDATALEN); - elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X", + elog(LOG, "ls_monitor: dropping replication slot %s because restart LSN %X/%X lower than cutoff LSN %X/%X", slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn)); - LWLockRelease(ReplicationSlotControlLock); - /* now try to drop it, killing owner before if any */ + /* now try to drop it, killing owner before, if any */ for (;;) { pid_t active_pid; @@ -288,9 +295,9 @@ LogicalSlotsMonitorMain(Datum main_arg) if (active_pid == 0) { /* - * Slot is releasted, try to drop it. Though of course + * Slot is released, try to drop it. Though of course, * it could have been reacquired, so drop can ERROR - * out. Similarly it could have been dropped in the + * out. Similarly, it could have been dropped in the * meanwhile. * * In principle we could remove pg_try/pg_catch, that @@ -300,14 +307,14 @@ LogicalSlotsMonitorMain(Datum main_arg) PG_TRY(); { ReplicationSlotDrop(slot_name, true); - elog(LOG, "ls_monitor: slot %s dropped", slot_name); + elog(LOG, "ls_monitor: replication slot %s dropped", slot_name); } PG_CATCH(); { /* log ERROR and reset elog stack */ EmitErrorReport(); FlushErrorState(); - elog(LOG, "ls_monitor: failed to drop slot %s", slot_name); + elog(LOG, "ls_monitor: failed to drop replication slot %s", slot_name); } PG_END_TRY(); break; @@ -315,7 +322,7 @@ LogicalSlotsMonitorMain(Datum main_arg) else { /* kill the owner and wait for release */ - elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid); + elog(LOG, "ls_monitor: killing replication slot %s owner %d", slot_name, active_pid); (void) kill(active_pid, SIGTERM); /* We shouldn't get stuck, but to be safe add timeout. */ ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index ff08f9164d..768d7ae9e8 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -12,6 +12,7 @@ #include "fmgr.h" #include "miscadmin.h" +#include "pgstat.h" #include "access/subtrans.h" #include "access/twophase.h" #include "access/xlog.h" @@ -19,6 +20,7 @@ #include "access/xlogrecovery.h" #endif #include "replication/logical.h" +#include "replication/logicallauncher.h" #include "replication/slot.h" #include "replication/walsender.h" #include "storage/proc.h" @@ -55,6 +57,7 @@ uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; uint32 WAIT_EVENT_NEON_LFC_READ; uint32 WAIT_EVENT_NEON_LFC_TRUNCATE; uint32 WAIT_EVENT_NEON_LFC_WRITE; +uint32 WAIT_EVENT_NEON_LFC_CV_WAIT; uint32 WAIT_EVENT_NEON_PS_STARTING; uint32 WAIT_EVENT_NEON_PS_CONFIGURING; uint32 WAIT_EVENT_NEON_PS_SEND; @@ -408,6 +411,16 @@ ReportSearchPath(void) } } +#if PG_VERSION_NUM < 150000 +/* + * PG14 uses separate backend for stats collector having no access to shared memory. + * As far as AUX mechanism requires access to shared memory, persisting pgstat.stat file + * is not supported in PG14. And so there is no definition of neon_pgstat_file_size_limit + * variable, so we have to declare it here. + */ +static int neon_pgstat_file_size_limit; +#endif + void _PG_init(void) { @@ -434,6 +447,15 @@ _PG_init(void) restore_running_xacts_callback = RestoreRunningXactsFromClog; + DefineCustomBoolVariable( + "neon.disable_logical_replication_subscribers", + "Disables incomming logical replication", + NULL, + &disable_logical_replication_subscribers, + false, + PGC_SIGHUP, + 0, + NULL, NULL, NULL); DefineCustomBoolVariable( "neon.allow_replica_misconfig", @@ -456,6 +478,15 @@ _PG_init(void) 0, NULL, NULL, NULL); + DefineCustomIntVariable("neon.pgstat_file_size_limit", + "Maximal size of pgstat.stat file saved in Neon storage", + "Zero value disables persisting pgstat.stat file", + &neon_pgstat_file_size_limit, + 0, 0, 1000000, /* disabled by default */ + PGC_SIGHUP, + GUC_UNIT_KB, + NULL, NULL, NULL); + /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the @@ -528,6 +559,7 @@ neon_shmem_startup_hook(void) WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read"); WAIT_EVENT_NEON_LFC_TRUNCATE = WaitEventExtensionNew("Neon/FileCache_Truncate"); WAIT_EVENT_NEON_LFC_WRITE = WaitEventExtensionNew("Neon/FileCache_Write"); + WAIT_EVENT_NEON_LFC_CV_WAIT = WaitEventExtensionNew("Neon/FileCache_CvWait"); WAIT_EVENT_NEON_PS_STARTING = WaitEventExtensionNew("Neon/PS_Starting"); WAIT_EVENT_NEON_PS_CONFIGURING = WaitEventExtensionNew("Neon/PS_Configuring"); WAIT_EVENT_NEON_PS_SEND = WaitEventExtensionNew("Neon/PS_SendIO"); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 79aa88b8d3..912e09c3d3 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -28,6 +28,7 @@ extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; extern uint32 WAIT_EVENT_NEON_LFC_READ; extern uint32 WAIT_EVENT_NEON_LFC_TRUNCATE; extern uint32 WAIT_EVENT_NEON_LFC_WRITE; +extern uint32 WAIT_EVENT_NEON_LFC_CV_WAIT; extern uint32 WAIT_EVENT_NEON_PS_STARTING; extern uint32 WAIT_EVENT_NEON_PS_CONFIGURING; extern uint32 WAIT_EVENT_NEON_PS_SEND; @@ -38,6 +39,7 @@ extern uint32 WAIT_EVENT_NEON_WAL_DL; #define WAIT_EVENT_NEON_LFC_READ WAIT_EVENT_BUFFILE_READ #define WAIT_EVENT_NEON_LFC_TRUNCATE WAIT_EVENT_BUFFILE_TRUNCATE #define WAIT_EVENT_NEON_LFC_WRITE WAIT_EVENT_BUFFILE_WRITE +#define WAIT_EVENT_NEON_LFC_CV_WAIT WAIT_EVENT_BUFFILE_READ #define WAIT_EVENT_NEON_PS_STARTING PG_WAIT_EXTENSION #define WAIT_EVENT_NEON_PS_CONFIGURING PG_WAIT_EXTENSION #define WAIT_EVENT_NEON_PS_SEND PG_WAIT_EXTENSION diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index 1fb4ed9522..1fad44bd58 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -51,6 +51,26 @@ HexDecodeString(uint8 *result, char *input, int nbytes) return true; } +/* -------------------------------- + * pq_getmsgint16 - get a binary 2-byte int from a message buffer + * -------------------------------- + */ +uint16 +pq_getmsgint16(StringInfo msg) +{ + return pq_getmsgint(msg, 2); +} + +/* -------------------------------- + * pq_getmsgint32 - get a binary 4-byte int from a message buffer + * -------------------------------- + */ +uint32 +pq_getmsgint32(StringInfo msg) +{ + return pq_getmsgint(msg, 4); +} + /* -------------------------------- * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order * -------------------------------- diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index 89683714f1..7480ac28cc 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -8,6 +8,8 @@ #endif bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint16 pq_getmsgint16(StringInfo msg); +uint32 pq_getmsgint32(StringInfo msg); uint32 pq_getmsgint32_le(StringInfo msg); uint64 pq_getmsgint64_le(StringInfo msg); void pq_sendint32_le(StringInfo buf, uint32 i); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index f905e3b0fa..9faab1e4f0 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -34,6 +34,8 @@ typedef enum T_NeonGetPageRequest, T_NeonDbSizeRequest, T_NeonGetSlruSegmentRequest, + /* future tags above this line */ + T_NeonTestRequest = 99, /* only in cfg(feature = "testing") */ /* pagestore -> pagestore_client */ T_NeonExistsResponse = 100, @@ -42,12 +44,19 @@ typedef enum T_NeonErrorResponse, T_NeonDbSizeResponse, T_NeonGetSlruSegmentResponse, + /* future tags above this line */ + T_NeonTestResponse = 199, /* only in cfg(feature = "testing") */ } NeonMessageTag; +typedef uint64 NeonRequestId; + /* base struct for c-style inheritance */ typedef struct { NeonMessageTag tag; + NeonRequestId reqid; + XLogRecPtr lsn; + XLogRecPtr not_modified_since; } NeonMessage; #define messageTag(m) (((const NeonMessage *)(m))->tag) @@ -67,6 +76,7 @@ typedef enum { SLRU_MULTIXACT_OFFSETS } SlruKind; + /*-- * supertype of all the Neon*Request structs below. * @@ -87,37 +97,37 @@ typedef enum { * * These structs describe the V2 of these requests. (The old now-defunct V1 * protocol contained just one LSN and a boolean 'latest' flag.) + * + * V3 version of protocol adds request ID to all requests. This request ID is also included in response + * as well as other fields from requests, which allows to verify that we receive response for our request. + * We copy fields from request to response to make checking more reliable: request ID is formed from process ID + * and local counter, so in principle there can be duplicated requests IDs if process PID is reused. */ -typedef struct -{ - NeonMessageTag tag; - XLogRecPtr lsn; - XLogRecPtr not_modified_since; -} NeonRequest; +typedef NeonMessage NeonRequest; typedef struct { - NeonRequest req; + NeonRequest hdr; NRelFileInfo rinfo; ForkNumber forknum; } NeonExistsRequest; typedef struct { - NeonRequest req; + NeonRequest hdr; NRelFileInfo rinfo; ForkNumber forknum; } NeonNblocksRequest; typedef struct { - NeonRequest req; + NeonRequest hdr; Oid dbNode; } NeonDbSizeRequest; typedef struct { - NeonRequest req; + NeonRequest hdr; NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blkno; @@ -125,32 +135,29 @@ typedef struct typedef struct { - NeonRequest req; - SlruKind kind; - int segno; + NeonRequest hdr; + SlruKind kind; + int segno; } NeonGetSlruSegmentRequest; /* supertype of all the Neon*Response structs below */ -typedef struct -{ - NeonMessageTag tag; -} NeonResponse; +typedef NeonMessage NeonResponse; typedef struct { - NeonMessageTag tag; + NeonExistsRequest req; bool exists; } NeonExistsResponse; typedef struct { - NeonMessageTag tag; + NeonNblocksRequest req; uint32 n_blocks; } NeonNblocksResponse; typedef struct { - NeonMessageTag tag; + NeonGetPageRequest req; char page[FLEXIBLE_ARRAY_MEMBER]; } NeonGetPageResponse; @@ -158,21 +165,21 @@ typedef struct typedef struct { - NeonMessageTag tag; + NeonDbSizeRequest req; int64 db_size; } NeonDbSizeResponse; typedef struct { - NeonMessageTag tag; + NeonResponse req; char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error * message */ } NeonErrorResponse; typedef struct { - NeonMessageTag tag; - int n_blocks; + NeonGetSlruSegmentRequest req; + int n_blocks; char data[BLCKSZ * SLRU_PAGES_PER_SEGMENT]; } NeonGetSlruSegmentResponse; @@ -189,9 +196,29 @@ typedef uint16 shardno_t; typedef struct { + /* + * Send this request to the PageServer associated with this shard. + */ bool (*send) (shardno_t shard_no, NeonRequest * request); + /* + * Blocking read for the next response of this shard. + * + * When a CANCEL signal is handled, the connection state will be + * unmodified. + */ NeonResponse *(*receive) (shardno_t shard_no); + /* + * Try get the next response from the TCP buffers, if any. + * Returns NULL when the data is not yet available. + */ + NeonResponse *(*try_receive) (shardno_t shard_no); + /* + * Make sure all requests are sent to PageServer. + */ bool (*flush) (shardno_t shard_no); + /* + * Disconnect from this pageserver shard. + */ void (*disconnect) (shardno_t shard_no); } page_server_api; @@ -206,6 +233,7 @@ extern char *neon_timeline; extern char *neon_tenant; extern int32 max_cluster_size; extern int neon_protocol_version; +extern bool lfc_store_prefetch_result; extern shardno_t get_shard_number(BufferTag* tag); @@ -274,14 +302,16 @@ extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int nblocks, bits8 *bitmap); -extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); +extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + const void* buffer, XLogRecPtr lsn); + static inline bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, void *buffer) { - bits8 rv = 0; + bits8 rv = 1; return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 385905d9ce..091ad555e0 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -120,6 +120,9 @@ static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); +static uint32 local_request_counter; +#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) + /* * Prefetch implementation: * @@ -159,7 +162,7 @@ static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); * UNUSED ------> REQUESTED --> RECEIVED * ^ : | | * | : v | - * | : TAG_UNUSED | + * | : TAG_REMAINS | * | : | | * +----------------+------------+ * : @@ -178,7 +181,7 @@ typedef enum PrefetchStatus /* must fit in uint8; bits 0x1 are used */ typedef enum { PRFSF_NONE = 0x0, - PRFSF_SEQ = 0x1, + PRFSF_LFC = 0x1 /* received prefetch result is stored in LFC */ } PrefetchRequestFlags; typedef struct PrefetchRequest @@ -188,15 +191,11 @@ typedef struct PrefetchRequest uint8 status; /* see PrefetchStatus for valid values */ uint8 flags; /* see PrefetchRequestFlags */ neon_request_lsns request_lsns; + NeonRequestId reqid; NeonResponse *response; /* may be null */ uint64 my_ring_index; } PrefetchRequest; -StaticAssertDecl(sizeof(PrefetchRequest) == 64, - "We prefer to have a power-of-2 size for this struct. Please" - " try to find an alternative solution before reaching to" - " increase the expected size here"); - /* prefetch buffer lookup hash table */ typedef struct PrfHashEntry @@ -306,7 +305,7 @@ GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, static void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *output, - BlockNumber nblocks, const bits8 *mask); + BlockNumber nblocks); static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot); @@ -364,7 +363,9 @@ compact_prefetch_buffers(void) target_slot->buftag = source_slot->buftag; target_slot->shard_no = source_slot->shard_no; target_slot->status = source_slot->status; + target_slot->flags = source_slot->flags; target_slot->response = source_slot->response; + target_slot->reqid = source_slot->reqid; target_slot->request_lsns = source_slot->request_lsns; target_slot->my_ring_index = empty_ring_index; @@ -405,6 +406,68 @@ compact_prefetch_buffers(void) return false; } +/* + * If there might be responses still in the TCP buffer, then + * we should try to use those, so as to reduce any TCP backpressure + * on the OS/PS side. + * + * This procedure handles that. + * + * Note that this is only valid as long as the only pipelined + * operations in the TCP buffer are getPage@Lsn requests. + */ +static void +prefetch_pump_state(void) +{ + while (MyPState->ring_receive != MyPState->ring_flush) + { + NeonResponse *response; + PrefetchRequest *slot; + MemoryContext old; + + slot = GetPrfSlot(MyPState->ring_receive); + + old = MemoryContextSwitchTo(MyPState->errctx); + response = page_server->try_receive(slot->shard_no); + MemoryContextSwitchTo(old); + + if (response == NULL) + break; + + /* The slot should still be valid */ + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(slot->shard_no, ERROR, + "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long) slot->my_ring_index, (long) MyPState->ring_receive); + + /* update prefetch state */ + MyPState->n_responses_buffered += 1; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + + /* update slot state */ + slot->status = PRFS_RECEIVED; + slot->response = response; + + if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) + { + /* + * Store prefetched result in LFC (please read comments to lfc_prefetch + * explaining why it can be done without holding shared buffer lock + */ + if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) + { + slot->flags |= PRFSF_LFC; + } + } + } +} + void readahead_buffer_resize(int newsize, void *extra) { @@ -424,8 +487,7 @@ readahead_buffer_resize(int newsize, void *extra) */ if (MyPState->n_requests_inflight > newsize) { - Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize); - prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize)); + prefetch_wait_for(MyPState->ring_unused - newsize - 1); Assert(MyPState->n_requests_inflight <= newsize); } @@ -664,6 +726,18 @@ prefetch_read(PrefetchRequest *slot) /* update slot state */ slot->status = PRFS_RECEIVED; slot->response = response; + + if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) + { + /* + * Store prefetched result in LFC (please read comments to lfc_prefetch + * explaining why it can be done without holding shared buffer lock + */ + if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) + { + slot->flags |= PRFSF_LFC; + } + } return true; } else @@ -716,6 +790,8 @@ prefetch_on_ps_disconnect(void) MyPState->ring_receive += 1; prefetch_set_unused(ring_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; } /* @@ -796,7 +872,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns uint64 mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index; NeonGetPageRequest request = { - .req.tag = T_NeonGetPageRequest, + .hdr.tag = T_NeonGetPageRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), /* lsn and not_modified_since are filled in below */ .rinfo = BufTagGetNRelFileInfo(slot->buftag), .forknum = slot->buftag.forkNum, @@ -805,14 +882,16 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns Assert(mySlotNo == MyPState->ring_unused); + slot->reqid = request.hdr.reqid; + if (force_request_lsns) slot->request_lsns = *force_request_lsns; else neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, - &slot->request_lsns, 1, NULL); - request.req.lsn = slot->request_lsns.request_lsn; - request.req.not_modified_since = slot->request_lsns.not_modified_since; + &slot->request_lsns, 1); + request.hdr.lsn = slot->request_lsns.request_lsn; + request.hdr.not_modified_since = slot->request_lsns.not_modified_since; Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); @@ -836,6 +915,74 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns Assert(!found); } +/* + * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted. + * Present pages are marked in "mask" bitmap and total number of such pages is returned. + */ +static int +prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, neon_request_lsns *lsns, + BlockNumber nblocks, void **buffers, bits8 *mask) +{ + int hits = 0; + PrefetchRequest hashkey; + + /* + * Use an intermediate PrefetchRequest struct as the hash key to ensure + * correct alignment and that the padding bytes are cleared. + */ + memset(&hashkey.buftag, 0, sizeof(BufferTag)); + CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); + hashkey.buftag.forkNum = forknum; + + for (int i = 0; i < nblocks; i++) + { + PrfHashEntry *entry; + + hashkey.buftag.blockNum = blocknum + i; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + + if (entry != NULL) + { + PrefetchRequest *slot = entry->slot; + uint64 ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); + + if (slot->status != PRFS_RECEIVED) + continue; + + /* + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. + */ + if (!neon_prefetch_response_usable(&lsns[i], slot)) + continue; + + memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); + prefetch_set_unused(ring_index); + BITMAP_SET(mask, i); + + hits += 1; + inc_getpage_wait(0); + } + } + pgBufferUsage.prefetch.hits += hits; + return hits; +} + +#if PG_MAJORVERSION_NUM < 17 +static bool +prefetch_lookup(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkn, neon_request_lsns *lsns, void *buffer) +{ + bits8 present = 0; + return prefetch_lookupv(rinfo, forkNum, blkn, lsns, 1, &buffer, &present) != 0; +} +#endif + /* * prefetch_register_bufferv() - register and prefetch buffers * @@ -861,7 +1008,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, { uint64 min_ring_index; PrefetchRequest hashkey; -#if USE_ASSERT_CHECKING +#ifdef USE_ASSERT_CHECKING bool any_hits = false; #endif /* We will never read further ahead than our buffer can store. */ @@ -900,7 +1047,7 @@ Retry: else lsns = NULL; -#if USE_ASSERT_CHECKING +#ifdef USE_ASSERT_CHECKING any_hits = true; #endif @@ -935,7 +1082,8 @@ Retry: prefetch_set_unused(ring_index); entry = NULL; slot = NULL; - MyNeonCounters->getpage_prefetch_discards_total++; + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; } } @@ -958,8 +1106,6 @@ Retry: /* The buffered request is good enough, return that index */ if (is_prefetch) pgBufferUsage.prefetch.duplicates++; - else - pgBufferUsage.prefetch.hits++; continue; } } @@ -1026,10 +1172,14 @@ Retry: if (!prefetch_wait_for(cleanup_index)) goto Retry; prefetch_set_unused(cleanup_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; break; case PRFS_RECEIVED: case PRFS_TAG_REMAINS: prefetch_set_unused(cleanup_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; break; default: pg_unreachable(); @@ -1057,6 +1207,7 @@ Retry: slot->buftag = hashkey.buftag; slot->shard_no = get_shard_number(&tag); slot->my_ring_index = ring_index; + slot->flags = 0; min_ring_index = Min(min_ring_index, ring_index); @@ -1095,6 +1246,12 @@ Retry: return min_ring_index; } +static bool +equal_requests(NeonRequest* a, NeonRequest* b) +{ + return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since; +} + /* * Note: this function can get canceled and use a long jump to the next catch @@ -1177,6 +1334,10 @@ nm_pack_request(NeonRequest *msg) initStringInfo(&s); pq_sendbyte(&s, msg->tag); + if (neon_protocol_version >= 3) + { + pq_sendint64(&s, msg->reqid); + } pq_sendint64(&s, msg->lsn); pq_sendint64(&s, msg->not_modified_since); @@ -1254,8 +1415,16 @@ NeonResponse * nm_unpack_response(StringInfo s) { NeonMessageTag tag = pq_getmsgbyte(s); + NeonResponse resp_hdr = {0}; /* make valgrind happy */ NeonResponse *resp = NULL; + resp_hdr.tag = tag; + if (neon_protocol_version >= 3) + { + resp_hdr.reqid = pq_getmsgint64(s); + resp_hdr.lsn = pq_getmsgint64(s); + resp_hdr.not_modified_since = pq_getmsgint64(s); + } switch (tag) { /* pagestore -> pagestore_client */ @@ -1263,7 +1432,14 @@ nm_unpack_response(StringInfo s) { NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); - msg_resp->tag = tag; + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + } + msg_resp->req.hdr = resp_hdr; msg_resp->exists = pq_getmsgbyte(s); pq_getmsgend(s); @@ -1275,7 +1451,14 @@ nm_unpack_response(StringInfo s) { NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); - msg_resp->tag = tag; + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + } + msg_resp->req.hdr = resp_hdr; msg_resp->n_blocks = pq_getmsgint(s, 4); pq_getmsgend(s); @@ -1288,12 +1471,20 @@ nm_unpack_response(StringInfo s) NeonGetPageResponse *msg_resp; msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE); - msg_resp->tag = tag; + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + msg_resp->req.blkno = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; /* XXX: should be varlena */ memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); pq_getmsgend(s); - Assert(msg_resp->tag == T_NeonGetPageResponse); + Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse); resp = (NeonResponse *) msg_resp; break; @@ -1303,7 +1494,11 @@ nm_unpack_response(StringInfo s) { NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); - msg_resp->tag = tag; + if (neon_protocol_version >= 3) + { + msg_resp->req.dbNode = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; msg_resp->db_size = pq_getmsgint64(s); pq_getmsgend(s); @@ -1321,7 +1516,7 @@ nm_unpack_response(StringInfo s) msglen = strlen(msgtext); msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); - msg_resp->tag = tag; + msg_resp->req = resp_hdr; memcpy(msg_resp->message, msgtext, msglen + 1); pq_getmsgend(s); @@ -1332,9 +1527,17 @@ nm_unpack_response(StringInfo s) case T_NeonGetSlruSegmentResponse: { NeonGetSlruSegmentResponse *msg_resp; - int n_blocks = pq_getmsgint(s, 4); - msg_resp = palloc(sizeof(NeonGetSlruSegmentResponse)); - msg_resp->tag = tag; + int n_blocks; + msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse)); + + if (neon_protocol_version >= 3) + { + msg_resp->req.kind = pq_getmsgbyte(s); + msg_resp->req.segno = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; + + n_blocks = pq_getmsgint(s, 4); msg_resp->n_blocks = n_blocks; memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ); pq_getmsgend(s); @@ -1379,8 +1582,8 @@ nm_to_string(NeonMessage *msg) appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1392,8 +1595,8 @@ nm_to_string(NeonMessage *msg) appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1406,8 +1609,8 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1417,8 +1620,8 @@ nm_to_string(NeonMessage *msg) appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1429,8 +1632,8 @@ nm_to_string(NeonMessage *msg) appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\""); appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1945,8 +2148,7 @@ GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, */ static void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, - neon_request_lsns *output, BlockNumber nblocks, - const bits8 *mask) + neon_request_lsns *output, BlockNumber nblocks) { XLogRecPtr last_written_lsns[PG_IOV_MAX]; @@ -2034,9 +2236,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *result = &output[i]; XLogRecPtr last_written_lsn = last_written_lsns[i]; - if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) - continue; - if (last_written_lsn > replay_lsn) { /* GetCurrentReplayRecPtr was introduced in v15 */ @@ -2079,8 +2278,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *result = &output[i]; XLogRecPtr last_written_lsn = last_written_lsns[i]; - if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) - continue; /* * Use the latest LSN that was evicted from the buffer cache as the * 'not_modified_since' hint. Any pages modified by later WAL records @@ -2302,42 +2499,67 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) } neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, - REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); { NeonExistsRequest request = { - .req.tag = T_NeonExistsRequest, - .req.lsn = request_lsns.request_lsn, - .req.not_modified_since = request_lsns.not_modified_since, + .hdr.tag = T_NeonExistsRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns.request_lsn, + .hdr.not_modified_since = request_lsns.not_modified_since, .rinfo = InfoFromSMgrRel(reln), .forknum = forkNum }; resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonExistsResponse: + { + NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + !RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) || + exists_resp->req.forknum != request.forknum) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum); + } + } + exists = exists_resp->exists; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", + resp->reqid, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", + T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); + } + pfree(resp); } - - switch (resp->tag) - { - case T_NeonExistsResponse: - exists = ((NeonExistsResponse *) resp)->exists; - break; - - case T_NeonErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", - T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); - } - pfree(resp); return exists; } @@ -2696,8 +2918,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, while (nblocks > 0) { int iterblocks = Min(nblocks, PG_IOV_MAX); - bits8 lfc_present[PG_IOV_MAX / 8]; - memset(lfc_present, 0, sizeof(lfc_present)); + bits8 lfc_present[PG_IOV_MAX / 8] = {0}; if (lfc_cache_containsv(InfoFromSMgrRel(reln), forknum, blocknum, iterblocks, lfc_present) == iterblocks) @@ -2708,12 +2929,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } tag.blockNum = blocknum; - + for (int i = 0; i < PG_IOV_MAX / 8; i++) lfc_present[i] = ~(lfc_present[i]); ring_index = prefetch_register_bufferv(tag, NULL, iterblocks, lfc_present, true); + nblocks -= iterblocks; blocknum += iterblocks; @@ -2721,6 +2943,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, MyPState->ring_last <= ring_index); } + prefetch_pump_state(); + return false; } @@ -2762,6 +2986,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) Assert(ring_index < MyPState->ring_unused && MyPState->ring_last <= ring_index); + prefetch_pump_state(); + return false; } #endif /* PG_MAJORVERSION_NUM < 17 */ @@ -2804,6 +3030,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, */ neon_log(SmgrTrace, "writeback noop"); + prefetch_pump_state(); + #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) mdwriteback(reln, forknum, blocknum, nblocks); @@ -2868,7 +3096,7 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block start_ts = GetCurrentTimestamp(); if (RecoveryInProgress() && MyBackendType != B_STARTUP) - XLogWaitForReplayOf(reqlsns[0].request_lsn); + XLogWaitForReplayOf(reqlsns->request_lsn); /* * Try to find prefetched page in the list of received pages. @@ -2945,15 +3173,44 @@ Retry: switch (resp->tag) { case T_NeonGetPageResponse: - memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); - lfc_write(rinfo, forkNum, blockno, buffer); + { + NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp; + if (neon_protocol_version >= 3) + { + if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since || + !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || + getpage_resp->req.forknum != forkNum || + getpage_resp->req.blkno != base_blockno + i) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i); + } + } + memcpy(buffer, getpage_resp->page, BLCKSZ); + if (!lfc_store_prefetch_result) + lfc_write(rinfo, forkNum, blockno, buffer); break; - + } case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); + } + } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - slot->shard_no, blockno, RelFileInfoFmt(rinfo), + errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo), forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); @@ -3020,6 +3277,17 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } + /* Try to read PS results if they are available */ + prefetch_pump_state(); + + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); + + if (prefetch_lookup(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, buffer)) + { + /* Prefetch hit */ + return; + } + /* Try to read from local file cache */ if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) { @@ -3027,9 +3295,13 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL); neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); + /* + * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. + */ + prefetch_pump_state(); + #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { @@ -3108,11 +3380,14 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer #if PG_MAJORVERSION_NUM >= 17 static void neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - void **buffers, BlockNumber nblocks) + void **buffers, BlockNumber nblocks) { + bits8 prefetch_hits[PG_IOV_MAX / 8] = {0}; + bits8 lfc_hits[PG_IOV_MAX / 8]; bits8 read[PG_IOV_MAX / 8]; neon_request_lsns request_lsns[PG_IOV_MAX]; int lfc_result; + int prefetch_result; switch (reln->smgr_relpersistence) { @@ -3135,38 +3410,54 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, neon_log(ERROR, "Read request too large: %d is larger than max %d", nblocks, PG_IOV_MAX); - memset(read, 0, sizeof(read)); + /* Try to read PS results if they are available */ + prefetch_pump_state(); + + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, + request_lsns, nblocks); + + + prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks, buffers, prefetch_hits); + + if (prefetch_result == nblocks) + return; + + /* invert the result: exclude prefetched blocks */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + lfc_hits[i] = ~prefetch_hits[i]; /* Try to read from local file cache */ lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, - nblocks, read); + nblocks, lfc_hits); if (lfc_result > 0) MyNeonCounters->file_cache_hits_total += lfc_result; /* Read all blocks from LFC, so we're done */ - if (lfc_result == nblocks) + if (prefetch_result + lfc_result == nblocks) return; - if (lfc_result == -1) + if (lfc_result <= 0) { /* can't use the LFC result, so read all blocks from PS */ for (int i = 0; i < PG_IOV_MAX / 8; i++) - read[i] = 0xFF; + read[i] = ~prefetch_hits[i]; } else { /* invert the result: exclude blocks read from lfc */ for (int i = 0; i < PG_IOV_MAX / 8; i++) - read[i] = ~(read[i]); + read[i] = ~(prefetch_hits[i] | lfc_hits[i]); } - neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, - request_lsns, nblocks, read); - neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, buffers, nblocks, read); + /* + * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. + */ + prefetch_pump_state(); + #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { @@ -3335,6 +3626,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); + prefetch_pump_state(); + #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) #if PG_MAJORVERSION_NUM >= 17 @@ -3388,6 +3681,8 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); + prefetch_pump_state(); + #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); @@ -3432,51 +3727,76 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) } neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, - REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); { NeonNblocksRequest request = { - .req.tag = T_NeonNblocksRequest, - .req.lsn = request_lsns.request_lsn, - .req.not_modified_since = request_lsns.not_modified_since, + .hdr.tag = T_NeonNblocksRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns.request_lsn, + .hdr.not_modified_since = request_lsns.not_modified_since, .rinfo = InfoFromSMgrRel(reln), .forknum = forknum, }; resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonNblocksResponse: + { + NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + !RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) || + relsize_resp->req.forknum != forknum) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum); + } + } + n_blocks = relsize_resp->n_blocks; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", + resp->reqid, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", + T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); + } + update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); + + neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), + n_blocks); + + pfree(resp); } - - switch (resp->tag) - { - case T_NeonNblocksResponse: - n_blocks = ((NeonNblocksResponse *) resp)->n_blocks; - break; - - case T_NeonErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", - T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); - } - update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); - - neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), - n_blocks); - - pfree(resp); return n_blocks; } @@ -3492,44 +3812,68 @@ neon_dbsize(Oid dbNode) NRelFileInfo dummy_node = {0}; neon_get_request_lsns(dummy_node, MAIN_FORKNUM, - REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); { NeonDbSizeRequest request = { - .req.tag = T_NeonDbSizeRequest, - .req.lsn = request_lsns.request_lsn, - .req.not_modified_since = request_lsns.not_modified_since, + .hdr.tag = T_NeonDbSizeRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns.request_lsn, + .hdr.not_modified_since = request_lsns.not_modified_since, .dbNode = dbNode, }; resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + dbsize_resp->req.dbNode != dbNode) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode); + } + } + db_size = dbsize_resp->db_size; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X", + resp->reqid, + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", + T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); + } + + neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); + + pfree(resp); } - - switch (resp->tag) - { - case T_NeonDbSizeResponse: - db_size = ((NeonDbSizeResponse *) resp)->db_size; - break; - - case T_NeonErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X", - dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", - T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); - } - - neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", - dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); - - pfree(resp); return db_size; } @@ -3537,7 +3881,7 @@ neon_dbsize(Oid dbNode) * neon_truncate() -- Truncate relation to specified number of blocks. */ static void -neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks) { XLogRecPtr lsn; @@ -3552,7 +3896,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: - mdtruncate(reln, forknum, nblocks); + mdtruncate(reln, forknum, old_blocks, nblocks); return; default: @@ -3590,7 +3934,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) - mdtruncate(reln, forknum, nblocks); + mdtruncate(reln, forknum, old_blocks, nblocks); #endif } @@ -3628,6 +3972,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); + prefetch_pump_state(); + #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) mdimmedsync(reln, forknum); @@ -3861,16 +4207,17 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf return -1; request = (NeonGetSlruSegmentRequest) { - .req.tag = T_NeonGetSlruSegmentRequest, - .req.lsn = request_lsn, - .req.not_modified_since = not_modified_since, + .hdr.tag = T_NeonGetSlruSegmentRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsn, + .hdr.not_modified_since = not_modified_since, .kind = kind, .segno = segno }; do { - while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no)); + while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); consume_prefetch_responses(); @@ -3880,14 +4227,38 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf switch (resp->tag) { case T_NeonGetSlruSegmentResponse: - n_blocks = ((NeonGetSlruSegmentResponse *) resp)->n_blocks; - memcpy(buffer, ((NeonGetSlruSegmentResponse *) resp)->data, n_blocks*BLCKSZ); + { + NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + slru_resp->req.kind != kind || + slru_resp->req.segno != segno) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, segno); + } + } + n_blocks = slru_resp->n_blocks; + memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ); break; - + } case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "could not read SLRU %d segment %d at lsn %X/%08X", + errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %d at lsn %X/%08X", + resp->reqid, kind, segno, LSN_FORMAT_ARGS(request_lsn)), @@ -4026,8 +4397,9 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, NeonResponse *response; NeonNblocksResponse *nbresponse; NeonNblocksRequest request = { - .req = (NeonRequest) { + .hdr = (NeonRequest) { .tag = T_NeonNblocksRequest, + .reqid = GENERATE_REQUEST_ID(), .lsn = end_recptr, .not_modified_since = end_recptr, }, @@ -4175,7 +4547,12 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) if (no_redo_needed) { SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); - lfc_evict(rinfo, forknum, blkno); + /* + * Redo changes if page exists in LFC. + * We should perform this check after assigning LwLSN to prevent + * prefetching of some older version of the page by some other backend. + */ + no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno); } LWLockRelease(partitionLock); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index e89ffdb628..d7604e30d7 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -70,6 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk); static bool RecvAppendResponses(Safekeeper *sk); static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp); static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp); +static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version); static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); @@ -81,6 +82,8 @@ static char *FormatSafekeeperState(Safekeeper *sk); static void AssertEventsOkForState(uint32 events, Safekeeper *sk); static char *FormatEvents(WalProposer *wp, uint32 events); static void UpdateDonorShmem(WalProposer *wp); +static char *MembershipConfigurationToString(MembershipConfiguration *mconf); +static void MembershipConfigurationFree(MembershipConfiguration *mconf); WalProposer * WalProposerCreate(WalProposerConfig *config, walproposer_api api) @@ -137,25 +140,21 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) } wp->quorum = wp->n_safekeepers / 2 + 1; + if (wp->config->proto_version != 2 && wp->config->proto_version != 3) + wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version); + wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version); + /* Fill the greeting package */ - wp->greetRequest.tag = 'g'; - wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION; - wp->greetRequest.pgVersion = PG_VERSION_NUM; - wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId)); - wp->greetRequest.systemId = wp->config->systemId; - if (!wp->config->neon_timeline) - wp_log(FATAL, "neon.timeline_id is not provided"); - if (*wp->config->neon_timeline != '\0' && - !HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16)) - wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline); + wp->greetRequest.pam.tag = 'g'; if (!wp->config->neon_tenant) wp_log(FATAL, "neon.tenant_id is not provided"); - if (*wp->config->neon_tenant != '\0' && - !HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16)) - wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant); - - wp->greetRequest.timeline = wp->config->pgTimeline; - wp->greetRequest.walSegSize = wp->config->wal_segment_size; + wp->greetRequest.tenant_id = wp->config->neon_tenant; + if (!wp->config->neon_timeline) + wp_log(FATAL, "neon.timeline_id is not provided"); + wp->greetRequest.timeline_id = wp->config->neon_timeline; + wp->greetRequest.pg_version = PG_VERSION_NUM; + wp->greetRequest.system_id = wp->config->systemId; + wp->greetRequest.wal_seg_size = wp->config->wal_segment_size; wp->api.init_event_set(wp); @@ -165,12 +164,14 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) void WalProposerFree(WalProposer *wp) { + MembershipConfigurationFree(&wp->mconf); for (int i = 0; i < wp->n_safekeepers; i++) { Safekeeper *sk = &wp->safekeeper[i]; Assert(sk->outbuf.data != NULL); pfree(sk->outbuf.data); + MembershipConfigurationFree(&sk->greetResponse.mconf); if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; @@ -308,6 +309,7 @@ ShutdownConnection(Safekeeper *sk) sk->state = SS_OFFLINE; sk->streamingAt = InvalidXLogRecPtr; + MembershipConfigurationFree(&sk->greetResponse.mconf); if (sk->voteResponse.termHistory.entries) pfree(sk->voteResponse.termHistory.entries); sk->voteResponse.termHistory.entries = NULL; @@ -598,11 +600,14 @@ static void SendStartWALPush(Safekeeper *sk) { WalProposer *wp = sk->wp; +#define CMD_LEN 512 + char cmd[CMD_LEN]; - if (!wp->api.conn_send_query(sk, "START_WAL_PUSH")) + snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version); + if (!wp->api.conn_send_query(sk, cmd)) { - wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", - sk->host, sk->port, wp->api.conn_error_message(sk)); + wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s", + cmd, sk->host, sk->port, wp->api.conn_error_message(sk)); ShutdownConnection(sk); return; } @@ -658,23 +663,33 @@ RecvStartWALPushResult(Safekeeper *sk) /* * Start handshake: first of all send information about the - * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for + * walproposer. After sending, we wait on SS_HANDSHAKE_RECV for * a response to finish the handshake. */ static void SendProposerGreeting(Safekeeper *sk) { + WalProposer *wp = sk->wp; + char *mconf_toml = MembershipConfigurationToString(&wp->greetRequest.mconf); + + wp_log(LOG, "sending ProposerGreeting to safekeeper %s:%s with mconf = %s", sk->host, sk->port, mconf_toml); + pfree(mconf_toml); + + PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->greetRequest, + &sk->outbuf, wp->config->proto_version); + /* * On failure, logging & resetting the connection is handled. We just need * to handle the control flow. */ - BlockingWrite(sk, &sk->wp->greetRequest, sizeof(sk->wp->greetRequest), SS_HANDSHAKE_RECV); + BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV); } static void RecvAcceptorGreeting(Safekeeper *sk) { WalProposer *wp = sk->wp; + char *mconf_toml; /* * If our reading doesn't immediately succeed, any necessary error @@ -685,7 +700,10 @@ RecvAcceptorGreeting(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) return; - wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term); + mconf_toml = MembershipConfigurationToString(&sk->greetResponse.mconf); + wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, node_id = %lu, mconf = %s, term=" UINT64_FORMAT, + sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term); + pfree(mconf_toml); /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; @@ -707,12 +725,9 @@ RecvAcceptorGreeting(Safekeeper *sk) wp->propTerm++; wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); - wp->voteRequest = (VoteRequest) - { - .tag = 'v', - .term = wp->propTerm - }; - memcpy(wp->voteRequest.proposerId.data, wp->greetRequest.proposerId.data, UUID_LEN); + wp->voteRequest.pam.tag = 'v'; + wp->voteRequest.generation = wp->mconf.generation; + wp->voteRequest.term = wp->propTerm; } } else if (sk->greetResponse.term > wp->propTerm) @@ -759,12 +774,14 @@ SendVoteRequest(Safekeeper *sk) { WalProposer *wp = sk->wp; - /* We have quorum for voting, send our vote request */ - wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term); - /* On failure, logging & resetting is handled */ - if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT)) - return; + PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->voteRequest, + &sk->outbuf, wp->config->proto_version); + /* We have quorum for voting, send our vote request */ + wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port, + wp->voteRequest.generation, wp->voteRequest.term); + /* On failure, logging & resetting is handled */ + BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT); /* If successful, wait for read-ready with SS_WAIT_VERDICT */ } @@ -778,11 +795,12 @@ RecvVoteResponse(Safekeeper *sk) return; wp_log(LOG, - "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, + sk->voteResponse.voteGiven, + GetHighestTerm(&sk->voteResponse.termHistory), LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), - LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), - LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn)); /* * In case of acceptor rejecting our vote, bail out, but only if either it @@ -847,9 +865,9 @@ HandleElectedProposer(WalProposer *wp) * otherwise we must be sync-safekeepers and we have nothing to do then. * * Proceeding is not only pointless but harmful, because we'd give - * safekeepers term history starting with 0/0. These hacks will go away once - * we disable implicit timeline creation on safekeepers and create it with - * non zero LSN from the start. + * safekeepers term history starting with 0/0. These hacks will go away + * once we disable implicit timeline creation on safekeepers and create it + * with non zero LSN from the start. */ if (wp->propEpochStartLsn == InvalidXLogRecPtr) { @@ -942,7 +960,6 @@ DetermineEpochStartLsn(WalProposer *wp) wp->propEpochStartLsn = InvalidXLogRecPtr; wp->donorEpoch = 0; wp->truncateLsn = InvalidXLogRecPtr; - wp->timelineStartLsn = InvalidXLogRecPtr; for (int i = 0; i < wp->n_safekeepers; i++) { @@ -959,20 +976,6 @@ DetermineEpochStartLsn(WalProposer *wp) wp->donor = i; } wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); - - if (wp->safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr) - { - /* timelineStartLsn should be the same everywhere or unknown */ - if (wp->timelineStartLsn != InvalidXLogRecPtr && - wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn) - { - wp_log(WARNING, - "inconsistent timelineStartLsn: current %X/%X, received %X/%X", - LSN_FORMAT_ARGS(wp->timelineStartLsn), - LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn)); - } - wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn; - } } } @@ -995,22 +998,11 @@ DetermineEpochStartLsn(WalProposer *wp) if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers) { wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp); - if (wp->timelineStartLsn == InvalidXLogRecPtr) - { - wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp); - } wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); } pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn); - /* - * Safekeepers are setting truncateLsn after timelineStartLsn is known, so - * it should never be zero at this point, if we know timelineStartLsn. - * - * timelineStartLsn can be zero only on the first syncSafekeepers run. - */ - Assert((wp->truncateLsn != InvalidXLogRecPtr) || - (wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn)); + Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers); /* * We will be generating WAL since propEpochStartLsn, so we should set @@ -1024,7 +1016,8 @@ DetermineEpochStartLsn(WalProposer *wp) dth = &wp->safekeeper[wp->donor].voteResponse.termHistory; wp->propTermHistory.n_entries = dth->n_entries + 1; wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries); - memcpy(wp->propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); + if (dth->n_entries > 0) + memcpy(wp->propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm; wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn; @@ -1052,10 +1045,11 @@ DetermineEpochStartLsn(WalProposer *wp) if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp)) { /* - * However, allow to proceed if last_log_term on the node which gave - * the highest vote (i.e. point where we are going to start writing) - * actually had been won by me; plain restart of walproposer not - * intervened by concurrent compute which wrote WAL is ok. + * However, allow to proceed if last_log_term on the node which + * gave the highest vote (i.e. point where we are going to start + * writing) actually had been won by me; plain restart of + * walproposer not intervened by concurrent compute which wrote + * WAL is ok. * * This avoids compute crash after manual term_bump. */ @@ -1125,14 +1119,8 @@ SendProposerElected(Safekeeper *sk) { /* safekeeper is empty or no common point, start from the beginning */ sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; - wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u", - sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); - - /* - * wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline - * is created manually (test_s3_wal_replay) - */ - Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr); + wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, termHistory.n_entries=%u", + sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), wp->propTermHistory.n_entries); } else { @@ -1157,29 +1145,19 @@ SendProposerElected(Safekeeper *sk) Assert(sk->startStreamingAt <= wp->availableLsn); - msg.tag = 'e'; + msg.apm.tag = 'e'; + msg.generation = wp->mconf.generation; msg.term = wp->propTerm; msg.startStreamingAt = sk->startStreamingAt; msg.termHistory = &wp->propTermHistory; - msg.timelineStartLsn = wp->timelineStartLsn; lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0; wp_log(LOG, - "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", - sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); - - resetStringInfo(&sk->outbuf); - pq_sendint64_le(&sk->outbuf, msg.tag); - pq_sendint64_le(&sk->outbuf, msg.term); - pq_sendint64_le(&sk->outbuf, msg.startStreamingAt); - pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries); - for (int i = 0; i < msg.termHistory->n_entries; i++) - { - pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term); - pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn); - } - pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn); + "sending elected msg to node " UINT64_FORMAT " generation=%u term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s", + sk->greetResponse.nodeId, msg.generation, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), + lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port); + PAMessageSerialize(wp, (ProposerAcceptorMessage *) &msg, &sk->outbuf, wp->config->proto_version); if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH)) return; @@ -1245,14 +1223,13 @@ static void PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) { Assert(endLsn >= beginLsn); - req->tag = 'a'; + req->apm.tag = 'a'; + req->generation = wp->mconf.generation; req->term = wp->propTerm; - req->epochStartLsn = wp->propEpochStartLsn; req->beginLsn = beginLsn; req->endLsn = endLsn; req->commitLsn = wp->commitLsn; req->truncateLsn = wp->truncateLsn; - req->proposerId = wp->greetRequest.proposerId; } /* @@ -1353,7 +1330,8 @@ SendAppendRequests(Safekeeper *sk) resetStringInfo(&sk->outbuf); /* write AppendRequest header */ - appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); + PAMessageSerialize(wp, (ProposerAcceptorMessage *) req, &sk->outbuf, wp->config->proto_version); + /* prepare for reading WAL into the outbuf */ enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); sk->active_state = SS_ACTIVE_READ_WAL; } @@ -1366,14 +1344,17 @@ SendAppendRequests(Safekeeper *sk) req = &sk->appendRequest; req_len = req->endLsn - req->beginLsn; - /* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */ + /* + * We send zero sized AppenRequests as heartbeats; don't wal_read + * for these. + */ if (req_len > 0) { switch (wp->api.wal_read(sk, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req_len, - &errmsg)) + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req_len, + &errmsg)) { case NEON_WALREAD_SUCCESS: break; @@ -1381,7 +1362,7 @@ SendAppendRequests(Safekeeper *sk) return true; case NEON_WALREAD_ERROR: wp_log(WARNING, "WAL reading for node %s:%s failed: %s", - sk->host, sk->port, errmsg); + sk->host, sk->port, errmsg); ShutdownConnection(sk); return false; default: @@ -1469,11 +1450,11 @@ RecvAppendResponses(Safekeeper *sk) * Term has changed to higher one, probably another compute is * running. If this is the case we could PANIC as well because * likely it inserted some data and our basebackup is unsuitable - * anymore. However, we also bump term manually (term_bump endpoint) - * on safekeepers for migration purposes, in this case we do want - * compute to stay alive. So restart walproposer with FATAL instead - * of panicking; if basebackup is spoiled next election will notice - * this. + * anymore. However, we also bump term manually (term_bump + * endpoint) on safekeepers for migration purposes, in this case + * we do want compute to stay alive. So restart walproposer with + * FATAL instead of panicking; if basebackup is spoiled next + * election will notice this. */ wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us", sk->host, sk->port, @@ -1508,7 +1489,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese for (i = 0; i < nkeys; i++) { - const char *key = pq_getmsgstring(reply_message); + const char *key = pq_getmsgrawstring(reply_message); unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32)); if (strcmp(key, "current_timeline_size") == 0) @@ -1749,6 +1730,208 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) } } +/* Serialize MembershipConfiguration into buf. */ +static void +MembershipConfigurationSerialize(MembershipConfiguration *mconf, StringInfo buf) +{ + uint32 i; + + pq_sendint32(buf, mconf->generation); + + pq_sendint32(buf, mconf->members.len); + for (i = 0; i < mconf->members.len; i++) + { + pq_sendint64(buf, mconf->members.m[i].node_id); + pq_send_ascii_string(buf, mconf->members.m[i].host); + pq_sendint16(buf, mconf->members.m[i].port); + } + + /* + * There is no special mark for absent new_members; zero members in + * invalid, so zero len means absent. + */ + pq_sendint32(buf, mconf->new_members.len); + for (i = 0; i < mconf->new_members.len; i++) + { + pq_sendint64(buf, mconf->new_members.m[i].node_id); + pq_send_ascii_string(buf, mconf->new_members.m[i].host); + pq_sendint16(buf, mconf->new_members.m[i].port); + } +} + +/* Serialize proposer -> acceptor message into buf using specified version */ +static void +PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version) +{ + /* both version are supported currently until we fully migrate to 3 */ + Assert(proto_version == 3 || proto_version == 2); + + resetStringInfo(buf); + + if (proto_version == 3) + { + /* + * v2 sends structs for some messages as is, so commonly send tag only + * for v3 + */ + pq_sendint8(buf, msg->tag); + + switch (msg->tag) + { + case 'g': + { + ProposerGreeting *m = (ProposerGreeting *) msg; + + pq_send_ascii_string(buf, m->tenant_id); + pq_send_ascii_string(buf, m->timeline_id); + MembershipConfigurationSerialize(&m->mconf, buf); + pq_sendint32(buf, m->pg_version); + pq_sendint64(buf, m->system_id); + pq_sendint32(buf, m->wal_seg_size); + break; + } + case 'v': + { + VoteRequest *m = (VoteRequest *) msg; + + pq_sendint32(buf, m->generation); + pq_sendint64(buf, m->term); + break; + + } + case 'e': + { + ProposerElected *m = (ProposerElected *) msg; + + pq_sendint32(buf, m->generation); + pq_sendint64(buf, m->term); + pq_sendint64(buf, m->startStreamingAt); + pq_sendint32(buf, m->termHistory->n_entries); + for (uint32 i = 0; i < m->termHistory->n_entries; i++) + { + pq_sendint64(buf, m->termHistory->entries[i].term); + pq_sendint64(buf, m->termHistory->entries[i].lsn); + } + break; + } + case 'a': + { + /* + * Note: this serializes only AppendRequestHeader, caller + * is expected to append WAL data later. + */ + AppendRequestHeader *m = (AppendRequestHeader *) msg; + + pq_sendint32(buf, m->generation); + pq_sendint64(buf, m->term); + pq_sendint64(buf, m->beginLsn); + pq_sendint64(buf, m->endLsn); + pq_sendint64(buf, m->commitLsn); + pq_sendint64(buf, m->truncateLsn); + break; + } + default: + wp_log(FATAL, "unexpected message type %c to serialize", msg->tag); + } + return; + } + + if (proto_version == 2) + { + switch (msg->tag) + { + case 'g': + { + /* v2 sent struct as is */ + ProposerGreeting *m = (ProposerGreeting *) msg; + ProposerGreetingV2 greetRequestV2; + + /* Fill also v2 struct. */ + greetRequestV2.tag = 'g'; + greetRequestV2.protocolVersion = proto_version; + greetRequestV2.pgVersion = m->pg_version; + + /* + * v3 removed this field because it's easier to pass as + * libq or START_WAL_PUSH options + */ + memset(&greetRequestV2.proposerId, 0, sizeof(greetRequestV2.proposerId)); + greetRequestV2.systemId = wp->config->systemId; + if (*m->timeline_id != '\0' && + !HexDecodeString(greetRequestV2.timeline_id, m->timeline_id, 16)) + wp_log(FATAL, "could not parse neon.timeline_id, %s", m->timeline_id); + if (*m->tenant_id != '\0' && + !HexDecodeString(greetRequestV2.tenant_id, m->tenant_id, 16)) + wp_log(FATAL, "could not parse neon.tenant_id, %s", m->tenant_id); + + greetRequestV2.timeline = wp->config->pgTimeline; + greetRequestV2.walSegSize = wp->config->wal_segment_size; + + pq_sendbytes(buf, (char *) &greetRequestV2, sizeof(greetRequestV2)); + break; + } + case 'v': + { + /* v2 sent struct as is */ + VoteRequest *m = (VoteRequest *) msg; + VoteRequestV2 voteRequestV2; + + voteRequestV2.tag = m->pam.tag; + voteRequestV2.term = m->term; + /* removed field */ + memset(&voteRequestV2.proposerId, 0, sizeof(voteRequestV2.proposerId)); + pq_sendbytes(buf, (char *) &voteRequestV2, sizeof(voteRequestV2)); + break; + } + case 'e': + { + ProposerElected *m = (ProposerElected *) msg; + + pq_sendint64_le(buf, m->apm.tag); + pq_sendint64_le(buf, m->term); + pq_sendint64_le(buf, m->startStreamingAt); + pq_sendint32_le(buf, m->termHistory->n_entries); + for (int i = 0; i < m->termHistory->n_entries; i++) + { + pq_sendint64_le(buf, m->termHistory->entries[i].term); + pq_sendint64_le(buf, m->termHistory->entries[i].lsn); + } + pq_sendint64_le(buf, 0); /* removed timeline_start_lsn */ + break; + } + case 'a': + + /* + * Note: this serializes only AppendRequestHeader, caller is + * expected to append WAL data later. + */ + { + /* v2 sent struct as is */ + AppendRequestHeader *m = (AppendRequestHeader *) msg; + AppendRequestHeaderV2 appendRequestHeaderV2; + + appendRequestHeaderV2.tag = m->apm.tag; + appendRequestHeaderV2.term = m->term; + appendRequestHeaderV2.epochStartLsn = 0; /* removed field */ + appendRequestHeaderV2.beginLsn = m->beginLsn; + appendRequestHeaderV2.endLsn = m->endLsn; + appendRequestHeaderV2.commitLsn = m->commitLsn; + appendRequestHeaderV2.truncateLsn = m->truncateLsn; + /* removed field */ + memset(&appendRequestHeaderV2.proposerId, 0, sizeof(appendRequestHeaderV2.proposerId)); + + pq_sendbytes(buf, (char *) &appendRequestHeaderV2, sizeof(appendRequestHeaderV2)); + break; + } + + default: + wp_log(FATAL, "unexpected message type %c to serialize", msg->tag); + } + return; + } + wp_log(FATAL, "unexpected proto_version %d", proto_version); +} + /* * Try to read CopyData message from i'th safekeeper, resetting connection on * failure. @@ -1778,6 +1961,37 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) return false; } +/* Deserialize membership configuration from buf to mconf. */ +static void +MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo buf) +{ + uint32 i; + + mconf->generation = pq_getmsgint32(buf); + mconf->members.len = pq_getmsgint32(buf); + mconf->members.m = palloc0(sizeof(SafekeeperId) * mconf->members.len); + for (i = 0; i < mconf->members.len; i++) + { + const char *buf_host; + + mconf->members.m[i].node_id = pq_getmsgint64(buf); + buf_host = pq_getmsgrawstring(buf); + strlcpy(mconf->members.m[i].host, buf_host, sizeof(mconf->members.m[i].host)); + mconf->members.m[i].port = pq_getmsgint16(buf); + } + mconf->new_members.len = pq_getmsgint32(buf); + mconf->new_members.m = palloc0(sizeof(SafekeeperId) * mconf->new_members.len); + for (i = 0; i < mconf->new_members.len; i++) + { + const char *buf_host; + + mconf->new_members.m[i].node_id = pq_getmsgint64(buf); + buf_host = pq_getmsgrawstring(buf); + strlcpy(mconf->new_members.m[i].host, buf_host, sizeof(mconf->new_members.m[i].host)); + mconf->new_members.m[i].port = pq_getmsgint16(buf); + } +} + /* * Read next message with known type into provided struct, by reading a CopyData * block from the safekeeper's postgres connection, returning whether the read @@ -1786,6 +2000,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size) * If the read needs more polling, we return 'false' and keep the state * unmodified, waiting until it becomes read-ready to try again. If it fully * failed, a warning is emitted and the connection is reset. + * + * Note: it pallocs if needed, i.e. for AcceptorGreeting and VoteResponse fields. */ static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) @@ -1794,82 +2010,154 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) char *buf; int buf_size; - uint64 tag; + uint8 tag; StringInfoData s; if (!(AsyncRead(sk, &buf, &buf_size))) return false; + sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); /* parse it */ s.data = buf; s.len = buf_size; + s.maxlen = buf_size; s.cursor = 0; - tag = pq_getmsgint64_le(&s); - if (tag != anymsg->tag) + if (wp->config->proto_version == 3) { - wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, - sk->port, FormatSafekeeperState(sk)); - ResetConnection(sk); - return false; - } - sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp); - switch (tag) - { - case 'g': - { - AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; - - msg->term = pq_getmsgint64_le(&s); - msg->nodeId = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } - - case 'v': - { - VoteResponse *msg = (VoteResponse *) anymsg; - - msg->term = pq_getmsgint64_le(&s); - msg->voteGiven = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->truncateLsn = pq_getmsgint64_le(&s); - msg->termHistory.n_entries = pq_getmsgint32_le(&s); - msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); - for (int i = 0; i < msg->termHistory.n_entries; i++) + tag = pq_getmsgbyte(&s); + if (tag != anymsg->tag) + { + wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk)); + ResetConnection(sk); + return false; + } + switch (tag) + { + case 'g': { - msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); - msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + + msg->nodeId = pq_getmsgint64(&s); + MembershipConfigurationDeserialize(&msg->mconf, &s); + msg->term = pq_getmsgint64(&s); + pq_getmsgend(&s); + return true; } - msg->timelineStartLsn = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; - case 'a': - { - AppendResponse *msg = (AppendResponse *) anymsg; + msg->generation = pq_getmsgint32(&s); + msg->term = pq_getmsgint64(&s); + msg->voteGiven = pq_getmsgbyte(&s); + msg->flushLsn = pq_getmsgint64(&s); + msg->truncateLsn = pq_getmsgint64(&s); + msg->termHistory.n_entries = pq_getmsgint32(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (uint32 i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64(&s); + } + pq_getmsgend(&s); + return true; + } + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; - msg->term = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->commitLsn = pq_getmsgint64_le(&s); - msg->hs.ts = pq_getmsgint64_le(&s); - msg->hs.xmin.value = pq_getmsgint64_le(&s); - msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); - if (s.len > s.cursor) - ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); - else - msg->ps_feedback.present = false; - pq_getmsgend(&s); - return true; - } - - default: - { - Assert(false); - return false; - } + msg->generation = pq_getmsgint32(&s); + msg->term = pq_getmsgint64(&s); + msg->flushLsn = pq_getmsgint64(&s); + msg->commitLsn = pq_getmsgint64(&s); + msg->hs.ts = pq_getmsgint64(&s); + msg->hs.xmin.value = pq_getmsgint64(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64(&s); + if (s.len > s.cursor) + ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); + else + msg->ps_feedback.present = false; + pq_getmsgend(&s); + return true; + } + default: + { + wp_log(FATAL, "unexpected message tag %c to read", (char) tag); + return false; + } + } } + else if (wp->config->proto_version == 2) + { + tag = pq_getmsgint64_le(&s); + if (tag != anymsg->tag) + { + wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk)); + ResetConnection(sk); + return false; + } + switch (tag) + { + case 'g': + { + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->nodeId = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->voteGiven = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->truncateLsn = pq_getmsgint64_le(&s); + msg->termHistory.n_entries = pq_getmsgint32_le(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (int i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + } + pq_getmsgint64_le(&s); /* timelineStartLsn */ + pq_getmsgend(&s); + return true; + } + + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->commitLsn = pq_getmsgint64_le(&s); + msg->hs.ts = pq_getmsgint64_le(&s); + msg->hs.xmin.value = pq_getmsgint64_le(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); + if (s.len > s.cursor) + ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); + else + msg->ps_feedback.present = false; + pq_getmsgend(&s); + return true; + } + + default: + { + wp_log(FATAL, "unexpected message tag %c to read", (char) tag); + return false; + } + } + } + wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version); + return false; /* keep the compiler quiet */ } /* @@ -2245,3 +2533,45 @@ FormatEvents(WalProposer *wp, uint32 events) return (char *) &return_str; } + +/* Dump mconf as toml for observability / debugging. Result is palloc'ed. */ +static char * +MembershipConfigurationToString(MembershipConfiguration *mconf) +{ + StringInfoData s; + uint32 i; + + initStringInfo(&s); + appendStringInfo(&s, "{gen = %u", mconf->generation); + appendStringInfoString(&s, ", members = ["); + for (i = 0; i < mconf->members.len; i++) + { + if (i > 0) + appendStringInfoString(&s, ", "); + appendStringInfo(&s, "{node_id = %lu", mconf->members.m[i].node_id); + appendStringInfo(&s, ", host = %s", mconf->members.m[i].host); + appendStringInfo(&s, ", port = %u }", mconf->members.m[i].port); + } + appendStringInfo(&s, "], new_members = ["); + for (i = 0; i < mconf->new_members.len; i++) + { + if (i > 0) + appendStringInfoString(&s, ", "); + appendStringInfo(&s, "{node_id = %lu", mconf->new_members.m[i].node_id); + appendStringInfo(&s, ", host = %s", mconf->new_members.m[i].host); + appendStringInfo(&s, ", port = %u }", mconf->new_members.m[i].port); + } + appendStringInfoString(&s, "]}"); + return s.data; +} + +static void +MembershipConfigurationFree(MembershipConfiguration *mconf) +{ + if (mconf->members.m) + pfree(mconf->members.m); + mconf->members.m = NULL; + if (mconf->new_members.m) + pfree(mconf->new_members.m); + mconf->new_members.m = NULL; +} diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index d8c44f8182..eee55f924f 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -12,9 +12,6 @@ #include "neon_walreader.h" #include "pagestore_client.h" -#define SK_MAGIC 0xCafeCeefu -#define SK_PROTOCOL_VERSION 2 - #define MAX_SAFEKEEPERS 32 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL * message */ @@ -143,12 +140,71 @@ typedef uint64 term_t; /* neon storage node id */ typedef uint64 NNodeId; +/* + * Number uniquely identifying safekeeper membership configuration. + * This and following structs pair ones in membership.rs. + */ +typedef uint32 Generation; + +typedef struct SafekeeperId +{ + NNodeId node_id; + char host[MAXCONNINFO]; + uint16 port; +} SafekeeperId; + +/* Set of safekeepers. */ +typedef struct MemberSet +{ + uint32 len; /* number of members */ + SafekeeperId *m; /* ids themselves */ +} MemberSet; + +/* Timeline safekeeper membership configuration. */ +typedef struct MembershipConfiguration +{ + Generation generation; + MemberSet members; + /* Has 0 n_members in non joint conf. */ + MemberSet new_members; +} MembershipConfiguration; + /* * Proposer <-> Acceptor messaging. */ +typedef struct ProposerAcceptorMessage +{ + uint8 tag; +} ProposerAcceptorMessage; + /* Initial Proposer -> Acceptor message */ typedef struct ProposerGreeting +{ + ProposerAcceptorMessage pam; /* message tag */ + + /* + * tenant/timeline ids as C strings with standard hex notation for ease of + * printing. In principle they are not strictly needed as ttid is also + * passed as libpq options. + */ + char *tenant_id; + char *timeline_id; + /* Full conf is carried to allow safekeeper switch */ + MembershipConfiguration mconf; + + /* + * pg_version and wal_seg_size are used for timeline creation until we + * fully migrate to doing externally. systemId is only used as a sanity + * cross check. + */ + uint32 pg_version; /* in PG_VERSION_NUM format */ + uint64 system_id; /* Postgres system identifier. */ + uint32 wal_seg_size; +} ProposerGreeting; + +/* protocol v2 variant, kept while wp supports it */ +typedef struct ProposerGreetingV2 { uint64 tag; /* message tag */ uint32 protocolVersion; /* proposer-safekeeper protocol version */ @@ -159,32 +215,42 @@ typedef struct ProposerGreeting uint8 tenant_id[16]; TimeLineID timeline; uint32 walSegSize; -} ProposerGreeting; +} ProposerGreetingV2; typedef struct AcceptorProposerMessage { - uint64 tag; + uint8 tag; } AcceptorProposerMessage; /* - * Acceptor -> Proposer initial response: the highest term acceptor voted for. + * Acceptor -> Proposer initial response: the highest term acceptor voted for, + * its node id and configuration. */ typedef struct AcceptorGreeting { AcceptorProposerMessage apm; - term_t term; NNodeId nodeId; + MembershipConfiguration mconf; + term_t term; } AcceptorGreeting; /* * Proposer -> Acceptor vote request. */ typedef struct VoteRequest +{ + ProposerAcceptorMessage pam; /* message tag */ + Generation generation; /* membership conf generation */ + term_t term; +} VoteRequest; + +/* protocol v2 variant, kept while wp supports it */ +typedef struct VoteRequestV2 { uint64 tag; term_t term; pg_uuid_t proposerId; /* for monitoring/debugging */ -} VoteRequest; +} VoteRequestV2; /* Element of term switching chain. */ typedef struct TermSwitchEntry @@ -203,8 +269,15 @@ typedef struct TermHistory typedef struct VoteResponse { AcceptorProposerMessage apm; + + /* + * Membership conf generation. It's not strictly required because on + * mismatch safekeeper is expected to ERROR the connection, but let's + * sanity check it. + */ + Generation generation; term_t term; - uint64 voteGiven; + uint8 voteGiven; /* * Safekeeper flush_lsn (end of WAL) + history of term switches allow @@ -214,7 +287,6 @@ typedef struct VoteResponse XLogRecPtr truncateLsn; /* minimal LSN which may be needed for* * recovery of some safekeeper */ TermHistory termHistory; - XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ } VoteResponse; /* @@ -223,20 +295,37 @@ typedef struct VoteResponse */ typedef struct ProposerElected { - uint64 tag; + AcceptorProposerMessage apm; + Generation generation; /* membership conf generation */ term_t term; /* proposer will send since this point */ XLogRecPtr startStreamingAt; /* history of term switches up to this proposer */ TermHistory *termHistory; - /* timeline globally starts at this LSN */ - XLogRecPtr timelineStartLsn; } ProposerElected; /* * Header of request with WAL message sent from proposer to safekeeper. */ typedef struct AppendRequestHeader +{ + AcceptorProposerMessage apm; + Generation generation; /* membership conf generation */ + term_t term; /* term of the proposer */ + XLogRecPtr beginLsn; /* start position of message in WAL */ + XLogRecPtr endLsn; /* end position of message in WAL */ + XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + + /* + * minimal LSN which may be needed for recovery of some safekeeper (end + * lsn + 1 of last chunk streamed to everyone) + */ + XLogRecPtr truncateLsn; + /* in the AppendRequest message, WAL data follows */ +} AppendRequestHeader; + +/* protocol v2 variant, kept while wp supports it */ +typedef struct AppendRequestHeaderV2 { uint64 tag; term_t term; /* term of the proposer */ @@ -256,7 +345,8 @@ typedef struct AppendRequestHeader */ XLogRecPtr truncateLsn; pg_uuid_t proposerId; /* for monitoring/debugging */ -} AppendRequestHeader; + /* in the AppendRequest message, WAL data follows */ +} AppendRequestHeaderV2; /* * Hot standby feedback received from replica @@ -309,6 +399,13 @@ typedef struct AppendResponse { AcceptorProposerMessage apm; + /* + * Membership conf generation. It's not strictly required because on + * mismatch safekeeper is expected to ERROR the connection, but let's + * sanity check it. + */ + Generation generation; + /* * Current term of the safekeeper; if it is higher than proposer's, the * compute is out of date. @@ -644,6 +741,8 @@ typedef struct WalProposerConfig /* Will be passed to safekeepers in greet request. */ TimeLineID pgTimeline; + int proto_version; + #ifdef WALPROPOSER_LIB void *callback_data; #endif @@ -656,11 +755,14 @@ typedef struct WalProposerConfig typedef struct WalProposer { WalProposerConfig *config; - int n_safekeepers; + /* Current walproposer membership configuration */ + MembershipConfiguration mconf; /* (n_safekeepers / 2) + 1 */ int quorum; + /* Number of occupied slots in safekeepers[] */ + int n_safekeepers; Safekeeper safekeeper[MAX_SAFEKEEPERS]; /* WAL has been generated up to this point */ @@ -670,6 +772,7 @@ typedef struct WalProposer XLogRecPtr commitLsn; ProposerGreeting greetRequest; + ProposerGreetingV2 greetRequestV2; /* Vote request for safekeeper */ VoteRequest voteRequest; diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c index 35d984c52e..a986160224 100644 --- a/pgxn/neon/walproposer_compat.c +++ b/pgxn/neon/walproposer_compat.c @@ -117,14 +117,13 @@ pq_getmsgbytes(StringInfo msg, int datalen) } /* -------------------------------- - * pq_getmsgstring - get a null-terminated text string (with conversion) + * pq_getmsgrawstring - get a null-terminated text string - NO conversion * - * May return a pointer directly into the message buffer, or a pointer - * to a palloc'd conversion result. + * Returns a pointer directly into the message buffer. * -------------------------------- */ const char * -pq_getmsgstring(StringInfo msg) +pq_getmsgrawstring(StringInfo msg) { char *str; int slen; @@ -155,6 +154,45 @@ pq_getmsgend(StringInfo msg) ExceptionalCondition("invalid msg format", __FILE__, __LINE__); } +/* -------------------------------- + * pq_sendbytes - append raw data to a StringInfo buffer + * -------------------------------- + */ +void +pq_sendbytes(StringInfo buf, const void *data, int datalen) +{ + /* use variant that maintains a trailing null-byte, out of caution */ + appendBinaryStringInfo(buf, data, datalen); +} + +/* -------------------------------- + * pq_send_ascii_string - append a null-terminated text string (without conversion) + * + * This function intentionally bypasses encoding conversion, instead just + * silently replacing any non-7-bit-ASCII characters with question marks. + * It is used only when we are having trouble sending an error message to + * the client with normal localization and encoding conversion. The caller + * should already have taken measures to ensure the string is just ASCII; + * the extra work here is just to make certain we don't send a badly encoded + * string to the client (which might or might not be robust about that). + * + * NB: passed text string must be null-terminated, and so is the data + * sent to the frontend. + * -------------------------------- + */ +void +pq_send_ascii_string(StringInfo buf, const char *str) +{ + while (*str) + { + char ch = *str++; + + if (IS_HIGHBIT_SET(ch)) + ch = '?'; + appendStringInfoCharMacro(buf, ch); + } + appendStringInfoChar(buf, '\0'); +} /* * Produce a C-string representation of a TimestampTz. diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 86444084ff..b21184de57 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -59,9 +59,11 @@ #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" +/* GUCs */ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; +int safekeeper_proto_version = 2; /* Set to true in the walproposer bgw. */ static bool am_walproposer; @@ -126,6 +128,7 @@ init_walprop_config(bool syncSafekeepers) else walprop_config.systemId = 0; walprop_config.pgTimeline = walprop_pg_get_timeline_id(); + walprop_config.proto_version = safekeeper_proto_version; } /* @@ -219,25 +222,37 @@ nwp_register_gucs(void) PGC_SIGHUP, GUC_UNIT_MS, NULL, NULL, NULL); + + DefineCustomIntVariable( + "neon.safekeeper_proto_version", + "Version of compute <-> safekeeper protocol.", + "Used while migrating from 2 to 3.", + &safekeeper_proto_version, + 2, 0, INT_MAX, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); } static int split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) { - int n_safekeepers = 0; - char *curr_sk = safekeepers_list; + int n_safekeepers = 0; + char *curr_sk = safekeepers_list; for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma) { - if (++n_safekeepers >= MAX_SAFEKEEPERS) { + if (++n_safekeepers >= MAX_SAFEKEEPERS) + { wpg_log(FATAL, "too many safekeepers"); } coma = strchr(coma, ','); - safekeepers[n_safekeepers-1] = curr_sk; + safekeepers[n_safekeepers - 1] = curr_sk; - if (coma != NULL) { + if (coma != NULL) + { *coma++ = '\0'; } } @@ -252,10 +267,10 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[]) static bool safekeepers_cmp(char *old, char *new) { - char *safekeepers_old[MAX_SAFEKEEPERS]; - char *safekeepers_new[MAX_SAFEKEEPERS]; - int len_old = 0; - int len_new = 0; + char *safekeepers_old[MAX_SAFEKEEPERS]; + char *safekeepers_new[MAX_SAFEKEEPERS]; + int len_old = 0; + int len_new = 0; len_old = split_safekeepers_list(old, safekeepers_old); len_new = split_safekeepers_list(new, safekeepers_new); @@ -292,7 +307,8 @@ assign_neon_safekeepers(const char *newval, void *extra) if (!am_walproposer) return; - if (!newval) { + if (!newval) + { /* should never happen */ wpg_log(FATAL, "neon.safekeepers is empty"); } @@ -301,11 +317,11 @@ assign_neon_safekeepers(const char *newval, void *extra) newval_copy = pstrdup(newval); oldval = pstrdup(wal_acceptors_list); - /* + /* * TODO: restarting through FATAL is stupid and introduces 1s delay before - * next bgw start. We should refactor walproposer to allow graceful exit and - * thus remove this delay. - * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder. + * next bgw start. We should refactor walproposer to allow graceful exit + * and thus remove this delay. XXX: If you change anything here, sync with + * test_safekeepers_reconfigure_reorder. */ if (!safekeepers_cmp(oldval, newval_copy)) { @@ -454,7 +470,8 @@ backpressure_throttling_impl(void) memcpy(new_status, old_status, len); snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag); set_ps_display(new_status); - new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */ + new_status[len] = '\0'; /* truncate off " backpressure ..." to later + * reset the ps */ elog(DEBUG2, "backpressure throttling: lag %lu", lag); start = GetCurrentTimestamp(); @@ -621,7 +638,7 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) wpg_log(LOG, "WAL proposer starts streaming at %X/%X", LSN_FORMAT_ARGS(startpos)); cmd.slotname = WAL_PROPOSER_SLOT_NAME; - cmd.timeline = wp->greetRequest.timeline; + cmd.timeline = wp->config->pgTimeline; cmd.startpoint = startpos; StartProposerReplication(wp, &cmd); } @@ -1963,10 +1980,11 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) FullTransactionId xmin = hsFeedback.xmin; FullTransactionId catalog_xmin = hsFeedback.catalog_xmin; FullTransactionId next_xid = ReadNextFullTransactionId(); + /* - * Page server is updating nextXid in checkpoint each 1024 transactions, - * so feedback xmin can be actually larger then nextXid and - * function TransactionIdInRecentPast return false in this case, + * Page server is updating nextXid in checkpoint each 1024 + * transactions, so feedback xmin can be actually larger then nextXid + * and function TransactionIdInRecentPast return false in this case, * preventing update of slot's xmin. */ if (FullTransactionIdPrecedes(next_xid, xmin)) diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index a45e8f5c4a..74cd5ac601 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -96,7 +96,7 @@ static void inmem_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); + BlockNumber old_blocks, BlockNumber nblocks); static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); #if PG_MAJORVERSION_NUM >= 17 static void inmem_registersync(SMgrRelation reln, ForkNumber forknum); @@ -345,7 +345,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum) * inmem_truncate() -- Truncate relation to specified number of blocks. */ static void -inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks) { } diff --git a/poetry.lock b/poetry.lock index 59ae5cf1ca..ba3b0535e4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -6,6 +6,7 @@ version = "2.3.5" description = "Happy Eyeballs for asyncio" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"}, {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"}, @@ -17,6 +18,7 @@ version = "3.10.11" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5077b1a5f40ffa3ba1f40d537d3bec4383988ee51fbba6b74aa8fb1bc466599e"}, {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8d6a14a4d93b5b3c2891fca94fa9d41b2322a68194422bef0dd5ec1e57d7d298"}, @@ -120,7 +122,7 @@ multidict = ">=4.5,<7.0" yarl = ">=1.12.0,<2.0" [package.extras] -speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] +speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.2.0) ; sys_platform == \"linux\" or sys_platform == \"darwin\"", "brotlicffi ; platform_python_implementation != \"CPython\""] [[package]] name = "aiopg" @@ -128,6 +130,7 @@ version = "1.4.0" description = "Postgres integration with asyncio." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "aiopg-1.4.0-py3-none-any.whl", hash = "sha256:aea46e8aff30b039cfa818e6db4752c97656e893fc75e5a5dc57355a9e9dedbd"}, {file = "aiopg-1.4.0.tar.gz", hash = "sha256:116253bef86b4d954116716d181e9a0294037f266718b2e1c9766af995639d71"}, @@ -146,6 +149,7 @@ version = "1.3.1" description = "aiosignal: a list of registered asynchronous callbacks" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"}, {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"}, @@ -156,28 +160,30 @@ frozenlist = ">=1.1.0" [[package]] name = "allure-pytest" -version = "2.13.2" +version = "2.13.5" description = "Allure pytest integration" optional = false python-versions = "*" +groups = ["main"] files = [ - {file = "allure-pytest-2.13.2.tar.gz", hash = "sha256:22243159e8ec81ce2b5254b4013802198821b1b42f118f69d4a289396607c7b3"}, - {file = "allure_pytest-2.13.2-py3-none-any.whl", hash = "sha256:17de9dbee7f61c8e66a5b5e818b00e419dbcea44cb55c24319401ba813220690"}, + {file = "allure-pytest-2.13.5.tar.gz", hash = "sha256:0ef8e1790c44a988db6b83c4d4f5e91451e2c4c8ea10601dfa88528d23afcf6e"}, + {file = "allure_pytest-2.13.5-py3-none-any.whl", hash = "sha256:94130bac32964b78058e62cf4b815ad97a5ac82a065e6dd2d43abac2be7640fc"}, ] [package.dependencies] -allure-python-commons = "2.13.2" +allure-python-commons = "2.13.5" pytest = ">=4.5.0" [[package]] name = "allure-python-commons" -version = "2.13.2" -description = "Common module for integrate allure with python-based frameworks" +version = "2.13.5" +description = "('Contains the API for end users as well as helper functions and classes to build Allure adapters for Python test frameworks',)" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ - {file = "allure-python-commons-2.13.2.tar.gz", hash = "sha256:8a03681330231b1deadd86b97ff68841c6591320114ae638570f1ed60d7a2033"}, - {file = "allure_python_commons-2.13.2-py3-none-any.whl", hash = "sha256:2bb3646ec3fbf5b36d178a5e735002bc130ae9f9ba80f080af97d368ba375051"}, + {file = "allure-python-commons-2.13.5.tar.gz", hash = "sha256:a232e7955811f988e49a4c1dd6c16cce7e9b81d0ea0422b1e5654d3254e2caf3"}, + {file = "allure_python_commons-2.13.5-py3-none-any.whl", hash = "sha256:8b0e837b6e32d810adec563f49e1d04127a5b6770e0232065b7cb09b9953980d"}, ] [package.dependencies] @@ -190,6 +196,7 @@ version = "0.6.0" description = "Reusable constraint types to use with typing.Annotated" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, @@ -201,6 +208,7 @@ version = "4.13.1" description = "ANTLR 4.13.1 runtime for Python 3" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"}, {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"}, @@ -212,6 +220,7 @@ version = "4.3.0" description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"}, {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"}, @@ -223,7 +232,7 @@ sniffio = ">=1.1" [package.extras] doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] -test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17) ; platform_python_implementation == \"CPython\" and platform_system != \"Windows\""] trio = ["trio (>=0.23)"] [[package]] @@ -232,6 +241,7 @@ version = "4.0.3" description = "Timeout context manager for asyncio programs" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, @@ -239,60 +249,67 @@ files = [ [[package]] name = "asyncpg" -version = "0.29.0" +version = "0.30.0" description = "An asyncio PostgreSQL driver" optional = false python-versions = ">=3.8.0" +groups = ["main"] files = [ - {file = "asyncpg-0.29.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72fd0ef9f00aeed37179c62282a3d14262dbbafb74ec0ba16e1b1864d8a12169"}, - {file = "asyncpg-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52e8f8f9ff6e21f9b39ca9f8e3e33a5fcdceaf5667a8c5c32bee158e313be385"}, - {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e6823a7012be8b68301342ba33b4740e5a166f6bbda0aee32bc01638491a22"}, - {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746e80d83ad5d5464cfbf94315eb6744222ab00aa4e522b704322fb182b83610"}, - {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ff8e8109cd6a46ff852a5e6bab8b0a047d7ea42fcb7ca5ae6eaae97d8eacf397"}, - {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:97eb024685b1d7e72b1972863de527c11ff87960837919dac6e34754768098eb"}, - {file = "asyncpg-0.29.0-cp310-cp310-win32.whl", hash = "sha256:5bbb7f2cafd8d1fa3e65431833de2642f4b2124be61a449fa064e1a08d27e449"}, - {file = "asyncpg-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:76c3ac6530904838a4b650b2880f8e7af938ee049e769ec2fba7cd66469d7772"}, - {file = "asyncpg-0.29.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4900ee08e85af01adb207519bb4e14b1cae8fd21e0ccf80fac6aa60b6da37b4"}, - {file = "asyncpg-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a65c1dcd820d5aea7c7d82a3fdcb70e096f8f70d1a8bf93eb458e49bfad036ac"}, - {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b52e46f165585fd6af4863f268566668407c76b2c72d366bb8b522fa66f1870"}, - {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc600ee8ef3dd38b8d67421359779f8ccec30b463e7aec7ed481c8346decf99f"}, - {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:039a261af4f38f949095e1e780bae84a25ffe3e370175193174eb08d3cecab23"}, - {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6feaf2d8f9138d190e5ec4390c1715c3e87b37715cd69b2c3dfca616134efd2b"}, - {file = "asyncpg-0.29.0-cp311-cp311-win32.whl", hash = "sha256:1e186427c88225ef730555f5fdda6c1812daa884064bfe6bc462fd3a71c4b675"}, - {file = "asyncpg-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfe73ffae35f518cfd6e4e5f5abb2618ceb5ef02a2365ce64f132601000587d3"}, - {file = "asyncpg-0.29.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6011b0dc29886ab424dc042bf9eeb507670a3b40aece3439944006aafe023178"}, - {file = "asyncpg-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b544ffc66b039d5ec5a7454667f855f7fec08e0dfaf5a5490dfafbb7abbd2cfb"}, - {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d84156d5fb530b06c493f9e7635aa18f518fa1d1395ef240d211cb563c4e2364"}, - {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54858bc25b49d1114178d65a88e48ad50cb2b6f3e475caa0f0c092d5f527c106"}, - {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bde17a1861cf10d5afce80a36fca736a86769ab3579532c03e45f83ba8a09c59"}, - {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a2ec1b9ff88d8773d3eb6d3784dc7e3fee7756a5317b67f923172a4748a175"}, - {file = "asyncpg-0.29.0-cp312-cp312-win32.whl", hash = "sha256:bb1292d9fad43112a85e98ecdc2e051602bce97c199920586be83254d9dafc02"}, - {file = "asyncpg-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:2245be8ec5047a605e0b454c894e54bf2ec787ac04b1cb7e0d3c67aa1e32f0fe"}, - {file = "asyncpg-0.29.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0009a300cae37b8c525e5b449233d59cd9868fd35431abc470a3e364d2b85cb9"}, - {file = "asyncpg-0.29.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cad1324dbb33f3ca0cd2074d5114354ed3be2b94d48ddfd88af75ebda7c43cc"}, - {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:012d01df61e009015944ac7543d6ee30c2dc1eb2f6b10b62a3f598beb6531548"}, - {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000c996c53c04770798053e1730d34e30cb645ad95a63265aec82da9093d88e7"}, - {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e0bfe9c4d3429706cf70d3249089de14d6a01192d617e9093a8e941fea8ee775"}, - {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:642a36eb41b6313ffa328e8a5c5c2b5bea6ee138546c9c3cf1bffaad8ee36dd9"}, - {file = "asyncpg-0.29.0-cp38-cp38-win32.whl", hash = "sha256:a921372bbd0aa3a5822dd0409da61b4cd50df89ae85150149f8c119f23e8c408"}, - {file = "asyncpg-0.29.0-cp38-cp38-win_amd64.whl", hash = "sha256:103aad2b92d1506700cbf51cd8bb5441e7e72e87a7b3a2ca4e32c840f051a6a3"}, - {file = "asyncpg-0.29.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5340dd515d7e52f4c11ada32171d87c05570479dc01dc66d03ee3e150fb695da"}, - {file = "asyncpg-0.29.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e17b52c6cf83e170d3d865571ba574577ab8e533e7361a2b8ce6157d02c665d3"}, - {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f100d23f273555f4b19b74a96840aa27b85e99ba4b1f18d4ebff0734e78dc090"}, - {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48e7c58b516057126b363cec8ca02b804644fd012ef8e6c7e23386b7d5e6ce83"}, - {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f9ea3f24eb4c49a615573724d88a48bd1b7821c890c2effe04f05382ed9e8810"}, - {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d36c7f14a22ec9e928f15f92a48207546ffe68bc412f3be718eedccdf10dc5c"}, - {file = "asyncpg-0.29.0-cp39-cp39-win32.whl", hash = "sha256:797ab8123ebaed304a1fad4d7576d5376c3a006a4100380fb9d517f0b59c1ab2"}, - {file = "asyncpg-0.29.0-cp39-cp39-win_amd64.whl", hash = "sha256:cce08a178858b426ae1aa8409b5cc171def45d4293626e7aa6510696d46decd8"}, - {file = "asyncpg-0.29.0.tar.gz", hash = "sha256:d1c49e1f44fffafd9a55e1a9b101590859d881d639ea2922516f5d9c512d354e"}, + {file = "asyncpg-0.30.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bfb4dd5ae0699bad2b233672c8fc5ccbd9ad24b89afded02341786887e37927e"}, + {file = "asyncpg-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc1f62c792752a49f88b7e6f774c26077091b44caceb1983509edc18a2222ec0"}, + {file = "asyncpg-0.30.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3152fef2e265c9c24eec4ee3d22b4f4d2703d30614b0b6753e9ed4115c8a146f"}, + {file = "asyncpg-0.30.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c7255812ac85099a0e1ffb81b10dc477b9973345793776b128a23e60148dd1af"}, + {file = "asyncpg-0.30.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:578445f09f45d1ad7abddbff2a3c7f7c291738fdae0abffbeb737d3fc3ab8b75"}, + {file = "asyncpg-0.30.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c42f6bb65a277ce4d93f3fba46b91a265631c8df7250592dd4f11f8b0152150f"}, + {file = "asyncpg-0.30.0-cp310-cp310-win32.whl", hash = "sha256:aa403147d3e07a267ada2ae34dfc9324e67ccc4cdca35261c8c22792ba2b10cf"}, + {file = "asyncpg-0.30.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb622c94db4e13137c4c7f98834185049cc50ee01d8f657ef898b6407c7b9c50"}, + {file = "asyncpg-0.30.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5e0511ad3dec5f6b4f7a9e063591d407eee66b88c14e2ea636f187da1dcfff6a"}, + {file = "asyncpg-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:915aeb9f79316b43c3207363af12d0e6fd10776641a7de8a01212afd95bdf0ed"}, + {file = "asyncpg-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c198a00cce9506fcd0bf219a799f38ac7a237745e1d27f0e1f66d3707c84a5a"}, + {file = "asyncpg-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3326e6d7381799e9735ca2ec9fd7be4d5fef5dcbc3cb555d8a463d8460607956"}, + {file = "asyncpg-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:51da377487e249e35bd0859661f6ee2b81db11ad1f4fc036194bc9cb2ead5056"}, + {file = "asyncpg-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bc6d84136f9c4d24d358f3b02be4b6ba358abd09f80737d1ac7c444f36108454"}, + {file = "asyncpg-0.30.0-cp311-cp311-win32.whl", hash = "sha256:574156480df14f64c2d76450a3f3aaaf26105869cad3865041156b38459e935d"}, + {file = "asyncpg-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:3356637f0bd830407b5597317b3cb3571387ae52ddc3bca6233682be88bbbc1f"}, + {file = "asyncpg-0.30.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c902a60b52e506d38d7e80e0dd5399f657220f24635fee368117b8b5fce1142e"}, + {file = "asyncpg-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aca1548e43bbb9f0f627a04666fedaca23db0a31a84136ad1f868cb15deb6e3a"}, + {file = "asyncpg-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c2a2ef565400234a633da0eafdce27e843836256d40705d83ab7ec42074efb3"}, + {file = "asyncpg-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1292b84ee06ac8a2ad8e51c7475aa309245874b61333d97411aab835c4a2f737"}, + {file = "asyncpg-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0f5712350388d0cd0615caec629ad53c81e506b1abaaf8d14c93f54b35e3595a"}, + {file = "asyncpg-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:db9891e2d76e6f425746c5d2da01921e9a16b5a71a1c905b13f30e12a257c4af"}, + {file = "asyncpg-0.30.0-cp312-cp312-win32.whl", hash = "sha256:68d71a1be3d83d0570049cd1654a9bdfe506e794ecc98ad0873304a9f35e411e"}, + {file = "asyncpg-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:9a0292c6af5c500523949155ec17b7fe01a00ace33b68a476d6b5059f9630305"}, + {file = "asyncpg-0.30.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:05b185ebb8083c8568ea8a40e896d5f7af4b8554b64d7719c0eaa1eb5a5c3a70"}, + {file = "asyncpg-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c47806b1a8cbb0a0db896f4cd34d89942effe353a5035c62734ab13b9f938da3"}, + {file = "asyncpg-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b6fde867a74e8c76c71e2f64f80c64c0f3163e687f1763cfaf21633ec24ec33"}, + {file = "asyncpg-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46973045b567972128a27d40001124fbc821c87a6cade040cfcd4fa8a30bcdc4"}, + {file = "asyncpg-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9110df111cabc2ed81aad2f35394a00cadf4f2e0635603db6ebbd0fc896f46a4"}, + {file = "asyncpg-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04ff0785ae7eed6cc138e73fc67b8e51d54ee7a3ce9b63666ce55a0bf095f7ba"}, + {file = "asyncpg-0.30.0-cp313-cp313-win32.whl", hash = "sha256:ae374585f51c2b444510cdf3595b97ece4f233fde739aa14b50e0d64e8a7a590"}, + {file = "asyncpg-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:f59b430b8e27557c3fb9869222559f7417ced18688375825f8f12302c34e915e"}, + {file = "asyncpg-0.30.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:29ff1fc8b5bf724273782ff8b4f57b0f8220a1b2324184846b39d1ab4122031d"}, + {file = "asyncpg-0.30.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:64e899bce0600871b55368b8483e5e3e7f1860c9482e7f12e0a771e747988168"}, + {file = "asyncpg-0.30.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b290f4726a887f75dcd1b3006f484252db37602313f806e9ffc4e5996cfe5cb"}, + {file = "asyncpg-0.30.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f86b0e2cd3f1249d6fe6fd6cfe0cd4538ba994e2d8249c0491925629b9104d0f"}, + {file = "asyncpg-0.30.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:393af4e3214c8fa4c7b86da6364384c0d1b3298d45803375572f415b6f673f38"}, + {file = "asyncpg-0.30.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:fd4406d09208d5b4a14db9a9dbb311b6d7aeeab57bded7ed2f8ea41aeef39b34"}, + {file = "asyncpg-0.30.0-cp38-cp38-win32.whl", hash = "sha256:0b448f0150e1c3b96cb0438a0d0aa4871f1472e58de14a3ec320dbb2798fb0d4"}, + {file = "asyncpg-0.30.0-cp38-cp38-win_amd64.whl", hash = "sha256:f23b836dd90bea21104f69547923a02b167d999ce053f3d502081acea2fba15b"}, + {file = "asyncpg-0.30.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6f4e83f067b35ab5e6371f8a4c93296e0439857b4569850b178a01385e82e9ad"}, + {file = "asyncpg-0.30.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5df69d55add4efcd25ea2a3b02025b669a285b767bfbf06e356d68dbce4234ff"}, + {file = "asyncpg-0.30.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3479a0d9a852c7c84e822c073622baca862d1217b10a02dd57ee4a7a081f708"}, + {file = "asyncpg-0.30.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26683d3b9a62836fad771a18ecf4659a30f348a561279d6227dab96182f46144"}, + {file = "asyncpg-0.30.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1b982daf2441a0ed314bd10817f1606f1c28b1136abd9e4f11335358c2c631cb"}, + {file = "asyncpg-0.30.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1c06a3a50d014b303e5f6fc1e5f95eb28d2cee89cf58384b700da621e5d5e547"}, + {file = "asyncpg-0.30.0-cp39-cp39-win32.whl", hash = "sha256:1b11a555a198b08f5c4baa8f8231c74a366d190755aa4f99aacec5970afe929a"}, + {file = "asyncpg-0.30.0-cp39-cp39-win_amd64.whl", hash = "sha256:8b684a3c858a83cd876f05958823b68e8d14ec01bb0c0d14a6704c5bf9711773"}, + {file = "asyncpg-0.30.0.tar.gz", hash = "sha256:c551e9928ab6707602f44811817f82ba3c446e018bfe1d3abecc8ba5f3eac851"}, ] -[package.dependencies] -async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.12.0\""} - [package.extras] -docs = ["Sphinx (>=5.3.0,<5.4.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] -test = ["flake8 (>=6.1,<7.0)", "uvloop (>=0.15.3)"] +docs = ["Sphinx (>=8.1.3,<8.2.0)", "sphinx-rtd-theme (>=1.2.2)"] +gssauth = ["gssapi ; platform_system != \"Windows\"", "sspilib ; platform_system == \"Windows\""] +test = ["distro (>=1.9.0,<1.10.0)", "flake8 (>=6.1,<7.0)", "flake8-pyi (>=24.1.0,<24.2.0)", "gssapi ; platform_system == \"Linux\"", "k5test ; platform_system == \"Linux\"", "mypy (>=1.8.0,<1.9.0)", "sspilib ; platform_system == \"Windows\"", "uvloop (>=0.15.3) ; platform_system != \"Windows\" and python_version < \"3.14.0\""] [[package]] name = "attrs" @@ -300,16 +317,17 @@ version = "21.4.0" description = "Classes Without Boilerplate" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] files = [ {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, ] [package.extras] -dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] +dev = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] -tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] -tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] +tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] +tests-no-zope = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] [[package]] name = "aws-sam-translator" @@ -317,6 +335,7 @@ version = "1.88.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" optional = false python-versions = "!=4.0,<=4.0,>=3.8" +groups = ["main"] files = [ {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"}, {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"}, @@ -337,6 +356,7 @@ version = "2.10.0" description = "The AWS X-Ray SDK for Python (the SDK) enables Python developers to record and emit information from within their applications to the AWS X-Ray service." optional = false python-versions = "*" +groups = ["main"] files = [ {file = "aws-xray-sdk-2.10.0.tar.gz", hash = "sha256:9b14924fd0628cf92936055864655354003f0b1acc3e1c3ffde6403d0799dd7a"}, {file = "aws_xray_sdk-2.10.0-py2.py3-none-any.whl", hash = "sha256:7551e81a796e1a5471ebe84844c40e8edf7c218db33506d046fec61f7495eda4"}, @@ -352,6 +372,7 @@ version = "2.2.1" description = "Function decoration for backoff and retry" optional = false python-versions = ">=3.7,<4.0" +groups = ["main"] files = [ {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"}, {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, @@ -363,6 +384,7 @@ version = "1.34.11" description = "The AWS SDK for Python" optional = false python-versions = ">= 3.8" +groups = ["main"] files = [ {file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"}, {file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"}, @@ -382,6 +404,7 @@ version = "1.26.16" description = "Type annotations for boto3 1.26.16 generated with mypy-boto3-builder 7.11.11" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "boto3-stubs-1.26.16.tar.gz", hash = "sha256:618253ae19f1480785759bcaee8c8b10ed3fc037027247c26a3461a50f58406d"}, {file = "boto3_stubs-1.26.16-py3-none-any.whl", hash = "sha256:8cf2925bc3e1349c93eb0f49c1061affc5ca314d69eeb335349037969d0787ed"}, @@ -389,6 +412,7 @@ files = [ [package.dependencies] botocore-stubs = "*" +mypy-boto3-kms = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"kms\""} mypy-boto3-s3 = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"s3\""} types-s3transfer = "*" typing-extensions = ">=4.1.0" @@ -726,6 +750,7 @@ version = "1.34.11" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">= 3.8" +groups = ["main"] files = [ {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"}, {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"}, @@ -745,6 +770,7 @@ version = "1.27.38" description = "Type annotations for botocore 1.27.38 generated with mypy-boto3-builder 7.10.1" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "botocore-stubs-1.27.38.tar.gz", hash = "sha256:408e8b86b5d171b58f81c74ca9d3b5317a5a8e2d3bc2073aa841ac13b8939e56"}, {file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"}, @@ -759,6 +785,7 @@ version = "2024.7.4" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, @@ -766,75 +793,79 @@ files = [ [[package]] name = "cffi" -version = "1.15.1" +version = "1.17.1" description = "Foreign Function Interface for Python calling C code." optional = false -python-versions = "*" +python-versions = ">=3.8" +groups = ["main"] files = [ - {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, - {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, - {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, - {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, - {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, - {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, - {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, - {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, - {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, - {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, - {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, - {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, - {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, - {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, - {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, - {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, - {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, - {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, - {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, + {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, + {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"}, + {file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"}, + {file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"}, + {file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"}, + {file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"}, + {file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"}, + {file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"}, + {file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"}, + {file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"}, + {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"}, + {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"}, + {file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"}, + {file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"}, + {file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"}, + {file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"}, + {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"}, + {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"}, + {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"}, + {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"}, + {file = "cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1"}, + {file = "cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8"}, + {file = "cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1"}, + {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"}, + {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"}, + {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"}, + {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, + {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, ] [package.dependencies] @@ -846,6 +877,7 @@ version = "0.87.1" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" optional = false python-versions = "!=4.0,<=4.0,>=3.8" +groups = ["main"] files = [ {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"}, {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"}, @@ -869,6 +901,7 @@ version = "2.1.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.6.0" +groups = ["main"] files = [ {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"}, {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"}, @@ -883,6 +916,7 @@ version = "8.1.3" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, @@ -897,6 +931,7 @@ version = "0.7.17" description = "ClickHouse Database Core Driver for Python, Pandas, and Superset" optional = false python-versions = "~=3.8" +groups = ["main"] files = [ {file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"}, {file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"}, @@ -987,6 +1022,8 @@ version = "0.4.5" description = "Cross-platform colored terminal text." optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "sys_platform == \"win32\" or platform_system == \"Windows\"" files = [ {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"}, {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, @@ -994,51 +1031,56 @@ files = [ [[package]] name = "cryptography" -version = "43.0.1" +version = "44.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false -python-versions = ">=3.7" +python-versions = "!=3.9.0,!=3.9.1,>=3.7" +groups = ["main"] files = [ - {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"}, - {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"}, - {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"}, - {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"}, - {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"}, - {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"}, - {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"}, - {file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"}, - {file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"}, - {file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"}, - {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"}, - {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"}, - {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"}, - {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"}, - {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"}, - {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"}, - {file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"}, - {file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"}, - {file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"}, - {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"}, - {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"}, - {file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"}, - {file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"}, - {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"}, - {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"}, - {file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"}, - {file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"}, + {file = "cryptography-44.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf688f615c29bfe9dfc44312ca470989279f0e94bb9f631f85e3459af8efc009"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd7c7e2d71d908dc0f8d2027e1604102140d84b155e658c20e8ad1304317691f"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887143b9ff6bad2b7570da75a7fe8bbf5f65276365ac259a5d2d5147a73775f2"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:322eb03ecc62784536bc173f1483e76747aafeb69c8728df48537eb431cd1911"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:21377472ca4ada2906bc313168c9dc7b1d7ca417b63c1c3011d0c74b7de9ae69"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:df978682c1504fc93b3209de21aeabf2375cb1571d4e61907b3e7a2540e83026"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:eb3889330f2a4a148abead555399ec9a32b13b7c8ba969b72d8e500eb7ef84cd"}, + {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:8e6a85a93d0642bd774460a86513c5d9d80b5c002ca9693e63f6e540f1815ed0"}, + {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6f76fdd6fd048576a04c5210d53aa04ca34d2ed63336d4abd306d0cbe298fddf"}, + {file = "cryptography-44.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6c8acf6f3d1f47acb2248ec3ea261171a671f3d9428e34ad0357148d492c7864"}, + {file = "cryptography-44.0.1-cp37-abi3-win32.whl", hash = "sha256:24979e9f2040c953a94bf3c6782e67795a4c260734e5264dceea65c8f4bae64a"}, + {file = "cryptography-44.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:fd0ee90072861e276b0ff08bd627abec29e32a53b2be44e41dbcdf87cbee2b00"}, + {file = "cryptography-44.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a2d8a7045e1ab9b9f803f0d9531ead85f90c5f2859e653b61497228b18452008"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8272f257cf1cbd3f2e120f14c68bff2b6bdfcc157fafdee84a1b795efd72862"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e8d181e90a777b63f3f0caa836844a1182f1f265687fac2115fcf245f5fbec3"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:436df4f203482f41aad60ed1813811ac4ab102765ecae7a2bbb1dbb66dcff5a7"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4f422e8c6a28cf8b7f883eb790695d6d45b0c385a2583073f3cec434cc705e1a"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:72198e2b5925155497a5a3e8c216c7fb3e64c16ccee11f0e7da272fa93b35c4c"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:2a46a89ad3e6176223b632056f321bc7de36b9f9b93b2cc1cccf935a3849dc62"}, + {file = "cryptography-44.0.1-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:53f23339864b617a3dfc2b0ac8d5c432625c80014c25caac9082314e9de56f41"}, + {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:888fcc3fce0c888785a4876ca55f9f43787f4c5c1cc1e2e0da71ad481ff82c5b"}, + {file = "cryptography-44.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00918d859aa4e57db8299607086f793fa7813ae2ff5a4637e318a25ef82730f7"}, + {file = "cryptography-44.0.1-cp39-abi3-win32.whl", hash = "sha256:9b336599e2cb77b1008cb2ac264b290803ec5e8e89d618a5e978ff5eb6f715d9"}, + {file = "cryptography-44.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:e403f7f766ded778ecdb790da786b418a9f2394f36e8cc8b796cc056ab05f44f"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1f9a92144fa0c877117e9748c74501bea842f93d21ee00b0cf922846d9d0b183"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:610a83540765a8d8ce0f351ce42e26e53e1f774a6efb71eb1b41eb01d01c3d12"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5fed5cd6102bb4eb843e3315d2bf25fede494509bddadb81e03a859c1bc17b83"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:f4daefc971c2d1f82f03097dc6f216744a6cd2ac0f04c68fb935ea2ba2a0d420"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:94f99f2b943b354a5b6307d7e8d19f5c423a794462bde2bf310c770ba052b1c4"}, + {file = "cryptography-44.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d9c5b9f698a83c8bd71e0f4d3f9f839ef244798e5ffe96febfa9714717db7af7"}, + {file = "cryptography-44.0.1.tar.gz", hash = "sha256:f51f5705ab27898afda1aaa430f34ad90dc117421057782022edf0600bec5f14"}, ] [package.dependencies] cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} [package.extras] -docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] -docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"] -nox = ["nox"] -pep8test = ["check-sdist", "click", "mypy", "ruff"] -sdist = ["build"] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=3.0.0) ; python_version >= \"3.8\""] +docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"] +nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2) ; python_version >= \"3.8\""] +pep8test = ["check-sdist ; python_version >= \"3.8\"", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] +sdist = ["build (>=1.0.0)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] +test = ["certifi (>=2024)", "cryptography-vectors (==44.0.1)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] test-randomorder = ["pytest-randomly"] [[package]] @@ -1047,6 +1089,7 @@ version = "7.1.0" description = "A Python library for the Docker Engine API." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"}, {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"}, @@ -1069,6 +1112,7 @@ version = "1.9.0" description = "execnet: rapid multi-Python deployment" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] files = [ {file = "execnet-1.9.0-py2.py3-none-any.whl", hash = "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"}, {file = "execnet-1.9.0.tar.gz", hash = "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5"}, @@ -1083,6 +1127,7 @@ version = "2.2.5" description = "A simple framework for building complex web applications." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "Flask-2.2.5-py3-none-any.whl", hash = "sha256:58107ed83443e86067e41eff4631b058178191a355886f8e479e347fa1285fdf"}, {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"}, @@ -1104,6 +1149,7 @@ version = "5.0.0" description = "A Flask extension adding a decorator for CORS support" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"}, {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"}, @@ -1114,72 +1160,104 @@ Flask = ">=0.9" [[package]] name = "frozenlist" -version = "1.4.0" +version = "1.5.0" description = "A list-like structure which implements collections.abc.MutableSequence" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ - {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab"}, - {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559"}, - {file = "frozenlist-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c"}, - {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b"}, - {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea"}, - {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326"}, - {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963"}, - {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300"}, - {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b"}, - {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8"}, - {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb"}, - {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9"}, - {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62"}, - {file = "frozenlist-1.4.0-cp310-cp310-win32.whl", hash = "sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0"}, - {file = "frozenlist-1.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956"}, - {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95"}, - {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3"}, - {file = "frozenlist-1.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc"}, - {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839"}, - {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c"}, - {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f"}, - {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b"}, - {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b"}, - {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472"}, - {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01"}, - {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f"}, - {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467"}, - {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb"}, - {file = "frozenlist-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431"}, - {file = "frozenlist-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1"}, - {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3"}, - {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503"}, - {file = "frozenlist-1.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9"}, - {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf"}, - {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2"}, - {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc"}, - {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672"}, - {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919"}, - {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc"}, - {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79"}, - {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e"}, - {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781"}, - {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8"}, - {file = "frozenlist-1.4.0-cp38-cp38-win32.whl", hash = "sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc"}, - {file = "frozenlist-1.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7"}, - {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf"}, - {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963"}, - {file = "frozenlist-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f"}, - {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1"}, - {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef"}, - {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87"}, - {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6"}, - {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087"}, - {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3"}, - {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d"}, - {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2"}, - {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a"}, - {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3"}, - {file = "frozenlist-1.4.0-cp39-cp39-win32.whl", hash = "sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f"}, - {file = "frozenlist-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167"}, - {file = "frozenlist-1.4.0.tar.gz", hash = "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251"}, + {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"}, + {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"}, + {file = "frozenlist-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec"}, + {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5"}, + {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76"}, + {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17"}, + {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba"}, + {file = "frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d"}, + {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2"}, + {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f"}, + {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c"}, + {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab"}, + {file = "frozenlist-1.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5"}, + {file = "frozenlist-1.5.0-cp310-cp310-win32.whl", hash = "sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb"}, + {file = "frozenlist-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4"}, + {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30"}, + {file = "frozenlist-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5"}, + {file = "frozenlist-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778"}, + {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a"}, + {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869"}, + {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d"}, + {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45"}, + {file = "frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d"}, + {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3"}, + {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a"}, + {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9"}, + {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2"}, + {file = "frozenlist-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf"}, + {file = "frozenlist-1.5.0-cp311-cp311-win32.whl", hash = "sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942"}, + {file = "frozenlist-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d"}, + {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21"}, + {file = "frozenlist-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d"}, + {file = "frozenlist-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e"}, + {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a"}, + {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a"}, + {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee"}, + {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6"}, + {file = "frozenlist-1.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e"}, + {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9"}, + {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039"}, + {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784"}, + {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631"}, + {file = "frozenlist-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f"}, + {file = "frozenlist-1.5.0-cp312-cp312-win32.whl", hash = "sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8"}, + {file = "frozenlist-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f"}, + {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a1a048f9215c90973402e26c01d1cff8a209e1f1b53f72b95c13db61b00f953"}, + {file = "frozenlist-1.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dd47a5181ce5fcb463b5d9e17ecfdb02b678cca31280639255ce9d0e5aa67af0"}, + {file = "frozenlist-1.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1431d60b36d15cda188ea222033eec8e0eab488f39a272461f2e6d9e1a8e63c2"}, + {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6482a5851f5d72767fbd0e507e80737f9c8646ae7fd303def99bfe813f76cf7f"}, + {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44c49271a937625619e862baacbd037a7ef86dd1ee215afc298a417ff3270608"}, + {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12f78f98c2f1c2429d42e6a485f433722b0061d5c0b0139efa64f396efb5886b"}, + {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce3aa154c452d2467487765e3adc730a8c153af77ad84096bc19ce19a2400840"}, + {file = "frozenlist-1.5.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b7dc0c4338e6b8b091e8faf0db3168a37101943e687f373dce00959583f7439"}, + {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45e0896250900b5aa25180f9aec243e84e92ac84bd4a74d9ad4138ef3f5c97de"}, + {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:561eb1c9579d495fddb6da8959fd2a1fca2c6d060d4113f5844b433fc02f2641"}, + {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:df6e2f325bfee1f49f81aaac97d2aa757c7646534a06f8f577ce184afe2f0a9e"}, + {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:140228863501b44b809fb39ec56b5d4071f4d0aa6d216c19cbb08b8c5a7eadb9"}, + {file = "frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03"}, + {file = "frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c"}, + {file = "frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28"}, + {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:dd94994fc91a6177bfaafd7d9fd951bc8689b0a98168aa26b5f543868548d3ca"}, + {file = "frozenlist-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2d0da8bbec082bf6bf18345b180958775363588678f64998c2b7609e34719b10"}, + {file = "frozenlist-1.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:73f2e31ea8dd7df61a359b731716018c2be196e5bb3b74ddba107f694fbd7604"}, + {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:828afae9f17e6de596825cf4228ff28fbdf6065974e5ac1410cecc22f699d2b3"}, + {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1577515d35ed5649d52ab4319db757bb881ce3b2b796d7283e6634d99ace307"}, + {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2150cc6305a2c2ab33299453e2968611dacb970d2283a14955923062c8d00b10"}, + {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a72b7a6e3cd2725eff67cd64c8f13335ee18fc3c7befc05aed043d24c7b9ccb9"}, + {file = "frozenlist-1.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c16d2fa63e0800723139137d667e1056bee1a1cf7965153d2d104b62855e9b99"}, + {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:17dcc32fc7bda7ce5875435003220a457bcfa34ab7924a49a1c19f55b6ee185c"}, + {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:97160e245ea33d8609cd2b8fd997c850b56db147a304a262abc2b3be021a9171"}, + {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:f1e6540b7fa044eee0bb5111ada694cf3dc15f2b0347ca125ee9ca984d5e9e6e"}, + {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:91d6c171862df0a6c61479d9724f22efb6109111017c87567cfeb7b5d1449fdf"}, + {file = "frozenlist-1.5.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c1fac3e2ace2eb1052e9f7c7db480818371134410e1f5c55d65e8f3ac6d1407e"}, + {file = "frozenlist-1.5.0-cp38-cp38-win32.whl", hash = "sha256:b97f7b575ab4a8af9b7bc1d2ef7f29d3afee2226bd03ca3875c16451ad5a7723"}, + {file = "frozenlist-1.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:374ca2dabdccad8e2a76d40b1d037f5bd16824933bf7bcea3e59c891fd4a0923"}, + {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9bbcdfaf4af7ce002694a4e10a0159d5a8d20056a12b05b45cea944a4953f972"}, + {file = "frozenlist-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1893f948bf6681733aaccf36c5232c231e3b5166d607c5fa77773611df6dc336"}, + {file = "frozenlist-1.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2b5e23253bb709ef57a8e95e6ae48daa9ac5f265637529e4ce6b003a37b2621f"}, + {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f253985bb515ecd89629db13cb58d702035ecd8cfbca7d7a7e29a0e6d39af5f"}, + {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04a5c6babd5e8fb7d3c871dc8b321166b80e41b637c31a995ed844a6139942b6"}, + {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a9fe0f1c29ba24ba6ff6abf688cb0b7cf1efab6b6aa6adc55441773c252f7411"}, + {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:226d72559fa19babe2ccd920273e767c96a49b9d3d38badd7c91a0fdeda8ea08"}, + {file = "frozenlist-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15b731db116ab3aedec558573c1a5eec78822b32292fe4f2f0345b7f697745c2"}, + {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:366d8f93e3edfe5a918c874702f78faac300209a4d5bf38352b2c1bdc07a766d"}, + {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1b96af8c582b94d381a1c1f51ffaedeb77c821c690ea5f01da3d70a487dd0a9b"}, + {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c03eff4a41bd4e38415cbed054bbaff4a075b093e2394b6915dca34a40d1e38b"}, + {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:50cf5e7ee9b98f22bdecbabf3800ae78ddcc26e4a435515fc72d97903e8488e0"}, + {file = "frozenlist-1.5.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1e76bfbc72353269c44e0bc2cfe171900fbf7f722ad74c9a7b638052afe6a00c"}, + {file = "frozenlist-1.5.0-cp39-cp39-win32.whl", hash = "sha256:666534d15ba8f0fda3f53969117383d5dc021266b3c1a42c9ec4855e4b58b9d3"}, + {file = "frozenlist-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:5c28f4b5dbef8a0d8aad0d4de24d1e9e981728628afaf4ea0792f5d0939372f0"}, + {file = "frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3"}, + {file = "frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817"}, ] [[package]] @@ -1188,6 +1266,7 @@ version = "3.2.1" description = "GraphQL implementation for Python, a port of GraphQL.js, the JavaScript reference implementation for GraphQL." optional = false python-versions = ">=3.6,<4" +groups = ["main"] files = [ {file = "graphql-core-3.2.1.tar.gz", hash = "sha256:9d1bf141427b7d54be944587c8349df791ce60ade2e3cccaf9c56368c133c201"}, {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"}, @@ -1199,6 +1278,7 @@ version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, @@ -1207,27 +1287,33 @@ files = [ [[package]] name = "h2" version = "4.1.0" -description = "HTTP/2 State-Machine based protocol implementation" +description = "Pure-Python HTTP/2 protocol implementation" optional = false -python-versions = ">=3.6.1" -files = [ - {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"}, - {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"}, -] +python-versions = ">=3.9" +groups = ["main"] +files = [] +develop = false [package.dependencies] -hpack = ">=4.0,<5" -hyperframe = ">=6.0,<7" +hpack = ">=4.1,<5" +hyperframe = ">=6.1,<7" + +[package.source] +type = "git" +url = "https://github.com/python-hyper/h2" +reference = "HEAD" +resolved_reference = "0b98b244b5fd1fe96100ac14905417a3b70a4286" [[package]] name = "hpack" -version = "4.0.0" -description = "Pure-Python HPACK header compression" +version = "4.1.0" +description = "Pure-Python HPACK header encoding" optional = false -python-versions = ">=3.6.1" +python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"}, - {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"}, + {file = "hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496"}, + {file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"}, ] [[package]] @@ -1236,6 +1322,7 @@ version = "1.0.3" description = "A minimal low-level HTTP client." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "httpcore-1.0.3-py3-none-any.whl", hash = "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"}, {file = "httpcore-1.0.3.tar.gz", hash = "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544"}, @@ -1257,6 +1344,7 @@ version = "0.26.0" description = "The next generation HTTP client." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"}, {file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"}, @@ -1271,20 +1359,21 @@ idna = "*" sniffio = "*" [package.extras] -brotli = ["brotli", "brotlicffi"] +brotli = ["brotli ; platform_python_implementation == \"CPython\"", "brotlicffi ; platform_python_implementation != \"CPython\""] cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] [[package]] name = "hyperframe" -version = "6.0.1" -description = "HTTP/2 framing layer for Python" +version = "6.1.0" +description = "Pure-Python HTTP/2 framing" optional = false -python-versions = ">=3.6.1" +python-versions = ">=3.9" +groups = ["main"] files = [ - {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"}, - {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"}, + {file = "hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5"}, + {file = "hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08"}, ] [[package]] @@ -1293,6 +1382,7 @@ version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" +groups = ["main"] files = [ {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, @@ -1304,6 +1394,7 @@ version = "1.1.1" description = "iniconfig: brain-dead simple config-ini parsing" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, @@ -1315,6 +1406,7 @@ version = "2.1.2" description = "Safely pass data to untrusted environments and back." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "itsdangerous-2.1.2-py3-none-any.whl", hash = "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44"}, {file = "itsdangerous-2.1.2.tar.gz", hash = "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"}, @@ -1322,13 +1414,14 @@ files = [ [[package]] name = "jinja2" -version = "3.1.4" +version = "3.1.5" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ - {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, - {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, + {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, + {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, ] [package.dependencies] @@ -1343,6 +1436,7 @@ version = "1.0.1" description = "JSON Matching Expressions" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, @@ -1354,6 +1448,7 @@ version = "0.9.0" description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"}, {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"}, @@ -1371,6 +1466,7 @@ version = "1.2.3" description = "Generate source code for Python classes from a JSON schema." optional = false python-versions = ">= 2.7" +groups = ["main"] files = [ {file = "jschema_to_python-1.2.3-py3-none-any.whl", hash = "sha256:8a703ca7604d42d74b2815eecf99a33359a8dccbb80806cce386d5e2dd992b05"}, {file = "jschema_to_python-1.2.3.tar.gz", hash = "sha256:76ff14fe5d304708ccad1284e4b11f96a658949a31ee7faed9e0995279549b91"}, @@ -1387,6 +1483,7 @@ version = "2.0.0" description = "Diff JSON and JSON-like structures in Python" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "jsondiff-2.0.0-py3-none-any.whl", hash = "sha256:689841d66273fc88fc79f7d33f4c074774f4f214b6466e3aff0e5adaf889d1e0"}, {file = "jsondiff-2.0.0.tar.gz", hash = "sha256:2795844ef075ec8a2b8d385c4d59f5ea48b08e7180fce3cb2787be0db00b1fb4"}, @@ -1398,6 +1495,8 @@ version = "0.20.0" description = "Python bindings for Jsonnet - The data templating language" optional = false python-versions = "*" +groups = ["main"] +markers = "python_version < \"3.13\"" files = [ {file = "jsonnet-0.20.0.tar.gz", hash = "sha256:7e770c7bf3a366b97b650a39430450f77612e74406731eb75c5bd59f3f104d4f"}, ] @@ -1408,6 +1507,7 @@ version = "1.32" description = "Apply JSON-Patches (RFC 6902)" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] files = [ {file = "jsonpatch-1.32-py2.py3-none-any.whl", hash = "sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397"}, {file = "jsonpatch-1.32.tar.gz", hash = "sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"}, @@ -1422,6 +1522,7 @@ version = "1.6.1" description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." optional = false python-versions = "*" +groups = ["main"] files = [ {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"}, {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"}, @@ -1436,6 +1537,7 @@ version = "2.2.0" description = "Python library for serializing any arbitrary object graph into JSON" optional = false python-versions = ">=2.7" +groups = ["main"] files = [ {file = "jsonpickle-2.2.0-py2.py3-none-any.whl", hash = "sha256:de7f2613818aa4f234138ca11243d6359ff83ae528b2185efdd474f62bcf9ae1"}, {file = "jsonpickle-2.2.0.tar.gz", hash = "sha256:7b272918b0554182e53dc340ddd62d9b7f902fec7e7b05620c04f3ccef479a0e"}, @@ -1443,8 +1545,8 @@ files = [ [package.extras] docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"] -testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"] -testing-libs = ["simplejson", "ujson", "yajl"] +testing = ["ecdsa", "enum34 ; python_version == \"2.7\"", "feedparser", "jsonlib ; python_version == \"2.7\"", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0) ; python_version <= \"3.6\"", "pytest-flake8 (>=1.1.1) ; python_version >= \"3.7\"", "scikit-learn", "sqlalchemy"] +testing-libs = ["simplejson", "ujson", "yajl ; python_version == \"2.7\""] [[package]] name = "jsonpointer" @@ -1452,6 +1554,7 @@ version = "2.3" description = "Identify specific nodes in a JSON document (RFC 6901)" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["main"] files = [ {file = "jsonpointer-2.3-py2.py3-none-any.whl", hash = "sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9"}, {file = "jsonpointer-2.3.tar.gz", hash = "sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"}, @@ -1463,6 +1566,7 @@ version = "4.17.3" description = "An implementation of JSON Schema validation for Python" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"}, {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"}, @@ -1482,6 +1586,7 @@ version = "0.1.6" description = "JSONSchema Spec with object-oriented paths" optional = false python-versions = ">=3.7.0,<4.0.0" +groups = ["main"] files = [ {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"}, {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"}, @@ -1499,6 +1604,7 @@ version = "1.9" description = "Creates JUnit XML test result documents that can be read by tools such as Jenkins" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"}, {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"}, @@ -1513,6 +1619,7 @@ version = "1.5.6" description = "Implementation of JOSE Web standards" optional = false python-versions = ">= 3.8" +groups = ["main"] files = [ {file = "jwcrypto-1.5.6-py3-none-any.whl", hash = "sha256:150d2b0ebbdb8f40b77f543fb44ffd2baeff48788be71f67f03566692fd55789"}, {file = "jwcrypto-1.5.6.tar.gz", hash = "sha256:771a87762a0c081ae6166958a954f80848820b2ab066937dc8b8379d65b1b039"}, @@ -1528,6 +1635,7 @@ version = "2.0.2" description = "Pure Python client for Apache Kafka" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"}, {file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"}, @@ -1542,6 +1650,7 @@ version = "1.10.0" description = "A fast and thorough lazy object proxy." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"}, {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"}, @@ -1588,6 +1697,7 @@ version = "4.3.3" description = "LZ4 Bindings for Python" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"}, {file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"}, @@ -1638,6 +1748,7 @@ version = "2.1.1" description = "Safely add untrusted strings to HTML/XML markup." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"}, {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"}, @@ -1687,6 +1798,7 @@ version = "5.0.6" description = "" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"}, {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"}, @@ -1746,6 +1858,7 @@ version = "1.3.0" description = "Python library for arbitrary-precision floating-point arithmetic" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, @@ -1754,7 +1867,7 @@ files = [ [package.extras] develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] docs = ["sphinx"] -gmpy = ["gmpy2 (>=2.1.0a4)"] +gmpy = ["gmpy2 (>=2.1.0a4) ; platform_python_implementation != \"PyPy\""] tests = ["pytest (>=4.6)"] [[package]] @@ -1763,6 +1876,7 @@ version = "6.0.5" description = "multidict implementation" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"}, {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"}, @@ -1862,6 +1976,7 @@ version = "1.13.0" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"}, {file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"}, @@ -1908,12 +2023,25 @@ install-types = ["pip"] mypyc = ["setuptools (>=50)"] reports = ["lxml"] +[[package]] +name = "mypy-boto3-kms" +version = "1.26.147" +description = "Type annotations for boto3.KMS 1.26.147 service generated with mypy-boto3-builder 7.14.5" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "mypy-boto3-kms-1.26.147.tar.gz", hash = "sha256:816a4d1bb0585e1b9620a3f96c1d69a06f53b7b5621858579dd77c60dbb5fa5c"}, + {file = "mypy_boto3_kms-1.26.147-py3-none-any.whl", hash = "sha256:493f0db674a25c88769f5cb8ab8ac00d3dda5dfc903d5cda34c990ee64689f79"}, +] + [[package]] name = "mypy-boto3-s3" version = "1.26.0.post1" description = "Type annotations for boto3.S3 1.26.0 service generated with mypy-boto3-builder 7.11.10" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "mypy-boto3-s3-1.26.0.post1.tar.gz", hash = "sha256:6d7079f8c739dc993cbedad0736299c413b297814b73795a3855a79169ecc938"}, {file = "mypy_boto3_s3-1.26.0.post1-py3-none-any.whl", hash = "sha256:7de2792ff0cc541b84cd46ff3a6aa2b6e5f267217f2203f27f6e4016bddc644d"}, @@ -1928,6 +2056,7 @@ version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.5" +groups = ["dev"] files = [ {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, @@ -1939,6 +2068,7 @@ version = "2.8.5" description = "Python package for creating and manipulating graphs and networks" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "networkx-2.8.5-py3-none-any.whl", hash = "sha256:a762f4b385692d9c3a6f2912d058d76d29a827deaedf9e63ed14d397b8030687"}, {file = "networkx-2.8.5.tar.gz", hash = "sha256:15a7b81a360791c458c55a417418ea136c13378cfdc06a2dcdc12bd2f9cf09c1"}, @@ -1957,6 +2087,7 @@ version = "0.4.4" description = "OpenAPI schema validation for Python" optional = false python-versions = ">=3.7.0,<4.0.0" +groups = ["main"] files = [ {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"}, {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"}, @@ -1975,6 +2106,7 @@ version = "0.5.7" description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator" optional = false python-versions = ">=3.7.0,<4.0.0" +groups = ["main"] files = [ {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"}, {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"}, @@ -1988,13 +2120,14 @@ openapi-schema-validator = ">=0.4.2,<0.5.0" [[package]] name = "packaging" -version = "23.0" +version = "24.2" description = "Core utilities for Python packages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" +groups = ["main"] files = [ - {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, - {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] [[package]] @@ -2003,6 +2136,7 @@ version = "0.4.3" description = "Object-oriented paths" optional = false python-versions = ">=3.7.0,<4.0.0" +groups = ["main"] files = [ {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"}, {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"}, @@ -2014,6 +2148,7 @@ version = "5.9.0" description = "Python Build Reasonableness" optional = false python-versions = ">=2.6" +groups = ["main"] files = [ {file = "pbr-5.9.0-py2.py3-none-any.whl", hash = "sha256:e547125940bcc052856ded43be8e101f63828c2d94239ffbe2b327ba3d5ccf0a"}, {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"}, @@ -2025,6 +2160,7 @@ version = "1.0.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, @@ -2040,6 +2176,7 @@ version = "3.11" description = "Python Lex & Yacc" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, @@ -2051,6 +2188,7 @@ version = "0.14.1" description = "Python client for the Prometheus monitoring system." optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"}, {file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"}, @@ -2065,6 +2203,7 @@ version = "0.2.0" description = "Accelerated property cache" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"}, {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"}, @@ -2172,6 +2311,7 @@ version = "5.9.4" description = "Cross-platform lib for process and system monitoring in Python." optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["main"] files = [ {file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"}, {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe"}, @@ -2190,7 +2330,7 @@ files = [ ] [package.extras] -test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] +test = ["enum34 ; python_version <= \"3.4\"", "ipaddress ; python_version < \"3.0\"", "mock ; python_version < \"3.0\"", "pywin32 ; sys_platform == \"win32\"", "wmi ; sys_platform == \"win32\""] [[package]] name = "psycopg2-binary" @@ -2198,6 +2338,7 @@ version = "2.9.10" description = "psycopg2 - Python-PostgreSQL Database Adapter" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2"}, {file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:0ea8e3d0ae83564f2fc554955d327fa081d065c8ca5cc6d2abb643e2c9c1200f"}, @@ -2246,6 +2387,7 @@ files = [ {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"}, {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"}, + {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"}, {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"}, @@ -2274,6 +2416,7 @@ version = "0.5.4" description = "Pure Python PartiQL Parser" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"}, {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"}, @@ -2288,6 +2431,7 @@ version = "2.21" description = "C parser in Python" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["main"] files = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, @@ -2295,109 +2439,133 @@ files = [ [[package]] name = "pydantic" -version = "2.7.1" +version = "2.10.4" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ - {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"}, - {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"}, + {file = "pydantic-2.10.4-py3-none-any.whl", hash = "sha256:597e135ea68be3a37552fb524bc7d0d66dcf93d395acd93a00682f1efcb8ee3d"}, + {file = "pydantic-2.10.4.tar.gz", hash = "sha256:82f12e9723da6de4fe2ba888b5971157b3be7ad914267dea8f05f82b28254f06"}, ] [package.dependencies] -annotated-types = ">=0.4.0" -pydantic-core = "2.18.2" -typing-extensions = ">=4.6.1" +annotated-types = ">=0.6.0" +pydantic-core = "2.27.2" +typing-extensions = ">=4.12.2" [package.extras] email = ["email-validator (>=2.0.0)"] +timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""] [[package]] name = "pydantic-core" -version = "2.18.2" +version = "2.27.2" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ - {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"}, - {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"}, - {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"}, - {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"}, - {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"}, - {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"}, - {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"}, - {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"}, - {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"}, - {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"}, - {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"}, - {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"}, - {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"}, - {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"}, - {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"}, - {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"}, - {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"}, - {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"}, - {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"}, - {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"}, - {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"}, - {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"}, - {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"}, - {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"}, - {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"}, - {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"}, - {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"}, - {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"}, - {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"}, - {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"}, - {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"}, - {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"}, - {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"}, + {file = "pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa"}, + {file = "pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c"}, + {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7969e133a6f183be60e9f6f56bfae753585680f3b7307a8e555a948d443cc05a"}, + {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3de9961f2a346257caf0aa508a4da705467f53778e9ef6fe744c038119737ef5"}, + {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2bb4d3e5873c37bb3dd58714d4cd0b0e6238cebc4177ac8fe878f8b3aa8e74c"}, + {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:280d219beebb0752699480fe8f1dc61ab6615c2046d76b7ab7ee38858de0a4e7"}, + {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47956ae78b6422cbd46f772f1746799cbb862de838fd8d1fbd34a82e05b0983a"}, + {file = "pydantic_core-2.27.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:14d4a5c49d2f009d62a2a7140d3064f686d17a5d1a268bc641954ba181880236"}, + {file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:337b443af21d488716f8d0b6164de833e788aa6bd7e3a39c005febc1284f4962"}, + {file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:03d0f86ea3184a12f41a2d23f7ccb79cdb5a18e06993f8a45baa8dfec746f0e9"}, + {file = "pydantic_core-2.27.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7041c36f5680c6e0f08d922aed302e98b3745d97fe1589db0a3eebf6624523af"}, + {file = "pydantic_core-2.27.2-cp310-cp310-win32.whl", hash = "sha256:50a68f3e3819077be2c98110c1f9dcb3817e93f267ba80a2c05bb4f8799e2ff4"}, + {file = "pydantic_core-2.27.2-cp310-cp310-win_amd64.whl", hash = "sha256:e0fd26b16394ead34a424eecf8a31a1f5137094cabe84a1bcb10fa6ba39d3d31"}, + {file = "pydantic_core-2.27.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:8e10c99ef58cfdf2a66fc15d66b16c4a04f62bca39db589ae8cba08bc55331bc"}, + {file = "pydantic_core-2.27.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:26f32e0adf166a84d0cb63be85c562ca8a6fa8de28e5f0d92250c6b7e9e2aff7"}, + {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c19d1ea0673cd13cc2f872f6c9ab42acc4e4f492a7ca9d3795ce2b112dd7e15"}, + {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e68c4446fe0810e959cdff46ab0a41ce2f2c86d227d96dc3847af0ba7def306"}, + {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9640b0059ff4f14d1f37321b94061c6db164fbe49b334b31643e0528d100d99"}, + {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:40d02e7d45c9f8af700f3452f329ead92da4c5f4317ca9b896de7ce7199ea459"}, + {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c1fd185014191700554795c99b347d64f2bb637966c4cfc16998a0ca700d048"}, + {file = "pydantic_core-2.27.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d81d2068e1c1228a565af076598f9e7451712700b673de8f502f0334f281387d"}, + {file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1a4207639fb02ec2dbb76227d7c751a20b1a6b4bc52850568e52260cae64ca3b"}, + {file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:3de3ce3c9ddc8bbd88f6e0e304dea0e66d843ec9de1b0042b0911c1663ffd474"}, + {file = "pydantic_core-2.27.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:30c5f68ded0c36466acede341551106821043e9afaad516adfb6e8fa80a4e6a6"}, + {file = "pydantic_core-2.27.2-cp311-cp311-win32.whl", hash = "sha256:c70c26d2c99f78b125a3459f8afe1aed4d9687c24fd677c6a4436bc042e50d6c"}, + {file = "pydantic_core-2.27.2-cp311-cp311-win_amd64.whl", hash = "sha256:08e125dbdc505fa69ca7d9c499639ab6407cfa909214d500897d02afb816e7cc"}, + {file = "pydantic_core-2.27.2-cp311-cp311-win_arm64.whl", hash = "sha256:26f0d68d4b235a2bae0c3fc585c585b4ecc51382db0e3ba402a22cbc440915e4"}, + {file = "pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0"}, + {file = "pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef"}, + {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7"}, + {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934"}, + {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6"}, + {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c"}, + {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2"}, + {file = "pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4"}, + {file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3"}, + {file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4"}, + {file = "pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57"}, + {file = "pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc"}, + {file = "pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9"}, + {file = "pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b"}, + {file = "pydantic_core-2.27.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7d14bd329640e63852364c306f4d23eb744e0f8193148d4044dd3dacdaacbd8b"}, + {file = "pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82f91663004eb8ed30ff478d77c4d1179b3563df6cdb15c0817cd1cdaf34d154"}, + {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71b24c7d61131bb83df10cc7e687433609963a944ccf45190cfc21e0887b08c9"}, + {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa8e459d4954f608fa26116118bb67f56b93b209c39b008277ace29937453dc9"}, + {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce8918cbebc8da707ba805b7fd0b382816858728ae7fe19a942080c24e5b7cd1"}, + {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3f5c2a021bbc5d976107bb302e0131351c2ba54343f8a496dc8783d3d3a6a"}, + {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8086fa684c4775c27f03f062cbb9eaa6e17f064307e86b21b9e0abc9c0f02e"}, + {file = "pydantic_core-2.27.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8d9b3388db186ba0c099a6d20f0604a44eabdeef1777ddd94786cdae158729e4"}, + {file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7a66efda2387de898c8f38c0cf7f14fca0b51a8ef0b24bfea5849f1b3c95af27"}, + {file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:18a101c168e4e092ab40dbc2503bdc0f62010e95d292b27827871dc85450d7ee"}, + {file = "pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ba5dd002f88b78a4215ed2f8ddbdf85e8513382820ba15ad5ad8955ce0ca19a1"}, + {file = "pydantic_core-2.27.2-cp313-cp313-win32.whl", hash = "sha256:1ebaf1d0481914d004a573394f4be3a7616334be70261007e47c2a6fe7e50130"}, + {file = "pydantic_core-2.27.2-cp313-cp313-win_amd64.whl", hash = "sha256:953101387ecf2f5652883208769a79e48db18c6df442568a0b5ccd8c2723abee"}, + {file = "pydantic_core-2.27.2-cp313-cp313-win_arm64.whl", hash = "sha256:ac4dbfd1691affb8f48c2c13241a2e3b60ff23247cbcf981759c768b6633cf8b"}, + {file = "pydantic_core-2.27.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d3e8d504bdd3f10835468f29008d72fc8359d95c9c415ce6e767203db6127506"}, + {file = "pydantic_core-2.27.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:521eb9b7f036c9b6187f0b47318ab0d7ca14bd87f776240b90b21c1f4f149320"}, + {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85210c4d99a0114f5a9481b44560d7d1e35e32cc5634c656bc48e590b669b145"}, + {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d716e2e30c6f140d7560ef1538953a5cd1a87264c737643d481f2779fc247fe1"}, + {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f66d89ba397d92f840f8654756196d93804278457b5fbede59598a1f9f90b228"}, + {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:669e193c1c576a58f132e3158f9dfa9662969edb1a250c54d8fa52590045f046"}, + {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdbe7629b996647b99c01b37f11170a57ae675375b14b8c13b8518b8320ced5"}, + {file = "pydantic_core-2.27.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d262606bf386a5ba0b0af3b97f37c83d7011439e3dc1a9298f21efb292e42f1a"}, + {file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:cabb9bcb7e0d97f74df8646f34fc76fbf793b7f6dc2438517d7a9e50eee4f14d"}, + {file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_armv7l.whl", hash = "sha256:d2d63f1215638d28221f664596b1ccb3944f6e25dd18cd3b86b0a4c408d5ebb9"}, + {file = "pydantic_core-2.27.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bca101c00bff0adb45a833f8451b9105d9df18accb8743b08107d7ada14bd7da"}, + {file = "pydantic_core-2.27.2-cp38-cp38-win32.whl", hash = "sha256:f6f8e111843bbb0dee4cb6594cdc73e79b3329b526037ec242a3e49012495b3b"}, + {file = "pydantic_core-2.27.2-cp38-cp38-win_amd64.whl", hash = "sha256:fd1aea04935a508f62e0d0ef1f5ae968774a32afc306fb8545e06f5ff5cdf3ad"}, + {file = "pydantic_core-2.27.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:c10eb4f1659290b523af58fa7cffb452a61ad6ae5613404519aee4bfbf1df993"}, + {file = "pydantic_core-2.27.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef592d4bad47296fb11f96cd7dc898b92e795032b4894dfb4076cfccd43a9308"}, + {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c61709a844acc6bf0b7dce7daae75195a10aac96a596ea1b776996414791ede4"}, + {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c5f762659e47fdb7b16956c71598292f60a03aa92f8b6351504359dbdba6cf"}, + {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4c9775e339e42e79ec99c441d9730fccf07414af63eac2f0e48e08fd38a64d76"}, + {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57762139821c31847cfb2df63c12f725788bd9f04bc2fb392790959b8f70f118"}, + {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d1e85068e818c73e048fe28cfc769040bb1f475524f4745a5dc621f75ac7630"}, + {file = "pydantic_core-2.27.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:097830ed52fd9e427942ff3b9bc17fab52913b2f50f2880dc4a5611446606a54"}, + {file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:044a50963a614ecfae59bb1eaf7ea7efc4bc62f49ed594e18fa1e5d953c40e9f"}, + {file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:4e0b4220ba5b40d727c7f879eac379b822eee5d8fff418e9d3381ee45b3b0362"}, + {file = "pydantic_core-2.27.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e4f4bb20d75e9325cc9696c6802657b58bc1dbbe3022f32cc2b2b632c3fbb96"}, + {file = "pydantic_core-2.27.2-cp39-cp39-win32.whl", hash = "sha256:cca63613e90d001b9f2f9a9ceb276c308bfa2a43fafb75c8031c4f66039e8c6e"}, + {file = "pydantic_core-2.27.2-cp39-cp39-win_amd64.whl", hash = "sha256:77d1bca19b0f7021b3a982e6f903dcd5b2b06076def36a652e3907f596e29f67"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:2bf14caea37e91198329b828eae1618c068dfb8ef17bb33287a7ad4b61ac314e"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b0cb791f5b45307caae8810c2023a184c74605ec3bcbb67d13846c28ff731ff8"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:688d3fd9fcb71f41c4c015c023d12a79d1c4c0732ec9eb35d96e3388a120dcf3"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d591580c34f4d731592f0e9fe40f9cc1b430d297eecc70b962e93c5c668f15f"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:82f986faf4e644ffc189a7f1aafc86e46ef70372bb153e7001e8afccc6e54133"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bec317a27290e2537f922639cafd54990551725fc844249e64c523301d0822fc"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:0296abcb83a797db256b773f45773da397da75a08f5fcaef41f2044adec05f50"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0d75070718e369e452075a6017fbf187f788e17ed67a3abd47fa934d001863d9"}, + {file = "pydantic_core-2.27.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7e17b560be3c98a8e3aa66ce828bdebb9e9ac6ad5466fba92eb74c4c95cb1151"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c33939a82924da9ed65dab5a65d427205a73181d8098e79b6b426bdf8ad4e656"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:00bad2484fa6bda1e216e7345a798bd37c68fb2d97558edd584942aa41b7d278"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c817e2b40aba42bac6f457498dacabc568c3b7a986fc9ba7c8d9d260b71485fb"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:251136cdad0cb722e93732cb45ca5299fb56e1344a833640bf93b2803f8d1bfd"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d2088237af596f0a524d3afc39ab3b036e8adb054ee57cbb1dcf8e09da5b29cc"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d4041c0b966a84b4ae7a09832eb691a35aec90910cd2dbe7a208de59be77965b"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:8083d4e875ebe0b864ffef72a4304827015cff328a1be6e22cc850753bfb122b"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f141ee28a0ad2123b6611b6ceff018039df17f32ada8b534e6aa039545a3efb2"}, + {file = "pydantic_core-2.27.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7d0c8399fcc1848491f00e0314bd59fb34a9c008761bcb422a057670c3f65e35"}, + {file = "pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39"}, ] [package.dependencies] @@ -2409,6 +2577,7 @@ version = "2.4.0" description = "JSON Web Token implementation in Python" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, @@ -2429,6 +2598,7 @@ version = "3.0.9" description = "pyparsing module - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.6.8" +groups = ["main"] files = [ {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, @@ -2443,6 +2613,7 @@ version = "0.18.1" description = "Persistent/Functional/Immutable data structures" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"}, {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"}, @@ -2473,6 +2644,7 @@ version = "7.4.4" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, @@ -2493,6 +2665,7 @@ version = "0.21.0" description = "Pytest support for asyncio" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "pytest-asyncio-0.21.0.tar.gz", hash = "sha256:2b38a496aef56f56b0e87557ec313e11e1ab9276fc3863f6a7be0f1d0e415e1b"}, {file = "pytest_asyncio-0.21.0-py3-none-any.whl", hash = "sha256:f2b3366b7cd501a4056858bd39349d5af19742aed2d81660b7998b6341c7eb9c"}, @@ -2511,6 +2684,7 @@ version = "1.0.8" description = "pytest-httpserver is a httpserver for pytest" optional = false python-versions = ">=3.8,<4.0" +groups = ["main"] files = [ {file = "pytest_httpserver-1.0.8-py3-none-any.whl", hash = "sha256:24cd3d9f6a0b927c7bfc400d0b3fda7442721b8267ce29942bf307b190f0bb09"}, {file = "pytest_httpserver-1.0.8.tar.gz", hash = "sha256:e052f69bc8a9073db02484681e8e47004dd1fb3763b0ae833bd899e5895c559a"}, @@ -2525,6 +2699,7 @@ version = "0.6.3" description = "It helps to use fixtures in pytest.mark.parametrize" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, @@ -2539,6 +2714,7 @@ version = "1.1.0" description = "pytest plugin to run your tests in a specific order" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, @@ -2553,6 +2729,7 @@ version = "0.9.3" description = "pytest plugin for repeating tests" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"}, {file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"}, @@ -2567,6 +2744,7 @@ version = "15.0" description = "pytest plugin to re-run tests to eliminate flaky failures" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "pytest-rerunfailures-15.0.tar.gz", hash = "sha256:2d9ac7baf59f4c13ac730b47f6fa80e755d1ba0581da45ce30b72fb3542b4474"}, {file = "pytest_rerunfailures-15.0-py3-none-any.whl", hash = "sha256:dd150c4795c229ef44320adc9a0c0532c51b78bb7a6843a8c53556b9a611df1a"}, @@ -2582,6 +2760,7 @@ version = "0.8.1" description = "Pytest plugin which splits the test suite to equally sized sub suites based on test execution time." optional = false python-versions = ">=3.7.1,<4.0" +groups = ["main"] files = [ {file = "pytest_split-0.8.1-py3-none-any.whl", hash = "sha256:74b110ea091bd147cc1c5f9665a59506e5cedfa66f96a89fb03e4ab447c2c168"}, {file = "pytest_split-0.8.1.tar.gz", hash = "sha256:2d88bd3dc528689a7a3f58fc12ea165c3aa62e90795e420dfad920afe5612d6d"}, @@ -2592,17 +2771,18 @@ pytest = ">=5,<8" [[package]] name = "pytest-timeout" -version = "2.1.0" +version = "2.3.1" description = "pytest plugin to abort hanging tests" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" +groups = ["main"] files = [ - {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"}, - {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"}, + {file = "pytest-timeout-2.3.1.tar.gz", hash = "sha256:12397729125c6ecbdaca01035b9e5239d4db97352320af155b3f5de1ba5165d9"}, + {file = "pytest_timeout-2.3.1-py3-none-any.whl", hash = "sha256:68188cb703edfc6a18fad98dc25a3c61e9f24d644b0b70f33af545219fc7813e"}, ] [package.dependencies] -pytest = ">=5.0.0" +pytest = ">=7.0.0" [[package]] name = "pytest-xdist" @@ -2610,6 +2790,7 @@ version = "3.3.1" description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "pytest-xdist-3.3.1.tar.gz", hash = "sha256:d5ee0520eb1b7bcca50a60a518ab7a7707992812c578198f8b44fdfac78e8c93"}, {file = "pytest_xdist-3.3.1-py3-none-any.whl", hash = "sha256:ff9daa7793569e6a68544850fd3927cd257cc03a7ef76c95e86915355e82b5f2"}, @@ -2630,6 +2811,7 @@ version = "2.8.2" description = "Extensions to the standard Python datetime module" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main"] files = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, @@ -2638,12 +2820,28 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "python-dotenv" +version = "1.0.1" +description = "Read key-value pairs from a .env file and set them as environment variables" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"}, + {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "pytz" version = "2024.1" description = "World timezone definitions, modern and historical" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, @@ -2655,6 +2853,8 @@ version = "308" description = "Python for Window Extensions" optional = false python-versions = "*" +groups = ["main"] +markers = "sys_platform == \"win32\"" files = [ {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"}, {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"}, @@ -2682,6 +2882,7 @@ version = "6.0.2" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, @@ -2744,6 +2945,7 @@ version = "2024.4.28" description = "Alternative regular expression module, to replace re." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"}, {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"}, @@ -2832,6 +3034,7 @@ version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, @@ -2853,6 +3056,7 @@ version = "0.25.3" description = "A utility library for mocking out the `requests` Python library." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "responses-0.25.3-py3-none-any.whl", hash = "sha256:521efcbc82081ab8daa588e08f7e8a64ce79b91c39f6e62199b19159bea7dbcb"}, {file = "responses-0.25.3.tar.gz", hash = "sha256:617b9247abd9ae28313d57a75880422d55ec63c29d33d629697590a034358dba"}, @@ -2864,7 +3068,7 @@ requests = ">=2.30.0,<3.0" urllib3 = ">=1.25.10,<3.0" [package.extras] -tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli", "tomli-w", "types-PyYAML", "types-requests"] +tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli ; python_version < \"3.11\"", "tomli-w", "types-PyYAML", "types-requests"] [[package]] name = "rfc3339-validator" @@ -2872,6 +3076,7 @@ version = "0.1.4" description = "A pure python RFC3339 validator" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] files = [ {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"}, {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"}, @@ -2886,6 +3091,7 @@ version = "0.7.0" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "ruff-0.7.0-py3-none-linux_armv6l.whl", hash = "sha256:0cdf20c2b6ff98e37df47b2b0bd3a34aaa155f59a11182c1303cce79be715628"}, {file = "ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737"}, @@ -2913,6 +3119,7 @@ version = "0.10.0" description = "An Amazon S3 Transfer Manager" optional = false python-versions = ">= 3.8" +groups = ["main"] files = [ {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"}, {file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"}, @@ -2930,6 +3137,7 @@ version = "1.0.4" description = "Classes implementing the SARIF 2.1.0 object model." optional = false python-versions = ">= 2.7" +groups = ["main"] files = [ {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"}, {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"}, @@ -2945,6 +3153,7 @@ version = "70.0.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, @@ -2952,7 +3161,7 @@ files = [ [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" @@ -2960,6 +3169,7 @@ version = "1.16.0" description = "Python 2 and 3 compatibility utilities" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +groups = ["main"] files = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, @@ -2971,6 +3181,7 @@ version = "1.3.0" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, @@ -2982,6 +3193,7 @@ version = "1.12" description = "Computer algebra system (CAS) in Python" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, @@ -2992,17 +3204,19 @@ mpmath = ">=0.19" [[package]] name = "testcontainers" -version = "4.8.1" +version = "4.9.0" description = "Python library for throwaway instances of anything that can run in a Docker container" optional = false python-versions = "<4.0,>=3.9" +groups = ["main"] files = [ - {file = "testcontainers-4.8.1-py3-none-any.whl", hash = "sha256:d8ae43e8fe34060fcd5c3f494e0b7652b7774beabe94568a2283d0881e94d489"}, - {file = "testcontainers-4.8.1.tar.gz", hash = "sha256:5ded4820b7227ad526857eb3caaafcabce1bbac05d22ad194849b136ffae3cb0"}, + {file = "testcontainers-4.9.0-py3-none-any.whl", hash = "sha256:c6fee929990972c40bf6b91b7072c94064ff3649b405a14fde0274c8b2479d32"}, + {file = "testcontainers-4.9.0.tar.gz", hash = "sha256:2cd6af070109ff68c1ab5389dc89c86c2dc3ab30a21ca734b2cb8f0f80ad479e"}, ] [package.dependencies] docker = "*" +python-dotenv = "*" typing-extensions = "*" urllib3 = "*" wrapt = "*" @@ -3048,6 +3262,7 @@ version = "0.10.2" description = "Python Library for Tom's Obvious, Minimal Language" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +groups = ["main"] files = [ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, @@ -3059,6 +3274,7 @@ version = "1.5.0.20240925" description = "Typing stubs for jwcrypto" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "types-jwcrypto-1.5.0.20240925.tar.gz", hash = "sha256:50e17b790378c96239344476c7bd13b52d0c7eeb6d16c2d53723e48cc6bbf4fe"}, {file = "types_jwcrypto-1.5.0.20240925-py3-none-any.whl", hash = "sha256:2d12a2d528240d326075e896aafec7056b9136bf3207fa6ccf3fcb8fbf9e11a1"}, @@ -3073,6 +3289,7 @@ version = "5.9.5.12" description = "Typing stubs for psutil" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "types-psutil-5.9.5.12.tar.gz", hash = "sha256:61a91679d3fe737250013b624dca09375e7cc3ad77dcc734553746c429c02aca"}, {file = "types_psutil-5.9.5.12-py3-none-any.whl", hash = "sha256:e9a147b8561235c6afcce5aa1adb973fad9ab2c50cf89820697687f53510358f"}, @@ -3084,6 +3301,7 @@ version = "2.9.21.20241019" description = "Typing stubs for psycopg2" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "types-psycopg2-2.9.21.20241019.tar.gz", hash = "sha256:bca89b988d2ebd19bcd08b177d22a877ea8b841decb10ed130afcf39404612fa"}, {file = "types_psycopg2-2.9.21.20241019-py3-none-any.whl", hash = "sha256:44d091e67732d16a941baae48cd7b53bf91911bc36888652447cf1ef0c1fb3f6"}, @@ -3095,6 +3313,7 @@ version = "0.6.3.3" description = "Typing stubs for pytest-lazy-fixture" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "types-pytest-lazy-fixture-0.6.3.3.tar.gz", hash = "sha256:2ef79d66bcde0e50acdac8dc55074b9ae0d4cfaeabdd638f5522f4cac7c8a2c7"}, {file = "types_pytest_lazy_fixture-0.6.3.3-py3-none-any.whl", hash = "sha256:a56a55649147ff960ff79d4b2c781a4f769351abc1876873f3116d0bd0c96353"}, @@ -3106,6 +3325,7 @@ version = "6.0.12.20240917" description = "Typing stubs for PyYAML" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "types-PyYAML-6.0.12.20240917.tar.gz", hash = "sha256:d1405a86f9576682234ef83bcb4e6fff7c9305c8b1fbad5e0bcd4f7dbdc9c587"}, {file = "types_PyYAML-6.0.12.20240917-py3-none-any.whl", hash = "sha256:392b267f1c0fe6022952462bf5d6523f31e37f6cea49b14cee7ad634b6301570"}, @@ -3117,6 +3337,7 @@ version = "2.31.0.0" description = "Typing stubs for requests" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "types-requests-2.31.0.0.tar.gz", hash = "sha256:c1c29d20ab8d84dff468d7febfe8e0cb0b4664543221b386605e14672b44ea25"}, {file = "types_requests-2.31.0.0-py3-none-any.whl", hash = "sha256:7c5cea7940f8e92ec560bbc468f65bf684aa3dcf0554a6f8c4710f5f708dc598"}, @@ -3131,6 +3352,7 @@ version = "0.6.0.post3" description = "Type annotations and code completion for s3transfer" optional = false python-versions = ">=3.7,<4.0" +groups = ["main"] files = [ {file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"}, {file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"}, @@ -3142,6 +3364,7 @@ version = "0.10.8.6" description = "Typing stubs for toml" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "types-toml-0.10.8.6.tar.gz", hash = "sha256:6d3ac79e36c9ee593c5d4fb33a50cca0e3adceb6ef5cff8b8e5aef67b4c4aaf2"}, {file = "types_toml-0.10.8.6-py3-none-any.whl", hash = "sha256:de7b2bb1831d6f7a4b554671ffe5875e729753496961b3e9b202745e4955dafa"}, @@ -3153,6 +3376,7 @@ version = "1.26.17" description = "Typing stubs for urllib3" optional = false python-versions = "*" +groups = ["main"] files = [ {file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"}, {file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"}, @@ -3160,13 +3384,14 @@ files = [ [[package]] name = "typing-extensions" -version = "4.6.1" -description = "Backported and Experimental Type Hints for Python 3.7+" +version = "4.12.2" +description = "Backported and Experimental Type Hints for Python 3.8+" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" +groups = ["main", "dev"] files = [ - {file = "typing_extensions-4.6.1-py3-none-any.whl", hash = "sha256:6bac751f4789b135c43228e72de18637e9a6c29d12777023a703fd1a6858469f"}, - {file = "typing_extensions-4.6.1.tar.gz", hash = "sha256:558bc0c4145f01e6405f4a5fdbd82050bd221b119f4bf72a961a1cfd471349d6"}, + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] [[package]] @@ -3175,14 +3400,15 @@ version = "1.26.19" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["main"] files = [ {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"}, {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"}, ] [package.extras] -brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] +brotli = ["brotli (==1.0.9) ; os_name != \"nt\" and python_version < \"3\" and platform_python_implementation == \"CPython\"", "brotli (>=1.0.9) ; python_version >= \"3\" and platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; (os_name != \"nt\" or python_version >= \"3\") and platform_python_implementation != \"CPython\"", "brotlipy (>=0.6.0) ; os_name == \"nt\" and python_version < \"3\""] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress ; python_version == \"2.7\"", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] @@ -3191,6 +3417,7 @@ version = "12.0" description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"}, {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"}, @@ -3272,6 +3499,7 @@ version = "3.0.6" description = "The comprehensive WSGI web application library." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"}, {file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"}, @@ -3289,6 +3517,7 @@ version = "1.14.1" description = "Module for decorators, wrappers and monkey patching." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +groups = ["main"] files = [ {file = "wrapt-1.14.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3"}, {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef"}, @@ -3309,6 +3538,16 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, + {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, + {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -3362,6 +3601,7 @@ version = "0.13.0" description = "Makes working with XML feel like you are working with JSON" optional = false python-versions = ">=3.4" +groups = ["main"] files = [ {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"}, {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"}, @@ -3373,6 +3613,7 @@ version = "1.17.2" description = "Yet another URL library" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "yarl-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:93771146ef048b34201bfa382c2bf74c524980870bb278e6df515efaf93699ff"}, {file = "yarl-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8281db240a1616af2f9c5f71d355057e73a1409c4648c8949901396dc0a3c151"}, @@ -3465,54 +3706,109 @@ propcache = ">=0.2.0" [[package]] name = "zstandard" -version = "0.21.0" +version = "0.23.0" description = "Zstandard bindings for Python" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" +groups = ["main"] files = [ - {file = "zstandard-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:649a67643257e3b2cff1c0a73130609679a5673bf389564bc6d4b164d822a7ce"}, - {file = "zstandard-0.21.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:144a4fe4be2e747bf9c646deab212666e39048faa4372abb6a250dab0f347a29"}, - {file = "zstandard-0.21.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b72060402524ab91e075881f6b6b3f37ab715663313030d0ce983da44960a86f"}, - {file = "zstandard-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8257752b97134477fb4e413529edaa04fc0457361d304c1319573de00ba796b1"}, - {file = "zstandard-0.21.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c053b7c4cbf71cc26808ed67ae955836232f7638444d709bfc302d3e499364fa"}, - {file = "zstandard-0.21.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2769730c13638e08b7a983b32cb67775650024632cd0476bf1ba0e6360f5ac7d"}, - {file = "zstandard-0.21.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7d3bc4de588b987f3934ca79140e226785d7b5e47e31756761e48644a45a6766"}, - {file = "zstandard-0.21.0-cp310-cp310-win32.whl", hash = "sha256:67829fdb82e7393ca68e543894cd0581a79243cc4ec74a836c305c70a5943f07"}, - {file = "zstandard-0.21.0-cp310-cp310-win_amd64.whl", hash = "sha256:e6048a287f8d2d6e8bc67f6b42a766c61923641dd4022b7fd3f7439e17ba5a4d"}, - {file = "zstandard-0.21.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7f2afab2c727b6a3d466faee6974a7dad0d9991241c498e7317e5ccf53dbc766"}, - {file = "zstandard-0.21.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff0852da2abe86326b20abae912d0367878dd0854b8931897d44cfeb18985472"}, - {file = "zstandard-0.21.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d12fa383e315b62630bd407477d750ec96a0f438447d0e6e496ab67b8b451d39"}, - {file = "zstandard-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1b9703fe2e6b6811886c44052647df7c37478af1b4a1a9078585806f42e5b15"}, - {file = "zstandard-0.21.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:df28aa5c241f59a7ab524f8ad8bb75d9a23f7ed9d501b0fed6d40ec3064784e8"}, - {file = "zstandard-0.21.0-cp311-cp311-win32.whl", hash = "sha256:0aad6090ac164a9d237d096c8af241b8dcd015524ac6dbec1330092dba151657"}, - {file = "zstandard-0.21.0-cp311-cp311-win_amd64.whl", hash = "sha256:48b6233b5c4cacb7afb0ee6b4f91820afbb6c0e3ae0fa10abbc20000acdf4f11"}, - {file = "zstandard-0.21.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e7d560ce14fd209db6adacce8908244503a009c6c39eee0c10f138996cd66d3e"}, - {file = "zstandard-0.21.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e6e131a4df2eb6f64961cea6f979cdff22d6e0d5516feb0d09492c8fd36f3bc"}, - {file = "zstandard-0.21.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1e0c62a67ff425927898cf43da2cf6b852289ebcc2054514ea9bf121bec10a5"}, - {file = "zstandard-0.21.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1545fb9cb93e043351d0cb2ee73fa0ab32e61298968667bb924aac166278c3fc"}, - {file = "zstandard-0.21.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe6c821eb6870f81d73bf10e5deed80edcac1e63fbc40610e61f340723fd5f7c"}, - {file = "zstandard-0.21.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ddb086ea3b915e50f6604be93f4f64f168d3fc3cef3585bb9a375d5834392d4f"}, - {file = "zstandard-0.21.0-cp37-cp37m-win32.whl", hash = "sha256:57ac078ad7333c9db7a74804684099c4c77f98971c151cee18d17a12649bc25c"}, - {file = "zstandard-0.21.0-cp37-cp37m-win_amd64.whl", hash = "sha256:1243b01fb7926a5a0417120c57d4c28b25a0200284af0525fddba812d575f605"}, - {file = "zstandard-0.21.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ea68b1ba4f9678ac3d3e370d96442a6332d431e5050223626bdce748692226ea"}, - {file = "zstandard-0.21.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8070c1cdb4587a8aa038638acda3bd97c43c59e1e31705f2766d5576b329e97c"}, - {file = "zstandard-0.21.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4af612c96599b17e4930fe58bffd6514e6c25509d120f4eae6031b7595912f85"}, - {file = "zstandard-0.21.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cff891e37b167bc477f35562cda1248acc115dbafbea4f3af54ec70821090965"}, - {file = "zstandard-0.21.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a9fec02ce2b38e8b2e86079ff0b912445495e8ab0b137f9c0505f88ad0d61296"}, - {file = "zstandard-0.21.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0bdbe350691dec3078b187b8304e6a9c4d9db3eb2d50ab5b1d748533e746d099"}, - {file = "zstandard-0.21.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b69cccd06a4a0a1d9fb3ec9a97600055cf03030ed7048d4bcb88c574f7895773"}, - {file = "zstandard-0.21.0-cp38-cp38-win32.whl", hash = "sha256:9980489f066a391c5572bc7dc471e903fb134e0b0001ea9b1d3eff85af0a6f1b"}, - {file = "zstandard-0.21.0-cp38-cp38-win_amd64.whl", hash = "sha256:0e1e94a9d9e35dc04bf90055e914077c80b1e0c15454cc5419e82529d3e70728"}, - {file = "zstandard-0.21.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d2d61675b2a73edcef5e327e38eb62bdfc89009960f0e3991eae5cc3d54718de"}, - {file = "zstandard-0.21.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:25fbfef672ad798afab12e8fd204d122fca3bc8e2dcb0a2ba73bf0a0ac0f5f07"}, - {file = "zstandard-0.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:62957069a7c2626ae80023998757e27bd28d933b165c487ab6f83ad3337f773d"}, - {file = "zstandard-0.21.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14e10ed461e4807471075d4b7a2af51f5234c8f1e2a0c1d37d5ca49aaaad49e8"}, - {file = "zstandard-0.21.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:9cff89a036c639a6a9299bf19e16bfb9ac7def9a7634c52c257166db09d950e7"}, - {file = "zstandard-0.21.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:52b2b5e3e7670bd25835e0e0730a236f2b0df87672d99d3bf4bf87248aa659fb"}, - {file = "zstandard-0.21.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b1367da0dde8ae5040ef0413fb57b5baeac39d8931c70536d5f013b11d3fc3a5"}, - {file = "zstandard-0.21.0-cp39-cp39-win32.whl", hash = "sha256:db62cbe7a965e68ad2217a056107cc43d41764c66c895be05cf9c8b19578ce9c"}, - {file = "zstandard-0.21.0-cp39-cp39-win_amd64.whl", hash = "sha256:a8d200617d5c876221304b0e3fe43307adde291b4a897e7b0617a61611dfff6a"}, - {file = "zstandard-0.21.0.tar.gz", hash = "sha256:f08e3a10d01a247877e4cb61a82a319ea746c356a3786558bed2481e6c405546"}, + {file = "zstandard-0.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9"}, + {file = "zstandard-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fc9ca1c9718cb3b06634c7c8dec57d24e9438b2aa9a0f02b8bb36bf478538880"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77da4c6bfa20dd5ea25cbf12c76f181a8e8cd7ea231c673828d0386b1740b8dc"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2170c7e0367dde86a2647ed5b6f57394ea7f53545746104c6b09fc1f4223573"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c16842b846a8d2a145223f520b7e18b57c8f476924bda92aeee3a88d11cfc391"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:157e89ceb4054029a289fb504c98c6a9fe8010f1680de0201b3eb5dc20aa6d9e"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:203d236f4c94cd8379d1ea61db2fce20730b4c38d7f1c34506a31b34edc87bdd"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:dc5d1a49d3f8262be192589a4b72f0d03b72dcf46c51ad5852a4fdc67be7b9e4"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:752bf8a74412b9892f4e5b58f2f890a039f57037f52c89a740757ebd807f33ea"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:80080816b4f52a9d886e67f1f96912891074903238fe54f2de8b786f86baded2"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:84433dddea68571a6d6bd4fbf8ff398236031149116a7fff6f777ff95cad3df9"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ab19a2d91963ed9e42b4e8d77cd847ae8381576585bad79dbd0a8837a9f6620a"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:59556bf80a7094d0cfb9f5e50bb2db27fefb75d5138bb16fb052b61b0e0eeeb0"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:27d3ef2252d2e62476389ca8f9b0cf2bbafb082a3b6bfe9d90cbcbb5529ecf7c"}, + {file = "zstandard-0.23.0-cp310-cp310-win32.whl", hash = "sha256:5d41d5e025f1e0bccae4928981e71b2334c60f580bdc8345f824e7c0a4c2a813"}, + {file = "zstandard-0.23.0-cp310-cp310-win_amd64.whl", hash = "sha256:519fbf169dfac1222a76ba8861ef4ac7f0530c35dd79ba5727014613f91613d4"}, + {file = "zstandard-0.23.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:34895a41273ad33347b2fc70e1bff4240556de3c46c6ea430a7ed91f9042aa4e"}, + {file = "zstandard-0.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77ea385f7dd5b5676d7fd943292ffa18fbf5c72ba98f7d09fc1fb9e819b34c23"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:983b6efd649723474f29ed42e1467f90a35a74793437d0bc64a5bf482bedfa0a"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80a539906390591dd39ebb8d773771dc4db82ace6372c4d41e2d293f8e32b8db"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:445e4cb5048b04e90ce96a79b4b63140e3f4ab5f662321975679b5f6360b90e2"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd30d9c67d13d891f2360b2a120186729c111238ac63b43dbd37a5a40670b8ca"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d20fd853fbb5807c8e84c136c278827b6167ded66c72ec6f9a14b863d809211c"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ed1708dbf4d2e3a1c5c69110ba2b4eb6678262028afd6c6fbcc5a8dac9cda68e"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:be9b5b8659dff1f913039c2feee1aca499cfbc19e98fa12bc85e037c17ec6ca5"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:65308f4b4890aa12d9b6ad9f2844b7ee42c7f7a4fd3390425b242ffc57498f48"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:98da17ce9cbf3bfe4617e836d561e433f871129e3a7ac16d6ef4c680f13a839c"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8ed7d27cb56b3e058d3cf684d7200703bcae623e1dcc06ed1e18ecda39fee003"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:b69bb4f51daf461b15e7b3db033160937d3ff88303a7bc808c67bbc1eaf98c78"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:034b88913ecc1b097f528e42b539453fa82c3557e414b3de9d5632c80439a473"}, + {file = "zstandard-0.23.0-cp311-cp311-win32.whl", hash = "sha256:f2d4380bf5f62daabd7b751ea2339c1a21d1c9463f1feb7fc2bdcea2c29c3160"}, + {file = "zstandard-0.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:62136da96a973bd2557f06ddd4e8e807f9e13cbb0bfb9cc06cfe6d98ea90dfe0"}, + {file = "zstandard-0.23.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b4567955a6bc1b20e9c31612e615af6b53733491aeaa19a6b3b37f3b65477094"}, + {file = "zstandard-0.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e172f57cd78c20f13a3415cc8dfe24bf388614324d25539146594c16d78fcc8"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0e166f698c5a3e914947388c162be2583e0c638a4703fc6a543e23a88dea3c1"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12a289832e520c6bd4dcaad68e944b86da3bad0d339ef7989fb7e88f92e96072"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d50d31bfedd53a928fed6707b15a8dbeef011bb6366297cc435accc888b27c20"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72c68dda124a1a138340fb62fa21b9bf4848437d9ca60bd35db36f2d3345f373"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:53dd9d5e3d29f95acd5de6802e909ada8d8d8cfa37a3ac64836f3bc4bc5512db"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:6a41c120c3dbc0d81a8e8adc73312d668cd34acd7725f036992b1b72d22c1772"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:40b33d93c6eddf02d2c19f5773196068d875c41ca25730e8288e9b672897c105"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9206649ec587e6b02bd124fb7799b86cddec350f6f6c14bc82a2b70183e708ba"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76e79bc28a65f467e0409098fa2c4376931fd3207fbeb6b956c7c476d53746dd"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:66b689c107857eceabf2cf3d3fc699c3c0fe8ccd18df2219d978c0283e4c508a"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9c236e635582742fee16603042553d276cca506e824fa2e6489db04039521e90"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a8fffdbd9d1408006baaf02f1068d7dd1f016c6bcb7538682622c556e7b68e35"}, + {file = "zstandard-0.23.0-cp312-cp312-win32.whl", hash = "sha256:dc1d33abb8a0d754ea4763bad944fd965d3d95b5baef6b121c0c9013eaf1907d"}, + {file = "zstandard-0.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:64585e1dba664dc67c7cdabd56c1e5685233fbb1fc1966cfba2a340ec0dfff7b"}, + {file = "zstandard-0.23.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:576856e8594e6649aee06ddbfc738fec6a834f7c85bf7cadd1c53d4a58186ef9"}, + {file = "zstandard-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38302b78a850ff82656beaddeb0bb989a0322a8bbb1bf1ab10c17506681d772a"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2240ddc86b74966c34554c49d00eaafa8200a18d3a5b6ffbf7da63b11d74ee2"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ef230a8fd217a2015bc91b74f6b3b7d6522ba48be29ad4ea0ca3a3775bf7dd5"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:774d45b1fac1461f48698a9d4b5fa19a69d47ece02fa469825b442263f04021f"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f77fa49079891a4aab203d0b1744acc85577ed16d767b52fc089d83faf8d8ed"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ac184f87ff521f4840e6ea0b10c0ec90c6b1dcd0bad2f1e4a9a1b4fa177982ea"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c363b53e257246a954ebc7c488304b5592b9c53fbe74d03bc1c64dda153fb847"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e7792606d606c8df5277c32ccb58f29b9b8603bf83b48639b7aedf6df4fe8171"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a0817825b900fcd43ac5d05b8b3079937073d2b1ff9cf89427590718b70dd840"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:9da6bc32faac9a293ddfdcb9108d4b20416219461e4ec64dfea8383cac186690"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:fd7699e8fd9969f455ef2926221e0233f81a2542921471382e77a9e2f2b57f4b"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d477ed829077cd945b01fc3115edd132c47e6540ddcd96ca169facff28173057"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa6ce8b52c5987b3e34d5674b0ab529a4602b632ebab0a93b07bfb4dfc8f8a33"}, + {file = "zstandard-0.23.0-cp313-cp313-win32.whl", hash = "sha256:a9b07268d0c3ca5c170a385a0ab9fb7fdd9f5fd866be004c4ea39e44edce47dd"}, + {file = "zstandard-0.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:f3513916e8c645d0610815c257cbfd3242adfd5c4cfa78be514e5a3ebb42a41b"}, + {file = "zstandard-0.23.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2ef3775758346d9ac6214123887d25c7061c92afe1f2b354f9388e9e4d48acfc"}, + {file = "zstandard-0.23.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4051e406288b8cdbb993798b9a45c59a4896b6ecee2f875424ec10276a895740"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2d1a054f8f0a191004675755448d12be47fa9bebbcffa3cdf01db19f2d30a54"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f83fa6cae3fff8e98691248c9320356971b59678a17f20656a9e59cd32cee6d8"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:32ba3b5ccde2d581b1e6aa952c836a6291e8435d788f656fe5976445865ae045"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f146f50723defec2975fb7e388ae3a024eb7151542d1599527ec2aa9cacb152"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1bfe8de1da6d104f15a60d4a8a768288f66aa953bbe00d027398b93fb9680b26"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:29a2bc7c1b09b0af938b7a8343174b987ae021705acabcbae560166567f5a8db"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:61f89436cbfede4bc4e91b4397eaa3e2108ebe96d05e93d6ccc95ab5714be512"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:53ea7cdc96c6eb56e76bb06894bcfb5dfa93b7adcf59d61c6b92674e24e2dd5e"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:a4ae99c57668ca1e78597d8b06d5af837f377f340f4cce993b551b2d7731778d"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:379b378ae694ba78cef921581ebd420c938936a153ded602c4fea612b7eaa90d"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:50a80baba0285386f97ea36239855f6020ce452456605f262b2d33ac35c7770b"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:61062387ad820c654b6a6b5f0b94484fa19515e0c5116faf29f41a6bc91ded6e"}, + {file = "zstandard-0.23.0-cp38-cp38-win32.whl", hash = "sha256:b8c0bd73aeac689beacd4e7667d48c299f61b959475cdbb91e7d3d88d27c56b9"}, + {file = "zstandard-0.23.0-cp38-cp38-win_amd64.whl", hash = "sha256:a05e6d6218461eb1b4771d973728f0133b2a4613a6779995df557f70794fd60f"}, + {file = "zstandard-0.23.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3aa014d55c3af933c1315eb4bb06dd0459661cc0b15cd61077afa6489bec63bb"}, + {file = "zstandard-0.23.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7f0804bb3799414af278e9ad51be25edf67f78f916e08afdb983e74161b916"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb2b1ecfef1e67897d336de3a0e3f52478182d6a47eda86cbd42504c5cbd009a"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:837bb6764be6919963ef41235fd56a6486b132ea64afe5fafb4cb279ac44f259"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1516c8c37d3a053b01c1c15b182f3b5f5eef19ced9b930b684a73bad121addf4"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48ef6a43b1846f6025dde6ed9fee0c24e1149c1c25f7fb0a0585572b2f3adc58"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11e3bf3c924853a2d5835b24f03eeba7fc9b07d8ca499e247e06ff5676461a15"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2fb4535137de7e244c230e24f9d1ec194f61721c86ebea04e1581d9d06ea1269"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8c24f21fa2af4bb9f2c492a86fe0c34e6d2c63812a839590edaf177b7398f700"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a8c86881813a78a6f4508ef9daf9d4995b8ac2d147dcb1a450448941398091c9"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fe3b385d996ee0822fd46528d9f0443b880d4d05528fd26a9119a54ec3f91c69"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:82d17e94d735c99621bf8ebf9995f870a6b3e6d14543b99e201ae046dfe7de70"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c7c517d74bea1a6afd39aa612fa025e6b8011982a0897768a2f7c8ab4ebb78a2"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1fd7e0f1cfb70eb2f95a19b472ee7ad6d9a0a992ec0ae53286870c104ca939e5"}, + {file = "zstandard-0.23.0-cp39-cp39-win32.whl", hash = "sha256:43da0f0092281bf501f9c5f6f3b4c975a8a0ea82de49ba3f7100e64d422a1274"}, + {file = "zstandard-0.23.0-cp39-cp39-win_amd64.whl", hash = "sha256:f8346bfa098532bc1fb6c7ef06783e969d87a99dd1d2a5a18a892c1d7a643c58"}, + {file = "zstandard-0.23.0.tar.gz", hash = "sha256:b2d8c62d08e7255f68f7a740bae85b3c9b8e5466baa9cbf7f57f1cde0ac6bc09"}, ] [package.dependencies] @@ -3522,6 +3818,6 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [metadata] -lock-version = "2.0" +lock-version = "2.1" python-versions = "^3.11" -content-hash = "426c385df93f578ba3537c40a269535e27fbcca1978b3cf266096ecbc298c6a9" +content-hash = "9711c5479c867fa614ce3d352f1bbc63dba1cb2376d347f96fbeda6f512ee308" diff --git a/pre-commit.py b/pre-commit.py index c9567e0c50..09139459d5 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -29,12 +29,12 @@ def colorify( return f"{color.value}{s}{NC}" -def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str: - cmd = "rustfmt --edition=2021" +def cargo_fmt(fix_inplace: bool = False, no_color: bool = False) -> str: + cmd = "cargo fmt" if not fix_inplace: cmd += " --check" if no_color: - cmd += " --color=never" + cmd += " -- --color=never" return cmd @@ -61,14 +61,23 @@ def get_commit_files() -> list[str]: return files.decode().splitlines() -def check(name: str, suffix: str, cmd: str, changed_files: list[str], no_color: bool = False): +def check( + name: str, + suffix: str, + cmd: str, + changed_files: list[str], + no_color: bool = False, + append_files_to_cmd: bool = True, +): print(f"Checking: {name} ", end="") applicable_files = list(filter(lambda fname: fname.strip().endswith(suffix), changed_files)) if not applicable_files: print(colorify("[NOT APPLICABLE]", Color.CYAN, no_color)) return - cmd = f'{cmd} {" ".join(applicable_files)}' + if append_files_to_cmd: + cmd = f"{cmd} {' '.join(applicable_files)}" + res = subprocess.run(cmd.split(), capture_output=True) if res.returncode != 0: print(colorify("[FAILED]", Color.RED, no_color)) @@ -100,15 +109,13 @@ if __name__ == "__main__": args = parser.parse_args() files = get_commit_files() - # we use rustfmt here because cargo fmt does not accept list of files - # it internally gathers project files and feeds them to rustfmt - # so because we want to check only files included in the commit we use rustfmt directly check( - name="rustfmt", + name="cargo fmt", suffix=".rs", - cmd=rustfmt(fix_inplace=args.fix_inplace, no_color=args.no_color), + cmd=cargo_fmt(fix_inplace=args.fix_inplace, no_color=args.no_color), changed_files=files, no_color=args.no_color, + append_files_to_cmd=False, ) check( name="ruff check", diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 2f63ee3acc..5964b76ecf 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "proxy" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] @@ -24,9 +24,9 @@ bytes = { workspace = true, features = ["serde"] } camino.workspace = true chrono.workspace = true clap = { workspace = true, features = ["derive", "env"] } +clashmap.workspace = true compute_api.workspace = true consumption_metrics.workspace = true -dashmap.workspace = true env_logger.workspace = true framed-websockets.workspace = true futures.workspace = true @@ -36,12 +36,14 @@ hex.workspace = true hmac.workspace = true hostname.workspace = true http.workspace = true +http-utils.workspace = true humantime.workspace = true humantime-serde.workspace = true hyper0.workspace = true hyper = { workspace = true, features = ["server", "http1", "http2"] } hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } http-body-util = { version = "0.1" } +gettid = "0.1.3" indexmap = { workspace = true, features = ["serde"] } ipnet.workspace = true itertools.workspace = true @@ -50,6 +52,8 @@ lasso = { workspace = true, features = ["multi-threaded"] } measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true +opentelemetry = { workspace = true, features = ["trace"] } +papaya = "0.1.8" parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true @@ -58,7 +62,6 @@ postgres_backend.workspace = true postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" } postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" } pq_proto.workspace = true -prometheus.workspace = true rand.workspace = true regex.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } @@ -76,7 +79,6 @@ sha2 = { workspace = true, features = ["asm", "oid"] } smol_str.workspace = true smallvec.workspace = true socket2.workspace = true -strum.workspace = true strum_macros.workspace = true subtle.workspace = true thiserror.workspace = true @@ -89,6 +91,8 @@ tokio = { workspace = true, features = ["signal"] } tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true +tracing-log.workspace = true +tracing-opentelemetry.workspace = true try-lock.workspace = true typed-json.workspace = true url.workspace = true @@ -106,11 +110,13 @@ jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] } signature = "2" ecdsa = "0.16" p256 = { version = "0.13", features = ["jwk"] } +ed25519-dalek = { version = "2", default-features = false, features = ["rand_core"] } rsa = "0.9" workspace_hack.workspace = true [dev-dependencies] +assert-json-diff.workspace = true camino-tempfile.workspace = true fallible-iterator.workspace = true flate2.workspace = true diff --git a/proxy/README.md b/proxy/README.md index 8d850737be..1156bfd352 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -37,8 +37,8 @@ To play with it locally one may start proxy over a local postgres installation If both postgres and proxy are running you may send a SQL query: ```console -curl -k -X POST 'https://proxy.localtest.me:4444/sql' \ - -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \ +curl -k -X POST 'https://proxy.local.neon.build:4444/sql' \ + -H 'Neon-Connection-String: postgres://stas:pass@proxy.local.neon.build:4444/postgres' \ -H 'Content-Type: application/json' \ --data '{ "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num", @@ -102,23 +102,39 @@ User can pass several optional headers that will affect resulting json. 2. `Neon-Array-Mode: true`. Return postgres rows as arrays instead of objects. That is more compact representation and also helps in some edge cases where it is hard to use rows represented as objects (e.g. when several fields have the same name). +## Test proxy locally -## Using SNI-based routing on localhost - -Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy: +Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.local.neon.build` which resolves to `127.0.0.1`. +We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows: ```sh -openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me" +docker run \ + --detach \ + --name proxy-postgres \ + --env POSTGRES_PASSWORD=proxy-postgres \ + --publish 5432:5432 \ + postgres:17-bookworm ``` -start proxy - +Next step is setting up auth table and schema as well as creating role (without the JWT table): ```sh -./target/debug/proxy -c server.crt -k server.key +docker exec -it proxy-postgres psql -U postgres -c "CREATE SCHEMA IF NOT EXISTS neon_control_plane" +docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_plane.endpoints (endpoint_id VARCHAR(255) PRIMARY KEY, allowed_ips VARCHAR(255))" +docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';" ``` -and connect to it +Let's create self-signed certificate by running: +```sh +openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build" +``` + +Then we need to build proxy with 'testing' feature and run, e.g.: +```sh +RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key +``` + +Now from client you can start a new session: ```sh -PGSSLROOTCERT=./server.crt psql 'postgres://my-cluster-42.localtest.me:1234?sslmode=verify-full' +PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full" ``` diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 575d60be85..dd48384c03 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -1,3 +1,5 @@ +use std::fmt; + use async_trait::async_trait; use postgres_client::config::SslMode; use pq_proto::BeMessage as Be; @@ -7,13 +9,17 @@ use tracing::{info, info_span}; use super::ComputeCredentialKeys; use crate::auth::IpPattern; +use crate::auth::backend::ComputeUserInfo; use crate::cache::Cached; use crate::config::AuthenticationConfig; use crate::context::RequestContext; +use crate::control_plane::client::cplane_proxy_v1; use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::{ReportableError, UserFacingError}; +use crate::proxy::NeonOptions; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::stream::PqStream; +use crate::types::RoleName; use crate::{auth, compute, waiters}; #[derive(Debug, Error)] @@ -31,6 +37,13 @@ pub(crate) enum ConsoleRedirectError { #[derive(Debug)] pub struct ConsoleRedirectBackend { console_uri: reqwest::Url, + api: cplane_proxy_v1::NeonControlPlaneClient, +} + +impl fmt::Debug for cplane_proxy_v1::NeonControlPlaneClient { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "NeonControlPlaneClient") + } } impl UserFacingError for ConsoleRedirectError { @@ -72,8 +85,12 @@ pub(crate) fn new_psql_session_id() -> String { } impl ConsoleRedirectBackend { - pub fn new(console_uri: reqwest::Url) -> Self { - Self { console_uri } + pub fn new(console_uri: reqwest::Url, api: cplane_proxy_v1::NeonControlPlaneClient) -> Self { + Self { console_uri, api } + } + + pub(crate) fn get_api(&self) -> &cplane_proxy_v1::NeonControlPlaneClient { + &self.api } pub(crate) async fn authenticate( @@ -81,10 +98,16 @@ impl ConsoleRedirectBackend { ctx: &RequestContext, auth_config: &'static AuthenticationConfig, client: &mut PqStream, - ) -> auth::Result<(ConsoleRedirectNodeInfo, Option>)> { + ) -> auth::Result<( + ConsoleRedirectNodeInfo, + ComputeUserInfo, + Option>, + )> { authenticate(ctx, auth_config, &self.console_uri, client) .await - .map(|(node_info, ip_allowlist)| (ConsoleRedirectNodeInfo(node_info), ip_allowlist)) + .map(|(node_info, user_info, ip_allowlist)| { + (ConsoleRedirectNodeInfo(node_info), user_info, ip_allowlist) + }) } } @@ -109,7 +132,7 @@ async fn authenticate( auth_config: &'static AuthenticationConfig, link_uri: &reqwest::Url, client: &mut PqStream, -) -> auth::Result<(NodeInfo, Option>)> { +) -> auth::Result<(NodeInfo, ComputeUserInfo, Option>)> { ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect); // registering waiter can fail if we get unlucky with rng. @@ -117,9 +140,8 @@ async fn authenticate( let (psql_session_id, waiter) = loop { let psql_session_id = new_psql_session_id(); - match control_plane::mgmt::get_waiter(&psql_session_id) { - Ok(waiter) => break (psql_session_id, waiter), - Err(_e) => continue, + if let Ok(waiter) = control_plane::mgmt::get_waiter(&psql_session_id) { + break (psql_session_id, waiter); } }; @@ -157,6 +179,15 @@ async fn authenticate( } } + // Check if the access over the public internet is allowed, otherwise block. Note that + // the console redirect is not behind the VPC service endpoint, so we don't need to check + // the VPC endpoint ID. + if let Some(public_access_allowed) = db_info.public_access_allowed { + if !public_access_allowed { + return Err(auth::AuthError::NetworkNotAllowed); + } + } + client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; // This config should be self-contained, because we won't @@ -164,8 +195,15 @@ async fn authenticate( let mut config = compute::ConnCfg::new(db_info.host.to_string(), db_info.port); config.dbname(&db_info.dbname).user(&db_info.user); + let user: RoleName = db_info.user.into(); + let user_info = ComputeUserInfo { + endpoint: db_info.aux.endpoint_id.as_str().into(), + user: user.clone(), + options: NeonOptions::default(), + }; + ctx.set_dbname(db_info.dbname.into()); - ctx.set_user(db_info.user.into()); + ctx.set_user(user); ctx.set_project(db_info.aux.clone()); info!("woken up a compute node"); @@ -187,8 +225,8 @@ async fn authenticate( NodeInfo { config, aux: db_info.aux, - allow_self_signed_compute: false, // caller may override }, + user_info, db_info.allowed_ips, )) } diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index a258090b15..942f1e13d1 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -4,11 +4,11 @@ use std::sync::Arc; use std::time::{Duration, SystemTime}; use arc_swap::ArcSwapOption; -use dashmap::DashMap; +use clashmap::ClashMap; use jose_jwk::crypto::KeyInfo; -use reqwest::{redirect, Client}; -use reqwest_retry::policies::ExponentialBackoff; +use reqwest::{Client, redirect}; use reqwest_retry::RetryTransientMiddleware; +use reqwest_retry::policies::ExponentialBackoff; use serde::de::Visitor; use serde::{Deserialize, Deserializer}; use serde_json::value::RawValue; @@ -64,7 +64,7 @@ pub(crate) struct AuthRule { pub struct JwkCache { client: reqwest_middleware::ClientWithMiddleware, - map: DashMap<(EndpointId, RoleName), Arc>, + map: ClashMap<(EndpointId, RoleName), Arc>, } pub(crate) struct JwkCacheEntry { @@ -220,11 +220,11 @@ async fn fetch_jwks( } impl JwkCacheEntryLock { - async fn acquire_permit<'a>(self: &'a Arc) -> JwkRenewalPermit<'a> { + async fn acquire_permit(self: &Arc) -> JwkRenewalPermit<'_> { JwkRenewalPermit::acquire_permit(self).await } - fn try_acquire_permit<'a>(self: &'a Arc) -> Option> { + fn try_acquire_permit(self: &Arc) -> Option> { JwkRenewalPermit::try_acquire_permit(self) } @@ -393,7 +393,7 @@ impl JwkCacheEntryLock { verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?; } key => return Err(JwtError::UnsupportedKeyType(key.into())), - }; + } tracing::debug!(?payload, "JWT signature valid with claims"); @@ -469,7 +469,7 @@ impl Default for JwkCache { JwkCache { client, - map: DashMap::default(), + map: ClashMap::default(), } } } @@ -498,8 +498,8 @@ fn verify_rsa_signature( alg: &jose_jwa::Algorithm, ) -> Result<(), JwtError> { use jose_jwa::{Algorithm, Signing}; - use rsa::pkcs1v15::{Signature, VerifyingKey}; use rsa::RsaPublicKey; + use rsa::pkcs1v15::{Signature, VerifyingKey}; let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?; @@ -510,7 +510,7 @@ fn verify_rsa_signature( key.verify(data, &sig)?; } _ => return Err(JwtError::InvalidRsaSigningAlgorithm), - }; + } Ok(()) } @@ -776,6 +776,7 @@ impl From<&jose_jwk::Key> for KeyType { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::future::IntoFuture; use std::net::SocketAddr; diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index d4273fb521..9c3a3772cd 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -8,8 +8,8 @@ use crate::auth::backend::jwt::FetchAuthRulesError; use crate::compute::ConnCfg; use crate::compute_ctl::ComputeCtlApi; use crate::context::RequestContext; -use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; use crate::control_plane::NodeInfo; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; use crate::http; use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}; use crate::types::EndpointId; @@ -37,7 +37,6 @@ impl LocalBackend { branch_id: BranchIdTag::get_interner().get_or_intern("local"), cold_start_info: ColdStartInfo::WarmCached, }, - allow_self_signed_compute: false, }, } } diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 1bad7b3086..83feed5094 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -12,23 +12,28 @@ pub(crate) use console_redirect::ConsoleRedirectError; use ipnet::{Ipv4Net, Ipv6Net}; use local::LocalBackend; use postgres_client::config::AuthKeys; +use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; -use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint}; +use crate::auth::{ + self, AuthError, ComputeUserInfoMaybeEndpoint, IpPattern, validate_password_and_exchange, +}; use crate::cache::Cached; use crate::config::AuthenticationConfig; use crate::context::RequestContext; use crate::control_plane::client::ControlPlaneClient; use crate::control_plane::errors::GetAuthInfoError; use crate::control_plane::{ - self, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, + self, AccessBlockerFlags, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps, + CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, }; use crate::intern::EndpointIdInt; use crate::metrics::Metrics; -use crate::proxy::connect_compute::ComputeConnectBackend; +use crate::protocol2::ConnectionInfoExtra; use crate::proxy::NeonOptions; +use crate::proxy::connect_compute::ComputeConnectBackend; use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter}; use crate::stream::Stream; use crate::types::{EndpointCacheKey, EndpointId, RoleName}; @@ -74,10 +79,6 @@ impl std::fmt::Display for Backend<'_, ()> { .debug_tuple("ControlPlane::ProxyV1") .field(&endpoint.url()) .finish(), - ControlPlaneClient::Neon(endpoint) => fmt - .debug_tuple("ControlPlane::Neon") - .field(&endpoint.url()) - .finish(), #[cfg(any(test, feature = "testing"))] ControlPlaneClient::PostgresMock(endpoint) => fmt .debug_tuple("ControlPlane::PostgresMock") @@ -100,6 +101,17 @@ impl Backend<'_, T> { Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)), } } + + pub(crate) fn get_api(&self) -> &ControlPlaneClient { + match self { + Self::ControlPlane(api, _) => api, + Self::Local(_) => panic!("Local backend has no API"), + } + } + + pub(crate) fn is_local_proxy(&self) -> bool { + matches!(self, Self::Local(_)) + } } impl<'a, T> Backend<'a, T> { @@ -135,7 +147,7 @@ pub(crate) struct ComputeUserInfoNoEndpoint { pub(crate) options: NeonOptions, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] pub(crate) struct ComputeUserInfo { pub(crate) endpoint: EndpointId, pub(crate) user: RoleName, @@ -260,7 +272,7 @@ async fn auth_quirks( allow_cleartext: bool, config: &'static AuthenticationConfig, endpoint_rate_limiter: Arc, -) -> auth::Result { +) -> auth::Result<(ComputeCredentials, Option>)> { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the endpoint (project) name. // We now expect to see a very specific payload in the place of password. @@ -274,23 +286,48 @@ async fn auth_quirks( Ok(info) => (info, None), }; - debug!("fetching user's authentication info"); - let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; + debug!("fetching authentication info and allowlists"); // check allowed list - if config.ip_allowlist_check_enabled - && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) - { - return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); + let allowed_ips = if config.ip_allowlist_check_enabled { + let allowed_ips = api.get_allowed_ips(ctx, &info).await?; + if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { + return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); + } + allowed_ips + } else { + Cached::new_uncached(Arc::new(vec![])) + }; + + // check if a VPC endpoint ID is coming in and if yes, if it's allowed + let access_blocks = api.get_block_public_or_vpc_access(ctx, &info).await?; + if config.is_vpc_acccess_proxy { + if access_blocks.vpc_access_blocked { + return Err(AuthError::NetworkNotAllowed); + } + + let incoming_vpc_endpoint_id = match ctx.extra() { + None => return Err(AuthError::MissingEndpointName), + Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(), + Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), + }; + let allowed_vpc_endpoint_ids = api.get_allowed_vpc_endpoint_ids(ctx, &info).await?; + // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that. + if !allowed_vpc_endpoint_ids.is_empty() + && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id) + { + return Err(AuthError::vpc_endpoint_id_not_allowed( + incoming_vpc_endpoint_id, + )); + } + } else if access_blocks.public_access_blocked { + return Err(AuthError::NetworkNotAllowed); } if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) { return Err(AuthError::too_many_connections()); } - let cached_secret = match maybe_secret { - Some(secret) => secret, - None => api.get_role_secret(ctx, &info).await?, - }; + let cached_secret = api.get_role_secret(ctx, &info).await?; let (cached_entry, secret) = cached_secret.take_value(); let secret = if let Some(secret) = secret { @@ -319,7 +356,7 @@ async fn auth_quirks( ) .await { - Ok(keys) => Ok(keys), + Ok(keys) => Ok((keys, Some(allowed_ips.as_ref().clone()))), Err(e) => { if e.is_password_failed() { // The password could have been changed, so we invalidate the cache. @@ -389,7 +426,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { allow_cleartext: bool, config: &'static AuthenticationConfig, endpoint_rate_limiter: Arc, - ) -> auth::Result> { + ) -> auth::Result<(Backend<'a, ComputeCredentials>, Option>)> { let res = match self { Self::ControlPlane(api, user_info) => { debug!( @@ -398,7 +435,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { "performing authentication using the console" ); - let credentials = auth_quirks( + let (credentials, ip_allowlist) = auth_quirks( ctx, &*api, user_info, @@ -408,16 +445,16 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { endpoint_rate_limiter, ) .await?; - Backend::ControlPlane(api, credentials) + Ok((Backend::ControlPlane(api, credentials), ip_allowlist)) } Self::Local(_) => { - return Err(auth::AuthError::bad_auth_method("invalid for local proxy")) + return Err(auth::AuthError::bad_auth_method("invalid for local proxy")); } }; // TODO: replace with some metric info!("user successfully authenticated"); - Ok(res) + res } } @@ -432,15 +469,37 @@ impl Backend<'_, ComputeUserInfo> { } } - pub(crate) async fn get_allowed_ips_and_secret( + pub(crate) async fn get_allowed_ips( &self, ctx: &RequestContext, - ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { + ) -> Result { + match self { + Self::ControlPlane(api, user_info) => api.get_allowed_ips(ctx, user_info).await, + Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))), + } + } + + pub(crate) async fn get_allowed_vpc_endpoint_ids( + &self, + ctx: &RequestContext, + ) -> Result { match self { Self::ControlPlane(api, user_info) => { - api.get_allowed_ips_and_secret(ctx, user_info).await + api.get_allowed_vpc_endpoint_ids(ctx, user_info).await } - Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), + Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))), + } + } + + pub(crate) async fn get_block_public_or_vpc_access( + &self, + ctx: &RequestContext, + ) -> Result { + match self { + Self::ControlPlane(api, user_info) => { + api.get_block_public_or_vpc_access(ctx, user_info).await + } + Self::Local(_) => Ok(Cached::new_uncached(AccessBlockerFlags::default())), } } } @@ -467,6 +526,8 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { #[cfg(test)] mod tests { + #![allow(clippy::unimplemented, clippy::unwrap_used)] + use std::net::IpAddr; use std::sync::Arc; use std::time::Duration; @@ -481,20 +542,25 @@ mod tests { use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; use super::jwt::JwkCache; - use super::{auth_quirks, AuthRateLimiter}; + use super::{AuthRateLimiter, auth_quirks}; use crate::auth::backend::MaskedIp; use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern}; use crate::config::AuthenticationConfig; use crate::context::RequestContext; - use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret}; + use crate::control_plane::{ + self, AccessBlockerFlags, CachedAccessBlockerFlags, CachedAllowedIps, + CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, + }; use crate::proxy::NeonOptions; use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo}; - use crate::scram::threadpool::ThreadPool; use crate::scram::ServerSecret; + use crate::scram::threadpool::ThreadPool; use crate::stream::{PqStream, Stream}; struct Auth { ips: Vec, + vpc_endpoint_ids: Vec, + access_blocker_flags: AccessBlockerFlags, secret: AuthSecret, } @@ -507,17 +573,31 @@ mod tests { Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone()))) } - async fn get_allowed_ips_and_secret( + async fn get_allowed_ips( &self, _ctx: &RequestContext, _user_info: &super::ComputeUserInfo, - ) -> Result< - (CachedAllowedIps, Option), - control_plane::errors::GetAuthInfoError, - > { - Ok(( - CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())), - Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))), + ) -> Result { + Ok(CachedAllowedIps::new_uncached(Arc::new(self.ips.clone()))) + } + + async fn get_allowed_vpc_endpoint_ids( + &self, + _ctx: &RequestContext, + _user_info: &super::ComputeUserInfo, + ) -> Result { + Ok(CachedAllowedVpcEndpointIds::new_uncached(Arc::new( + self.vpc_endpoint_ids.clone(), + ))) + } + + async fn get_block_public_or_vpc_access( + &self, + _ctx: &RequestContext, + _user_info: &super::ComputeUserInfo, + ) -> Result { + Ok(CachedAccessBlockerFlags::new_uncached( + self.access_blocker_flags.clone(), )) } @@ -547,6 +627,7 @@ mod tests { rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), rate_limit_ip_subnet: 64, ip_allowlist_check_enabled: true, + is_vpc_acccess_proxy: false, is_auth_broker: false, accept_jwts: false, console_redirect_confirmation_timeout: std::time::Duration::from_secs(5), @@ -614,6 +695,8 @@ mod tests { let ctx = RequestContext::test(); let api = Auth { ips: vec![], + vpc_endpoint_ids: vec![], + access_blocker_flags: AccessBlockerFlags::default(), secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), }; @@ -680,6 +763,9 @@ mod tests { .await .unwrap(); + // flush the final server message + stream.flush().await.unwrap(); + handle.await.unwrap(); } @@ -691,6 +777,8 @@ mod tests { let ctx = RequestContext::test(); let api = Auth { ips: vec![], + vpc_endpoint_ids: vec![], + access_blocker_flags: AccessBlockerFlags::default(), secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), }; @@ -743,6 +831,8 @@ mod tests { let ctx = RequestContext::test(); let api = Auth { ips: vec![], + vpc_endpoint_ids: vec![], + access_blocker_flags: AccessBlockerFlags::default(), secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), }; @@ -785,7 +875,7 @@ mod tests { .await .unwrap(); - assert_eq!(creds.info.endpoint, "my-endpoint"); + assert_eq!(creds.0.info.endpoint, "my-endpoint"); handle.await.unwrap(); } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index f6bce9f2d8..c1b7718e4f 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -197,7 +197,10 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern { type Value = IpPattern; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask") + write!( + formatter, + "comma separated list with ip address, ip address range, or ip address subnet mask" + ) } fn visit_str(self, v: &str) -> Result @@ -250,9 +253,10 @@ fn project_name_valid(name: &str) -> bool { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { - use serde_json::json; use ComputeUserInfoParseError::*; + use serde_json::json; use super::*; diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 60d1962d7f..0992c6d875 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -10,7 +10,6 @@ use tracing::info; use super::backend::ComputeCredentialKeys; use super::{AuthError, PasswordHackPayload}; -use crate::config::TlsServerEndPoint; use crate::context::RequestContext; use crate::control_plane::AuthSecret; use crate::intern::EndpointIdInt; @@ -18,6 +17,7 @@ use crate::sasl; use crate::scram::threadpool::ThreadPool; use crate::scram::{self}; use crate::stream::{PqStream, Stream}; +use crate::tls::TlsServerEndPoint; /// Every authentication selector is supposed to implement this trait. pub(crate) trait AuthMethod { diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs index 0198cc306e..5670f8e43d 100644 --- a/proxy/src/auth/mod.rs +++ b/proxy/src/auth/mod.rs @@ -5,13 +5,13 @@ pub use backend::Backend; mod credentials; pub(crate) use credentials::{ - check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, - ComputeUserInfoParseError, IpPattern, + ComputeUserInfoMaybeEndpoint, ComputeUserInfoParseError, IpPattern, check_peer_addr_is_in_list, + endpoint_sni, }; mod password_hack; -pub(crate) use password_hack::parse_endpoint_param; use password_hack::PasswordHackPayload; +pub(crate) use password_hack::parse_endpoint_param; mod flow; use std::io; @@ -55,6 +55,12 @@ pub(crate) enum AuthError { )] MissingEndpointName, + #[error( + "VPC endpoint ID is not specified. \ + This endpoint requires a VPC endpoint ID to connect." + )] + MissingVPCEndpointId, + #[error("password authentication failed for user '{0}'")] PasswordFailed(Box), @@ -69,6 +75,15 @@ pub(crate) enum AuthError { )] IpAddressNotAllowed(IpAddr), + #[error("This connection is trying to access this endpoint from a blocked network.")] + NetworkNotAllowed, + + #[error( + "This VPC endpoint id {0} is not allowed to connect to this endpoint. \ + Please add it to the allowed list in the Neon console." + )] + VpcEndpointIdNotAllowed(String), + #[error("Too many connections to this endpoint. Please try again later.")] TooManyConnections, @@ -95,6 +110,10 @@ impl AuthError { AuthError::IpAddressNotAllowed(ip) } + pub(crate) fn vpc_endpoint_id_not_allowed(id: String) -> Self { + AuthError::VpcEndpointIdNotAllowed(id) + } + pub(crate) fn too_many_connections() -> Self { AuthError::TooManyConnections } @@ -122,8 +141,11 @@ impl UserFacingError for AuthError { Self::BadAuthMethod(_) => self.to_string(), Self::MalformedPassword(_) => self.to_string(), Self::MissingEndpointName => self.to_string(), + Self::MissingVPCEndpointId => self.to_string(), Self::Io(_) => "Internal error".to_string(), Self::IpAddressNotAllowed(_) => self.to_string(), + Self::NetworkNotAllowed => self.to_string(), + Self::VpcEndpointIdNotAllowed(_) => self.to_string(), Self::TooManyConnections => self.to_string(), Self::UserTimeout(_) => self.to_string(), Self::ConfirmationTimeout(_) => self.to_string(), @@ -142,8 +164,11 @@ impl ReportableError for AuthError { Self::BadAuthMethod(_) => crate::error::ErrorKind::User, Self::MalformedPassword(_) => crate::error::ErrorKind::User, Self::MissingEndpointName => crate::error::ErrorKind::User, + Self::MissingVPCEndpointId => crate::error::ErrorKind::User, Self::Io(_) => crate::error::ErrorKind::ClientDisconnect, Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, + Self::NetworkNotAllowed => crate::error::ErrorKind::User, + Self::VpcEndpointIdNotAllowed(_) => crate::error::ErrorKind::User, Self::TooManyConnections => crate::error::ErrorKind::RateLimit, Self::UserTimeout(_) => crate::error::ErrorKind::User, Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User, diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 968682cf0f..8f225dc1e0 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -1,414 +1,7 @@ -use std::net::SocketAddr; -use std::pin::pin; -use std::str::FromStr; -use std::sync::Arc; -use std::time::Duration; - -use anyhow::{bail, ensure, Context}; -use camino::{Utf8Path, Utf8PathBuf}; -use compute_api::spec::LocalProxySpec; -use dashmap::DashMap; -use futures::future::Either; -use proxy::auth::backend::jwt::JwkCache; -use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; -use proxy::auth::{self}; -use proxy::cancellation::CancellationHandlerMain; -use proxy::config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}; -use proxy::control_plane::locks::ApiLocks; -use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings}; -use proxy::http::health_server::AppMetrics; -use proxy::intern::RoleNameInt; -use proxy::metrics::{Metrics, ThreadPoolMetrics}; -use proxy::rate_limiter::{ - BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, -}; -use proxy::scram::threadpool::ThreadPool; -use proxy::serverless::cancel_set::CancelSet; -use proxy::serverless::{self, GlobalConnPoolOptions}; -use proxy::types::RoleName; -use proxy::url::ApiUrl; - -project_git_version!(GIT_VERSION); -project_build_tag!(BUILD_TAG); - -use clap::Parser; -use thiserror::Error; -use tokio::net::TcpListener; -use tokio::sync::Notify; -use tokio::task::JoinSet; -use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn}; -use utils::sentry_init::init_sentry; -use utils::{pid_file, project_build_tag, project_git_version}; - #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -/// Neon proxy/router -#[derive(Parser)] -#[command(version = GIT_VERSION, about)] -struct LocalProxyCliArgs { - /// listen for incoming metrics connections on ip:port - #[clap(long, default_value = "127.0.0.1:7001")] - metrics: String, - /// listen for incoming http connections on ip:port - #[clap(long)] - http: String, - /// timeout for the TLS handshake - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - handshake_timeout: tokio::time::Duration, - /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] - connect_compute_lock: String, - #[clap(flatten)] - sql_over_http: SqlOverHttpArgs, - /// User rate limiter max number of requests per second. - /// - /// Provided in the form `@`. - /// Can be given multiple times for different bucket sizes. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] - user_rps_limit: Vec, - /// Whether the auth rate limiter actually takes effect (for testing) - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - auth_rate_limit_enabled: bool, - /// Authentication rate limiter max number of hashes per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] - auth_rate_limit: Vec, - /// The IP subnet to use when considering whether two IP addresses are considered the same. - #[clap(long, default_value_t = 64)] - auth_rate_limit_ip_subnet: u8, - /// Whether to retry the connection to the compute node - #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] - connect_to_compute_retry: String, - /// Address of the postgres server - #[clap(long, default_value = "127.0.0.1:5432")] - postgres: SocketAddr, - /// Address of the compute-ctl api service - #[clap(long, default_value = "http://127.0.0.1:3080/")] - compute_ctl: ApiUrl, - /// Path of the local proxy config file - #[clap(long, default_value = "./local_proxy.json")] - config_path: Utf8PathBuf, - /// Path of the local proxy PID file - #[clap(long, default_value = "./local_proxy.pid")] - pid_path: Utf8PathBuf, -} - -#[derive(clap::Args, Clone, Copy, Debug)] -struct SqlOverHttpArgs { - /// How many connections to pool for each endpoint. Excess connections are discarded - #[clap(long, default_value_t = 200)] - sql_over_http_pool_max_total_conns: usize, - - /// How long pooled connections should remain idle for before closing - #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] - sql_over_http_idle_timeout: tokio::time::Duration, - - #[clap(long, default_value_t = 100)] - sql_over_http_client_conn_threshold: u64, - - #[clap(long, default_value_t = 16)] - sql_over_http_cancel_set_shards: usize, - - #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_request_size_bytes: usize, - - #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_response_size_bytes: usize, -} - #[tokio::main] async fn main() -> anyhow::Result<()> { - let _logging_guard = proxy::logging::init_local_proxy()?; - let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); - let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - - Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); - - // TODO: refactor these to use labels - debug!("Version: {GIT_VERSION}"); - debug!("Build_tag: {BUILD_TAG}"); - let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { - revision: GIT_VERSION, - build_tag: BUILD_TAG, - }); - - let jemalloc = match proxy::jemalloc::MetricRecorder::new() { - Ok(t) => Some(t), - Err(e) => { - tracing::error!(error = ?e, "could not start jemalloc metrics loop"); - None - } - }; - - let args = LocalProxyCliArgs::parse(); - let config = build_config(&args)?; - let auth_backend = build_auth_backend(&args)?; - - // before we bind to any ports, write the process ID to a file - // so that compute-ctl can find our process later - // in order to trigger the appropriate SIGHUP on config change. - // - // This also claims a "lock" that makes sure only one instance - // of local_proxy runs at a time. - let _process_guard = loop { - match pid_file::claim_for_current_process(&args.pid_path) { - Ok(guard) => break guard, - Err(e) => { - // compute-ctl might have tried to read the pid-file to let us - // know about some config change. We should try again. - error!(path=?args.pid_path, "could not claim PID file guard: {e:?}"); - tokio::time::sleep(Duration::from_secs(1)).await; - } - } - }; - - let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?; - let http_listener = TcpListener::bind(args.http).await?; - let shutdown = CancellationToken::new(); - - // todo: should scale with CU - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( - LeakyBucketConfig { - rps: 10.0, - max: 100.0, - }, - 16, - )); - - let mut maintenance_tasks = JoinSet::new(); - - let refresh_config_notify = Arc::new(Notify::new()); - maintenance_tasks.spawn(proxy::signals::handle(shutdown.clone(), { - let refresh_config_notify = Arc::clone(&refresh_config_notify); - move || { - refresh_config_notify.notify_one(); - } - })); - - // trigger the first config load **after** setting up the signal hook - // to avoid the race condition where: - // 1. No config file registered when local_proxy starts up - // 2. The config file is written but the signal hook is not yet received - // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config. - refresh_config_notify.notify_one(); - tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify)); - - maintenance_tasks.spawn(proxy::http::health_server::task_main( - metrics_listener, - AppMetrics { - jemalloc, - neon_metrics, - proxy: proxy::metrics::Metrics::get(), - }, - )); - - let task = serverless::task_main( - config, - auth_backend, - http_listener, - shutdown.clone(), - Arc::new(CancellationHandlerMain::new( - Arc::new(DashMap::new()), - None, - proxy::metrics::CancellationSource::Local, - )), - endpoint_rate_limiter, - ); - - match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await { - // exit immediately on maintenance task completion - Either::Left((Some(res), _)) => match proxy::error::flatten_err(res)? {}, - // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) - Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), - // exit immediately on client task error - Either::Right((res, _)) => res?, - } - - Ok(()) -} - -/// ProxyConfig is created at proxy startup, and lives forever. -fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.connect_compute_lock.parse()?; - info!( - ?limiter, - shards, - ?epoch, - "Using NodeLocks (connect_compute)" - ); - let connect_compute_locks = ApiLocks::new( - "connect_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().proxy.connect_compute_lock, - )?; - - let http_config = HttpConfig { - accept_websockets: false, - pool_options: GlobalConnPoolOptions { - gc_epoch: Duration::from_secs(60), - pool_shards: 2, - idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, - opt_in: false, - - max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns, - max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, - }, - cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), - client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, - max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, - max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, - }; - - Ok(Box::leak(Box::new(ProxyConfig { - tls_config: None, - metric_collection: None, - allow_self_signed_compute: false, - http_config, - authentication_config: AuthenticationConfig { - jwks_cache: JwkCache::default(), - thread_pool: ThreadPool::new(0), - scram_protocol_timeout: Duration::from_secs(10), - rate_limiter_enabled: false, - rate_limiter: BucketRateLimiter::new(vec![]), - rate_limit_ip_subnet: 64, - ip_allowlist_check_enabled: true, - is_auth_broker: false, - accept_jwts: true, - console_redirect_confirmation_timeout: Duration::ZERO, - }, - proxy_protocol_v2: config::ProxyProtocolV2::Rejected, - handshake_timeout: Duration::from_secs(10), - region: "local".into(), - wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, - connect_compute_locks, - connect_to_compute_retry_config: RetryConfig::parse( - RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES, - )?, - }))) -} - -/// auth::Backend is created at proxy startup, and lives forever. -fn build_auth_backend( - args: &LocalProxyCliArgs, -) -> anyhow::Result<&'static auth::Backend<'static, ()>> { - let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( - LocalBackend::new(args.postgres, args.compute_ctl.clone()), - )); - - Ok(Box::leak(Box::new(auth_backend))) -} - -#[derive(Error, Debug)] -enum RefreshConfigError { - #[error(transparent)] - Read(#[from] std::io::Error), - #[error(transparent)] - Parse(#[from] serde_json::Error), - #[error(transparent)] - Validate(anyhow::Error), -} - -async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { - let mut init = true; - loop { - rx.notified().await; - - match refresh_config_inner(&path).await { - Ok(()) => {} - // don't log for file not found errors if this is the first time we are checking - // for computes that don't use local_proxy, this is not an error. - Err(RefreshConfigError::Read(e)) - if init && e.kind() == std::io::ErrorKind::NotFound => - { - debug!(error=?e, ?path, "could not read config file"); - } - Err(e) => { - error!(error=?e, ?path, "could not read config file"); - } - } - - init = false; - } -} - -async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> { - let bytes = tokio::fs::read(&path).await?; - let data: LocalProxySpec = serde_json::from_slice(&bytes)?; - - let mut jwks_set = vec![]; - - fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result { - let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; - - ensure!( - jwks_url.has_authority() - && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), - "Invalid JWKS url. Must be HTTP", - ); - - ensure!( - jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), - "Invalid JWKS url. No domain listed", - ); - - // clear username, password and ports - jwks_url - .set_username("") - .expect("url can be a base and has a valid host and is not a file. should not error"); - jwks_url - .set_password(None) - .expect("url can be a base and has a valid host and is not a file. should not error"); - // local testing is hard if we need to have a specific restricted port - if cfg!(not(feature = "testing")) { - jwks_url.set_port(None).expect( - "url can be a base and has a valid host and is not a file. should not error", - ); - } - - // clear query params - jwks_url.set_fragment(None); - jwks_url.query_pairs_mut().clear().finish(); - - if jwks_url.scheme() != "https" { - // local testing is hard if we need to set up https support. - if cfg!(not(feature = "testing")) { - jwks_url - .set_scheme("https") - .expect("should not error to set the scheme to https if it was http"); - } else { - warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); - } - } - - Ok(JwksSettings { - id: jwks.id, - jwks_url, - provider_name: jwks.provider_name, - jwt_audience: jwks.jwt_audience, - role_names: jwks - .role_names - .into_iter() - .map(RoleName::from) - .map(|s| RoleNameInt::from(&s)) - .collect(), - }) - } - - for jwks in data.jwks.into_iter().flatten() { - jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?); - } - - info!("successfully loaded new config"); - JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); - - Ok(()) + proxy::binary::local_proxy::run().await } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 623a0fd3b2..0c3326af85 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -1,299 +1,10 @@ -/// A stand-alone program that routes connections, e.g. from -/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. -/// -/// This allows connecting to pods/services running in the same Kubernetes cluster from -/// the outside. Similar to an ingress controller for HTTPS. -use std::{net::SocketAddr, sync::Arc}; - -use anyhow::{anyhow, bail, ensure, Context}; -use clap::Arg; -use futures::future::Either; -use futures::TryFutureExt; -use itertools::Itertools; -use proxy::config::TlsServerEndPoint; -use proxy::context::RequestContext; -use proxy::metrics::{Metrics, ThreadPoolMetrics}; -use proxy::protocol2::ConnectionInfo; -use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; -use proxy::stream::{PqStream, Stream}; -use rustls::crypto::ring; -use rustls::pki_types::PrivateKeyDer; -use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::net::TcpListener; -use tokio_util::sync::CancellationToken; -use tracing::{error, info, Instrument}; -use utils::project_git_version; -use utils::sentry_init::init_sentry; - -project_git_version!(GIT_VERSION); - -fn cli() -> clap::Command { - clap::Command::new("Neon proxy/router") - .version(GIT_VERSION) - .arg( - Arg::new("listen") - .short('l') - .long("listen") - .help("listen for incoming client connections on ip:port") - .default_value("127.0.0.1:4432"), - ) - .arg( - Arg::new("tls-key") - .short('k') - .long("tls-key") - .help("path to TLS key for client postgres connections") - .required(true), - ) - .arg( - Arg::new("tls-cert") - .short('c') - .long("tls-cert") - .help("path to TLS cert for client postgres connections") - .required(true), - ) - .arg( - Arg::new("dest") - .short('d') - .long("destination") - .help("append this domain zone to the SNI hostname to get the destination address") - .required(true), - ) -} +//! A stand-alone program that routes connections, e.g. from +//! `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. +//! +//! This allows connecting to pods/services running in the same Kubernetes cluster from +//! the outside. Similar to an ingress controller for HTTPS. #[tokio::main] async fn main() -> anyhow::Result<()> { - let _logging_guard = proxy::logging::init().await?; - let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); - let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - - Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); - - let args = cli().get_matches(); - let destination: String = args.get_one::("dest").unwrap().parse()?; - - // Configure TLS - let (tls_config, tls_server_end_point): (Arc, TlsServerEndPoint) = match ( - args.get_one::("tls-key"), - args.get_one::("tls-cert"), - ) { - (Some(key_path), Some(cert_path)) => { - let key = { - let key_bytes = std::fs::read(key_path).context("TLS key file")?; - - let mut keys = - rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); - - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - PrivateKeyDer::Pkcs8( - keys.pop() - .unwrap() - .context(format!("Failed to read TLS keys at '{key_path}'"))?, - ) - }; - - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain: Vec<_> = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - // needed for channel bindings - let first_cert = cert_chain.first().context("missing certificate")?; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - - let tls_config = - rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) - .context("ring should support TLS1.2 and TLS1.3")? - .with_no_client_auth() - .with_single_cert(cert_chain, key)? - .into(); - - (tls_config, tls_server_end_point) - } - _ => bail!("tls-key and tls-cert must be specified"), - }; - - // Start listening for incoming client connections - let proxy_address: SocketAddr = args.get_one::("listen").unwrap().parse()?; - info!("Starting sni router on {proxy_address}"); - let proxy_listener = TcpListener::bind(proxy_address).await?; - - let cancellation_token = CancellationToken::new(); - - let main = tokio::spawn(task_main( - Arc::new(destination), - tls_config, - tls_server_end_point, - proxy_listener, - cancellation_token.clone(), - )); - let signals_task = tokio::spawn(proxy::signals::handle(cancellation_token, || {})); - - // the signal task cant ever succeed. - // the main task can error, or can succeed on cancellation. - // we want to immediately exit on either of these cases - let signal = match futures::future::select(signals_task, main).await { - Either::Left((res, _)) => proxy::error::flatten_err(res)?, - Either::Right((res, _)) => return proxy::error::flatten_err(res), - }; - - // maintenance tasks return `Infallible` success values, this is an impossible value - // so this match statically ensures that there are no possibilities for that value - match signal {} -} - -async fn task_main( - dest_suffix: Arc, - tls_config: Arc, - tls_server_end_point: TlsServerEndPoint, - listener: tokio::net::TcpListener, - cancellation_token: CancellationToken, -) -> anyhow::Result<()> { - // When set for the server socket, the keepalive setting - // will be inherited by all accepted client sockets. - socket2::SockRef::from(&listener).set_keepalive(true)?; - - let connections = tokio_util::task::task_tracker::TaskTracker::new(); - - while let Some(accept_result) = - run_until_cancelled(listener.accept(), &cancellation_token).await - { - let (socket, peer_addr) = accept_result?; - - let session_id = uuid::Uuid::new_v4(); - let tls_config = Arc::clone(&tls_config); - let dest_suffix = Arc::clone(&dest_suffix); - - connections.spawn( - async move { - socket - .set_nodelay(true) - .context("failed to set socket option")?; - - info!(%peer_addr, "serving"); - let ctx = RequestContext::new( - session_id, - ConnectionInfo { - addr: peer_addr, - extra: None, - }, - proxy::metrics::Protocol::SniRouter, - "sni", - ); - handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await - } - .unwrap_or_else(|e| { - // Acknowledge that the task has finished with an error. - error!("per-client task finished with an error: {e:#}"); - }) - .instrument(tracing::info_span!("handle_client", ?session_id)), - ); - } - - connections.close(); - drop(listener); - - connections.wait().await; - - info!("all client connections have finished"); - Ok(()) -} - -const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; - -async fn ssl_handshake( - ctx: &RequestContext, - raw_stream: S, - tls_config: Arc, - tls_server_end_point: TlsServerEndPoint, -) -> anyhow::Result> { - let mut stream = PqStream::new(Stream::from_raw(raw_stream)); - - let msg = stream.read_startup_packet().await?; - use pq_proto::FeStartupPacket::*; - - match msg { - SslRequest { direct: false } => { - stream - .write_message(&pq_proto::BeMessage::EncryptionResponse(true)) - .await?; - - // Upgrade raw stream into a secure TLS-backed stream. - // NOTE: We've consumed `tls`; this fact will be used later. - - let (raw, read_buf) = stream.into_inner(); - // TODO: Normally, client doesn't send any data before - // server says TLS handshake is ok and read_buf is empy. - // However, you could imagine pipelining of postgres - // SSLRequest + TLS ClientHello in one hunk similar to - // pipelining in our node js driver. We should probably - // support that by chaining read_buf with the stream. - if !read_buf.is_empty() { - bail!("data is sent before server replied with EncryptionResponse"); - } - - Ok(Stream::Tls { - tls: Box::new( - raw.upgrade(tls_config, !ctx.has_private_peer_addr()) - .await?, - ), - tls_server_end_point, - }) - } - unexpected => { - info!( - ?unexpected, - "unexpected startup packet, rejecting connection" - ); - stream - .throw_error_str(ERR_INSECURE_CONNECTION, proxy::error::ErrorKind::User) - .await? - } - } -} - -async fn handle_client( - ctx: RequestContext, - dest_suffix: Arc, - tls_config: Arc, - tls_server_end_point: TlsServerEndPoint, - stream: impl AsyncRead + AsyncWrite + Unpin, -) -> anyhow::Result<()> { - let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?; - - // Cut off first part of the SNI domain - // We receive required destination details in the format of - // `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain` - let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?; - let dest: Vec<&str> = sni - .split_once('.') - .context("invalid SNI")? - .0 - .splitn(3, "--") - .collect(); - let port = dest[2].parse::().context("invalid port")?; - let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port); - - info!("destination: {}", destination); - - let mut client = tokio::net::TcpStream::connect(destination).await?; - - // doesn't yet matter as pg-sni-router doesn't report analytics logs - ctx.set_success(); - ctx.log_connect(); - - // Starting from here we only proxy the client's traffic. - info!("performing the proxy pass..."); - - match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await { - Ok(_) => Ok(()), - Err(ErrorSource::Client(err)) => Err(err).context("client"), - Err(ErrorSource::Compute(err)) => Err(err).context("compute"), - } + proxy::binary::pg_sni_router::run().await } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 99144acef0..7d4b44841d 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,871 +1,7 @@ -use std::net::SocketAddr; -use std::pin::pin; -use std::sync::Arc; - -use anyhow::bail; -use futures::future::Either; -use proxy::auth::backend::jwt::JwkCache; -use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; -use proxy::cancellation::{CancelMap, CancellationHandler}; -use proxy::config::{ - self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig, - ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, -}; -use proxy::context::parquet::ParquetUploadArgs; -use proxy::http::health_server::AppMetrics; -use proxy::metrics::Metrics; -use proxy::rate_limiter::{ - EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter, -}; -use proxy::redis::cancellation_publisher::RedisPublisherClient; -use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use proxy::redis::{elasticache, notifications}; -use proxy::scram::threadpool::ThreadPool; -use proxy::serverless::cancel_set::CancelSet; -use proxy::serverless::GlobalConnPoolOptions; -use proxy::{auth, control_plane, http, serverless, usage_metrics}; -use remote_storage::RemoteStorageConfig; -use tokio::net::TcpListener; -use tokio::sync::Mutex; -use tokio::task::JoinSet; -use tokio_util::sync::CancellationToken; -use tracing::{info, warn, Instrument}; -use utils::sentry_init::init_sentry; -use utils::{project_build_tag, project_git_version}; - -project_git_version!(GIT_VERSION); -project_build_tag!(BUILD_TAG); - -use clap::{Parser, ValueEnum}; - #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -#[derive(Clone, Debug, ValueEnum)] -enum AuthBackendType { - #[value(name("console"), alias("cplane"))] - ControlPlane, - - #[value(name("cplane-v1"), alias("control-plane"))] - ControlPlaneV1, - - #[value(name("link"), alias("control-redirect"))] - ConsoleRedirect, - - #[cfg(feature = "testing")] - Postgres, -} - -/// Neon proxy/router -#[derive(Parser)] -#[command(version = GIT_VERSION, about)] -struct ProxyCliArgs { - /// Name of the region this proxy is deployed in - #[clap(long, default_value_t = String::new())] - region: String, - /// listen for incoming client connections on ip:port - #[clap(short, long, default_value = "127.0.0.1:4432")] - proxy: String, - #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] - auth_backend: AuthBackendType, - /// listen for management callback connection on ip:port - #[clap(short, long, default_value = "127.0.0.1:7000")] - mgmt: String, - /// listen for incoming http connections (metrics, etc) on ip:port - #[clap(long, default_value = "127.0.0.1:7001")] - http: String, - /// listen for incoming wss connections on ip:port - #[clap(long)] - wss: Option, - /// redirect unauthenticated users to the given uri in case of console redirect auth - #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] - uri: String, - /// cloud API endpoint for authenticating users - #[clap( - short, - long, - default_value = "http://localhost:3000/authenticate_proxy_request/" - )] - auth_endpoint: String, - /// JWT used to connect to control plane. - #[clap( - long, - value_name = "JWT", - default_value = "", - env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN" - )] - control_plane_token: Arc, - /// if this is not local proxy, this toggles whether we accept jwt or passwords for http - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - is_auth_broker: bool, - /// path to TLS key for client postgres connections - /// - /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir - #[clap(short = 'k', long, alias = "ssl-key")] - tls_key: Option, - /// path to TLS cert for client postgres connections - /// - /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir - #[clap(short = 'c', long, alias = "ssl-cert")] - tls_cert: Option, - /// path to directory with TLS certificates for client postgres connections - #[clap(long)] - certs_dir: Option, - /// timeout for the TLS handshake - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - handshake_timeout: tokio::time::Duration, - /// http endpoint to receive periodic metric updates - #[clap(long)] - metric_collection_endpoint: Option, - /// how often metrics should be sent to a collection endpoint - #[clap(long)] - metric_collection_interval: Option, - /// cache for `wake_compute` api method (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - wake_compute_cache: String, - /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] - wake_compute_lock: String, - /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] - connect_compute_lock: String, - /// Allow self-signed certificates for compute nodes (for testing) - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - allow_self_signed_compute: bool, - #[clap(flatten)] - sql_over_http: SqlOverHttpArgs, - /// timeout for scram authentication protocol - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - scram_protocol_timeout: tokio::time::Duration, - /// size of the threadpool for password hashing - #[clap(long, default_value_t = 4)] - scram_thread_pool_size: u8, - /// Endpoint rate limiter max number of requests per second. - /// - /// Provided in the form `@`. - /// Can be given multiple times for different bucket sizes. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] - endpoint_rps_limit: Vec, - /// Wake compute rate limiter max number of requests per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] - wake_compute_limit: Vec, - /// Whether the auth rate limiter actually takes effect (for testing) - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - auth_rate_limit_enabled: bool, - /// Authentication rate limiter max number of hashes per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] - auth_rate_limit: Vec, - /// The IP subnet to use when considering whether two IP addresses are considered the same. - #[clap(long, default_value_t = 64)] - auth_rate_limit_ip_subnet: u8, - /// Redis rate limiter max number of requests per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] - redis_rps_limit: Vec, - /// cache for `allowed_ips` (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - allowed_ips_cache: String, - /// cache for `role_secret` (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - role_secret_cache: String, - /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) - #[clap(long)] - redis_notifications: Option, - /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain". - #[clap(long, default_value = "irsa")] - redis_auth_type: String, - /// redis host for streaming connections (might be different from the notifications host) - #[clap(long)] - redis_host: Option, - /// redis port for streaming connections (might be different from the notifications host) - #[clap(long)] - redis_port: Option, - /// redis cluster name, used in aws elasticache - #[clap(long)] - redis_cluster_name: Option, - /// redis user_id, used in aws elasticache - #[clap(long)] - redis_user_id: Option, - /// aws region to retrieve credentials - #[clap(long, default_value_t = String::new())] - aws_region: String, - /// cache for `project_info` (use `size=0` to disable) - #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] - project_info_cache: String, - /// cache for all valid endpoints - #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] - endpoint_cache_config: String, - #[clap(flatten)] - parquet_upload: ParquetUploadArgs, - - /// interval for backup metric collection - #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] - metric_backup_collection_interval: std::time::Duration, - /// remote storage configuration for backup metric collection - /// Encoded as toml (same format as pageservers), eg - /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` - #[clap(long, value_parser = remote_storage_from_toml)] - metric_backup_collection_remote_storage: Option, - /// chunk size for backup metric collection - /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. - #[clap(long, default_value = "4194304")] - metric_backup_collection_chunk_size: usize, - /// Whether to retry the connection to the compute node - #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] - connect_to_compute_retry: String, - /// Whether to retry the wake_compute request - #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] - wake_compute_retry: String, - - /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - is_private_access_proxy: bool, - - /// Configure whether all incoming requests have a Proxy Protocol V2 packet. - // TODO(conradludgate): switch default to rejected or required once we've updated all deployments - #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)] - proxy_protocol_v2: ProxyProtocolV2, - - /// Time the proxy waits for the webauth session to be confirmed by the control plane. - // TODO: rename to `console_redirect_confirmation_timeout`. - #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] - webauth_confirmation_timeout: std::time::Duration, -} - -#[derive(clap::Args, Clone, Copy, Debug)] -struct SqlOverHttpArgs { - /// timeout for http connection requests - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - sql_over_http_timeout: tokio::time::Duration, - - /// Whether the SQL over http pool is opt-in - #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - sql_over_http_pool_opt_in: bool, - - /// How many connections to pool for each endpoint. Excess connections are discarded - #[clap(long, default_value_t = 20)] - sql_over_http_pool_max_conns_per_endpoint: usize, - - /// How many connections to pool for each endpoint. Excess connections are discarded - #[clap(long, default_value_t = 20000)] - sql_over_http_pool_max_total_conns: usize, - - /// How long pooled connections should remain idle for before closing - #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] - sql_over_http_idle_timeout: tokio::time::Duration, - - /// Duration each shard will wait on average before a GC sweep. - /// A longer time will causes sweeps to take longer but will interfere less frequently. - #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] - sql_over_http_pool_gc_epoch: tokio::time::Duration, - - /// How many shards should the global pool have. Must be a power of two. - /// More shards will introduce less contention for pool operations, but can - /// increase memory used by the pool - #[clap(long, default_value_t = 128)] - sql_over_http_pool_shards: usize, - - #[clap(long, default_value_t = 10000)] - sql_over_http_client_conn_threshold: u64, - - #[clap(long, default_value_t = 64)] - sql_over_http_cancel_set_shards: usize, - - #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_request_size_bytes: usize, - - #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB - sql_over_http_max_response_size_bytes: usize, -} - #[tokio::main] async fn main() -> anyhow::Result<()> { - let _logging_guard = proxy::logging::init().await?; - let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); - let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - - // TODO: refactor these to use labels - info!("Version: {GIT_VERSION}"); - info!("Build_tag: {BUILD_TAG}"); - let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { - revision: GIT_VERSION, - build_tag: BUILD_TAG, - }); - - let jemalloc = match proxy::jemalloc::MetricRecorder::new() { - Ok(t) => Some(t), - Err(e) => { - tracing::error!(error = ?e, "could not start jemalloc metrics loop"); - None - } - }; - - let args = ProxyCliArgs::parse(); - let config = build_config(&args)?; - let auth_backend = build_auth_backend(&args)?; - - match auth_backend { - Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), - Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), - }; - info!("Using region: {}", args.aws_region); - - // TODO: untangle the config args - let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { - ("plain", redis_url) => match redis_url { - None => { - bail!("plain auth requires redis_notifications to be set"); - } - Some(url) => Some( - ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()), - ), - }, - ("irsa", _) => match (&args.redis_host, args.redis_port) { - (Some(host), Some(port)) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host.to_string(), - port, - elasticache::CredentialsProvider::new( - args.aws_region, - args.redis_cluster_name, - args.redis_user_id, - ) - .await, - ), - ), - (None, None) => { - warn!("irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client"); - None - } - _ => { - bail!("redis-host and redis-port must be specified together"); - } - }, - _ => { - bail!("unknown auth type given"); - } - }; - - let redis_notifications_client = if let Some(url) = args.redis_notifications { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string())) - } else { - regional_redis_client.clone() - }; - - // Check that we can bind to address before further initialization - let http_address: SocketAddr = args.http.parse()?; - info!("Starting http on {http_address}"); - let http_listener = TcpListener::bind(http_address).await?.into_std()?; - - let mgmt_address: SocketAddr = args.mgmt.parse()?; - info!("Starting mgmt on {mgmt_address}"); - let mgmt_listener = TcpListener::bind(mgmt_address).await?; - - let proxy_listener = if !args.is_auth_broker { - let proxy_address: SocketAddr = args.proxy.parse()?; - info!("Starting proxy on {proxy_address}"); - - Some(TcpListener::bind(proxy_address).await?) - } else { - None - }; - - // TODO: rename the argument to something like serverless. - // It now covers more than just websockets, it also covers SQL over HTTP. - let serverless_listener = if let Some(serverless_address) = args.wss { - let serverless_address: SocketAddr = serverless_address.parse()?; - info!("Starting wss on {serverless_address}"); - Some(TcpListener::bind(serverless_address).await?) - } else if args.is_auth_broker { - bail!("wss arg must be present for auth-broker") - } else { - None - }; - - let cancellation_token = CancellationToken::new(); - - let cancel_map = CancelMap::default(); - - let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); - RateBucketInfo::validate(redis_rps_limit)?; - - let redis_publisher = match ®ional_redis_client { - Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( - redis_publisher.clone(), - args.region.clone(), - redis_rps_limit, - )?))), - None => None, - }; - - let cancellation_handler = Arc::new(CancellationHandler::< - Option>>, - >::new( - cancel_map.clone(), - redis_publisher, - proxy::metrics::CancellationSource::FromClient, - )); - - // bit of a hack - find the min rps and max rps supported and turn it into - // leaky bucket config instead - let max = args - .endpoint_rps_limit - .iter() - .map(|x| x.rps()) - .max_by(f64::total_cmp) - .unwrap_or(EndpointRateLimiter::DEFAULT.max); - let rps = args - .endpoint_rps_limit - .iter() - .map(|x| x.rps()) - .min_by(f64::total_cmp) - .unwrap_or(EndpointRateLimiter::DEFAULT.rps); - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( - LeakyBucketConfig { rps, max }, - 64, - )); - - // client facing tasks. these will exit on error or on cancellation - // cancellation returns Ok(()) - let mut client_tasks = JoinSet::new(); - match auth_backend { - Either::Left(auth_backend) => { - if let Some(proxy_listener) = proxy_listener { - client_tasks.spawn(proxy::proxy::task_main( - config, - auth_backend, - proxy_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); - } - - if let Some(serverless_listener) = serverless_listener { - client_tasks.spawn(serverless::task_main( - config, - auth_backend, - serverless_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); - } - } - Either::Right(auth_backend) => { - if let Some(proxy_listener) = proxy_listener { - client_tasks.spawn(proxy::console_redirect_proxy::task_main( - config, - auth_backend, - proxy_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - )); - } - } - } - - client_tasks.spawn(proxy::context::parquet::worker( - cancellation_token.clone(), - args.parquet_upload, - )); - - // maintenance tasks. these never return unless there's an error - let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::signals::handle(cancellation_token.clone(), || {})); - maintenance_tasks.spawn(http::health_server::task_main( - http_listener, - AppMetrics { - jemalloc, - neon_metrics, - proxy: proxy::metrics::Metrics::get(), - }, - )); - maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener)); - - if let Some(metrics_config) = &config.metric_collection { - // TODO: Add gc regardles of the metric collection being enabled. - maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); - } - - if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend { - if let proxy::control_plane::client::ControlPlaneClient::Neon(api) = &**api { - match (redis_notifications_client, regional_redis_client.clone()) { - (None, None) => {} - (client1, client2) => { - let cache = api.caches.project_info.clone(); - if let Some(client) = client1 { - maintenance_tasks.spawn(notifications::task_main( - client, - cache.clone(), - cancel_map.clone(), - args.region.clone(), - )); - } - if let Some(client) = client2 { - maintenance_tasks.spawn(notifications::task_main( - client, - cache.clone(), - cancel_map.clone(), - args.region.clone(), - )); - } - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); - } - } - if let Some(regional_redis_client) = regional_redis_client { - let cache = api.caches.endpoints_cache.clone(); - let con = regional_redis_client; - let span = tracing::info_span!("endpoints_cache"); - maintenance_tasks.spawn( - async move { cache.do_read(con, cancellation_token.clone()).await } - .instrument(span), - ); - } - } else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api { - match (redis_notifications_client, regional_redis_client.clone()) { - (None, None) => {} - (client1, client2) => { - let cache = api.caches.project_info.clone(); - if let Some(client) = client1 { - maintenance_tasks.spawn(notifications::task_main( - client, - cache.clone(), - cancel_map.clone(), - args.region.clone(), - )); - } - if let Some(client) = client2 { - maintenance_tasks.spawn(notifications::task_main( - client, - cache.clone(), - cancel_map.clone(), - args.region.clone(), - )); - } - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); - } - } - if let Some(regional_redis_client) = regional_redis_client { - let cache = api.caches.endpoints_cache.clone(); - let con = regional_redis_client; - let span = tracing::info_span!("endpoints_cache"); - maintenance_tasks.spawn( - async move { cache.do_read(con, cancellation_token.clone()).await } - .instrument(span), - ); - } - } - } - - let maintenance = loop { - // get one complete task - match futures::future::select( - pin!(maintenance_tasks.join_next()), - pin!(client_tasks.join_next()), - ) - .await - { - // exit immediately on maintenance task completion - Either::Left((Some(res), _)) => break proxy::error::flatten_err(res)?, - // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) - Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), - // exit immediately on client task error - Either::Right((Some(res), _)) => proxy::error::flatten_err(res)?, - // exit if all our client tasks have shutdown gracefully - Either::Right((None, _)) => return Ok(()), - } - }; - - // maintenance tasks return Infallible success values, this is an impossible value - // so this match statically ensures that there are no possibilities for that value - match maintenance {} -} - -/// ProxyConfig is created at proxy startup, and lives forever. -fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { - let thread_pool = ThreadPool::new(args.scram_thread_pool_size); - Metrics::install(thread_pool.metrics.clone()); - - let tls_config = match (&args.tls_key, &args.tls_cert) { - (Some(key_path), Some(cert_path)) => Some(config::configure_tls( - key_path, - cert_path, - args.certs_dir.as_ref(), - )?), - (None, None) => None, - _ => bail!("either both or neither tls-key and tls-cert must be specified"), - }; - - if args.allow_self_signed_compute { - warn!("allowing self-signed compute certificates"); - } - let backup_metric_collection_config = config::MetricBackupCollectionConfig { - interval: args.metric_backup_collection_interval, - remote_storage_config: args.metric_backup_collection_remote_storage.clone(), - chunk_size: args.metric_backup_collection_chunk_size, - }; - - let metric_collection = match ( - &args.metric_collection_endpoint, - &args.metric_collection_interval, - ) { - (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { - endpoint: endpoint.parse()?, - interval: humantime::parse_duration(interval)?, - backup_metric_collection_config, - }), - (None, None) => None, - _ => bail!( - "either both or neither metric-collection-endpoint \ - and metric-collection-interval must be specified" - ), - }; - - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.connect_compute_lock.parse()?; - info!( - ?limiter, - shards, - ?epoch, - "Using NodeLocks (connect_compute)" - ); - let connect_compute_locks = control_plane::locks::ApiLocks::new( - "connect_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().proxy.connect_compute_lock, - )?; - - let http_config = HttpConfig { - accept_websockets: !args.is_auth_broker, - pool_options: GlobalConnPoolOptions { - max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, - gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, - pool_shards: args.sql_over_http.sql_over_http_pool_shards, - idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, - opt_in: args.sql_over_http.sql_over_http_pool_opt_in, - max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, - }, - cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), - client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, - max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, - max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, - }; - let authentication_config = AuthenticationConfig { - jwks_cache: JwkCache::default(), - thread_pool, - scram_protocol_timeout: args.scram_protocol_timeout, - rate_limiter_enabled: args.auth_rate_limit_enabled, - rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), - rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, - ip_allowlist_check_enabled: !args.is_private_access_proxy, - is_auth_broker: args.is_auth_broker, - accept_jwts: args.is_auth_broker, - console_redirect_confirmation_timeout: args.webauth_confirmation_timeout, - }; - - let config = ProxyConfig { - tls_config, - metric_collection, - allow_self_signed_compute: args.allow_self_signed_compute, - http_config, - authentication_config, - proxy_protocol_v2: args.proxy_protocol_v2, - handshake_timeout: args.handshake_timeout, - region: args.region.clone(), - wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, - connect_compute_locks, - connect_to_compute_retry_config: config::RetryConfig::parse( - &args.connect_to_compute_retry, - )?, - }; - - let config = Box::leak(Box::new(config)); - - tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); - - Ok(config) -} - -/// auth::Backend is created at proxy startup, and lives forever. -fn build_auth_backend( - args: &ProxyCliArgs, -) -> anyhow::Result, &'static ConsoleRedirectBackend>> { - match &args.auth_backend { - AuthBackendType::ControlPlaneV1 => { - let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; - let project_info_cache_config: ProjectInfoCacheOptions = - args.project_info_cache.parse()?; - let endpoint_cache_config: config::EndpointCacheConfig = - args.endpoint_cache_config.parse()?; - - info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); - info!( - "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" - ); - info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); - let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( - wake_compute_cache_config, - project_info_cache_config, - endpoint_cache_config, - ))); - - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.wake_compute_lock.parse()?; - info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); - let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( - "wake_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().wake_compute_lock, - )?)); - tokio::spawn(locks.garbage_collect_worker()); - - let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?; - - let endpoint = http::Endpoint::new(url, http::new_client()); - - let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); - RateBucketInfo::validate(&mut wake_compute_rps_limit)?; - let wake_compute_endpoint_rate_limiter = - Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); - - let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( - endpoint, - args.control_plane_token.clone(), - caches, - locks, - wake_compute_endpoint_rate_limiter, - ); - - let api = control_plane::client::ControlPlaneClient::ProxyV1(api); - let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); - let config = Box::leak(Box::new(auth_backend)); - - Ok(Either::Left(config)) - } - - AuthBackendType::ControlPlane => { - let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; - let project_info_cache_config: ProjectInfoCacheOptions = - args.project_info_cache.parse()?; - let endpoint_cache_config: config::EndpointCacheConfig = - args.endpoint_cache_config.parse()?; - - info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); - info!( - "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" - ); - info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); - let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( - wake_compute_cache_config, - project_info_cache_config, - endpoint_cache_config, - ))); - - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.wake_compute_lock.parse()?; - info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); - let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( - "wake_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().wake_compute_lock, - )?)); - tokio::spawn(locks.garbage_collect_worker()); - - let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?; - - let endpoint = http::Endpoint::new(url, http::new_client()); - - let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); - RateBucketInfo::validate(&mut wake_compute_rps_limit)?; - let wake_compute_endpoint_rate_limiter = - Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); - - let api = control_plane::client::neon::NeonControlPlaneClient::new( - endpoint, - args.control_plane_token.clone(), - caches, - locks, - wake_compute_endpoint_rate_limiter, - ); - let api = control_plane::client::ControlPlaneClient::Neon(api); - let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); - - let config = Box::leak(Box::new(auth_backend)); - - Ok(Either::Left(config)) - } - - #[cfg(feature = "testing")] - AuthBackendType::Postgres => { - let url = args.auth_endpoint.parse()?; - let api = control_plane::client::mock::MockControlPlane::new( - url, - !args.is_private_access_proxy, - ); - let api = control_plane::client::ControlPlaneClient::PostgresMock(api); - - let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); - - let config = Box::leak(Box::new(auth_backend)); - - Ok(Either::Left(config)) - } - - AuthBackendType::ConsoleRedirect => { - let url = args.uri.parse()?; - let backend = ConsoleRedirectBackend::new(url); - - let config = Box::leak(Box::new(backend)); - - Ok(Either::Right(config)) - } - } -} - -#[cfg(test)] -mod tests { - use std::time::Duration; - - use clap::Parser; - use proxy::rate_limiter::RateBucketInfo; - - #[test] - fn parse_endpoint_rps_limit() { - let config = super::ProxyCliArgs::parse_from([ - "proxy", - "--endpoint-rps-limit", - "100@1s", - "--endpoint-rps-limit", - "20@30s", - ]); - - assert_eq!( - config.endpoint_rps_limit, - vec![ - RateBucketInfo::new(100, Duration::from_secs(1)), - RateBucketInfo::new(20, Duration::from_secs(30)), - ] - ); - } + proxy::binary::proxy::run().await } diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs new file mode 100644 index 0000000000..dedd225cba --- /dev/null +++ b/proxy/src/binary/local_proxy.rs @@ -0,0 +1,410 @@ +use std::net::SocketAddr; +use std::pin::pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::{Context, bail, ensure}; +use camino::{Utf8Path, Utf8PathBuf}; +use clap::Parser; +use compute_api::spec::LocalProxySpec; +use futures::future::Either; +use thiserror::Error; +use tokio::net::TcpListener; +use tokio::sync::Notify; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, warn}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version}; + +use crate::auth::backend::jwt::JwkCache; +use crate::auth::backend::local::{JWKS_ROLE_MAP, LocalBackend}; +use crate::auth::{self}; +use crate::cancellation::CancellationHandler; +use crate::config::{ + self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig, +}; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings}; +use crate::http::health_server::AppMetrics; +use crate::intern::RoleNameInt; +use crate::metrics::{Metrics, ThreadPoolMetrics}; +use crate::rate_limiter::{ + BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, +}; +use crate::scram::threadpool::ThreadPool; +use crate::serverless::cancel_set::CancelSet; +use crate::serverless::{self, GlobalConnPoolOptions}; +use crate::tls::client_config::compute_client_config_with_root_certs; +use crate::types::RoleName; +use crate::url::ApiUrl; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + +/// Neon proxy/router +#[derive(Parser)] +#[command(version = GIT_VERSION, about)] +struct LocalProxyCliArgs { + /// listen for incoming metrics connections on ip:port + #[clap(long, default_value = "127.0.0.1:7001")] + metrics: String, + /// listen for incoming http connections on ip:port + #[clap(long)] + http: String, + /// timeout for the TLS handshake + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + handshake_timeout: tokio::time::Duration, + /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] + connect_compute_lock: String, + #[clap(flatten)] + sql_over_http: SqlOverHttpArgs, + /// User rate limiter max number of requests per second. + /// + /// Provided in the form `@`. + /// Can be given multiple times for different bucket sizes. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] + user_rps_limit: Vec, + /// Whether the auth rate limiter actually takes effect (for testing) + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + auth_rate_limit_enabled: bool, + /// Authentication rate limiter max number of hashes per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] + auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, + /// Whether to retry the connection to the compute node + #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] + connect_to_compute_retry: String, + /// Address of the postgres server + #[clap(long, default_value = "127.0.0.1:5432")] + postgres: SocketAddr, + /// Address of the internal compute-ctl api service + #[clap(long, default_value = "http://127.0.0.1:3081/")] + compute_ctl: ApiUrl, + /// Path of the local proxy config file + #[clap(long, default_value = "./local_proxy.json")] + config_path: Utf8PathBuf, + /// Path of the local proxy PID file + #[clap(long, default_value = "./local_proxy.pid")] + pid_path: Utf8PathBuf, +} + +#[derive(clap::Args, Clone, Copy, Debug)] +struct SqlOverHttpArgs { + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 200)] + sql_over_http_pool_max_total_conns: usize, + + /// How long pooled connections should remain idle for before closing + #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] + sql_over_http_idle_timeout: tokio::time::Duration, + + #[clap(long, default_value_t = 100)] + sql_over_http_client_conn_threshold: u64, + + #[clap(long, default_value_t = 16)] + sql_over_http_cancel_set_shards: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_request_size_bytes: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_response_size_bytes: usize, +} + +pub async fn run() -> anyhow::Result<()> { + let _logging_guard = crate::logging::init_local_proxy()?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); + + // TODO: refactor these to use labels + debug!("Version: {GIT_VERSION}"); + debug!("Build_tag: {BUILD_TAG}"); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); + + let jemalloc = match crate::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None + } + }; + + let args = LocalProxyCliArgs::parse(); + let config = build_config(&args)?; + let auth_backend = build_auth_backend(&args); + + // before we bind to any ports, write the process ID to a file + // so that compute-ctl can find our process later + // in order to trigger the appropriate SIGHUP on config change. + // + // This also claims a "lock" that makes sure only one instance + // of local_proxy runs at a time. + let _process_guard = loop { + match pid_file::claim_for_current_process(&args.pid_path) { + Ok(guard) => break guard, + Err(e) => { + // compute-ctl might have tried to read the pid-file to let us + // know about some config change. We should try again. + error!(path=?args.pid_path, "could not claim PID file guard: {e:?}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + }; + + let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?; + let http_listener = TcpListener::bind(args.http).await?; + let shutdown = CancellationToken::new(); + + // todo: should scale with CU + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + LeakyBucketConfig { + rps: 10.0, + max: 100.0, + }, + 16, + )); + + let mut maintenance_tasks = JoinSet::new(); + + let refresh_config_notify = Arc::new(Notify::new()); + maintenance_tasks.spawn(crate::signals::handle(shutdown.clone(), { + let refresh_config_notify = Arc::clone(&refresh_config_notify); + move || { + refresh_config_notify.notify_one(); + } + })); + + // trigger the first config load **after** setting up the signal hook + // to avoid the race condition where: + // 1. No config file registered when local_proxy starts up + // 2. The config file is written but the signal hook is not yet received + // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config. + refresh_config_notify.notify_one(); + tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify)); + + maintenance_tasks.spawn(crate::http::health_server::task_main( + metrics_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: crate::metrics::Metrics::get(), + }, + )); + + let task = serverless::task_main( + config, + auth_backend, + http_listener, + shutdown.clone(), + Arc::new(CancellationHandler::new(&config.connect_to_compute, None)), + endpoint_rate_limiter, + ); + + match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await { + // exit immediately on maintenance task completion + Either::Left((Some(res), _)) => match crate::error::flatten_err(res)? {}, + // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) + Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), + // exit immediately on client task error + Either::Right((res, _)) => res?, + } + + Ok(()) +} + +/// ProxyConfig is created at proxy startup, and lives forever. +fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!( + ?limiter, + shards, + ?epoch, + "Using NodeLocks (connect_compute)" + ); + let connect_compute_locks = ApiLocks::new( + "connect_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + ); + + let http_config = HttpConfig { + accept_websockets: false, + pool_options: GlobalConnPoolOptions { + gc_epoch: Duration::from_secs(60), + pool_shards: 2, + idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, + opt_in: false, + + max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns, + max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, + }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, + }; + + let compute_config = ComputeConfig { + retry: RetryConfig::parse(RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)?, + tls: Arc::new(compute_client_config_with_root_certs()?), + timeout: Duration::from_secs(2), + }; + + Ok(Box::leak(Box::new(ProxyConfig { + tls_config: None, + metric_collection: None, + http_config, + authentication_config: AuthenticationConfig { + jwks_cache: JwkCache::default(), + thread_pool: ThreadPool::new(0), + scram_protocol_timeout: Duration::from_secs(10), + rate_limiter_enabled: false, + rate_limiter: BucketRateLimiter::new(vec![]), + rate_limit_ip_subnet: 64, + ip_allowlist_check_enabled: true, + is_vpc_acccess_proxy: false, + is_auth_broker: false, + accept_jwts: true, + console_redirect_confirmation_timeout: Duration::ZERO, + }, + proxy_protocol_v2: config::ProxyProtocolV2::Rejected, + handshake_timeout: Duration::from_secs(10), + region: "local".into(), + wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, + connect_compute_locks, + connect_to_compute: compute_config, + }))) +} + +/// auth::Backend is created at proxy startup, and lives forever. +fn build_auth_backend(args: &LocalProxyCliArgs) -> &'static auth::Backend<'static, ()> { + let auth_backend = crate::auth::Backend::Local(crate::auth::backend::MaybeOwned::Owned( + LocalBackend::new(args.postgres, args.compute_ctl.clone()), + )); + + Box::leak(Box::new(auth_backend)) +} + +#[derive(Error, Debug)] +enum RefreshConfigError { + #[error(transparent)] + Read(#[from] std::io::Error), + #[error(transparent)] + Parse(#[from] serde_json::Error), + #[error(transparent)] + Validate(anyhow::Error), +} + +async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { + let mut init = true; + loop { + rx.notified().await; + + match refresh_config_inner(&path).await { + Ok(()) => {} + // don't log for file not found errors if this is the first time we are checking + // for computes that don't use local_proxy, this is not an error. + Err(RefreshConfigError::Read(e)) + if init && e.kind() == std::io::ErrorKind::NotFound => + { + debug!(error=?e, ?path, "could not read config file"); + } + Err(e) => { + error!(error=?e, ?path, "could not read config file"); + } + } + + init = false; + } +} + +async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> { + let bytes = tokio::fs::read(&path).await?; + let data: LocalProxySpec = serde_json::from_slice(&bytes)?; + + let mut jwks_set = vec![]; + + fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result { + let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; + + ensure!( + jwks_url.has_authority() + && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), + "Invalid JWKS url. Must be HTTP", + ); + + ensure!( + jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), + "Invalid JWKS url. No domain listed", + ); + + // clear username, password and ports + jwks_url + .set_username("") + .expect("url can be a base and has a valid host and is not a file. should not error"); + jwks_url + .set_password(None) + .expect("url can be a base and has a valid host and is not a file. should not error"); + // local testing is hard if we need to have a specific restricted port + if cfg!(not(feature = "testing")) { + jwks_url.set_port(None).expect( + "url can be a base and has a valid host and is not a file. should not error", + ); + } + + // clear query params + jwks_url.set_fragment(None); + jwks_url.query_pairs_mut().clear().finish(); + + if jwks_url.scheme() != "https" { + // local testing is hard if we need to set up https support. + if cfg!(not(feature = "testing")) { + jwks_url + .set_scheme("https") + .expect("should not error to set the scheme to https if it was http"); + } else { + warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); + } + } + + Ok(JwksSettings { + id: jwks.id, + jwks_url, + _provider_name: jwks.provider_name, + jwt_audience: jwks.jwt_audience, + role_names: jwks + .role_names + .into_iter() + .map(RoleName::from) + .map(|s| RoleNameInt::from(&s)) + .collect(), + }) + } + + for jwks in data.jwks.into_iter().flatten() { + jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?); + } + + info!("successfully loaded new config"); + JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); + + Ok(()) +} diff --git a/proxy/src/binary/mod.rs b/proxy/src/binary/mod.rs new file mode 100644 index 0000000000..dc07d3e675 --- /dev/null +++ b/proxy/src/binary/mod.rs @@ -0,0 +1,7 @@ +//! All binaries have the body of their main() defined here, so that the code +//! is also covered by code style configs in lib.rs and the unused-code check is +//! more effective when practically all modules are private to the lib. + +pub mod local_proxy; +pub mod pg_sni_router; +pub mod proxy; diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs new file mode 100644 index 0000000000..1aa290399c --- /dev/null +++ b/proxy/src/binary/pg_sni_router.rs @@ -0,0 +1,305 @@ +/// A stand-alone program that routes connections, e.g. from +/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. +/// +/// This allows connecting to pods/services running in the same Kubernetes cluster from +/// the outside. Similar to an ingress controller for HTTPS. +use std::{net::SocketAddr, sync::Arc}; + +use anyhow::{Context, anyhow, bail, ensure}; +use clap::Arg; +use futures::TryFutureExt; +use futures::future::Either; +use itertools::Itertools; +use rustls::crypto::ring; +use rustls::pki_types::PrivateKeyDer; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::TcpListener; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, error, info}; +use utils::project_git_version; +use utils::sentry_init::init_sentry; + +use crate::context::RequestContext; +use crate::metrics::{Metrics, ThreadPoolMetrics}; +use crate::protocol2::ConnectionInfo; +use crate::proxy::{ErrorSource, copy_bidirectional_client_compute, run_until_cancelled}; +use crate::stream::{PqStream, Stream}; +use crate::tls::TlsServerEndPoint; + +project_git_version!(GIT_VERSION); + +fn cli() -> clap::Command { + clap::Command::new("Neon proxy/router") + .version(GIT_VERSION) + .arg( + Arg::new("listen") + .short('l') + .long("listen") + .help("listen for incoming client connections on ip:port") + .default_value("127.0.0.1:4432"), + ) + .arg( + Arg::new("tls-key") + .short('k') + .long("tls-key") + .help("path to TLS key for client postgres connections") + .required(true), + ) + .arg( + Arg::new("tls-cert") + .short('c') + .long("tls-cert") + .help("path to TLS cert for client postgres connections") + .required(true), + ) + .arg( + Arg::new("dest") + .short('d') + .long("destination") + .help("append this domain zone to the SNI hostname to get the destination address") + .required(true), + ) +} + +pub async fn run() -> anyhow::Result<()> { + let _logging_guard = crate::logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); + + let args = cli().get_matches(); + let destination: String = args + .get_one::("dest") + .expect("string argument defined") + .parse()?; + + // Configure TLS + let (tls_config, tls_server_end_point): (Arc, TlsServerEndPoint) = match ( + args.get_one::("tls-key"), + args.get_one::("tls-cert"), + ) { + (Some(key_path), Some(cert_path)) => { + let key = { + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + + let mut keys = + rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); + + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); + PrivateKeyDer::Pkcs8( + keys.pop() + .expect("keys should not be empty") + .context(format!("Failed to read TLS keys at '{key_path}'"))?, + ) + }; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain: Vec<_> = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") + })? + }; + + // needed for channel bindings + let first_cert = cert_chain.first().context("missing certificate")?; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + + let tls_config = + rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("ring should support TLS1.2 and TLS1.3")? + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into(); + + (tls_config, tls_server_end_point) + } + _ => bail!("tls-key and tls-cert must be specified"), + }; + + // Start listening for incoming client connections + let proxy_address: SocketAddr = args + .get_one::("listen") + .expect("string argument defined") + .parse()?; + info!("Starting sni router on {proxy_address}"); + let proxy_listener = TcpListener::bind(proxy_address).await?; + + let cancellation_token = CancellationToken::new(); + + let main = tokio::spawn(task_main( + Arc::new(destination), + tls_config, + tls_server_end_point, + proxy_listener, + cancellation_token.clone(), + )); + let signals_task = tokio::spawn(crate::signals::handle(cancellation_token, || {})); + + // the signal task cant ever succeed. + // the main task can error, or can succeed on cancellation. + // we want to immediately exit on either of these cases + let signal = match futures::future::select(signals_task, main).await { + Either::Left((res, _)) => crate::error::flatten_err(res)?, + Either::Right((res, _)) => return crate::error::flatten_err(res), + }; + + // maintenance tasks return `Infallible` success values, this is an impossible value + // so this match statically ensures that there are no possibilities for that value + match signal {} +} + +async fn task_main( + dest_suffix: Arc, + tls_config: Arc, + tls_server_end_point: TlsServerEndPoint, + listener: tokio::net::TcpListener, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { + // When set for the server socket, the keepalive setting + // will be inherited by all accepted client sockets. + socket2::SockRef::from(&listener).set_keepalive(true)?; + + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + + while let Some(accept_result) = + run_until_cancelled(listener.accept(), &cancellation_token).await + { + let (socket, peer_addr) = accept_result?; + + let session_id = uuid::Uuid::new_v4(); + let tls_config = Arc::clone(&tls_config); + let dest_suffix = Arc::clone(&dest_suffix); + + connections.spawn( + async move { + socket + .set_nodelay(true) + .context("failed to set socket option")?; + + info!(%peer_addr, "serving"); + let ctx = RequestContext::new( + session_id, + ConnectionInfo { + addr: peer_addr, + extra: None, + }, + crate::metrics::Protocol::SniRouter, + "sni", + ); + handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await + } + .unwrap_or_else(|e| { + // Acknowledge that the task has finished with an error. + error!("per-client task finished with an error: {e:#}"); + }) + .instrument(tracing::info_span!("handle_client", ?session_id)), + ); + } + + connections.close(); + drop(listener); + + connections.wait().await; + + info!("all client connections have finished"); + Ok(()) +} + +const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; + +async fn ssl_handshake( + ctx: &RequestContext, + raw_stream: S, + tls_config: Arc, + tls_server_end_point: TlsServerEndPoint, +) -> anyhow::Result> { + let mut stream = PqStream::new(Stream::from_raw(raw_stream)); + + let msg = stream.read_startup_packet().await?; + use pq_proto::FeStartupPacket::SslRequest; + + match msg { + SslRequest { direct: false } => { + stream + .write_message(&pq_proto::BeMessage::EncryptionResponse(true)) + .await?; + + // Upgrade raw stream into a secure TLS-backed stream. + // NOTE: We've consumed `tls`; this fact will be used later. + + let (raw, read_buf) = stream.into_inner(); + // TODO: Normally, client doesn't send any data before + // server says TLS handshake is ok and read_buf is empty. + // However, you could imagine pipelining of postgres + // SSLRequest + TLS ClientHello in one hunk similar to + // pipelining in our node js driver. We should probably + // support that by chaining read_buf with the stream. + if !read_buf.is_empty() { + bail!("data is sent before server replied with EncryptionResponse"); + } + + Ok(Stream::Tls { + tls: Box::new( + raw.upgrade(tls_config, !ctx.has_private_peer_addr()) + .await?, + ), + tls_server_end_point, + }) + } + unexpected => { + info!( + ?unexpected, + "unexpected startup packet, rejecting connection" + ); + stream + .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User) + .await? + } + } +} + +async fn handle_client( + ctx: RequestContext, + dest_suffix: Arc, + tls_config: Arc, + tls_server_end_point: TlsServerEndPoint, + stream: impl AsyncRead + AsyncWrite + Unpin, +) -> anyhow::Result<()> { + let mut tls_stream = ssl_handshake(&ctx, stream, tls_config, tls_server_end_point).await?; + + // Cut off first part of the SNI domain + // We receive required destination details in the format of + // `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain` + let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?; + let dest: Vec<&str> = sni + .split_once('.') + .context("invalid SNI")? + .0 + .splitn(3, "--") + .collect(); + let port = dest[2].parse::().context("invalid port")?; + let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port); + + info!("destination: {}", destination); + + let mut client = tokio::net::TcpStream::connect(destination).await?; + + // doesn't yet matter as pg-sni-router doesn't report analytics logs + ctx.set_success(); + ctx.log_connect(); + + // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); + + match copy_bidirectional_client_compute(&mut tls_stream, &mut client).await { + Ok(_) => Ok(()), + Err(ErrorSource::Client(err)) => Err(err).context("client"), + Err(ErrorSource::Compute(err)) => Err(err).context("compute"), + } +} diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs new file mode 100644 index 0000000000..eec0bf8f99 --- /dev/null +++ b/proxy/src/binary/proxy.rs @@ -0,0 +1,831 @@ +use std::net::SocketAddr; +use std::pin::pin; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::bail; +use futures::future::Either; +use remote_storage::RemoteStorageConfig; +use tokio::net::TcpListener; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, info, warn}; +use utils::sentry_init::init_sentry; +use utils::{project_build_tag, project_git_version}; + +use crate::auth::backend::jwt::JwkCache; +use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; +use crate::cancellation::{CancellationHandler, handle_cancel_messages}; +use crate::config::{ + self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions, + ProxyConfig, ProxyProtocolV2, remote_storage_from_toml, +}; +use crate::context::parquet::ParquetUploadArgs; +use crate::http::health_server::AppMetrics; +use crate::metrics::Metrics; +use crate::rate_limiter::{ + EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter, +}; +use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use crate::redis::kv_ops::RedisKVClient; +use crate::redis::{elasticache, notifications}; +use crate::scram::threadpool::ThreadPool; +use crate::serverless::GlobalConnPoolOptions; +use crate::serverless::cancel_set::CancelSet; +use crate::tls::client_config::compute_client_config_with_root_certs; +use crate::{auth, control_plane, http, serverless, usage_metrics}; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + +use clap::{Parser, ValueEnum}; + +#[derive(Clone, Debug, ValueEnum)] +enum AuthBackendType { + #[value(name("cplane-v1"), alias("control-plane"))] + ControlPlaneV1, + + #[value(name("link"), alias("control-redirect"))] + ConsoleRedirect, + + #[cfg(any(test, feature = "testing"))] + Postgres, +} + +/// Neon proxy/router +#[derive(Parser)] +#[command(version = GIT_VERSION, about)] +struct ProxyCliArgs { + /// Name of the region this proxy is deployed in + #[clap(long, default_value_t = String::new())] + region: String, + /// listen for incoming client connections on ip:port + #[clap(short, long, default_value = "127.0.0.1:4432")] + proxy: String, + #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] + auth_backend: AuthBackendType, + /// listen for management callback connection on ip:port + #[clap(short, long, default_value = "127.0.0.1:7000")] + mgmt: String, + /// listen for incoming http connections (metrics, etc) on ip:port + #[clap(long, default_value = "127.0.0.1:7001")] + http: String, + /// listen for incoming wss connections on ip:port + #[clap(long)] + wss: Option, + /// redirect unauthenticated users to the given uri in case of console redirect auth + #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] + uri: String, + /// cloud API endpoint for authenticating users + #[clap( + short, + long, + default_value = "http://localhost:3000/authenticate_proxy_request/" + )] + auth_endpoint: String, + /// JWT used to connect to control plane. + #[clap( + long, + value_name = "JWT", + default_value = "", + env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN" + )] + control_plane_token: Arc, + /// if this is not local proxy, this toggles whether we accept jwt or passwords for http + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_auth_broker: bool, + /// path to TLS key for client postgres connections + /// + /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir + #[clap(short = 'k', long, alias = "ssl-key")] + tls_key: Option, + /// path to TLS cert for client postgres connections + /// + /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir + #[clap(short = 'c', long, alias = "ssl-cert")] + tls_cert: Option, + /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`. + #[clap(long, alias = "allow-ssl-keylogfile")] + allow_tls_keylogfile: bool, + /// path to directory with TLS certificates for client postgres connections + #[clap(long)] + certs_dir: Option, + /// timeout for the TLS handshake + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + handshake_timeout: tokio::time::Duration, + /// http endpoint to receive periodic metric updates + #[clap(long)] + metric_collection_endpoint: Option, + /// how often metrics should be sent to a collection endpoint + #[clap(long)] + metric_collection_interval: Option, + /// cache for `wake_compute` api method (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] + wake_compute_cache: String, + /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] + wake_compute_lock: String, + /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] + connect_compute_lock: String, + #[clap(flatten)] + sql_over_http: SqlOverHttpArgs, + /// timeout for scram authentication protocol + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + scram_protocol_timeout: tokio::time::Duration, + /// size of the threadpool for password hashing + #[clap(long, default_value_t = 4)] + scram_thread_pool_size: u8, + /// Endpoint rate limiter max number of requests per second. + /// + /// Provided in the form `@`. + /// Can be given multiple times for different bucket sizes. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] + endpoint_rps_limit: Vec, + /// Wake compute rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + wake_compute_limit: Vec, + /// Whether the auth rate limiter actually takes effect (for testing) + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + auth_rate_limit_enabled: bool, + /// Authentication rate limiter max number of hashes per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] + auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, + /// Redis rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)] + redis_rps_limit: Vec, + /// Cancellation channel size (max queue size for redis kv client) + #[clap(long, default_value = "1024")] + cancellation_ch_size: usize, + /// cache for `allowed_ips` (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] + allowed_ips_cache: String, + /// cache for `role_secret` (use `size=0` to disable) + #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] + role_secret_cache: String, + /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) + #[clap(long)] + redis_notifications: Option, + /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain". + #[clap(long, default_value = "irsa")] + redis_auth_type: String, + /// redis host for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_host: Option, + /// redis port for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_port: Option, + /// redis cluster name, used in aws elasticache + #[clap(long)] + redis_cluster_name: Option, + /// redis user_id, used in aws elasticache + #[clap(long)] + redis_user_id: Option, + /// aws region to retrieve credentials + #[clap(long, default_value_t = String::new())] + aws_region: String, + /// cache for `project_info` (use `size=0` to disable) + #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] + project_info_cache: String, + /// cache for all valid endpoints + #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] + endpoint_cache_config: String, + #[clap(flatten)] + parquet_upload: ParquetUploadArgs, + + /// interval for backup metric collection + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + metric_backup_collection_interval: std::time::Duration, + /// remote storage configuration for backup metric collection + /// Encoded as toml (same format as pageservers), eg + /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` + #[clap(long, value_parser = remote_storage_from_toml)] + metric_backup_collection_remote_storage: Option, + /// chunk size for backup metric collection + /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. + #[clap(long, default_value = "4194304")] + metric_backup_collection_chunk_size: usize, + /// Whether to retry the connection to the compute node + #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] + connect_to_compute_retry: String, + /// Whether to retry the wake_compute request + #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] + wake_compute_retry: String, + + /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_private_access_proxy: bool, + + /// Configure whether all incoming requests have a Proxy Protocol V2 packet. + // TODO(conradludgate): switch default to rejected or required once we've updated all deployments + #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)] + proxy_protocol_v2: ProxyProtocolV2, + + /// Time the proxy waits for the webauth session to be confirmed by the control plane. + // TODO: rename to `console_redirect_confirmation_timeout`. + #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] + webauth_confirmation_timeout: std::time::Duration, +} + +#[derive(clap::Args, Clone, Copy, Debug)] +struct SqlOverHttpArgs { + /// timeout for http connection requests + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + sql_over_http_timeout: tokio::time::Duration, + + /// Whether the SQL over http pool is opt-in + #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + sql_over_http_pool_opt_in: bool, + + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 20)] + sql_over_http_pool_max_conns_per_endpoint: usize, + + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 20000)] + sql_over_http_pool_max_total_conns: usize, + + /// How long pooled connections should remain idle for before closing + #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] + sql_over_http_idle_timeout: tokio::time::Duration, + + /// Duration each shard will wait on average before a GC sweep. + /// A longer time will causes sweeps to take longer but will interfere less frequently. + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + sql_over_http_pool_gc_epoch: tokio::time::Duration, + + /// How many shards should the global pool have. Must be a power of two. + /// More shards will introduce less contention for pool operations, but can + /// increase memory used by the pool + #[clap(long, default_value_t = 128)] + sql_over_http_pool_shards: usize, + + #[clap(long, default_value_t = 10000)] + sql_over_http_client_conn_threshold: u64, + + #[clap(long, default_value_t = 64)] + sql_over_http_cancel_set_shards: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_request_size_bytes: usize, + + #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB + sql_over_http_max_response_size_bytes: usize, +} + +pub async fn run() -> anyhow::Result<()> { + let _logging_guard = crate::logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + // TODO: refactor these to use labels + info!("Version: {GIT_VERSION}"); + info!("Build_tag: {BUILD_TAG}"); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); + + let jemalloc = match crate::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None + } + }; + + let args = ProxyCliArgs::parse(); + let config = build_config(&args)?; + let auth_backend = build_auth_backend(&args)?; + + match auth_backend { + Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), + Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), + } + info!("Using region: {}", args.aws_region); + + // TODO: untangle the config args + let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { + ("plain", redis_url) => match redis_url { + None => { + bail!("plain auth requires redis_notifications to be set"); + } + Some(url) => Some( + ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()), + ), + }, + ("irsa", _) => match (&args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host.to_string(), + port, + elasticache::CredentialsProvider::new( + args.aws_region, + args.redis_cluster_name, + args.redis_user_id, + ) + .await, + ), + ), + (None, None) => { + warn!( + "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client" + ); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }, + _ => { + bail!("unknown auth type given"); + } + }; + + let redis_notifications_client = if let Some(url) = args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + } else { + regional_redis_client.clone() + }; + + // Check that we can bind to address before further initialization + let http_address: SocketAddr = args.http.parse()?; + info!("Starting http on {http_address}"); + let http_listener = TcpListener::bind(http_address).await?.into_std()?; + + let mgmt_address: SocketAddr = args.mgmt.parse()?; + info!("Starting mgmt on {mgmt_address}"); + let mgmt_listener = TcpListener::bind(mgmt_address).await?; + + let proxy_listener = if args.is_auth_broker { + None + } else { + let proxy_address: SocketAddr = args.proxy.parse()?; + info!("Starting proxy on {proxy_address}"); + + Some(TcpListener::bind(proxy_address).await?) + }; + + // TODO: rename the argument to something like serverless. + // It now covers more than just websockets, it also covers SQL over HTTP. + let serverless_listener = if let Some(serverless_address) = args.wss { + let serverless_address: SocketAddr = serverless_address.parse()?; + info!("Starting wss on {serverless_address}"); + Some(TcpListener::bind(serverless_address).await?) + } else if args.is_auth_broker { + bail!("wss arg must be present for auth-broker") + } else { + None + }; + + let cancellation_token = CancellationToken::new(); + + let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); + RateBucketInfo::validate(redis_rps_limit)?; + + let redis_kv_client = regional_redis_client + .as_ref() + .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit)); + + // channel size should be higher than redis client limit to avoid blocking + let cancel_ch_size = args.cancellation_ch_size; + let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size); + let cancellation_handler = Arc::new(CancellationHandler::new( + &config.connect_to_compute, + Some(tx_cancel), + )); + + // bit of a hack - find the min rps and max rps supported and turn it into + // leaky bucket config instead + let max = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .max_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.max); + let rps = args + .endpoint_rps_limit + .iter() + .map(|x| x.rps()) + .min_by(f64::total_cmp) + .unwrap_or(EndpointRateLimiter::DEFAULT.rps); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( + LeakyBucketConfig { rps, max }, + 64, + )); + + // client facing tasks. these will exit on error or on cancellation + // cancellation returns Ok(()) + let mut client_tasks = JoinSet::new(); + match auth_backend { + Either::Left(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(crate::proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } + + if let Some(serverless_listener) = serverless_listener { + client_tasks.spawn(serverless::task_main( + config, + auth_backend, + serverless_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } + } + Either::Right(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(crate::console_redirect_proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + )); + } + } + } + + client_tasks.spawn(crate::context::parquet::worker( + cancellation_token.clone(), + args.parquet_upload, + )); + + // maintenance tasks. these never return unless there's an error + let mut maintenance_tasks = JoinSet::new(); + maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), || {})); + maintenance_tasks.spawn(http::health_server::task_main( + http_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: crate::metrics::Metrics::get(), + }, + )); + maintenance_tasks.spawn(control_plane::mgmt::task_main(mgmt_listener)); + + if let Some(metrics_config) = &config.metric_collection { + // TODO: Add gc regardles of the metric collection being enabled. + maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); + } + + #[cfg_attr(not(any(test, feature = "testing")), expect(irrefutable_let_patterns))] + if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend { + if let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api { + match (redis_notifications_client, regional_redis_client.clone()) { + (None, None) => {} + (client1, client2) => { + let cache = api.caches.project_info.clone(); + if let Some(client) = client1 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + args.region.clone(), + )); + } + if let Some(client) = client2 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + args.region.clone(), + )); + } + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + } + } + + if let Some(mut redis_kv_client) = redis_kv_client { + maintenance_tasks.spawn(async move { + redis_kv_client.try_connect().await?; + handle_cancel_messages(&mut redis_kv_client, rx_cancel).await + }); + } + + if let Some(regional_redis_client) = regional_redis_client { + let cache = api.caches.endpoints_cache.clone(); + let con = regional_redis_client; + let span = tracing::info_span!("endpoints_cache"); + maintenance_tasks.spawn( + async move { cache.do_read(con, cancellation_token.clone()).await } + .instrument(span), + ); + } + } + } + + let maintenance = loop { + // get one complete task + match futures::future::select( + pin!(maintenance_tasks.join_next()), + pin!(client_tasks.join_next()), + ) + .await + { + // exit immediately on maintenance task completion + Either::Left((Some(res), _)) => break crate::error::flatten_err(res)?, + // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above) + Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"), + // exit immediately on client task error + Either::Right((Some(res), _)) => crate::error::flatten_err(res)?, + // exit if all our client tasks have shutdown gracefully + Either::Right((None, _)) => return Ok(()), + } + }; + + // maintenance tasks return Infallible success values, this is an impossible value + // so this match statically ensures that there are no possibilities for that value + match maintenance {} +} + +/// ProxyConfig is created at proxy startup, and lives forever. +fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let thread_pool = ThreadPool::new(args.scram_thread_pool_size); + Metrics::install(thread_pool.metrics.clone()); + + let tls_config = match (&args.tls_key, &args.tls_cert) { + (Some(key_path), Some(cert_path)) => Some(config::configure_tls( + key_path, + cert_path, + args.certs_dir.as_ref(), + args.allow_tls_keylogfile, + )?), + (None, None) => None, + _ => bail!("either both or neither tls-key and tls-cert must be specified"), + }; + + let backup_metric_collection_config = config::MetricBackupCollectionConfig { + remote_storage_config: args.metric_backup_collection_remote_storage.clone(), + chunk_size: args.metric_backup_collection_chunk_size, + }; + + let metric_collection = match ( + &args.metric_collection_endpoint, + &args.metric_collection_interval, + ) { + (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { + endpoint: endpoint.parse()?, + interval: humantime::parse_duration(interval)?, + backup_metric_collection_config, + }), + (None, None) => None, + _ => bail!( + "either both or neither metric-collection-endpoint \ + and metric-collection-interval must be specified" + ), + }; + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!( + ?limiter, + shards, + ?epoch, + "Using NodeLocks (connect_compute)" + ); + let connect_compute_locks = control_plane::locks::ApiLocks::new( + "connect_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + ); + + let http_config = HttpConfig { + accept_websockets: !args.is_auth_broker, + pool_options: GlobalConnPoolOptions { + max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, + gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, + pool_shards: args.sql_over_http.sql_over_http_pool_shards, + idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, + opt_in: args.sql_over_http.sql_over_http_pool_opt_in, + max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, + }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, + }; + let authentication_config = AuthenticationConfig { + jwks_cache: JwkCache::default(), + thread_pool, + scram_protocol_timeout: args.scram_protocol_timeout, + rate_limiter_enabled: args.auth_rate_limit_enabled, + rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), + rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, + ip_allowlist_check_enabled: !args.is_private_access_proxy, + is_vpc_acccess_proxy: args.is_private_access_proxy, + is_auth_broker: args.is_auth_broker, + accept_jwts: args.is_auth_broker, + console_redirect_confirmation_timeout: args.webauth_confirmation_timeout, + }; + + let compute_config = ComputeConfig { + retry: config::RetryConfig::parse(&args.connect_to_compute_retry)?, + tls: Arc::new(compute_client_config_with_root_certs()?), + timeout: Duration::from_secs(2), + }; + + let config = ProxyConfig { + tls_config, + metric_collection, + http_config, + authentication_config, + proxy_protocol_v2: args.proxy_protocol_v2, + handshake_timeout: args.handshake_timeout, + region: args.region.clone(), + wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, + connect_compute_locks, + connect_to_compute: compute_config, + }; + + let config = Box::leak(Box::new(config)); + + tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); + + Ok(config) +} + +/// auth::Backend is created at proxy startup, and lives forever. +fn build_auth_backend( + args: &ProxyCliArgs, +) -> anyhow::Result, &'static ConsoleRedirectBackend>> { + match &args.auth_backend { + AuthBackendType::ControlPlaneV1 => { + let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; + let project_info_cache_config: ProjectInfoCacheOptions = + args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; + + info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); + info!( + "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" + ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( + wake_compute_cache_config, + project_info_cache_config, + endpoint_cache_config, + ))); + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.wake_compute_lock.parse()?; + info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( + "wake_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + ))); + tokio::spawn(locks.garbage_collect_worker()); + + let url: crate::url::ApiUrl = args.auth_endpoint.parse()?; + + let endpoint = http::Endpoint::new(url, http::new_client()); + + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + + let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( + endpoint, + args.control_plane_token.clone(), + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); + + let api = control_plane::client::ControlPlaneClient::ProxyV1(api); + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } + + #[cfg(any(test, feature = "testing"))] + AuthBackendType::Postgres => { + let url = args.auth_endpoint.parse()?; + let api = control_plane::client::mock::MockControlPlane::new( + url, + !args.is_private_access_proxy, + ); + let api = control_plane::client::ControlPlaneClient::PostgresMock(api); + + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } + + AuthBackendType::ConsoleRedirect => { + let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; + let project_info_cache_config: ProjectInfoCacheOptions = + args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; + + info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); + info!( + "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" + ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( + wake_compute_cache_config, + project_info_cache_config, + endpoint_cache_config, + ))); + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.wake_compute_lock.parse()?; + info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( + "wake_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + ))); + + let url = args.uri.clone().parse()?; + let ep_url: crate::url::ApiUrl = args.auth_endpoint.parse()?; + let endpoint = http::Endpoint::new(ep_url, http::new_client()); + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + + // Since we use only get_allowed_ips_and_secret() wake_compute_endpoint_rate_limiter + // and locks are not used in ConsoleRedirectBackend, + // but they are required by the NeonControlPlaneClient + let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( + endpoint, + args.control_plane_token.clone(), + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); + + let backend = ConsoleRedirectBackend::new(url, api); + let config = Box::leak(Box::new(backend)); + + Ok(Either::Right(config)) + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use clap::Parser; + + use crate::rate_limiter::RateBucketInfo; + + #[test] + fn parse_endpoint_rps_limit() { + let config = super::ProxyCliArgs::parse_from([ + "proxy", + "--endpoint-rps-limit", + "100@1s", + "--endpoint-rps-limit", + "20@30s", + ]); + + assert_eq!( + config.endpoint_rps_limit, + vec![ + RateBucketInfo::new(100, Duration::from_secs(1)), + RateBucketInfo::new(20, Duration::from_secs(30)), + ] + ); + } +} diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 20db1fbb14..8ec1a4648b 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -3,7 +3,7 @@ use std::future::pending; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; -use dashmap::DashSet; +use clashmap::ClashSet; use redis::streams::{StreamReadOptions, StreamReadReply}; use redis::{AsyncCommands, FromRedisValue, Value}; use serde::Deserialize; @@ -12,6 +12,7 @@ use tracing::info; use crate::config::EndpointCacheConfig; use crate::context::RequestContext; +use crate::ext::LockExt; use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; use crate::rate_limiter::GlobalRateLimiter; @@ -54,9 +55,9 @@ impl TryFrom<&Value> for ControlPlaneEvent { pub struct EndpointsCache { config: EndpointCacheConfig, - endpoints: DashSet, - branches: DashSet, - projects: DashSet, + endpoints: ClashSet, + branches: ClashSet, + projects: ClashSet, ready: AtomicBool, limiter: Arc>, } @@ -68,9 +69,9 @@ impl EndpointsCache { config.limiter_info.clone(), ))), config, - endpoints: DashSet::new(), - branches: DashSet::new(), - projects: DashSet::new(), + endpoints: ClashSet::new(), + branches: ClashSet::new(), + projects: ClashSet::new(), ready: AtomicBool::new(false), } } @@ -96,7 +97,7 @@ impl EndpointsCache { // If the limiter allows, we can pretend like it's valid // (incase it is, due to redis channel lag). - if self.limiter.lock().unwrap().check() { + if self.limiter.lock_propagate_poison().check() { return true; } @@ -241,7 +242,7 @@ impl EndpointsCache { }); tracing::error!("error parsing value {value:?}: {err:?}"); } - }; + } } if total.is_power_of_two() { tracing::debug!("endpoints read {}", total); @@ -258,6 +259,7 @@ impl EndpointsCache { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 84430dc812..e153e9f61f 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -1,12 +1,12 @@ use std::collections::HashSet; use std::convert::Infallible; -use std::sync::atomic::AtomicU64; use std::sync::Arc; +use std::sync::atomic::AtomicU64; use std::time::Duration; use async_trait::async_trait; -use dashmap::DashMap; -use rand::{thread_rng, Rng}; +use clashmap::ClashMap; +use rand::{Rng, thread_rng}; use smol_str::SmolStr; use tokio::sync::Mutex; use tokio::time::Instant; @@ -15,13 +15,16 @@ use tracing::{debug, info}; use super::{Cache, Cached}; use crate::auth::IpPattern; use crate::config::ProjectInfoCacheOptions; -use crate::control_plane::AuthSecret; -use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}; +use crate::control_plane::{AccessBlockerFlags, AuthSecret}; +use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::types::{EndpointId, RoleName}; #[async_trait] pub(crate) trait ProjectInfoCache { fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt); + fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec); + fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt); + fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt); fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt); async fn decrement_active_listeners(&self); async fn increment_active_listeners(&self); @@ -51,6 +54,8 @@ impl From for Entry { struct EndpointInfo { secret: std::collections::HashMap>>, allowed_ips: Option>>>, + block_public_or_vpc_access: Option>, + allowed_vpc_endpoint_ids: Option>>>, } impl EndpointInfo { @@ -92,9 +97,52 @@ impl EndpointInfo { } None } + pub(crate) fn get_allowed_vpc_endpoint_ids( + &self, + valid_since: Instant, + ignore_cache_since: Option, + ) -> Option<(Arc>, bool)> { + if let Some(allowed_vpc_endpoint_ids) = &self.allowed_vpc_endpoint_ids { + if valid_since < allowed_vpc_endpoint_ids.created_at { + return Some(( + allowed_vpc_endpoint_ids.value.clone(), + Self::check_ignore_cache( + ignore_cache_since, + allowed_vpc_endpoint_ids.created_at, + ), + )); + } + } + None + } + pub(crate) fn get_block_public_or_vpc_access( + &self, + valid_since: Instant, + ignore_cache_since: Option, + ) -> Option<(AccessBlockerFlags, bool)> { + if let Some(block_public_or_vpc_access) = &self.block_public_or_vpc_access { + if valid_since < block_public_or_vpc_access.created_at { + return Some(( + block_public_or_vpc_access.value.clone(), + Self::check_ignore_cache( + ignore_cache_since, + block_public_or_vpc_access.created_at, + ), + )); + } + } + None + } + pub(crate) fn invalidate_allowed_ips(&mut self) { self.allowed_ips = None; } + pub(crate) fn invalidate_allowed_vpc_endpoint_ids(&mut self) { + self.allowed_vpc_endpoint_ids = None; + } + pub(crate) fn invalidate_block_public_or_vpc_access(&mut self) { + self.block_public_or_vpc_access = None; + } pub(crate) fn invalidate_role_secret(&mut self, role_name: RoleNameInt) { self.secret.remove(&role_name); } @@ -108,9 +156,11 @@ impl EndpointInfo { /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available? /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache. pub struct ProjectInfoCacheImpl { - cache: DashMap, + cache: ClashMap, - project2ep: DashMap>, + project2ep: ClashMap>, + // FIXME(stefan): we need a way to GC the account2ep map. + account2ep: ClashMap>, config: ProjectInfoCacheOptions, start_time: Instant, @@ -120,6 +170,63 @@ pub struct ProjectInfoCacheImpl { #[async_trait] impl ProjectInfoCache for ProjectInfoCacheImpl { + fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec) { + info!( + "invalidating allowed vpc endpoint ids for projects `{}`", + project_ids + .iter() + .map(|id| id.to_string()) + .collect::>() + .join(", ") + ); + for project_id in project_ids { + let endpoints = self + .project2ep + .get(&project_id) + .map(|kv| kv.value().clone()) + .unwrap_or_default(); + for endpoint_id in endpoints { + if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) { + endpoint_info.invalidate_allowed_vpc_endpoint_ids(); + } + } + } + } + + fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt) { + info!( + "invalidating allowed vpc endpoint ids for org `{}`", + account_id + ); + let endpoints = self + .account2ep + .get(&account_id) + .map(|kv| kv.value().clone()) + .unwrap_or_default(); + for endpoint_id in endpoints { + if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) { + endpoint_info.invalidate_allowed_vpc_endpoint_ids(); + } + } + } + + fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt) { + info!( + "invalidating block public or vpc access for project `{}`", + project_id + ); + let endpoints = self + .project2ep + .get(&project_id) + .map(|kv| kv.value().clone()) + .unwrap_or_default(); + for endpoint_id in endpoints { + if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) { + endpoint_info.invalidate_block_public_or_vpc_access(); + } + } + } + fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) { info!("invalidating allowed ips for project `{}`", project_id); let endpoints = self @@ -176,8 +283,9 @@ impl ProjectInfoCache for ProjectInfoCacheImpl { impl ProjectInfoCacheImpl { pub(crate) fn new(config: ProjectInfoCacheOptions) -> Self { Self { - cache: DashMap::new(), - project2ep: DashMap::new(), + cache: ClashMap::new(), + project2ep: ClashMap::new(), + account2ep: ClashMap::new(), config, ttl_disabled_since_us: AtomicU64::new(u64::MAX), start_time: Instant::now(), @@ -226,6 +334,49 @@ impl ProjectInfoCacheImpl { } Some(Cached::new_uncached(value)) } + pub(crate) fn get_allowed_vpc_endpoint_ids( + &self, + endpoint_id: &EndpointId, + ) -> Option>>> { + let endpoint_id = EndpointIdInt::get(endpoint_id)?; + let (valid_since, ignore_cache_since) = self.get_cache_times(); + let endpoint_info = self.cache.get(&endpoint_id)?; + let value = endpoint_info.get_allowed_vpc_endpoint_ids(valid_since, ignore_cache_since); + let (value, ignore_cache) = value?; + if !ignore_cache { + let cached = Cached { + token: Some(( + self, + CachedLookupInfo::new_allowed_vpc_endpoint_ids(endpoint_id), + )), + value, + }; + return Some(cached); + } + Some(Cached::new_uncached(value)) + } + pub(crate) fn get_block_public_or_vpc_access( + &self, + endpoint_id: &EndpointId, + ) -> Option> { + let endpoint_id = EndpointIdInt::get(endpoint_id)?; + let (valid_since, ignore_cache_since) = self.get_cache_times(); + let endpoint_info = self.cache.get(&endpoint_id)?; + let value = endpoint_info.get_block_public_or_vpc_access(valid_since, ignore_cache_since); + let (value, ignore_cache) = value?; + if !ignore_cache { + let cached = Cached { + token: Some(( + self, + CachedLookupInfo::new_block_public_or_vpc_access(endpoint_id), + )), + value, + }; + return Some(cached); + } + Some(Cached::new_uncached(value)) + } + pub(crate) fn insert_role_secret( &self, project_id: ProjectIdInt, @@ -256,6 +407,43 @@ impl ProjectInfoCacheImpl { self.insert_project2endpoint(project_id, endpoint_id); self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into()); } + pub(crate) fn insert_allowed_vpc_endpoint_ids( + &self, + account_id: Option, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, + allowed_vpc_endpoint_ids: Arc>, + ) { + if self.cache.len() >= self.config.size { + // If there are too many entries, wait until the next gc cycle. + return; + } + if let Some(account_id) = account_id { + self.insert_account2endpoint(account_id, endpoint_id); + } + self.insert_project2endpoint(project_id, endpoint_id); + self.cache + .entry(endpoint_id) + .or_default() + .allowed_vpc_endpoint_ids = Some(allowed_vpc_endpoint_ids.into()); + } + pub(crate) fn insert_block_public_or_vpc_access( + &self, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, + access_blockers: AccessBlockerFlags, + ) { + if self.cache.len() >= self.config.size { + // If there are too many entries, wait until the next gc cycle. + return; + } + self.insert_project2endpoint(project_id, endpoint_id); + self.cache + .entry(endpoint_id) + .or_default() + .block_public_or_vpc_access = Some(access_blockers.into()); + } + fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) { if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) { endpoints.insert(endpoint_id); @@ -264,6 +452,14 @@ impl ProjectInfoCacheImpl { .insert(project_id, HashSet::from([endpoint_id])); } } + fn insert_account2endpoint(&self, account_id: AccountIdInt, endpoint_id: EndpointIdInt) { + if let Some(mut endpoints) = self.account2ep.get_mut(&account_id) { + endpoints.insert(endpoint_id); + } else { + self.account2ep + .insert(account_id, HashSet::from([endpoint_id])); + } + } fn get_cache_times(&self) -> (Instant, Option) { let mut valid_since = Instant::now() - self.config.ttl; // Only ignore cache if ttl is disabled. @@ -302,7 +498,7 @@ impl ProjectInfoCacheImpl { let mut removed = 0; let shard = self.project2ep.shards()[shard].write(); for (_, endpoints) in shard.iter() { - for endpoint in endpoints.get() { + for endpoint in endpoints { self.cache.remove(endpoint); removed += 1; } @@ -334,11 +530,25 @@ impl CachedLookupInfo { lookup_type: LookupType::AllowedIps, } } + pub(self) fn new_allowed_vpc_endpoint_ids(endpoint_id: EndpointIdInt) -> Self { + Self { + endpoint_id, + lookup_type: LookupType::AllowedVpcEndpointIds, + } + } + pub(self) fn new_block_public_or_vpc_access(endpoint_id: EndpointIdInt) -> Self { + Self { + endpoint_id, + lookup_type: LookupType::BlockPublicOrVpcAccess, + } + } } enum LookupType { RoleSecret(RoleNameInt), AllowedIps, + AllowedVpcEndpointIds, + BlockPublicOrVpcAccess, } impl Cache for ProjectInfoCacheImpl { @@ -360,11 +570,22 @@ impl Cache for ProjectInfoCacheImpl { endpoint_info.invalidate_allowed_ips(); } } + LookupType::AllowedVpcEndpointIds => { + if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) { + endpoint_info.invalidate_allowed_vpc_endpoint_ids(); + } + } + LookupType::BlockPublicOrVpcAccess => { + if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) { + endpoint_info.invalidate_block_public_or_vpc_access(); + } + } } } } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; use crate::scram::ServerSecret; diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 06eaeb9a30..7cfe5100ea 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -11,11 +11,11 @@ use std::time::{Duration, Instant}; // This severely hinders its usage both in terms of creating wrappers and supported key types. // // On the other hand, `hashlink` has good download stats and appears to be maintained. -use hashlink::{linked_hash_map::RawEntryMut, LruCache}; +use hashlink::{LruCache, linked_hash_map::RawEntryMut}; use tracing::debug; use super::common::Cached; -use super::{timed_lru, Cache}; +use super::{Cache, timed_lru}; /// An implementation of timed LRU cache with fixed capacity. /// Key properties: diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index ed717507ee..8263e5aa2a 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,41 +1,142 @@ +use std::convert::Infallible; use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; -use dashmap::DashMap; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; -use postgres_client::{CancelToken, NoTls}; +use postgres_client::CancelToken; +use postgres_client::tls::MakeTlsConnect; use pq_proto::CancelKeyData; +use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; -use tokio::sync::Mutex; +use tokio::sync::{mpsc, oneshot}; use tracing::{debug, info}; -use uuid::Uuid; -use crate::auth::{check_peer_addr_is_in_list, IpPattern}; +use crate::auth::backend::ComputeUserInfo; +use crate::auth::{AuthError, check_peer_addr_is_in_list}; +use crate::config::ComputeConfig; +use crate::context::RequestContext; +use crate::control_plane::ControlPlaneApi; use crate::error::ReportableError; -use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; +use crate::ext::LockExt; +use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, RedisMsgKind}; +use crate::protocol2::ConnectionInfoExtra; use crate::rate_limiter::LeakyBucketRateLimiter; -use crate::redis::cancellation_publisher::{ - CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, -}; - -pub type CancelMap = Arc>>; -pub type CancellationHandlerMain = CancellationHandler>>>; -pub(crate) type CancellationHandlerMainInternal = Option>>; +use crate::redis::keys::KeyPrefix; +use crate::redis::kv_ops::RedisKVClient; +use crate::tls::postgres_rustls::MakeRustlsConnect; type IpSubnetKey = IpNet; +const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time +const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10); + +// Message types for sending through mpsc channel +pub enum CancelKeyOp { + StoreCancelKey { + key: String, + field: String, + value: String, + resp_tx: Option>>, + _guard: CancelChannelSizeGuard<'static>, + expire: i64, // TTL for key + }, + GetCancelData { + key: String, + resp_tx: oneshot::Sender>>, + _guard: CancelChannelSizeGuard<'static>, + }, + RemoveCancelKey { + key: String, + field: String, + resp_tx: Option>>, + _guard: CancelChannelSizeGuard<'static>, + }, +} + +// Running as a separate task to accept messages through the rx channel +// In case of problems with RTT: switch to recv_many() + redis pipeline +pub async fn handle_cancel_messages( + client: &mut RedisKVClient, + mut rx: mpsc::Receiver, +) -> anyhow::Result { + loop { + if let Some(msg) = rx.recv().await { + match msg { + CancelKeyOp::StoreCancelKey { + key, + field, + value, + resp_tx, + _guard, + expire, + } => { + let res = client.hset(&key, field, value).await; + if let Some(resp_tx) = resp_tx { + if res.is_ok() { + resp_tx + .send(client.expire(key, expire).await) + .inspect_err(|e| { + tracing::debug!( + "failed to send StoreCancelKey response: {:?}", + e + ); + }) + .ok(); + } else { + resp_tx + .send(res) + .inspect_err(|e| { + tracing::debug!( + "failed to send StoreCancelKey response: {:?}", + e + ); + }) + .ok(); + } + } else if res.is_ok() { + drop(client.expire(key, expire).await); + } else { + tracing::warn!("failed to store cancel key: {:?}", res); + } + } + CancelKeyOp::GetCancelData { + key, + resp_tx, + _guard, + } => { + drop(resp_tx.send(client.hget_all(key).await)); + } + CancelKeyOp::RemoveCancelKey { + key, + field, + resp_tx, + _guard, + } => { + if let Some(resp_tx) = resp_tx { + resp_tx + .send(client.hdel(key, field).await) + .inspect_err(|e| { + tracing::debug!("failed to send StoreCancelKey response: {:?}", e); + }) + .ok(); + } else { + drop(client.hdel(key, field).await); + } + } + } + } + } +} + /// Enables serving `CancelRequest`s. /// /// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances. -pub struct CancellationHandler

{ - map: CancelMap, - client: P, - /// This field used for the monitoring purposes. - /// Represents the source of the cancellation request. - from: CancellationSource, +pub struct CancellationHandler { + compute_config: &'static ComputeConfig, // rate limiter of cancellation requests limiter: Arc>>, + tx: Option>, // send messages to the redis KV client task } #[derive(Debug, Error)] @@ -51,6 +152,18 @@ pub(crate) enum CancelError { #[error("IP is not allowed")] IpNotAllowed, + + #[error("VPC endpoint id is not allowed to connect")] + VpcEndpointIdNotAllowed, + + #[error("Authentication backend error")] + AuthError(#[from] AuthError), + + #[error("key not found")] + NotFound, + + #[error("proxy service error")] + InternalError, } impl ReportableError for CancelError { @@ -62,261 +175,336 @@ impl ReportableError for CancelError { } CancelError::Postgres(_) => crate::error::ErrorKind::Compute, CancelError::RateLimit => crate::error::ErrorKind::RateLimit, - CancelError::IpNotAllowed => crate::error::ErrorKind::User, + CancelError::IpNotAllowed + | CancelError::VpcEndpointIdNotAllowed + | CancelError::NotFound => crate::error::ErrorKind::User, + CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane, + CancelError::InternalError => crate::error::ErrorKind::Service, } } } -impl CancellationHandler

{ - /// Run async action within an ephemeral session identified by [`CancelKeyData`]. - pub(crate) fn get_session(self: Arc) -> Session

{ +impl CancellationHandler { + pub fn new( + compute_config: &'static ComputeConfig, + tx: Option>, + ) -> Self { + Self { + compute_config, + tx, + limiter: Arc::new(std::sync::Mutex::new( + LeakyBucketRateLimiter::::new_with_shards( + LeakyBucketRateLimiter::::DEFAULT, + 64, + ), + )), + } + } + + pub(crate) fn get_key(self: &Arc) -> Session { // we intentionally generate a random "backend pid" and "secret key" here. // we use the corresponding u64 as an identifier for the // actual endpoint+pid+secret for postgres/pgbouncer. // // if we forwarded the backend_pid from postgres to the client, there would be a lot // of overlap between our computes as most pids are small (~100). - let key = loop { - let key = rand::random(); - // Random key collisions are unlikely to happen here, but they're still possible, - // which is why we have to take care not to rewrite an existing key. - match self.map.entry(key) { - dashmap::mapref::entry::Entry::Occupied(_) => continue, - dashmap::mapref::entry::Entry::Vacant(e) => { - e.insert(None); - } - } - break key; - }; + let key: CancelKeyData = rand::random(); + + let prefix_key: KeyPrefix = KeyPrefix::Cancel(key); + let redis_key = prefix_key.build_redis_key(); debug!("registered new query cancellation key {key}"); Session { key, - cancellation_handler: self, + redis_key, + cancellation_handler: Arc::clone(self), } } - /// Try to cancel a running query for the corresponding connection. - /// If the cancellation key is not found, it will be published to Redis. - /// check_allowed - if true, check if the IP is allowed to cancel the query - /// return Result primarily for tests - pub(crate) async fn cancel_session( + async fn get_cancel_key( &self, key: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - check_allowed: bool, - ) -> Result<(), CancelError> { - // TODO: check for unspecified address is only for backward compatibility, should be removed - if !peer_addr.is_unspecified() { - let subnet_key = match peer_addr { - IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here - IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), - }; - if !self.limiter.lock().unwrap().check(subnet_key, 1) { - // log only the subnet part of the IP address to know which subnet is rate limited - tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}"); - Metrics::get() - .proxy - .cancellation_requests_total - .inc(CancellationRequest { - source: self.from, - kind: crate::metrics::CancellationOutcome::RateLimitExceeded, - }); - return Err(CancelError::RateLimit); + ) -> Result, CancelError> { + let prefix_key: KeyPrefix = KeyPrefix::Cancel(key); + let redis_key = prefix_key.build_redis_key(); + + let (resp_tx, resp_rx) = tokio::sync::oneshot::channel(); + let op = CancelKeyOp::GetCancelData { + key: redis_key, + resp_tx, + _guard: Metrics::get() + .proxy + .cancel_channel_size + .guard(RedisMsgKind::HGetAll), + }; + + let Some(tx) = &self.tx else { + tracing::warn!("cancellation handler is not available"); + return Err(CancelError::InternalError); + }; + + tx.send_timeout(op, REDIS_SEND_TIMEOUT) + .await + .map_err(|e| { + tracing::warn!("failed to send GetCancelData for {key}: {e}"); + }) + .map_err(|()| CancelError::InternalError)?; + + let result = resp_rx.await.map_err(|e| { + tracing::warn!("failed to receive GetCancelData response: {e}"); + CancelError::InternalError + })?; + + let cancel_state_str: Option = match result { + Ok(mut state) => { + if state.len() == 1 { + Some(state.remove(0).1) + } else { + tracing::warn!("unexpected number of entries in cancel state: {state:?}"); + return Err(CancelError::InternalError); + } } + Err(e) => { + tracing::warn!("failed to receive cancel state from redis: {e}"); + return Err(CancelError::InternalError); + } + }; + + let cancel_state: Option = match cancel_state_str { + Some(state) => { + let cancel_closure: CancelClosure = serde_json::from_str(&state).map_err(|e| { + tracing::warn!("failed to deserialize cancel state: {e}"); + CancelError::InternalError + })?; + Some(cancel_closure) + } + None => None, + }; + Ok(cancel_state) + } + /// Try to cancel a running query for the corresponding connection. + /// If the cancellation key is not found, it will be published to Redis. + /// check_allowed - if true, check if the IP is allowed to cancel the query. + /// Will fetch IP allowlist internally. + /// + /// return Result primarily for tests + pub(crate) async fn cancel_session( + &self, + key: CancelKeyData, + ctx: RequestContext, + check_ip_allowed: bool, + check_vpc_allowed: bool, + auth_backend: &T, + ) -> Result<(), CancelError> { + let subnet_key = match ctx.peer_addr() { + IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here + IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), + }; + if !self.limiter.lock_propagate_poison().check(subnet_key, 1) { + // log only the subnet part of the IP address to know which subnet is rate limited + tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}"); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + kind: crate::metrics::CancellationOutcome::RateLimitExceeded, + }); + return Err(CancelError::RateLimit); } - // NB: we should immediately release the lock after cloning the token. - let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { + let cancel_state = self.get_cancel_key(key).await.map_err(|e| { + tracing::warn!("failed to receive RedisOp response: {e}"); + CancelError::InternalError + })?; + + let Some(cancel_closure) = cancel_state else { tracing::warn!("query cancellation key not found: {key}"); Metrics::get() .proxy .cancellation_requests_total .inc(CancellationRequest { - source: self.from, kind: crate::metrics::CancellationOutcome::NotFound, }); - - if session_id == Uuid::nil() { - // was already published, do not publish it again - return Ok(()); - } - - match self.client.try_publish(key, session_id, peer_addr).await { - Ok(()) => {} // do nothing - Err(e) => { - // log it here since cancel_session could be spawned in a task - tracing::error!("failed to publish cancellation key: {key}, error: {e}"); - return Err(CancelError::IO(std::io::Error::new( - std::io::ErrorKind::Other, - e.to_string(), - ))); - } - } - return Ok(()); + return Err(CancelError::NotFound); }; - if check_allowed - && !check_peer_addr_is_in_list(&peer_addr, cancel_closure.ip_allowlist.as_slice()) - { - // log it here since cancel_session could be spawned in a task - tracing::warn!("IP is not allowed to cancel the query: {key}"); - return Err(CancelError::IpNotAllowed); + if check_ip_allowed { + let ip_allowlist = auth_backend + .get_allowed_ips(&ctx, &cancel_closure.user_info) + .await + .map_err(|e| CancelError::AuthError(e.into()))?; + + if !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) { + // log it here since cancel_session could be spawned in a task + tracing::warn!( + "IP is not allowed to cancel the query: {key}, address: {}", + ctx.peer_addr() + ); + return Err(CancelError::IpNotAllowed); + } + } + + // check if a VPC endpoint ID is coming in and if yes, if it's allowed + let access_blocks = auth_backend + .get_block_public_or_vpc_access(&ctx, &cancel_closure.user_info) + .await + .map_err(|e| CancelError::AuthError(e.into()))?; + + if check_vpc_allowed { + if access_blocks.vpc_access_blocked { + return Err(CancelError::AuthError(AuthError::NetworkNotAllowed)); + } + + let incoming_vpc_endpoint_id = match ctx.extra() { + None => return Err(CancelError::AuthError(AuthError::MissingVPCEndpointId)), + Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(), + Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), + }; + + let allowed_vpc_endpoint_ids = auth_backend + .get_allowed_vpc_endpoint_ids(&ctx, &cancel_closure.user_info) + .await + .map_err(|e| CancelError::AuthError(e.into()))?; + // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that. + if !allowed_vpc_endpoint_ids.is_empty() + && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id) + { + return Err(CancelError::VpcEndpointIdNotAllowed); + } + } else if access_blocks.public_access_blocked { + return Err(CancelError::VpcEndpointIdNotAllowed); } Metrics::get() .proxy .cancellation_requests_total .inc(CancellationRequest { - source: self.from, kind: crate::metrics::CancellationOutcome::Found, }); info!("cancelling query per user's request using key {key}"); - cancel_closure.try_cancel_query().await - } - - #[cfg(test)] - fn contains(&self, session: &Session

) -> bool { - self.map.contains_key(&session.key) - } - - #[cfg(test)] - fn is_empty(&self) -> bool { - self.map.is_empty() - } -} - -impl CancellationHandler<()> { - pub fn new(map: CancelMap, from: CancellationSource) -> Self { - Self { - map, - client: (), - from, - limiter: Arc::new(std::sync::Mutex::new( - LeakyBucketRateLimiter::::new_with_shards( - LeakyBucketRateLimiter::::DEFAULT, - 64, - ), - )), - } - } -} - -impl CancellationHandler>>> { - pub fn new(map: CancelMap, client: Option>>, from: CancellationSource) -> Self { - Self { - map, - client, - from, - limiter: Arc::new(std::sync::Mutex::new( - LeakyBucketRateLimiter::::new_with_shards( - LeakyBucketRateLimiter::::DEFAULT, - 64, - ), - )), - } + cancel_closure.try_cancel_query(self.compute_config).await } } /// This should've been a [`std::future::Future`], but /// it's impossible to name a type of an unboxed future /// (we'd need something like `#![feature(type_alias_impl_trait)]`). -#[derive(Clone)] +#[derive(Clone, Serialize, Deserialize)] pub struct CancelClosure { socket_addr: SocketAddr, cancel_token: CancelToken, - ip_allowlist: Vec, + hostname: String, // for pg_sni router + user_info: ComputeUserInfo, } impl CancelClosure { pub(crate) fn new( socket_addr: SocketAddr, cancel_token: CancelToken, - ip_allowlist: Vec, + hostname: String, + user_info: ComputeUserInfo, ) -> Self { Self { socket_addr, cancel_token, - ip_allowlist, + hostname, + user_info, } } /// Cancels the query running on user's compute node. - pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> { + pub(crate) async fn try_cancel_query( + self, + compute_config: &ComputeConfig, + ) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; - self.cancel_token.cancel_query_raw(socket, NoTls).await?; + + let mut mk_tls = + crate::tls::postgres_rustls::MakeRustlsConnect::new(compute_config.tls.clone()); + let tls = >::make_tls_connect( + &mut mk_tls, + &self.hostname, + ) + .map_err(|e| { + CancelError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + )) + })?; + + self.cancel_token.cancel_query_raw(socket, tls).await?; debug!("query was cancelled"); Ok(()) } - pub(crate) fn set_ip_allowlist(&mut self, ip_allowlist: Vec) { - self.ip_allowlist = ip_allowlist; - } } /// Helper for registering query cancellation tokens. -pub(crate) struct Session

{ +pub(crate) struct Session { /// The user-facing key identifying this session. key: CancelKeyData, - /// The [`CancelMap`] this session belongs to. - cancellation_handler: Arc>, + redis_key: String, + cancellation_handler: Arc, } -impl

Session

{ - /// Store the cancel token for the given session. - /// This enables query cancellation in `crate::proxy::prepare_client_connection`. - pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData { - debug!("enabling query cancellation for this session"); - self.cancellation_handler - .map - .insert(self.key, Some(cancel_closure)); - - self.key +impl Session { + pub(crate) fn key(&self) -> &CancelKeyData { + &self.key } -} -impl

Drop for Session

{ - fn drop(&mut self) { - self.cancellation_handler.map.remove(&self.key); - debug!("dropped query cancellation key {}", &self.key); - } -} + // Send the store key op to the cancellation handler and set TTL for the key + pub(crate) async fn write_cancel_key( + &self, + cancel_closure: CancelClosure, + ) -> Result<(), CancelError> { + let Some(tx) = &self.cancellation_handler.tx else { + tracing::warn!("cancellation handler is not available"); + return Err(CancelError::InternalError); + }; -#[cfg(test)] -mod tests { - use super::*; + let closure_json = serde_json::to_string(&cancel_closure).map_err(|e| { + tracing::warn!("failed to serialize cancel closure: {e}"); + CancelError::InternalError + })?; - #[tokio::test] - async fn check_session_drop() -> anyhow::Result<()> { - let cancellation_handler = Arc::new(CancellationHandler::<()>::new( - CancelMap::default(), - CancellationSource::FromRedis, - )); - - let session = cancellation_handler.clone().get_session(); - assert!(cancellation_handler.contains(&session)); - drop(session); - // Check that the session has been dropped. - assert!(cancellation_handler.is_empty()); + let op = CancelKeyOp::StoreCancelKey { + key: self.redis_key.clone(), + field: "data".to_string(), + value: closure_json, + resp_tx: None, + _guard: Metrics::get() + .proxy + .cancel_channel_size + .guard(RedisMsgKind::HSet), + expire: CANCEL_KEY_TTL, + }; + let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { + let key = self.key; + tracing::warn!("failed to send StoreCancelKey for {key}: {e}"); + }); Ok(()) } - #[tokio::test] - async fn cancel_session_noop_regression() { - let handler = - CancellationHandler::<()>::new(CancelMap::default(), CancellationSource::Local); - handler - .cancel_session( - CancelKeyData { - backend_pid: 0, - cancel_key: 0, - }, - Uuid::new_v4(), - "127.0.0.1".parse().unwrap(), - true, - ) - .await - .unwrap(); + pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> { + let Some(tx) = &self.cancellation_handler.tx else { + tracing::warn!("cancellation handler is not available"); + return Err(CancelError::InternalError); + }; + + let op = CancelKeyOp::RemoveCancelKey { + key: self.redis_key.clone(), + field: "data".to_string(), + resp_tx: None, + _guard: Metrics::get() + .proxy + .cancel_channel_size + .guard(RedisMsgKind::HDel), + }; + + let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { + let key = self.key; + tracing::warn!("failed to send RemoveCancelKey for {key}: {e}"); + }); + Ok(()) } } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 4113b5bb80..5447a4a4c0 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,32 +1,30 @@ use std::io; use std::net::SocketAddr; -use std::sync::Arc; use std::time::Duration; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; -use once_cell::sync::OnceCell; use postgres_client::tls::MakeTlsConnect; use postgres_client::{CancelToken, RawConnection}; use postgres_protocol::message::backend::NoticeResponseBody; use pq_proto::StartupMessageParams; -use rustls::client::danger::ServerCertVerifier; -use rustls::crypto::ring; use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::net::TcpStream; use tracing::{debug, error, info, warn}; +use crate::auth::backend::ComputeUserInfo; use crate::auth::parse_endpoint_param; use crate::cancellation::CancelClosure; +use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::WakeComputeError; use crate::control_plane::messages::MetricsAuxInfo; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumDbConnectionsGuard}; -use crate::postgres_rustls::MakeRustlsConnect; use crate::proxy::neon_option; +use crate::tls::postgres_rustls::MakeRustlsConnect; use crate::types::Host; pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; @@ -41,9 +39,6 @@ pub(crate) enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] CouldNotConnect(#[from] io::Error), - #[error("Couldn't load native TLS certificates: {0:?}")] - TlsCertificateError(Vec), - #[error("{COULD_NOT_CONNECT}: {0}")] TlsError(#[from] InvalidDnsNameError), @@ -90,7 +85,6 @@ impl ReportableError for ConnectionError { } ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute, ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute, - ConnectionError::TlsCertificateError(_) => crate::error::ErrorKind::Service, ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, ConnectionError::WakeComputeError(e) => e.get_error_kind(), ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), @@ -143,8 +137,8 @@ impl ConnCfg { match k { // Only set `user` if it's not present in the config. // Console redirect auth flow takes username from the console's response. - "user" if self.user_is_set() => continue, - "database" if self.db_is_set() => continue, + "user" if self.user_is_set() => {} + "database" if self.db_is_set() => {} "options" => { if let Some(options) = filtered_options(v) { self.set_param(k, &options); @@ -200,11 +194,15 @@ impl ConnCfg { let connect_once = |host, port| { debug!("trying to connect to compute node at {host}:{port}"); - connect_with_timeout(host, port).and_then(|socket| async { - let socket_addr = socket.peer_addr()?; + connect_with_timeout(host, port).and_then(|stream| async { + let socket_addr = stream.peer_addr()?; + let socket = socket2::SockRef::from(&stream); + // Disable Nagle's algorithm to not introduce latency between + // client and compute. + socket.set_nodelay(true)?; // This prevents load balancer from severing the connection. - socket2::SockRef::from(&socket).set_keepalive(true)?; - Ok((socket_addr, socket)) + socket.set_keepalive(true)?; + Ok((socket_addr, stream)) }) }; @@ -251,35 +249,15 @@ impl ConnCfg { pub(crate) async fn connect( &self, ctx: &RequestContext, - allow_self_signed_compute: bool, aux: MetricsAuxInfo, - timeout: Duration, + config: &ComputeConfig, + user_info: ComputeUserInfo, ) -> Result { let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let (socket_addr, stream, host) = self.connect_raw(timeout).await?; + let (socket_addr, stream, host) = self.connect_raw(config.timeout).await?; drop(pause); - let client_config = if allow_self_signed_compute { - // Allow all certificates for creating the connection - let verifier = Arc::new(AcceptEverythingVerifier); - rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_safe_default_protocol_versions() - .expect("ring should support the default protocol versions") - .dangerous() - .with_custom_certificate_verifier(verifier) - } else { - let root_store = TLS_ROOTS - .get_or_try_init(load_certs) - .map_err(ConnectionError::TlsCertificateError)? - .clone(); - rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_safe_default_protocol_versions() - .expect("ring should support the default protocol versions") - .with_root_certificates(root_store) - }; - let client_config = client_config.with_no_client_auth(); - - let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config); + let mut mk_tls = crate::tls::postgres_rustls::MakeRustlsConnect::new(config.tls.clone()); let tls = >::make_tls_connect( &mut mk_tls, host, @@ -318,7 +296,8 @@ impl ConnCfg { process_id, secret_key, }, - vec![], + host.to_string(), + user_info, ); let connection = PostgresConnection { @@ -350,63 +329,6 @@ fn filtered_options(options: &str) -> Option { Some(options) } -fn load_certs() -> Result, Vec> { - let der_certs = rustls_native_certs::load_native_certs(); - - if !der_certs.errors.is_empty() { - return Err(der_certs.errors); - } - - let mut store = rustls::RootCertStore::empty(); - store.add_parsable_certificates(der_certs.certs); - Ok(Arc::new(store)) -} -static TLS_ROOTS: OnceCell> = OnceCell::new(); - -#[derive(Debug)] -struct AcceptEverythingVerifier; -impl ServerCertVerifier for AcceptEverythingVerifier { - fn supported_verify_schemes(&self) -> Vec { - use rustls::SignatureScheme; - // The schemes for which `SignatureScheme::supported_in_tls13` returns true. - vec![ - SignatureScheme::ECDSA_NISTP521_SHA512, - SignatureScheme::ECDSA_NISTP384_SHA384, - SignatureScheme::ECDSA_NISTP256_SHA256, - SignatureScheme::RSA_PSS_SHA512, - SignatureScheme::RSA_PSS_SHA384, - SignatureScheme::RSA_PSS_SHA256, - SignatureScheme::ED25519, - ] - } - fn verify_server_cert( - &self, - _end_entity: &rustls::pki_types::CertificateDer<'_>, - _intermediates: &[rustls::pki_types::CertificateDer<'_>], - _server_name: &rustls::pki_types::ServerName<'_>, - _ocsp_response: &[u8], - _now: rustls::pki_types::UnixTime, - ) -> Result { - Ok(rustls::client::danger::ServerCertVerified::assertion()) - } - fn verify_tls12_signature( - &self, - _message: &[u8], - _cert: &rustls::pki_types::CertificateDer<'_>, - _dss: &rustls::DigitallySignedStruct, - ) -> Result { - Ok(rustls::client::danger::HandshakeSignatureValid::assertion()) - } - fn verify_tls13_signature( - &self, - _message: &[u8], - _cert: &rustls::pki_types::CertificateDer<'_>, - _dss: &rustls::DigitallySignedStruct, - ) -> Result { - Ok(rustls::client::danger::HandshakeSignatureValid::assertion()) - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/compute_ctl/mod.rs b/proxy/src/compute_ctl/mod.rs index 60fdf107d4..ab3179afb2 100644 --- a/proxy/src/compute_ctl/mod.rs +++ b/proxy/src/compute_ctl/mod.rs @@ -42,14 +42,14 @@ pub enum Privilege { #[derive(Error, Debug)] pub enum ComputeCtlError { #[error("connection error: {0}")] - ConnectionError(#[source] reqwest_middleware::Error), + Connection(#[source] reqwest_middleware::Error), #[error("request error [{status}]: {body:?}")] - RequestError { + Request { status: StatusCode, body: Option, }, #[error("response parsing error: {0}")] - ResponseError(#[source] reqwest::Error), + Response(#[source] reqwest::Error), } impl ComputeCtlApi { @@ -89,14 +89,14 @@ impl ComputeCtlApi { .json(req) .send() .await - .map_err(ComputeCtlError::ConnectionError)?; + .map_err(ComputeCtlError::Connection)?; let status = resp.status(); if status.is_client_error() || status.is_server_error() { let body = resp.json().await.ok(); - return Err(ComputeCtlError::RequestError { status, body }); + return Err(ComputeCtlError::Request { status, body }); } - resp.json().await.map_err(ComputeCtlError::ResponseError) + resp.json().await.map_err(ComputeCtlError::Response) } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 8bc8e3f96f..1bcd22e98f 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,31 +1,24 @@ -use std::collections::{HashMap, HashSet}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{bail, ensure, Context, Ok}; +use anyhow::{Context, Ok, bail, ensure}; use clap::ValueEnum; -use itertools::Itertools; use remote_storage::RemoteStorageConfig; -use rustls::crypto::ring::{self, sign}; -use rustls::pki_types::{CertificateDer, PrivateKeyDer}; -use sha2::{Digest, Sha256}; -use tracing::{error, info}; -use x509_parser::oid_registry; -use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::AuthRateLimiter; +use crate::auth::backend::jwt::JwkCache; use crate::control_plane::locks::ApiLocks; use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}; use crate::scram::threadpool::ThreadPool; -use crate::serverless::cancel_set::CancelSet; use crate::serverless::GlobalConnPoolOptions; +use crate::serverless::cancel_set::CancelSet; +pub use crate::tls::server_config::{TlsConfig, configure_tls}; use crate::types::Host; pub struct ProxyConfig { pub tls_config: Option, pub metric_collection: Option, - pub allow_self_signed_compute: bool, pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, pub proxy_protocol_v2: ProxyProtocolV2, @@ -33,7 +26,13 @@ pub struct ProxyConfig { pub handshake_timeout: Duration, pub wake_compute_retry_config: RetryConfig, pub connect_compute_locks: ApiLocks, - pub connect_to_compute_retry_config: RetryConfig, + pub connect_to_compute: ComputeConfig, +} + +pub struct ComputeConfig { + pub retry: RetryConfig, + pub tls: Arc, + pub timeout: Duration, } #[derive(Copy, Clone, Debug, ValueEnum, PartialEq)] @@ -53,12 +52,6 @@ pub struct MetricCollectionConfig { pub backup_metric_collection_config: MetricBackupCollectionConfig, } -pub struct TlsConfig { - pub config: Arc, - pub common_names: HashSet, - pub cert_resolver: Arc, -} - pub struct HttpConfig { pub accept_websockets: bool, pub pool_options: GlobalConnPoolOptions, @@ -75,277 +68,13 @@ pub struct AuthenticationConfig { pub rate_limiter: AuthRateLimiter, pub rate_limit_ip_subnet: u8, pub ip_allowlist_check_enabled: bool, + pub is_vpc_acccess_proxy: bool, pub jwks_cache: JwkCache, pub is_auth_broker: bool, pub accept_jwts: bool, pub console_redirect_confirmation_timeout: tokio::time::Duration, } -impl TlsConfig { - pub fn to_server_config(&self) -> Arc { - self.config.clone() - } -} - -/// -pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql"; - -/// Configure TLS for the main endpoint. -pub fn configure_tls( - key_path: &str, - cert_path: &str, - certs_dir: Option<&String>, -) -> anyhow::Result { - let mut cert_resolver = CertResolver::new(); - - // add default certificate - cert_resolver.add_cert_path(key_path, cert_path, true)?; - - // add extra certificates - if let Some(certs_dir) = certs_dir { - for entry in std::fs::read_dir(certs_dir)? { - let entry = entry?; - let path = entry.path(); - if path.is_dir() { - // file names aligned with default cert-manager names - let key_path = path.join("tls.key"); - let cert_path = path.join("tls.crt"); - if key_path.exists() && cert_path.exists() { - cert_resolver.add_cert_path( - &key_path.to_string_lossy(), - &cert_path.to_string_lossy(), - false, - )?; - } - } - } - } - - let common_names = cert_resolver.get_common_names(); - - let cert_resolver = Arc::new(cert_resolver); - - // allow TLS 1.2 to be compatible with older client libraries - let mut config = - rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) - .context("ring should support TLS1.2 and TLS1.3")? - .with_no_client_auth() - .with_cert_resolver(cert_resolver.clone()); - - config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()]; - - Ok(TlsConfig { - config: Arc::new(config), - common_names, - cert_resolver, - }) -} - -/// Channel binding parameter -/// -/// -/// Description: The hash of the TLS server's certificate as it -/// appears, octet for octet, in the server's Certificate message. Note -/// that the Certificate message contains a certificate_list, in which -/// the first element is the server's certificate. -/// -/// The hash function is to be selected as follows: -/// -/// * if the certificate's signatureAlgorithm uses a single hash -/// function, and that hash function is either MD5 or SHA-1, then use SHA-256; -/// -/// * if the certificate's signatureAlgorithm uses a single hash -/// function and that hash function neither MD5 nor SHA-1, then use -/// the hash function associated with the certificate's -/// signatureAlgorithm; -/// -/// * if the certificate's signatureAlgorithm uses no hash functions or -/// uses multiple hash functions, then this channel binding type's -/// channel bindings are undefined at this time (updates to is channel -/// binding type may occur to address this issue if it ever arises). -#[derive(Debug, Clone, Copy)] -pub enum TlsServerEndPoint { - Sha256([u8; 32]), - Undefined, -} - -impl TlsServerEndPoint { - pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result { - let sha256_oids = [ - // I'm explicitly not adding MD5 or SHA1 here... They're bad. - oid_registry::OID_SIG_ECDSA_WITH_SHA256, - oid_registry::OID_PKCS1_SHA256WITHRSA, - ]; - - let pem = x509_parser::parse_x509_certificate(cert) - .context("Failed to parse PEM object from cerficiate")? - .1; - - info!(subject = %pem.subject, "parsing TLS certificate"); - - let reg = oid_registry::OidRegistry::default().with_all_crypto(); - let oid = pem.signature_algorithm.oid(); - let alg = reg.get(oid); - if sha256_oids.contains(oid) { - let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into(); - info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); - Ok(Self::Sha256(tls_server_end_point)) - } else { - error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding"); - Ok(Self::Undefined) - } - } - - pub fn supported(&self) -> bool { - !matches!(self, TlsServerEndPoint::Undefined) - } -} - -#[derive(Default, Debug)] -pub struct CertResolver { - certs: HashMap, TlsServerEndPoint)>, - default: Option<(Arc, TlsServerEndPoint)>, -} - -impl CertResolver { - pub fn new() -> Self { - Self::default() - } - - fn add_cert_path( - &mut self, - key_path: &str, - cert_path: &str, - is_default: bool, - ) -> anyhow::Result<()> { - let priv_key = { - let key_bytes = std::fs::read(key_path) - .context(format!("Failed to read TLS keys at '{key_path}'"))?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); - - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - PrivateKeyDer::Pkcs8( - keys.pop() - .unwrap() - .context(format!("Failed to parse TLS keys at '{key_path}'"))?, - ) - }; - - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - self.add_cert(priv_key, cert_chain, is_default) - } - - pub fn add_cert( - &mut self, - priv_key: PrivateKeyDer<'static>, - cert_chain: Vec>, - is_default: bool, - ) -> anyhow::Result<()> { - let key = sign::any_supported_type(&priv_key).context("invalid private key")?; - - let first_cert = &cert_chain[0]; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let pem = x509_parser::parse_x509_certificate(first_cert) - .context("Failed to parse PEM object from cerficiate")? - .1; - - let common_name = pem.subject().to_string(); - - // We need to get the canonical name for this certificate so we can match them against any domain names - // seen within the proxy codebase. - // - // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. - // We need to remove the wildcard prefix for the purposes of certificate selection. - // - // auth-broker does not use SNI and instead uses the Neon-Connection-String header. - // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. - // - // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string - // validation, so let's we can continue with any common-name - let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { - s.to_string() - } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { - s.to_string() - } else if let Some(s) = common_name.strip_prefix("CN=") { - s.to_string() - } else { - bail!("Failed to parse common name from certificate") - }; - - let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); - - if is_default { - self.default = Some((cert.clone(), tls_server_end_point)); - } - - self.certs.insert(common_name, (cert, tls_server_end_point)); - - Ok(()) - } - - pub fn get_common_names(&self) -> HashSet { - self.certs.keys().map(|s| s.to_string()).collect() - } -} - -impl rustls::server::ResolvesServerCert for CertResolver { - fn resolve( - &self, - client_hello: rustls::server::ClientHello<'_>, - ) -> Option> { - self.resolve(client_hello.server_name()).map(|x| x.0) - } -} - -impl CertResolver { - pub fn resolve( - &self, - server_name: Option<&str>, - ) -> Option<(Arc, TlsServerEndPoint)> { - // loop here and cut off more and more subdomains until we find - // a match to get a proper wildcard support. OTOH, we now do not - // use nested domains, so keep this simple for now. - // - // With the current coding foo.com will match *.foo.com and that - // repeats behavior of the old code. - if let Some(mut sni_name) = server_name { - loop { - if let Some(cert) = self.certs.get(sni_name) { - return Some(cert.clone()); - } - if let Some((_, rest)) = sni_name.split_once('.') { - sni_name = rest; - } else { - return None; - } - } - } else { - // No SNI, use the default certificate, otherwise we can't get to - // options parameter which can be used to set endpoint name too. - // That means that non-SNI flow will not work for CNAME domains in - // verify-full mode. - // - // If that will be a problem we can: - // - // a) Instead of multi-cert approach use single cert with extra - // domains listed in Subject Alternative Name (SAN). - // b) Deploy separate proxy instances for extra domains. - self.default.clone() - } - } -} - #[derive(Debug)] pub struct EndpointCacheConfig { /// Batch size to receive all endpoints on the startup. @@ -368,8 +97,7 @@ pub struct EndpointCacheConfig { impl EndpointCacheConfig { /// Default options for [`crate::control_plane::NodeInfoCache`]. /// Notice that by default the limiter is empty, which means that cache is disabled. - pub const CACHE_DEFAULT_OPTIONS: &'static str = - "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; + pub const CACHE_DEFAULT_OPTIONS: &'static str = "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; /// Parse cache options passed via cmdline. /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. @@ -422,7 +150,6 @@ impl FromStr for EndpointCacheConfig { } #[derive(Debug)] pub struct MetricBackupCollectionConfig { - pub interval: Duration, pub remote_storage_config: Option, pub chunk_size: usize, } diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 65702e0e4c..4662860b3f 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -3,20 +3,20 @@ use std::sync::Arc; use futures::{FutureExt, TryFutureExt}; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, Instrument}; +use tracing::{Instrument, debug, error, info}; use crate::auth::backend::ConsoleRedirectBackend; -use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal}; +use crate::cancellation::CancellationHandler; use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; -use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo}; -use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism}; -use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol}; +use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute}; +use crate::proxy::handshake::{HandshakeData, handshake}; use crate::proxy::passthrough::ProxyPassthrough; use crate::proxy::{ - prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource, + ClientRequestError, ErrorSource, prepare_client_connection, run_until_cancelled, }; pub async fn task_main( @@ -24,7 +24,7 @@ pub async fn task_main( backend: &'static ConsoleRedirectBackend, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, - cancellation_handler: Arc, + cancellation_handler: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("proxy has shut down"); @@ -64,25 +64,37 @@ pub async fn task_main( debug!("healthcheck received"); return; } - Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { + Ok((_socket, ConnectHeader::Missing)) + if config.proxy_protocol_v2 == ProxyProtocolV2::Required => + { error!("missing required proxy protocol header"); return; } - Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { + Ok((_socket, ConnectHeader::Proxy(_))) + if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => + { error!("proxy protocol header not supported"); return; } Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), - Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo{ addr: peer_addr, extra: None }), + Ok((socket, ConnectHeader::Missing)) => ( + socket, + ConnectionInfo { + addr: peer_addr, + extra: None, + }, + ), }; match socket.inner.set_nodelay(true) { Ok(()) => {} Err(e) => { - error!("per-client task finished with an error: failed to set socket option: {e:#}"); + error!( + "per-client task finished with an error: failed to set socket option: {e:#}" + ); return; } - }; + } let ctx = RequestContext::new( session_id, @@ -115,13 +127,19 @@ pub async fn task_main( Ok(Some(p)) => { ctx.set_success(); let _disconnect = ctx.log_connect(); - match p.proxy_pass().await { + match p.proxy_pass(&config.connect_to_compute).await { Ok(()) => {} Err(ErrorSource::Client(e)) => { - error!(?session_id, "per-client task finished with an IO error from the client: {e:#}"); + error!( + ?session_id, + "per-client task finished with an IO error from the client: {e:#}" + ); } Err(ErrorSource::Compute(e)) => { - error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}"); + error!( + ?session_id, + "per-client task finished with an IO error from the compute: {e:#}" + ); } } } @@ -140,15 +158,16 @@ pub async fn task_main( Ok(()) } +#[allow(clippy::too_many_arguments)] pub(crate) async fn handle_client( config: &'static ProxyConfig, backend: &'static ConsoleRedirectBackend, ctx: &RequestContext, - cancellation_handler: Arc, + cancellation_handler: Arc, stream: S, conn_gauge: NumClientConnectionsGuard<'static>, cancellations: tokio_util::task::task_tracker::TaskTracker, -) -> Result>, ClientRequestError> { +) -> Result>, ClientRequestError> { debug!( protocol = %ctx.protocol(), "handling interactive connection from client" @@ -159,6 +178,7 @@ pub(crate) async fn handle_client( let request_gauge = metrics.connection_requests.guard(proto); let tls = config.tls_config.as_ref(); + let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); let do_handshake = handshake(ctx, stream, tls, record_handshake_error); @@ -170,24 +190,22 @@ pub(crate) async fn handle_client( HandshakeData::Cancel(cancel_key_data) => { // spawn a task to cancel the session, but don't wait for it cancellations.spawn({ - let cancellation_handler_clone = Arc::clone(&cancellation_handler); - let session_id = ctx.session_id(); - let peer_ip = ctx.peer_addr(); - let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id); + let cancellation_handler_clone = Arc::clone(&cancellation_handler); + let ctx = ctx.clone(); + let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id()); cancel_span.follows_from(tracing::Span::current()); async move { - drop( - cancellation_handler_clone - .cancel_session( - cancel_key_data, - session_id, - peer_ip, - config.authentication_config.ip_allowlist_check_enabled, - ) - .instrument(cancel_span) - .await, - ); - } + cancellation_handler_clone + .cancel_session( + cancel_key_data, + ctx, + config.authentication_config.ip_allowlist_check_enabled, + config.authentication_config.is_vpc_acccess_proxy, + backend.get_api(), + ) + .await + .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok(); + }.instrument(cancel_span) }); return Ok(None); @@ -197,7 +215,7 @@ pub(crate) async fn handle_client( ctx.set_db_options(params.clone()); - let (user_info, ip_allowlist) = match backend + let (node_info, user_info, _ip_allowlist) = match backend .authenticate(ctx, &config.authentication_config, &mut stream) .await { @@ -210,22 +228,26 @@ pub(crate) async fn handle_client( let mut node = connect_to_compute( ctx, &TcpMechanism { + user_info, params_compat: true, params: ¶ms, locks: &config.connect_compute_locks, }, - &user_info, - config.allow_self_signed_compute, + &node_info, config.wake_compute_retry_config, - config.connect_to_compute_retry_config, + &config.connect_to_compute, ) .or_else(|e| stream.throw_error(e)) .await?; - node.cancel_closure - .set_ip_allowlist(ip_allowlist.unwrap_or_default()); - let session = cancellation_handler.get_session(); - prepare_client_connection(&node, &session, &mut stream).await?; + let cancellation_handler_clone = Arc::clone(&cancellation_handler); + let session = cancellation_handler_clone.get_key(); + + session + .write_cancel_key(node.cancel_closure.clone()) + .await?; + + prepare_client_connection(&node, *session.key(), &mut stream).await?; // Before proxy passing, forward to compute whatever data is left in the // PqStream input buffer. Normally there is none, but our serverless npm @@ -237,10 +259,11 @@ pub(crate) async fn handle_client( Ok(Some(ProxyPassthrough { client: stream, aux: node.aux.clone(), + private_link_id: None, compute: node, session_id: ctx.session_id(), + cancel: session, _req: request_gauge, _conn: conn_gauge, - _cancel: session, })) } diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index a9fb513d3c..74b48a1bea 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -8,7 +8,7 @@ use pq_proto::StartupMessageParams; use smol_str::SmolStr; use tokio::sync::mpsc; use tracing::field::display; -use tracing::{debug, error, info_span, Span}; +use tracing::{Span, debug, error, info_span}; use try_lock::TryLock; use uuid::Uuid; @@ -19,7 +19,7 @@ use crate::intern::{BranchIdInt, ProjectIdInt}; use crate::metrics::{ ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting, }; -use crate::protocol2::ConnectionInfo; +use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra}; use crate::types::{DbName, EndpointId, RoleName}; pub mod parquet; @@ -312,6 +312,15 @@ impl RequestContext { .ip() } + pub(crate) fn extra(&self) -> Option { + self.0 + .try_lock() + .expect("should not deadlock") + .conn_info + .extra + .clone() + } + pub(crate) fn cold_start_info(&self) -> ColdStartInfo { self.0 .try_lock() diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 3105d08526..f029327266 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -8,7 +8,7 @@ use chrono::{Datelike, Timelike}; use futures::{Stream, StreamExt}; use parquet::basic::Compression; use parquet::file::metadata::RowGroupMetaDataPtr; -use parquet::file::properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE}; +use parquet::file::properties::{DEFAULT_PAGE_SIZE, WriterProperties, WriterPropertiesPtr}; use parquet::file::writer::SerializedFileWriter; use parquet::record::RecordWriter; use pq_proto::StartupMessageParams; @@ -17,12 +17,13 @@ use serde::ser::SerializeMap; use tokio::sync::mpsc; use tokio::time; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, Span}; +use tracing::{Span, debug, info}; use utils::backoff; -use super::{RequestContextInner, LOG_CHAN}; +use super::{LOG_CHAN, RequestContextInner}; use crate::config::remote_storage_from_toml; use crate::context::LOG_CHAN_DISCONNECT; +use crate::ext::TaskExt; #[derive(clap::Args, Clone, Debug)] pub struct ParquetUploadArgs { @@ -171,7 +172,9 @@ pub async fn worker( }; let (tx, mut rx) = mpsc::unbounded_channel(); - LOG_CHAN.set(tx.downgrade()).unwrap(); + LOG_CHAN + .set(tx.downgrade()) + .expect("only one worker should set the channel"); // setup row stream that will close on cancellation let cancellation_token2 = cancellation_token.clone(); @@ -207,7 +210,9 @@ pub async fn worker( config.parquet_upload_disconnect_events_remote_storage { let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel(); - LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap(); + LOG_CHAN_DISCONNECT + .set(tx_disconnect.downgrade()) + .expect("only one worker should set the channel"); // setup row stream that will close on cancellation tokio::spawn(async move { @@ -326,7 +331,7 @@ where Ok::<_, parquet::errors::ParquetError>((rows, w, rg_meta)) }) .await - .unwrap()?; + .propagate_task_panic()?; rows.clear(); Ok((rows, w, rg_meta)) @@ -352,7 +357,7 @@ async fn upload_parquet( Ok((buffer, metadata)) }) .await - .unwrap()?; + .propagate_task_panic()?; let data = buffer.split().freeze(); @@ -398,17 +403,18 @@ async fn upload_parquet( .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x) - .context("request_data_upload") + .with_context(|| format!("request_data_upload: path={path}")) .err(); if let Some(err) = maybe_err { - tracing::error!(%id, error = ?err, "failed to upload request data"); + tracing::error!(%id, %path, error = ?err, "failed to upload request data"); } Ok(buffer.writer()) } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::net::Ipv4Addr; use std::num::NonZeroUsize; @@ -419,20 +425,20 @@ mod tests { use futures::{Stream, StreamExt}; use itertools::Itertools; use parquet::basic::{Compression, ZstdLevel}; - use parquet::file::properties::{WriterProperties, DEFAULT_PAGE_SIZE}; + use parquet::file::properties::{DEFAULT_PAGE_SIZE, WriterProperties}; use parquet::file::reader::FileReader; use parquet::file::serialized_reader::SerializedFileReader; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use remote_storage::{ - GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, }; use tokio::sync::mpsc; use tokio::time; use walkdir::WalkDir; - use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData}; + use super::{ParquetConfig, ParquetUploadArgs, RequestData, worker_inner}; #[derive(Parser)] struct ProxyCliArgs { @@ -508,26 +514,26 @@ mod tests { fn generate_request_data(rng: &mut impl Rng) -> RequestData { RequestData { - session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(), - peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(), + session_id: uuid::Builder::from_random_bytes(rng.r#gen()).into_uuid(), + peer_addr: Ipv4Addr::from(rng.r#gen::<[u8; 4]>()).to_string(), timestamp: chrono::DateTime::from_timestamp_millis( rng.gen_range(1703862754..1803862754), ) .unwrap() .naive_utc(), application_name: Some("test".to_owned()), - username: Some(hex::encode(rng.gen::<[u8; 4]>())), - endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())), - database: Some(hex::encode(rng.gen::<[u8; 16]>())), - project: Some(hex::encode(rng.gen::<[u8; 16]>())), - branch: Some(hex::encode(rng.gen::<[u8; 16]>())), + username: Some(hex::encode(rng.r#gen::<[u8; 4]>())), + endpoint_id: Some(hex::encode(rng.r#gen::<[u8; 16]>())), + database: Some(hex::encode(rng.r#gen::<[u8; 16]>())), + project: Some(hex::encode(rng.r#gen::<[u8; 16]>())), + branch: Some(hex::encode(rng.r#gen::<[u8; 16]>())), pg_options: None, auth_method: None, jwt_issuer: None, protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)], region: "us-east-1", error: None, - success: rng.gen(), + success: rng.r#gen(), cold_start_info: "no", duration_us: rng.gen_range(0..30_000_000), disconnect_timestamp: None, diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index e33a37f643..977fcf4727 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -3,16 +3,16 @@ use std::sync::Arc; use std::time::Duration; -use ::http::header::AUTHORIZATION; use ::http::HeaderName; +use ::http::header::AUTHORIZATION; use futures::TryFutureExt; use postgres_client::config::SslMode; use tokio::time::Instant; -use tracing::{debug, info, info_span, warn, Instrument}; +use tracing::{Instrument, debug, info, info_span, warn}; use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute}; -use crate::auth::backend::jwt::AuthRule; use crate::auth::backend::ComputeUserInfo; +use crate::auth::backend::jwt::AuthRule; use crate::cache::Cached; use crate::context::RequestContext; use crate::control_plane::caches::ApiCaches; @@ -22,14 +22,15 @@ use crate::control_plane::errors::{ use crate::control_plane::locks::ApiLocks; use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}; use crate::control_plane::{ - AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, + AccessBlockerFlags, AuthInfo, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps, + CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, NodeInfo, }; use crate::metrics::{CacheOutcome, Metrics}; use crate::rate_limiter::WakeComputeRateLimiter; use crate::types::{EndpointCacheKey, EndpointId}; use crate::{compute, http, scram}; -const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); +pub(crate) const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); #[derive(Clone)] pub struct NeonControlPlaneClient { @@ -78,15 +79,30 @@ impl NeonControlPlaneClient { info!("endpoint is not valid, skipping the request"); return Ok(AuthInfo::default()); } - let request_id = ctx.session_id().to_string(); - let application_name = ctx.console_application_name(); + self.do_get_auth_req(user_info, &ctx.session_id(), Some(ctx)) + .await + } + + async fn do_get_auth_req( + &self, + user_info: &ComputeUserInfo, + session_id: &uuid::Uuid, + ctx: Option<&RequestContext>, + ) -> Result { + let request_id: String = session_id.to_string(); + let application_name = if let Some(ctx) = ctx { + ctx.console_application_name() + } else { + "auth_cancellation".to_string() + }; + async { let request = self .endpoint .get_path("get_endpoint_access_control") .header(X_REQUEST_ID, &request_id) .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id())]) + .query(&[("session_id", session_id)]) .query(&[ ("application_name", application_name.as_str()), ("endpointish", user_info.endpoint.as_str()), @@ -96,9 +112,16 @@ impl NeonControlPlaneClient { debug!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); - let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); - let response = self.endpoint.execute(request).await?; - drop(pause); + let response = match ctx { + Some(ctx) => { + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); + let rsp = self.endpoint.execute(request).await; + drop(pause); + rsp? + } + None => self.endpoint.execute(request).await?, + }; + info!(duration = ?start.elapsed(), "received http response"); let body = match parse_body::(response).await { Ok(body) => body, @@ -115,9 +138,6 @@ impl NeonControlPlaneClient { } }; - // Ivan: don't know where it will be used, so I leave it here - let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default(); - let secret = if body.role_secret.is_empty() { None } else { @@ -131,10 +151,23 @@ impl NeonControlPlaneClient { .proxy .allowed_ips_number .observe(allowed_ips.len() as f64); + let allowed_vpc_endpoint_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default(); + Metrics::get() + .proxy + .allowed_vpc_endpoint_ids + .observe(allowed_vpc_endpoint_ids.len() as f64); + let block_public_connections = body.block_public_connections.unwrap_or_default(); + let block_vpc_connections = body.block_vpc_connections.unwrap_or_default(); Ok(AuthInfo { secret, allowed_ips, + allowed_vpc_endpoint_ids, project_id: body.project_id, + account_id: body.account_id, + access_blocker_flags: AccessBlockerFlags { + public_access_blocked: block_public_connections, + vpc_access_blocked: block_vpc_connections, + }, }) } .inspect_err(|e| tracing::debug!(error = ?e)) @@ -250,7 +283,6 @@ impl NeonControlPlaneClient { let node = NodeInfo { config, aux: body.aux, - allow_self_signed_compute: false, }; Ok(node) @@ -278,6 +310,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { return Ok(role_secret); } let auth_info = self.do_get_auth_info(ctx, user_info).await?; + let account_id = auth_info.account_id; if let Some(project_id) = auth_info.project_id { let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( @@ -291,24 +324,35 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { normalized_ep_int, Arc::new(auth_info.allowed_ips), ); + self.caches.project_info.insert_allowed_vpc_endpoint_ids( + account_id, + project_id, + normalized_ep_int, + Arc::new(auth_info.allowed_vpc_endpoint_ids), + ); + self.caches.project_info.insert_block_public_or_vpc_access( + project_id, + normalized_ep_int, + auth_info.access_blocker_flags, + ); ctx.set_project_id(project_id); } // When we just got a secret, we don't need to invalidate it. Ok(Cached::new_uncached(auth_info.secret)) } - async fn get_allowed_ips_and_secret( + async fn get_allowed_ips( &self, ctx: &RequestContext, user_info: &ComputeUserInfo, - ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { + ) -> Result { let normalized_ep = &user_info.endpoint.normalize(); if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { Metrics::get() .proxy - .allowed_ips_cache_misses + .allowed_ips_cache_misses // TODO SR: Should we rename this variable to something like allowed_ip_cache_stats? .inc(CacheOutcome::Hit); - return Ok((allowed_ips, None)); + return Ok(allowed_ips); } Metrics::get() .proxy @@ -316,7 +360,10 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { .inc(CacheOutcome::Miss); let auth_info = self.do_get_auth_info(ctx, user_info).await?; let allowed_ips = Arc::new(auth_info.allowed_ips); + let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids); + let access_blocker_flags = auth_info.access_blocker_flags; let user = &user_info.user; + let account_id = auth_info.account_id; if let Some(project_id) = auth_info.project_id { let normalized_ep_int = normalized_ep.into(); self.caches.project_info.insert_role_secret( @@ -330,12 +377,136 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { normalized_ep_int, allowed_ips.clone(), ); + self.caches.project_info.insert_allowed_vpc_endpoint_ids( + account_id, + project_id, + normalized_ep_int, + allowed_vpc_endpoint_ids.clone(), + ); + self.caches.project_info.insert_block_public_or_vpc_access( + project_id, + normalized_ep_int, + access_blocker_flags, + ); ctx.set_project_id(project_id); } - Ok(( - Cached::new_uncached(allowed_ips), - Some(Cached::new_uncached(auth_info.secret)), - )) + Ok(Cached::new_uncached(allowed_ips)) + } + + async fn get_allowed_vpc_endpoint_ids( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(allowed_vpc_endpoint_ids) = self + .caches + .project_info + .get_allowed_vpc_endpoint_ids(normalized_ep) + { + Metrics::get() + .proxy + .vpc_endpoint_id_cache_stats + .inc(CacheOutcome::Hit); + return Ok(allowed_vpc_endpoint_ids); + } + + Metrics::get() + .proxy + .vpc_endpoint_id_cache_stats + .inc(CacheOutcome::Miss); + + let auth_info = self.do_get_auth_info(ctx, user_info).await?; + let allowed_ips = Arc::new(auth_info.allowed_ips); + let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids); + let access_blocker_flags = auth_info.access_blocker_flags; + let user = &user_info.user; + let account_id = auth_info.account_id; + if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); + self.caches.project_info.insert_role_secret( + project_id, + normalized_ep_int, + user.into(), + auth_info.secret.clone(), + ); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); + self.caches.project_info.insert_allowed_vpc_endpoint_ids( + account_id, + project_id, + normalized_ep_int, + allowed_vpc_endpoint_ids.clone(), + ); + self.caches.project_info.insert_block_public_or_vpc_access( + project_id, + normalized_ep_int, + access_blocker_flags, + ); + ctx.set_project_id(project_id); + } + Ok(Cached::new_uncached(allowed_vpc_endpoint_ids)) + } + + async fn get_block_public_or_vpc_access( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(access_blocker_flags) = self + .caches + .project_info + .get_block_public_or_vpc_access(normalized_ep) + { + Metrics::get() + .proxy + .access_blocker_flags_cache_stats + .inc(CacheOutcome::Hit); + return Ok(access_blocker_flags); + } + + Metrics::get() + .proxy + .access_blocker_flags_cache_stats + .inc(CacheOutcome::Miss); + + let auth_info = self.do_get_auth_info(ctx, user_info).await?; + let allowed_ips = Arc::new(auth_info.allowed_ips); + let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids); + let access_blocker_flags = auth_info.access_blocker_flags; + let user = &user_info.user; + let account_id = auth_info.account_id; + if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); + self.caches.project_info.insert_role_secret( + project_id, + normalized_ep_int, + user.into(), + auth_info.secret.clone(), + ); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); + self.caches.project_info.insert_allowed_vpc_endpoint_ids( + account_id, + project_id, + normalized_ep_int, + allowed_vpc_endpoint_ids.clone(), + ); + self.caches.project_info.insert_block_public_or_vpc_access( + project_id, + normalized_ep_int, + access_blocker_flags.clone(), + ); + ctx.set_project_id(project_id); + } + Ok(Cached::new_uncached(access_blocker_flags)) } #[tracing::instrument(skip_all)] diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index eaf692ab27..7da5464aa5 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -6,19 +6,21 @@ use std::sync::Arc; use futures::TryFutureExt; use thiserror::Error; use tokio_postgres::Client; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{Instrument, error, info, info_span, warn}; -use crate::auth::backend::jwt::AuthRule; -use crate::auth::backend::ComputeUserInfo; use crate::auth::IpPattern; +use crate::auth::backend::ComputeUserInfo; +use crate::auth::backend::jwt::AuthRule; use crate::cache::Cached; use crate::context::RequestContext; -use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret}; +use crate::control_plane::client::{ + CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedRoleSecret, +}; use crate::control_plane::errors::{ ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, }; use crate::control_plane::messages::MetricsAuxInfo; -use crate::control_plane::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo}; +use crate::control_plane::{AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo}; use crate::error::io_error; use crate::intern::RoleNameInt; use crate::types::{BranchId, EndpointId, ProjectId, RoleName}; @@ -102,7 +104,9 @@ impl MockControlPlane { Some(s) => { info!("got allowed_ips: {s}"); s.split(',') - .map(|s| IpPattern::from_str(s).unwrap()) + .map(|s| { + IpPattern::from_str(s).expect("mocked ip pattern should be correct") + }) .collect() } None => vec![], @@ -119,7 +123,10 @@ impl MockControlPlane { Ok(AuthInfo { secret, allowed_ips, + allowed_vpc_endpoint_ids: vec![], project_id: None, + account_id: None, + access_blocker_flags: AccessBlockerFlags::default(), }) } @@ -174,7 +181,6 @@ impl MockControlPlane { branch_id: (&BranchId::from("branch")).into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, - allow_self_signed_compute: false, }; Ok(node) @@ -213,16 +219,35 @@ impl super::ControlPlaneApi for MockControlPlane { )) } - async fn get_allowed_ips_and_secret( + async fn get_allowed_ips( &self, _ctx: &RequestContext, user_info: &ComputeUserInfo, - ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { - Ok(( - Cached::new_uncached(Arc::new( - self.do_get_auth_info(user_info).await?.allowed_ips, - )), - None, + ) -> Result { + Ok(Cached::new_uncached(Arc::new( + self.do_get_auth_info(user_info).await?.allowed_ips, + ))) + } + + async fn get_allowed_vpc_endpoint_ids( + &self, + _ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + Ok(Cached::new_uncached(Arc::new( + self.do_get_auth_info(user_info) + .await? + .allowed_vpc_endpoint_ids, + ))) + } + + async fn get_block_public_or_vpc_access( + &self, + _ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + Ok(Cached::new_uncached( + self.do_get_auth_info(user_info).await?.access_blocker_flags, )) } diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index 7ef5a9c9fd..746595de38 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -1,24 +1,24 @@ pub mod cplane_proxy_v1; #[cfg(any(test, feature = "testing"))] pub mod mock; -pub mod neon; use std::hash::Hash; use std::sync::Arc; use std::time::Duration; -use dashmap::DashMap; +use clashmap::ClashMap; use tokio::time::Instant; use tracing::{debug, info}; -use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; use crate::auth::backend::ComputeUserInfo; +use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; use crate::cache::endpoints::EndpointsCache; use crate::cache::project_info::ProjectInfoCacheImpl; use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}; use crate::context::RequestContext; use crate::control_plane::{ - errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache, + CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedNodeInfo, + CachedRoleSecret, ControlPlaneApi, NodeInfoCache, errors, }; use crate::error::ReportableError; use crate::metrics::ApiLockMetrics; @@ -28,10 +28,8 @@ use crate::types::EndpointId; #[non_exhaustive] #[derive(Clone)] pub enum ControlPlaneClient { - /// New Proxy V1 control plane API + /// Proxy V1 control plane API ProxyV1(cplane_proxy_v1::NeonControlPlaneClient), - /// Current Management API (V2). - Neon(neon::NeonControlPlaneClient), /// Local mock control plane. #[cfg(any(test, feature = "testing"))] PostgresMock(mock::MockControlPlane), @@ -49,7 +47,6 @@ impl ControlPlaneApi for ControlPlaneClient { ) -> Result { match self { Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await, - Self::Neon(api) => api.get_role_secret(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await, #[cfg(test)] @@ -59,18 +56,45 @@ impl ControlPlaneApi for ControlPlaneClient { } } - async fn get_allowed_ips_and_secret( + async fn get_allowed_ips( &self, ctx: &RequestContext, user_info: &ComputeUserInfo, - ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError> { + ) -> Result { match self { - Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, - Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, + Self::ProxyV1(api) => api.get_allowed_ips(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] - Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, + Self::PostgresMock(api) => api.get_allowed_ips(ctx, user_info).await, #[cfg(test)] - Self::Test(api) => api.get_allowed_ips_and_secret(), + Self::Test(api) => api.get_allowed_ips(), + } + } + + async fn get_allowed_vpc_endpoint_ids( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + match self { + Self::ProxyV1(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await, + #[cfg(any(test, feature = "testing"))] + Self::PostgresMock(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await, + #[cfg(test)] + Self::Test(api) => api.get_allowed_vpc_endpoint_ids(), + } + } + + async fn get_block_public_or_vpc_access( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + match self { + Self::ProxyV1(api) => api.get_block_public_or_vpc_access(ctx, user_info).await, + #[cfg(any(test, feature = "testing"))] + Self::PostgresMock(api) => api.get_block_public_or_vpc_access(ctx, user_info).await, + #[cfg(test)] + Self::Test(api) => api.get_block_public_or_vpc_access(), } } @@ -81,7 +105,6 @@ impl ControlPlaneApi for ControlPlaneClient { ) -> Result, errors::GetEndpointJwksError> { match self { Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await, - Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await, #[cfg(test)] @@ -96,7 +119,6 @@ impl ControlPlaneApi for ControlPlaneClient { ) -> Result { match self { Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await, - Self::Neon(api) => api.wake_compute(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await, #[cfg(test)] @@ -109,9 +131,15 @@ impl ControlPlaneApi for ControlPlaneClient { pub(crate) trait TestControlPlaneClient: Send + Sync + 'static { fn wake_compute(&self) -> Result; - fn get_allowed_ips_and_secret( + fn get_allowed_ips(&self) -> Result; + + fn get_allowed_vpc_endpoint_ids( &self, - ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; + ) -> Result; + + fn get_block_public_or_vpc_access( + &self, + ) -> Result; fn dyn_clone(&self) -> Box; } @@ -155,7 +183,7 @@ impl ApiCaches { /// Various caches for [`control_plane`](super). pub struct ApiLocks { name: &'static str, - node_locks: DashMap>, + node_locks: ClashMap>, config: RateLimiterConfig, timeout: Duration, epoch: std::time::Duration, @@ -184,15 +212,15 @@ impl ApiLocks { timeout: Duration, epoch: std::time::Duration, metrics: &'static ApiLockMetrics, - ) -> prometheus::Result { - Ok(Self { + ) -> Self { + Self { name, - node_locks: DashMap::with_shard_amount(shards), + node_locks: ClashMap::with_shard_amount(shards), config, timeout, epoch, metrics, - }) + } } pub(crate) async fn get_permit(&self, key: &K) -> Result { @@ -245,7 +273,7 @@ impl ApiLocks { let mut lock = shard.write(); let timer = self.metrics.reclamation_lag_seconds.start_timer(); let count = lock - .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1) + .extract_if(|(_, semaphore)| Arc::strong_count(semaphore) == 1) .count(); drop(lock); self.metrics.semaphores_unregistered.inc_by(count as u64); diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs deleted file mode 100644 index bf62c0d6ab..0000000000 --- a/proxy/src/control_plane/client/neon.rs +++ /dev/null @@ -1,511 +0,0 @@ -//! Stale console backend, remove after migrating to Proxy V1 API (#15245). - -use std::sync::Arc; -use std::time::Duration; - -use ::http::header::AUTHORIZATION; -use ::http::HeaderName; -use futures::TryFutureExt; -use postgres_client::config::SslMode; -use tokio::time::Instant; -use tracing::{debug, info, info_span, warn, Instrument}; - -use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute}; -use crate::auth::backend::jwt::AuthRule; -use crate::auth::backend::ComputeUserInfo; -use crate::cache::Cached; -use crate::context::RequestContext; -use crate::control_plane::caches::ApiCaches; -use crate::control_plane::errors::{ - ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, -}; -use crate::control_plane::locks::ApiLocks; -use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}; -use crate::control_plane::{ - AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, -}; -use crate::metrics::{CacheOutcome, Metrics}; -use crate::rate_limiter::WakeComputeRateLimiter; -use crate::types::{EndpointCacheKey, EndpointId}; -use crate::{compute, http, scram}; - -const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); - -#[derive(Clone)] -pub struct NeonControlPlaneClient { - endpoint: http::Endpoint, - pub caches: &'static ApiCaches, - pub(crate) locks: &'static ApiLocks, - pub(crate) wake_compute_endpoint_rate_limiter: Arc, - // put in a shared ref so we don't copy secrets all over in memory - jwt: Arc, -} - -impl NeonControlPlaneClient { - /// Construct an API object containing the auth parameters. - pub fn new( - endpoint: http::Endpoint, - jwt: Arc, - caches: &'static ApiCaches, - locks: &'static ApiLocks, - wake_compute_endpoint_rate_limiter: Arc, - ) -> Self { - Self { - endpoint, - caches, - locks, - wake_compute_endpoint_rate_limiter, - jwt, - } - } - - pub(crate) fn url(&self) -> &str { - self.endpoint.url().as_str() - } - - async fn do_get_auth_info( - &self, - ctx: &RequestContext, - user_info: &ComputeUserInfo, - ) -> Result { - if !self - .caches - .endpoints_cache - .is_valid(ctx, &user_info.endpoint.normalize()) - { - // TODO: refactor this because it's weird - // this is a failure to authenticate but we return Ok. - info!("endpoint is not valid, skipping the request"); - return Ok(AuthInfo::default()); - } - let request_id = ctx.session_id().to_string(); - let application_name = ctx.console_application_name(); - async { - let request = self - .endpoint - .get_path("proxy_get_role_secret") - .header(X_REQUEST_ID, &request_id) - .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id())]) - .query(&[ - ("application_name", application_name.as_str()), - ("project", user_info.endpoint.as_str()), - ("role", user_info.user.as_str()), - ]) - .build()?; - - debug!(url = request.url().as_str(), "sending http request"); - let start = Instant::now(); - let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); - let response = self.endpoint.execute(request).await?; - drop(pause); - info!(duration = ?start.elapsed(), "received http response"); - let body = match parse_body::(response).await { - Ok(body) => body, - // Error 404 is special: it's ok not to have a secret. - // TODO(anna): retry - Err(e) => { - return if e.get_reason().is_not_found() { - // TODO: refactor this because it's weird - // this is a failure to authenticate but we return Ok. - Ok(AuthInfo::default()) - } else { - Err(e.into()) - }; - } - }; - - let secret = if body.role_secret.is_empty() { - None - } else { - let secret = scram::ServerSecret::parse(&body.role_secret) - .map(AuthSecret::Scram) - .ok_or(GetAuthInfoError::BadSecret)?; - Some(secret) - }; - let allowed_ips = body.allowed_ips.unwrap_or_default(); - Metrics::get() - .proxy - .allowed_ips_number - .observe(allowed_ips.len() as f64); - Ok(AuthInfo { - secret, - allowed_ips, - project_id: body.project_id, - }) - } - .inspect_err(|e| tracing::debug!(error = ?e)) - .instrument(info_span!("do_get_auth_info")) - .await - } - - async fn do_get_endpoint_jwks( - &self, - ctx: &RequestContext, - endpoint: EndpointId, - ) -> Result, GetEndpointJwksError> { - if !self - .caches - .endpoints_cache - .is_valid(ctx, &endpoint.normalize()) - { - return Err(GetEndpointJwksError::EndpointNotFound); - } - let request_id = ctx.session_id().to_string(); - async { - let request = self - .endpoint - .get_with_url(|url| { - url.path_segments_mut() - .push("endpoints") - .push(endpoint.as_str()) - .push("jwks"); - }) - .header(X_REQUEST_ID, &request_id) - .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id())]) - .build() - .map_err(GetEndpointJwksError::RequestBuild)?; - - debug!(url = request.url().as_str(), "sending http request"); - let start = Instant::now(); - let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); - let response = self - .endpoint - .execute(request) - .await - .map_err(GetEndpointJwksError::RequestExecute)?; - drop(pause); - info!(duration = ?start.elapsed(), "received http response"); - - let body = parse_body::(response).await?; - - let rules = body - .jwks - .into_iter() - .map(|jwks| AuthRule { - id: jwks.id, - jwks_url: jwks.jwks_url, - audience: jwks.jwt_audience, - role_names: jwks.role_names, - }) - .collect(); - - Ok(rules) - } - .inspect_err(|e| tracing::debug!(error = ?e)) - .instrument(info_span!("do_get_endpoint_jwks")) - .await - } - - async fn do_wake_compute( - &self, - ctx: &RequestContext, - user_info: &ComputeUserInfo, - ) -> Result { - let request_id = ctx.session_id().to_string(); - let application_name = ctx.console_application_name(); - async { - let mut request_builder = self - .endpoint - .get_path("proxy_wake_compute") - .header("X-Request-ID", &request_id) - .header("Authorization", format!("Bearer {}", &self.jwt)) - .query(&[("session_id", ctx.session_id())]) - .query(&[ - ("application_name", application_name.as_str()), - ("project", user_info.endpoint.as_str()), - ]); - - let options = user_info.options.to_deep_object(); - if !options.is_empty() { - request_builder = request_builder.query(&options); - } - - let request = request_builder.build()?; - - debug!(url = request.url().as_str(), "sending http request"); - let start = Instant::now(); - let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); - let response = self.endpoint.execute(request).await?; - drop(pause); - info!(duration = ?start.elapsed(), "received http response"); - let body = parse_body::(response).await?; - - // Unfortunately, ownership won't let us use `Option::ok_or` here. - let (host, port) = match parse_host_port(&body.address) { - None => return Err(WakeComputeError::BadComputeAddress(body.address)), - Some(x) => x, - }; - - // Don't set anything but host and port! This config will be cached. - // We'll set username and such later using the startup message. - // TODO: add more type safety (in progress). - let mut config = compute::ConnCfg::new(host.to_owned(), port); - config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. - - let node = NodeInfo { - config, - aux: body.aux, - allow_self_signed_compute: false, - }; - - Ok(node) - } - .inspect_err(|e| tracing::debug!(error = ?e)) - .instrument(info_span!("do_wake_compute")) - .await - } -} - -impl super::ControlPlaneApi for NeonControlPlaneClient { - #[tracing::instrument(skip_all)] - async fn get_role_secret( - &self, - ctx: &RequestContext, - user_info: &ComputeUserInfo, - ) -> Result { - let normalized_ep = &user_info.endpoint.normalize(); - let user = &user_info.user; - if let Some(role_secret) = self - .caches - .project_info - .get_role_secret(normalized_ep, user) - { - return Ok(role_secret); - } - let auth_info = self.do_get_auth_info(ctx, user_info).await?; - if let Some(project_id) = auth_info.project_id { - let normalized_ep_int = normalized_ep.into(); - self.caches.project_info.insert_role_secret( - project_id, - normalized_ep_int, - user.into(), - auth_info.secret.clone(), - ); - self.caches.project_info.insert_allowed_ips( - project_id, - normalized_ep_int, - Arc::new(auth_info.allowed_ips), - ); - ctx.set_project_id(project_id); - } - // When we just got a secret, we don't need to invalidate it. - Ok(Cached::new_uncached(auth_info.secret)) - } - - async fn get_allowed_ips_and_secret( - &self, - ctx: &RequestContext, - user_info: &ComputeUserInfo, - ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { - let normalized_ep = &user_info.endpoint.normalize(); - if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { - Metrics::get() - .proxy - .allowed_ips_cache_misses - .inc(CacheOutcome::Hit); - return Ok((allowed_ips, None)); - } - Metrics::get() - .proxy - .allowed_ips_cache_misses - .inc(CacheOutcome::Miss); - let auth_info = self.do_get_auth_info(ctx, user_info).await?; - let allowed_ips = Arc::new(auth_info.allowed_ips); - let user = &user_info.user; - if let Some(project_id) = auth_info.project_id { - let normalized_ep_int = normalized_ep.into(); - self.caches.project_info.insert_role_secret( - project_id, - normalized_ep_int, - user.into(), - auth_info.secret.clone(), - ); - self.caches.project_info.insert_allowed_ips( - project_id, - normalized_ep_int, - allowed_ips.clone(), - ); - ctx.set_project_id(project_id); - } - Ok(( - Cached::new_uncached(allowed_ips), - Some(Cached::new_uncached(auth_info.secret)), - )) - } - - #[tracing::instrument(skip_all)] - async fn get_endpoint_jwks( - &self, - ctx: &RequestContext, - endpoint: EndpointId, - ) -> Result, GetEndpointJwksError> { - self.do_get_endpoint_jwks(ctx, endpoint).await - } - - #[tracing::instrument(skip_all)] - async fn wake_compute( - &self, - ctx: &RequestContext, - user_info: &ComputeUserInfo, - ) -> Result { - let key = user_info.endpoint_cache_key(); - - macro_rules! check_cache { - () => { - if let Some(cached) = self.caches.node_info.get(&key) { - let (cached, info) = cached.take_value(); - let info = info.map_err(|c| { - info!(key = &*key, "found cached wake_compute error"); - WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c))) - })?; - - debug!(key = &*key, "found cached compute node info"); - ctx.set_project(info.aux.clone()); - return Ok(cached.map(|()| info)); - } - }; - } - - // Every time we do a wakeup http request, the compute node will stay up - // for some time (highly depends on the console's scale-to-zero policy); - // The connection info remains the same during that period of time, - // which means that we might cache it to reduce the load and latency. - check_cache!(); - - let permit = self.locks.get_permit(&key).await?; - - // after getting back a permit - it's possible the cache was filled - // double check - if permit.should_check_cache() { - // TODO: if there is something in the cache, mark the permit as success. - check_cache!(); - } - - // check rate limit - if !self - .wake_compute_endpoint_rate_limiter - .check(user_info.endpoint.normalize_intern(), 1) - { - return Err(WakeComputeError::TooManyConnections); - } - - let node = permit.release_result(self.do_wake_compute(ctx, user_info).await); - match node { - Ok(node) => { - ctx.set_project(node.aux.clone()); - debug!(key = &*key, "created a cache entry for woken compute node"); - - let mut stored_node = node.clone(); - // store the cached node as 'warm_cached' - stored_node.aux.cold_start_info = ColdStartInfo::WarmCached; - - let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node)); - - Ok(cached.map(|()| node)) - } - Err(err) => match err { - WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => { - let Some(status) = &err.status else { - return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( - err, - ))); - }; - - let reason = status - .details - .error_info - .map_or(Reason::Unknown, |x| x.reason); - - // if we can retry this error, do not cache it. - if reason.can_retry() { - return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( - err, - ))); - } - - // at this point, we should only have quota errors. - debug!( - key = &*key, - "created a cache entry for the wake compute error" - ); - - self.caches.node_info.insert_ttl( - key, - Err(err.clone()), - Duration::from_secs(30), - ); - - Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( - err, - ))) - } - err => return Err(err), - }, - } - } -} - -/// Parse http response body, taking status code into account. -async fn parse_body serde::Deserialize<'a>>( - response: http::Response, -) -> Result { - let status = response.status(); - if status.is_success() { - // We shouldn't log raw body because it may contain secrets. - info!("request succeeded, processing the body"); - return Ok(response.json().await?); - } - let s = response.bytes().await?; - // Log plaintext to be able to detect, whether there are some cases not covered by the error struct. - info!("response_error plaintext: {:?}", s); - - // Don't throw an error here because it's not as important - // as the fact that the request itself has failed. - let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| { - warn!("failed to parse error body: {e}"); - ControlPlaneErrorMessage { - error: "reason unclear (malformed error message)".into(), - http_status_code: status, - status: None, - } - }); - body.http_status_code = status; - - warn!("console responded with an error ({status}): {body:?}"); - Err(ControlPlaneError::Message(Box::new(body))) -} - -fn parse_host_port(input: &str) -> Option<(&str, u16)> { - let (host, port) = input.rsplit_once(':')?; - let ipv6_brackets: &[_] = &['[', ']']; - Some((host.trim_matches(ipv6_brackets), port.parse().ok()?)) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_host_port_v4() { - let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse"); - assert_eq!(host, "127.0.0.1"); - assert_eq!(port, 5432); - } - - #[test] - fn test_parse_host_port_v6() { - let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse"); - assert_eq!(host, "2001:db8::1"); - assert_eq!(port, 5432); - } - - #[test] - fn test_parse_host_port_url() { - let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432") - .expect("failed to parse"); - assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local"); - assert_eq!(port, 5432); - } -} diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs index d6f565e34a..bc30cffd27 100644 --- a/proxy/src/control_plane/errors.rs +++ b/proxy/src/control_plane/errors.rs @@ -2,7 +2,7 @@ use thiserror::Error; use crate::control_plane::client::ApiLockError; use crate::control_plane::messages::{self, ControlPlaneErrorMessage, Reason}; -use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError}; +use crate::error::{ErrorKind, ReportableError, UserFacingError, io_error}; use crate::proxy::retry::CouldRetry; /// A go-to error message which doesn't leak any detail. diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index 2662ab85f9..8d6b2e96f5 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -4,7 +4,7 @@ use measured::FixedCardinalityLabel; use serde::{Deserialize, Serialize}; use crate::auth::IpPattern; -use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; +use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::proxy::retry::CouldRetry; /// Generic error response with human-readable description. @@ -221,30 +221,17 @@ pub(crate) struct UserFacingMessage { pub(crate) message: Box, } -/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`]. -/// Returned by the `/proxy_get_role_secret` API method. -#[derive(Deserialize)] -pub(crate) struct GetRoleSecret { - pub(crate) role_secret: Box, - pub(crate) allowed_ips: Option>, - pub(crate) project_id: Option, -} - /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`]. /// Returned by the `/get_endpoint_access_control` API method. #[derive(Deserialize)] pub(crate) struct GetEndpointAccessControl { pub(crate) role_secret: Box, pub(crate) allowed_ips: Option>, + pub(crate) allowed_vpc_endpoint_ids: Option>, pub(crate) project_id: Option, - pub(crate) allowed_vpc_endpoint_ids: Option>, -} - -// Manually implement debug to omit sensitive info. -impl fmt::Debug for GetRoleSecret { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("GetRoleSecret").finish_non_exhaustive() - } + pub(crate) account_id: Option, + pub(crate) block_public_connections: Option, + pub(crate) block_vpc_connections: Option, } /// Response which holds compute node's `host:port` pair. @@ -298,6 +285,10 @@ pub(crate) struct DatabaseInfo { pub(crate) aux: MetricsAuxInfo, #[serde(default)] pub(crate) allowed_ips: Option>, + #[serde(default)] + pub(crate) allowed_vpc_endpoint_ids: Option>, + #[serde(default)] + pub(crate) public_access_allowed: Option, } // Manually implement debug to omit sensitive info. @@ -309,6 +300,7 @@ impl fmt::Debug for DatabaseInfo { .field("dbname", &self.dbname) .field("user", &self.user) .field("allowed_ips", &self.allowed_ips) + .field("allowed_vpc_endpoint_ids", &self.allowed_vpc_endpoint_ids) .finish_non_exhaustive() } } @@ -369,7 +361,8 @@ pub struct EndpointJwksResponse { pub struct JwksSettings { pub id: String, pub jwks_url: url::Url, - pub provider_name: String, + #[serde(rename = "provider_name")] + pub _provider_name: String, pub jwt_audience: Option, pub role_names: Vec, } @@ -473,22 +466,34 @@ mod tests { #[test] fn parse_get_role_secret() -> anyhow::Result<()> { - // Empty `allowed_ips` field. + // Empty `allowed_ips` and `allowed_vpc_endpoint_ids` field. let json = json!({ "role_secret": "secret", }); - serde_json::from_str::(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], }); - serde_json::from_str::(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; + let json = json!({ + "role_secret": "secret", + "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"], + }); + serde_json::from_str::(&json.to_string())?; let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], + "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"], + }); + serde_json::from_str::(&json.to_string())?; + let json = json!({ + "role_secret": "secret", + "allowed_ips": ["8.8.8.8"], + "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"], "project_id": "project", }); - serde_json::from_str::(&json.to_string())?; + serde_json::from_str::(&json.to_string())?; Ok(()) } diff --git a/proxy/src/control_plane/mgmt.rs b/proxy/src/control_plane/mgmt.rs index 2f7359240d..df31abcc8c 100644 --- a/proxy/src/control_plane/mgmt.rs +++ b/proxy/src/control_plane/mgmt.rs @@ -6,7 +6,7 @@ use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; use tokio::net::{TcpListener, TcpStream}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, Instrument}; +use tracing::{Instrument, error, info, info_span}; use crate::control_plane::messages::{DatabaseInfo, KickSession}; use crate::waiters::{self, Waiter, Waiters}; diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index 41972e4e44..d592223be1 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -10,16 +10,16 @@ pub mod client; pub(crate) mod errors; use std::sync::Arc; -use std::time::Duration; +use crate::auth::IpPattern; use crate::auth::backend::jwt::AuthRule; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; -use crate::auth::IpPattern; use crate::cache::project_info::ProjectInfoCacheImpl; use crate::cache::{Cached, TimedLru}; +use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo}; -use crate::intern::ProjectIdInt; +use crate::intern::{AccountIdInt, ProjectIdInt}; use crate::types::{EndpointCacheKey, EndpointId}; use crate::{compute, scram}; @@ -52,8 +52,14 @@ pub(crate) struct AuthInfo { pub(crate) secret: Option, /// List of IP addresses allowed for the autorization. pub(crate) allowed_ips: Vec, + /// List of VPC endpoints allowed for the autorization. + pub(crate) allowed_vpc_endpoint_ids: Vec, /// Project ID. This is used for cache invalidation. pub(crate) project_id: Option, + /// Account ID. This is used for cache invalidation. + pub(crate) account_id: Option, + /// Are public connections or VPC connections blocked? + pub(crate) access_blocker_flags: AccessBlockerFlags, } /// Info for establishing a connection to a compute node. @@ -67,28 +73,21 @@ pub(crate) struct NodeInfo { /// Labels for proxy's metrics. pub(crate) aux: MetricsAuxInfo, - - /// Whether we should accept self-signed certificates (for testing) - pub(crate) allow_self_signed_compute: bool, } impl NodeInfo { pub(crate) async fn connect( &self, ctx: &RequestContext, - timeout: Duration, + config: &ComputeConfig, + user_info: ComputeUserInfo, ) -> Result { self.config - .connect( - ctx, - self.allow_self_signed_compute, - self.aux.clone(), - timeout, - ) + .connect(ctx, self.aux.clone(), config, user_info) .await } + pub(crate) fn reuse_settings(&mut self, other: Self) { - self.allow_self_signed_compute = other.allow_self_signed_compute; self.config.reuse_password(other.config); } @@ -102,11 +101,21 @@ impl NodeInfo { } } +#[derive(Clone, Default, Eq, PartialEq, Debug)] +pub(crate) struct AccessBlockerFlags { + pub public_access_blocked: bool, + pub vpc_access_blocked: bool, +} + pub(crate) type NodeInfoCache = TimedLru>>; pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>; pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option>; pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; +pub(crate) type CachedAllowedVpcEndpointIds = + Cached<&'static ProjectInfoCacheImpl, Arc>>; +pub(crate) type CachedAccessBlockerFlags = + Cached<&'static ProjectInfoCacheImpl, AccessBlockerFlags>; /// This will allocate per each call, but the http requests alone /// already require a few allocations, so it should be fine. @@ -120,11 +129,23 @@ pub(crate) trait ControlPlaneApi { user_info: &ComputeUserInfo, ) -> Result; - async fn get_allowed_ips_and_secret( + async fn get_allowed_ips( &self, ctx: &RequestContext, user_info: &ComputeUserInfo, - ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; + ) -> Result; + + async fn get_allowed_vpc_endpoint_ids( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result; + + async fn get_block_public_or_vpc_access( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result; async fn get_endpoint_jwks( &self, diff --git a/proxy/src/ext.rs b/proxy/src/ext.rs new file mode 100644 index 0000000000..8d00afbf51 --- /dev/null +++ b/proxy/src/ext.rs @@ -0,0 +1,41 @@ +use std::panic::resume_unwind; +use std::sync::{Mutex, MutexGuard}; + +use tokio::task::JoinError; + +pub(crate) trait LockExt { + fn lock_propagate_poison(&self) -> MutexGuard<'_, T>; +} + +impl LockExt for Mutex { + /// Lock the mutex and panic if the mutex was poisoned. + #[track_caller] + fn lock_propagate_poison(&self) -> MutexGuard<'_, T> { + match self.lock() { + Ok(guard) => guard, + // poison occurs when another thread panicked while holding the lock guard. + // since panicking is often unrecoverable, propagating the poison panic is reasonable. + Err(poison) => panic!("{poison}"), + } + } +} + +pub(crate) trait TaskExt { + fn propagate_task_panic(self) -> T; +} + +impl TaskExt for Result { + /// Unwrap the result and panic if the inner task panicked. + /// Also panics if the task was cancelled + #[track_caller] + fn propagate_task_panic(self) -> T { + match self { + Ok(t) => t, + // Using resume_unwind prevents the panic hook being called twice. + // Since we use this for structured concurrency, there is only + // 1 logical panic, so this is more correct. + Err(e) if e.is_panic() => resume_unwind(e.into_panic()), + Err(e) => panic!("unexpected task error: {e}"), + } + } +} diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 978ad9f761..5278fe2a3e 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -3,17 +3,18 @@ use std::net::TcpListener; use std::sync::{Arc, Mutex}; use anyhow::{anyhow, bail}; +use http_utils::endpoint::{self, request_span}; +use http_utils::error::ApiError; +use http_utils::json::json_response; +use http_utils::{RouterBuilder, RouterService}; use hyper0::header::CONTENT_TYPE; use hyper0::{Body, Request, Response, StatusCode}; -use measured::text::BufferedTextEncoder; use measured::MetricGroup; +use measured::text::BufferedTextEncoder; use metrics::NeonMetrics; use tracing::{info, info_span}; -use utils::http::endpoint::{self, request_span}; -use utils::http::error::ApiError; -use utils::http::json::json_response; -use utils::http::{RouterBuilder, RouterService}; +use crate::ext::{LockExt, TaskExt}; use crate::jemalloc; async fn status_handler(_: Request) -> Result, ApiError> { @@ -76,7 +77,7 @@ async fn prometheus_metrics_handler( let body = tokio::task::spawn_blocking(move || { let _span = span.entered(); - let mut state = state.lock().unwrap(); + let mut state = state.lock_propagate_poison(); let PrometheusHandler { encoder, metrics } = &mut *state; metrics @@ -94,13 +95,13 @@ async fn prometheus_metrics_handler( body }) .await - .unwrap(); + .propagate_task_panic(); let response = Response::builder() .status(200) .header(CONTENT_TYPE, "text/plain; version=0.0.4") .body(Body::from(body)) - .unwrap(); + .expect("response headers should be valid"); Ok(response) } diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs index ed88c77256..96f600d836 100644 --- a/proxy/src/http/mod.rs +++ b/proxy/src/http/mod.rs @@ -13,8 +13,8 @@ use hyper::body::Body; pub(crate) use reqwest::{Request, Response}; use reqwest_middleware::RequestBuilder; pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; -pub(crate) use reqwest_retry::policies::ExponentialBackoff; pub(crate) use reqwest_retry::RetryTransientMiddleware; +pub(crate) use reqwest_retry::policies::ExponentialBackoff; use thiserror::Error; use crate::metrics::{ConsoleRequest, Metrics}; diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index f56d92a6b3..0d1382679c 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -7,7 +7,7 @@ use std::sync::OnceLock; use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo}; use rustc_hash::FxHasher; -use crate::types::{BranchId, EndpointId, ProjectId, RoleName}; +use crate::types::{AccountId, BranchId, EndpointId, ProjectId, RoleName}; pub trait InternId: Sized + 'static { fn get_interner() -> &'static StringInterner; @@ -83,7 +83,7 @@ impl StringInterner { pub(crate) fn new() -> Self { StringInterner { inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher( - Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()), + Capacity::new(2500, NonZeroUsize::new(1 << 16).expect("value is nonzero")), // unbounded MemoryLimits::for_memory_usage(usize::MAX), BuildHasherDefault::::default(), @@ -206,7 +206,28 @@ impl From for ProjectIdInt { } } +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct AccountIdTag; +impl InternId for AccountIdTag { + fn get_interner() -> &'static StringInterner { + static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type AccountIdInt = InternedString; +impl From<&AccountId> for AccountIdInt { + fn from(value: &AccountId) -> Self { + AccountIdTag::get_interner().get_or_intern(value) + } +} +impl From for AccountIdInt { + fn from(value: AccountId) -> Self { + AccountIdTag::get_interner().get_or_intern(&value) + } +} + #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::sync::OnceLock; diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index ba69f9cf2d..a9e5fbc85b 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -22,8 +22,8 @@ clippy::string_add, clippy::string_to_string, clippy::todo, - // TODO: consider clippy::unimplemented - // TODO: consider clippy::unwrap_used + clippy::unimplemented, + clippy::unwrap_used, )] // List of permanently allowed lints. #![allow( @@ -72,33 +72,36 @@ // List of temporarily allowed lints to unblock beta/nightly. #![allow(unknown_lints)] -pub mod auth; -pub mod cache; -pub mod cancellation; -pub mod compute; -pub mod compute_ctl; -pub mod config; -pub mod console_redirect_proxy; -pub mod context; -pub mod control_plane; -pub mod error; -pub mod http; -pub mod intern; -pub mod jemalloc; -pub mod logging; -pub mod metrics; -pub mod parse; -pub mod postgres_rustls; -pub mod protocol2; -pub mod proxy; -pub mod rate_limiter; -pub mod redis; -pub mod sasl; -pub mod scram; -pub mod serverless; -pub mod signals; -pub mod stream; -pub mod types; -pub mod url; -pub mod usage_metrics; -pub mod waiters; +pub mod binary; + +mod auth; +mod cache; +mod cancellation; +mod compute; +mod compute_ctl; +mod config; +mod console_redirect_proxy; +mod context; +mod control_plane; +mod error; +mod ext; +mod http; +mod intern; +mod jemalloc; +mod logging; +mod metrics; +mod parse; +mod protocol2; +mod proxy; +mod rate_limiter; +mod redis; +mod sasl; +mod scram; +mod serverless; +mod signals; +mod stream; +mod tls; +mod types; +mod url; +mod usage_metrics; +mod waiters; diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 74d2b9a1d0..3c34918d84 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,10 +1,22 @@ -use tracing::Subscriber; +use std::cell::{Cell, RefCell}; +use std::collections::HashMap; +use std::hash::BuildHasher; +use std::{env, io}; + +use chrono::{DateTime, Utc}; +use opentelemetry::trace::TraceContextExt; +use scopeguard::defer; +use serde::ser::{SerializeMap, Serializer}; +use tracing::subscriber::Interest; +use tracing::{Event, Metadata, Span, Subscriber, callsite, span}; +use tracing_opentelemetry::OpenTelemetrySpanExt; use tracing_subscriber::filter::{EnvFilter, LevelFilter}; use tracing_subscriber::fmt::format::{Format, Full}; use tracing_subscriber::fmt::time::SystemTime; use tracing_subscriber::fmt::{FormatEvent, FormatFields}; +use tracing_subscriber::layer::{Context, Layer}; use tracing_subscriber::prelude::*; -use tracing_subscriber::registry::LookupSpan; +use tracing_subscriber::registry::{LookupSpan, SpanRef}; /// Initialize logging and OpenTelemetry tracing and exporter. /// @@ -15,23 +27,52 @@ use tracing_subscriber::registry::LookupSpan; /// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. /// See pub async fn init() -> anyhow::Result { + let logfmt = LogFormat::from_env()?; + let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) .from_env_lossy() - .add_directive("aws_config=info".parse().unwrap()) - .add_directive("azure_core::policies::transport=off".parse().unwrap()); - - let fmt_layer = tracing_subscriber::fmt::layer() - .with_ansi(false) - .with_writer(std::io::stderr) - .with_target(false); + .add_directive( + "aws_config=info" + .parse() + .expect("this should be a valid filter directive"), + ) + .add_directive( + "azure_core::policies::transport=off" + .parse() + .expect("this should be a valid filter directive"), + ); let otlp_layer = tracing_utils::init_tracing("proxy").await; + let json_log_layer = if logfmt == LogFormat::Json { + Some(JsonLoggingLayer { + clock: RealClock, + skipped_field_indices: papaya::HashMap::default(), + writer: StderrWriter { + stderr: std::io::stderr(), + }, + }) + } else { + None + }; + + let text_log_layer = if logfmt == LogFormat::Text { + Some( + tracing_subscriber::fmt::layer() + .with_ansi(false) + .with_writer(std::io::stderr) + .with_target(false), + ) + } else { + None + }; + tracing_subscriber::registry() .with(env_filter) .with(otlp_layer) - .with(fmt_layer) + .with(json_log_layer) + .with(text_log_layer) .try_init()?; Ok(LoggingGuard) @@ -86,3 +127,857 @@ impl Drop for LoggingGuard { tracing_utils::shutdown_tracing(); } } + +// TODO: make JSON the default +#[derive(Copy, Clone, PartialEq, Eq, Default, Debug)] +enum LogFormat { + #[default] + Text = 1, + Json, +} + +impl LogFormat { + fn from_env() -> anyhow::Result { + let logfmt = env::var("LOGFMT"); + Ok(match logfmt.as_deref() { + Err(_) => LogFormat::default(), + Ok("text") => LogFormat::Text, + Ok("json") => LogFormat::Json, + Ok(logfmt) => anyhow::bail!("unknown log format: {logfmt}"), + }) + } +} + +trait MakeWriter { + fn make_writer(&self) -> impl io::Write; +} + +struct StderrWriter { + stderr: io::Stderr, +} + +impl MakeWriter for StderrWriter { + #[inline] + fn make_writer(&self) -> impl io::Write { + self.stderr.lock() + } +} + +// TODO: move into separate module or even separate crate. +trait Clock { + fn now(&self) -> DateTime; +} + +struct RealClock; + +impl Clock for RealClock { + #[inline] + fn now(&self) -> DateTime { + Utc::now() + } +} + +/// Name of the field used by tracing crate to store the event message. +const MESSAGE_FIELD: &str = "message"; + +thread_local! { + /// Protects against deadlocks and double panics during log writing. + /// The current panic handler will use tracing to log panic information. + static REENTRANCY_GUARD: Cell = const { Cell::new(false) }; + /// Thread-local instance with per-thread buffer for log writing. + static EVENT_FORMATTER: RefCell = RefCell::new(EventFormatter::new()); + /// Cached OS thread ID. + static THREAD_ID: u64 = gettid::gettid(); +} + +/// Implements tracing layer to handle events specific to logging. +struct JsonLoggingLayer { + clock: C, + skipped_field_indices: papaya::HashMap, + writer: W, +} + +impl Layer for JsonLoggingLayer +where + S: Subscriber + for<'a> LookupSpan<'a>, +{ + fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) { + use std::io::Write; + + // TODO: consider special tracing subscriber to grab timestamp very + // early, before OTel machinery, and add as event extension. + let now = self.clock.now(); + + let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| { + if entered.get() { + let mut formatter = EventFormatter::new(); + formatter.format(now, event, &ctx, &self.skipped_field_indices)?; + self.writer.make_writer().write_all(formatter.buffer()) + } else { + entered.set(true); + defer!(entered.set(false);); + + EVENT_FORMATTER.with_borrow_mut(move |formatter| { + formatter.reset(); + formatter.format(now, event, &ctx, &self.skipped_field_indices)?; + self.writer.make_writer().write_all(formatter.buffer()) + }) + } + }); + + // In case logging fails we generate a simpler JSON object. + if let Err(err) = res { + if let Ok(mut line) = serde_json::to_vec(&serde_json::json!( { + "timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true), + "level": "ERROR", + "message": format_args!("cannot log event: {err:?}"), + "fields": { + "event": format_args!("{event:?}"), + }, + })) { + line.push(b'\n'); + self.writer.make_writer().write_all(&line).ok(); + } + } + } + + /// Registers a SpanFields instance as span extension. + fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) { + let span = ctx.span(id).expect("span must exist"); + let fields = SpanFields::default(); + fields.record_fields(attrs); + // This could deadlock when there's a panic somewhere in the tracing + // event handling and a read or write guard is still held. This includes + // the OTel subscriber. + span.extensions_mut().insert(fields); + } + + fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) { + let span = ctx.span(id).expect("span must exist"); + let ext = span.extensions(); + if let Some(data) = ext.get::() { + data.record_fields(values); + } + } + + /// Called (lazily) whenever a new log call is executed. We quickly check + /// for duplicate field names and record duplicates as skippable. Last one + /// wins. + fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest { + if !metadata.is_event() { + // Must not be never because we wouldn't get trace and span data. + return Interest::always(); + } + + let mut field_indices = SkippedFieldIndices::default(); + let mut seen_fields = HashMap::<&'static str, usize>::new(); + for field in metadata.fields() { + use std::collections::hash_map::Entry; + match seen_fields.entry(field.name()) { + Entry::Vacant(entry) => { + // field not seen yet + entry.insert(field.index()); + } + Entry::Occupied(mut entry) => { + // replace currently stored index + let old_index = entry.insert(field.index()); + // ... and append it to list of skippable indices + field_indices.push(old_index); + } + } + } + + if !field_indices.is_empty() { + self.skipped_field_indices + .pin() + .insert(metadata.callsite(), field_indices); + } + + Interest::always() + } +} + +/// Stores span field values recorded during the spans lifetime. +#[derive(Default)] +struct SpanFields { + // TODO: Switch to custom enum with lasso::Spur for Strings? + fields: papaya::HashMap<&'static str, serde_json::Value>, +} + +impl SpanFields { + #[inline] + fn record_fields(&self, fields: R) { + fields.record(&mut SpanFieldsRecorder { + fields: self.fields.pin(), + }); + } +} + +/// Implements a tracing field visitor to convert and store values. +struct SpanFieldsRecorder<'m, S, G> { + fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>, +} + +impl tracing::field::Visit for SpanFieldsRecorder<'_, S, G> { + #[inline] + fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } + + #[inline] + fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } + + #[inline] + fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } + + #[inline] + fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { + if let Ok(value) = i64::try_from(value) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } else { + self.fields + .insert(field.name(), serde_json::Value::from(format!("{value}"))); + } + } + + #[inline] + fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { + if let Ok(value) = u64::try_from(value) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } else { + self.fields + .insert(field.name(), serde_json::Value::from(format!("{value}"))); + } + } + + #[inline] + fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } + + #[inline] + fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } + + #[inline] + fn record_str(&mut self, field: &tracing::field::Field, value: &str) { + self.fields + .insert(field.name(), serde_json::Value::from(value)); + } + + #[inline] + fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { + self.fields + .insert(field.name(), serde_json::Value::from(format!("{value:?}"))); + } + + #[inline] + fn record_error( + &mut self, + field: &tracing::field::Field, + value: &(dyn std::error::Error + 'static), + ) { + self.fields + .insert(field.name(), serde_json::Value::from(format!("{value}"))); + } +} + +/// List of field indices skipped during logging. Can list duplicate fields or +/// metafields not meant to be logged. +#[derive(Clone, Default)] +struct SkippedFieldIndices { + bits: u64, +} + +impl SkippedFieldIndices { + #[inline] + fn is_empty(&self) -> bool { + self.bits == 0 + } + + #[inline] + fn push(&mut self, index: usize) { + self.bits |= 1u64 + .checked_shl(index as u32) + .expect("field index too large"); + } + + #[inline] + fn contains(&self, index: usize) -> bool { + self.bits + & 1u64 + .checked_shl(index as u32) + .expect("field index too large") + != 0 + } +} + +/// Formats a tracing event and writes JSON to its internal buffer including a newline. +// TODO: buffer capacity management, truncate if too large +struct EventFormatter { + logline_buffer: Vec, +} + +impl EventFormatter { + #[inline] + fn new() -> Self { + EventFormatter { + logline_buffer: Vec::new(), + } + } + + #[inline] + fn buffer(&self) -> &[u8] { + &self.logline_buffer + } + + #[inline] + fn reset(&mut self) { + self.logline_buffer.clear(); + } + + fn format( + &mut self, + now: DateTime, + event: &Event<'_>, + ctx: &Context<'_, S>, + skipped_field_indices: &papaya::HashMap, + ) -> io::Result<()> + where + S: Subscriber + for<'a> LookupSpan<'a>, + { + let timestamp = now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true); + + use tracing_log::NormalizeEvent; + let normalized_meta = event.normalized_metadata(); + let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata()); + + let skipped_field_indices = skipped_field_indices.pin(); + let skipped_field_indices = skipped_field_indices.get(&meta.callsite()); + + let mut serialize = || { + let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer); + + let mut serializer = serializer.serialize_map(None)?; + + // Timestamp comes first, so raw lines can be sorted by timestamp. + serializer.serialize_entry("timestamp", ×tamp)?; + + // Level next. + serializer.serialize_entry("level", &meta.level().as_str())?; + + // Message next. + serializer.serialize_key("message")?; + let mut message_extractor = + MessageFieldExtractor::new(serializer, skipped_field_indices); + event.record(&mut message_extractor); + let mut serializer = message_extractor.into_serializer()?; + + let mut fields_present = FieldsPresent(false, skipped_field_indices); + event.record(&mut fields_present); + if fields_present.0 { + serializer.serialize_entry( + "fields", + &SerializableEventFields(event, skipped_field_indices), + )?; + } + + let pid = std::process::id(); + if pid != 1 { + serializer.serialize_entry("process_id", &pid)?; + } + + THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?; + + // TODO: tls cache? name could change + if let Some(thread_name) = std::thread::current().name() { + if !thread_name.is_empty() && thread_name != "tokio-runtime-worker" { + serializer.serialize_entry("thread_name", thread_name)?; + } + } + + if let Some(task_id) = tokio::task::try_id() { + serializer.serialize_entry("task_id", &format_args!("{task_id}"))?; + } + + serializer.serialize_entry("target", meta.target())?; + + if let Some(module) = meta.module_path() { + if module != meta.target() { + serializer.serialize_entry("module", module)?; + } + } + + if let Some(file) = meta.file() { + if let Some(line) = meta.line() { + serializer.serialize_entry("src", &format_args!("{file}:{line}"))?; + } else { + serializer.serialize_entry("src", file)?; + } + } + + { + let otel_context = Span::current().context(); + let otel_spanref = otel_context.span(); + let span_context = otel_spanref.span_context(); + if span_context.is_valid() { + serializer.serialize_entry( + "trace_id", + &format_args!("{}", span_context.trace_id()), + )?; + } + } + + serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?; + + serializer.end() + }; + + serialize().map_err(io::Error::other)?; + self.logline_buffer.push(b'\n'); + Ok(()) + } +} + +/// Extracts the message field that's mixed will other fields. +struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> { + serializer: S, + skipped_field_indices: Option<&'a SkippedFieldIndices>, + state: Option>, +} + +impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> { + #[inline] + fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self { + Self { + serializer, + skipped_field_indices, + state: None, + } + } + + #[inline] + fn into_serializer(mut self) -> Result { + match self.state { + Some(Ok(())) => {} + Some(Err(err)) => return Err(err), + None => self.serializer.serialize_value("")?, + } + Ok(self.serializer) + } + + #[inline] + fn accept_field(&self, field: &tracing::field::Field) -> bool { + self.state.is_none() + && field.name() == MESSAGE_FIELD + && !self + .skipped_field_indices + .is_some_and(|i| i.contains(field.index())) + } +} + +impl tracing::field::Visit for MessageFieldExtractor<'_, S> { + #[inline] + fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&format_args!("{value:x?}"))); + } + } + + #[inline] + fn record_str(&mut self, field: &tracing::field::Field, value: &str) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&value)); + } + } + + #[inline] + fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&format_args!("{value:?}"))); + } + } + + #[inline] + fn record_error( + &mut self, + field: &tracing::field::Field, + value: &(dyn std::error::Error + 'static), + ) { + if self.accept_field(field) { + self.state = Some(self.serializer.serialize_value(&format_args!("{value}"))); + } + } +} + +/// Checks if there's any fields and field values present. If not, the JSON subobject +/// can be skipped. +// This is entirely optional and only cosmetic, though maybe helps a +// bit during log parsing in dashboards when there's no field with empty object. +struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>); + +// Even though some methods have an overhead (error, bytes) it is assumed the +// compiler won't include this since we ignore the value entirely. +impl tracing::field::Visit for FieldsPresent<'_> { + #[inline] + fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) { + if !self.1.is_some_and(|i| i.contains(field.index())) + && field.name() != MESSAGE_FIELD + && !field.name().starts_with("log.") + { + self.0 |= true; + } + } +} + +/// Serializes the fields directly supplied with a log event. +struct SerializableEventFields<'a, 'event>( + &'a tracing::Event<'event>, + Option<&'a SkippedFieldIndices>, +); + +impl serde::ser::Serialize for SerializableEventFields<'_, '_> { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + use serde::ser::SerializeMap; + let serializer = serializer.serialize_map(None)?; + let mut message_skipper = MessageFieldSkipper::new(serializer, self.1); + self.0.record(&mut message_skipper); + let serializer = message_skipper.into_serializer()?; + serializer.end() + } +} + +/// A tracing field visitor that skips the message field. +struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> { + serializer: S, + skipped_field_indices: Option<&'a SkippedFieldIndices>, + state: Result<(), S::Error>, +} + +impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> { + #[inline] + fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self { + Self { + serializer, + skipped_field_indices, + state: Ok(()), + } + } + + #[inline] + fn accept_field(&self, field: &tracing::field::Field) -> bool { + self.state.is_ok() + && field.name() != MESSAGE_FIELD + && !field.name().starts_with("log.") + && !self + .skipped_field_indices + .is_some_and(|i| i.contains(field.index())) + } + + #[inline] + fn into_serializer(self) -> Result { + self.state?; + Ok(self.serializer) + } +} + +impl tracing::field::Visit for MessageFieldSkipper<'_, S> { + #[inline] + fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { + if self.accept_field(field) { + self.state = self + .serializer + .serialize_entry(field.name(), &format_args!("{value:x?}")); + } + } + + #[inline] + fn record_str(&mut self, field: &tracing::field::Field, value: &str) { + if self.accept_field(field) { + self.state = self.serializer.serialize_entry(field.name(), &value); + } + } + + #[inline] + fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { + if self.accept_field(field) { + self.state = self + .serializer + .serialize_entry(field.name(), &format_args!("{value:?}")); + } + } + + #[inline] + fn record_error( + &mut self, + field: &tracing::field::Field, + value: &(dyn std::error::Error + 'static), + ) { + if self.accept_field(field) { + self.state = self.serializer.serialize_value(&format_args!("{value}")); + } + } +} + +/// Serializes the span stack from root to leaf (parent of event) enumerated +/// inside an object where the keys are just the number padded with zeroes +/// to retain sorting order. +// The object is necessary because Loki cannot flatten arrays. +struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>) +where + Span: Subscriber + for<'lookup> LookupSpan<'lookup>; + +impl serde::ser::Serialize for SerializableSpanStack<'_, '_, Span> +where + Span: Subscriber + for<'lookup> LookupSpan<'lookup>, +{ + fn serialize(&self, serializer: Ser) -> Result + where + Ser: serde::ser::Serializer, + { + let mut serializer = serializer.serialize_map(None)?; + + if let Some(leaf_span) = self.0.lookup_current() { + for (i, span) in leaf_span.scope().from_root().enumerate() { + serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?; + } + } + + serializer.end() + } +} + +/// Serializes a single span. Include the span ID, name and its fields as +/// recorded up to this point. +struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>) +where + Span: for<'lookup> LookupSpan<'lookup>; + +impl serde::ser::Serialize for SerializableSpan<'_, '_, Span> +where + Span: for<'lookup> LookupSpan<'lookup>, +{ + fn serialize(&self, serializer: Ser) -> Result + where + Ser: serde::ser::Serializer, + { + let mut serializer = serializer.serialize_map(None)?; + // TODO: the span ID is probably only useful for debugging tracing. + serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?; + serializer.serialize_entry("span_name", self.0.metadata().name())?; + + let ext = self.0.extensions(); + if let Some(data) = ext.get::() { + for (key, value) in &data.fields.pin() { + serializer.serialize_entry(key, value)?; + } + } + + serializer.end() + } +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use std::sync::{Arc, Mutex, MutexGuard}; + + use assert_json_diff::assert_json_eq; + use tracing::info_span; + + use super::*; + + struct TestClock { + current_time: Mutex>, + } + + impl Clock for Arc { + fn now(&self) -> DateTime { + *self.current_time.lock().expect("poisoned") + } + } + + struct VecWriter<'a> { + buffer: MutexGuard<'a, Vec>, + } + + impl MakeWriter for Arc>> { + fn make_writer(&self) -> impl io::Write { + VecWriter { + buffer: self.lock().expect("poisoned"), + } + } + } + + impl io::Write for VecWriter<'_> { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.buffer.write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } + } + + #[test] + fn test_field_collection() { + let clock = Arc::new(TestClock { + current_time: Mutex::new(Utc::now()), + }); + let buffer = Arc::new(Mutex::new(Vec::new())); + let log_layer = JsonLoggingLayer { + clock: clock.clone(), + skipped_field_indices: papaya::HashMap::default(), + writer: buffer.clone(), + }; + + let registry = tracing_subscriber::Registry::default().with(log_layer); + + tracing::subscriber::with_default(registry, || { + info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| { + info_span!("span2").in_scope(|| { + tracing::error!( + a = 1, + a = 2, + a = 3, + message = "explicit message field", + "implicit message field" + ); + }); + }); + }); + + let buffer = Arc::try_unwrap(buffer) + .expect("no other reference") + .into_inner() + .expect("poisoned"); + let actual: serde_json::Value = serde_json::from_slice(&buffer).expect("valid JSON"); + let expected: serde_json::Value = serde_json::json!( + { + "timestamp": clock.now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true), + "level": "ERROR", + "message": "explicit message field", + "fields": { + "a": 3, + }, + "spans": { + "00":{ + "span_id": "0000000000000001", + "span_name": "span1", + "x": 42, + }, + "01": { + "span_id": "0000000000000002", + "span_name": "span2", + } + }, + "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(), + "target": "proxy::logging::tests", + "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(), + "thread_id": actual.as_object().unwrap().get("thread_id").unwrap().as_number().unwrap(), + "thread_name": "logging::tests::test_field_collection", + } + ); + + assert_json_eq!(actual, expected); + } +} diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 659c57c865..db1f096de1 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -56,6 +56,8 @@ pub struct ProxyMetrics { pub connection_requests: CounterPairVec, #[metric(flatten)] pub http_endpoint_pools: HttpEndpointPools, + #[metric(flatten)] + pub cancel_channel_size: CounterPairVec, /// Time it took for proxy to establish a connection to the compute endpoint. // largest bucket = 2^16 * 0.5ms = 32s @@ -94,6 +96,16 @@ pub struct ProxyMetrics { #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_ips_number: Histogram<10>, + /// Number of cache hits/misses for VPC endpoint IDs. + pub vpc_endpoint_id_cache_stats: CounterVec>, + + /// Number of cache hits/misses for access blocker flags. + pub access_blocker_flags_cache_stats: CounterVec>, + + /// Number of allowed VPC endpoints IDs + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] + pub allowed_vpc_endpoint_ids: Histogram<10>, + /// Number of connections (per sni). pub accepted_connections_by_sni: CounterVec>, @@ -193,7 +205,7 @@ pub enum Protocol { } impl Protocol { - pub fn as_str(&self) -> &'static str { + pub fn as_str(self) -> &'static str { match self { Protocol::Http => "http", Protocol::Ws => "ws", @@ -294,6 +306,16 @@ impl CounterPairAssoc for NumConnectionRequestsGauge { pub type NumConnectionRequestsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>; +pub struct CancelChannelSizeGauge; +impl CounterPairAssoc for CancelChannelSizeGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_msgs_cancel_channel_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_msgs_cancel_channel_total"); + const INC_HELP: &'static str = "Number of processing messages in the cancellation channel."; + const DEC_HELP: &'static str = "Number of closed messages in the cancellation channel."; + type LabelGroupSet = StaticLabelSet; +} +pub type CancelChannelSizeGuard<'a> = metrics::MeasuredCounterPairGuard<'a, CancelChannelSizeGauge>; + #[derive(LabelGroup)] #[label(set = ComputeConnectionLatencySet)] pub struct ComputeConnectionLatencyGroup { @@ -340,13 +362,6 @@ pub struct RedisErrors<'a> { pub channel: &'a str, } -#[derive(FixedCardinalityLabel, Copy, Clone)] -pub enum CancellationSource { - FromClient, - FromRedis, - Local, -} - #[derive(FixedCardinalityLabel, Copy, Clone)] pub enum CancellationOutcome { NotFound, @@ -357,7 +372,6 @@ pub enum CancellationOutcome { #[derive(LabelGroup)] #[label(set = CancellationRequestSet)] pub struct CancellationRequest { - pub source: CancellationSource, pub kind: CancellationOutcome, } @@ -369,6 +383,17 @@ pub enum Waiting { RetryTimeout, } +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +#[allow(clippy::enum_variant_names)] +pub enum RedisMsgKind { + HSet, + HSetMultiple, + HGet, + HGetAll, + HDel, +} + #[derive(Default)] struct Accumulated { cplane: time::Duration, @@ -518,11 +543,7 @@ impl Drop for LatencyTimer { impl From for Bool { fn from(value: bool) -> Self { - if value { - Bool::True - } else { - Bool::False - } + if value { Bool::True } else { Bool::False } } } @@ -556,6 +577,9 @@ pub enum RedisEventsCount { CancelSession, PasswordUpdate, AllowedIpsUpdate, + AllowedVpcEndpointIdsUpdateForProjects, + AllowedVpcEndpointIdsUpdateForAllProjectsInOrg, + BlockPublicOrVpcAccessUpdate, } pub struct ThreadPoolWorkers(usize); diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs index 8c0f251066..095d6278cc 100644 --- a/proxy/src/parse.rs +++ b/proxy/src/parse.rs @@ -8,14 +8,6 @@ pub(crate) fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { Some((cstr, other)) } -/// See . -pub(crate) fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { - (bytes.len() >= N).then(|| { - let (head, tail) = bytes.split_at(N); - (head.try_into().unwrap(), tail) - }) -} - #[cfg(test)] mod tests { use super::*; @@ -33,11 +25,4 @@ mod tests { assert_eq!(cstr.to_bytes(), b"foo"); assert_eq!(rest, b"bar"); } - - #[test] - fn test_split_at_const() { - assert!(split_at_const::<0>(b"").is_some()); - assert!(split_at_const::<1>(b"").is_none()); - assert!(matches!(split_at_const::<1>(b"ok"), Some((b"o", b"k")))); - } } diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 33a5eb5e1e..41180fa6c1 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -9,6 +9,7 @@ use std::task::{Context, Poll}; use bytes::{Buf, Bytes, BytesMut}; use pin_project_lite::pin_project; +use smol_str::SmolStr; use strum_macros::FromRepr; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; use zerocopy::{FromBytes, FromZeroes}; @@ -99,7 +100,7 @@ impl fmt::Display for ConnectionInfo { #[derive(PartialEq, Eq, Clone, Debug)] pub enum ConnectionInfoExtra { - Aws { vpce_id: Bytes }, + Aws { vpce_id: SmolStr }, Azure { link_id: u32 }, } @@ -119,7 +120,7 @@ pub(crate) async fn read_proxy_protocol( // if no more bytes available then exit if bytes_read == 0 { return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing)); - }; + } // check if we have enough bytes to continue if let Some(header) = buf.try_get::() { @@ -169,7 +170,7 @@ fn process_proxy_payload( header.version_and_command ), )), - }; + } let size_err = "invalid proxy protocol length. payload not large enough to fit requested IP addresses"; @@ -193,7 +194,7 @@ fn process_proxy_payload( return Err(io::Error::new( io::ErrorKind::Other, "invalid proxy protocol address family/transport protocol.", - )) + )); } }; @@ -207,9 +208,14 @@ fn process_proxy_payload( } let subtype = tlv.value.get_u8(); match Pp2AwsType::from_repr(subtype) { - Some(Pp2AwsType::VpceId) => { - extra = Some(ConnectionInfoExtra::Aws { vpce_id: tlv.value }); - } + Some(Pp2AwsType::VpceId) => match std::str::from_utf8(&tlv.value) { + Ok(s) => { + extra = Some(ConnectionInfoExtra::Aws { vpce_id: s.into() }); + } + Err(e) => { + tracing::warn!("invalid aws vpce id: {e}"); + } + }, None => { tracing::warn!("unknown aws tlv: subtype={subtype}"); } @@ -396,11 +402,12 @@ impl NetworkEndianIpv6 { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use tokio::io::AsyncReadExt; use crate::protocol2::{ - read_proxy_protocol, ConnectHeader, LOCAL_V2, PROXY_V2, TCP_OVER_IPV4, UDP_OVER_IPV6, + ConnectHeader, LOCAL_V2, PROXY_V2, TCP_OVER_IPV4, UDP_OVER_IPV6, read_proxy_protocol, }; #[tokio::test] diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index a3027abd7c..b8b39fa121 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -4,9 +4,9 @@ use tokio::time; use tracing::{debug, info, warn}; use super::retry::ShouldRetryWakeCompute; -use crate::auth::backend::ComputeCredentialKeys; -use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT}; -use crate::config::RetryConfig; +use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; +use crate::compute::{self, COULD_NOT_CONNECT, PostgresConnection}; +use crate::config::{ComputeConfig, RetryConfig}; use crate::context::RequestContext; use crate::control_plane::errors::WakeComputeError; use crate::control_plane::locks::ApiLocks; @@ -15,12 +15,10 @@ use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType, }; -use crate::proxy::retry::{retry_after, should_retry, CouldRetry}; +use crate::proxy::retry::{CouldRetry, retry_after, should_retry}; use crate::proxy::wake_compute::wake_compute; use crate::types::Host; -const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); - /// If we couldn't connect, a cached connection info might be to blame /// (e.g. the compute node's address might've changed at the wrong time). /// Invalidate the cache entry (if any) to prevent subsequent errors. @@ -49,7 +47,7 @@ pub(crate) trait ConnectMechanism { &self, ctx: &RequestContext, node_info: &control_plane::CachedNodeInfo, - timeout: time::Duration, + config: &ComputeConfig, ) -> Result; fn update_connect_config(&self, conf: &mut compute::ConnCfg); @@ -73,6 +71,8 @@ pub(crate) struct TcpMechanism<'a> { /// connect_to_compute concurrency lock pub(crate) locks: &'static ApiLocks, + + pub(crate) user_info: ComputeUserInfo, } #[async_trait] @@ -86,11 +86,11 @@ impl ConnectMechanism for TcpMechanism<'_> { &self, ctx: &RequestContext, node_info: &control_plane::CachedNodeInfo, - timeout: time::Duration, + config: &ComputeConfig, ) -> Result { let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; - permit.release_result(node_info.connect(ctx, timeout).await) + permit.release_result(node_info.connect(ctx, config, self.user_info.clone()).await) } fn update_connect_config(&self, config: &mut compute::ConnCfg) { @@ -104,9 +104,8 @@ pub(crate) async fn connect_to_compute Result where M::ConnectError: CouldRetry + ShouldRetryWakeCompute + std::fmt::Debug, @@ -117,14 +116,10 @@ where wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; node_info.set_keys(user_info.get_keys()); - node_info.allow_self_signed_compute = allow_self_signed_compute; mechanism.update_connect_config(&mut node_info.config); // try once - let err = match mechanism - .connect_once(ctx, &node_info, CONNECT_TIMEOUT) - .await - { + let err = match mechanism.connect_once(ctx, &node_info, compute).await { Ok(res) => { ctx.success(); Metrics::get().proxy.retries_metric.observe( @@ -144,7 +139,7 @@ where let node_info = if !node_info.cached() || !err.should_retry_wake_compute() { // If we just recieved this from cplane and didn't get it from cache, we shouldn't retry. // Do not need to retrieve a new node_info, just return the old one. - if should_retry(&err, num_retries, connect_to_compute_retry_config) { + if should_retry(&err, num_retries, compute.retry) { Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, @@ -174,10 +169,7 @@ where debug!("wake_compute success. attempting to connect"); num_retries = 1; loop { - match mechanism - .connect_once(ctx, &node_info, CONNECT_TIMEOUT) - .await - { + match mechanism.connect_once(ctx, &node_info, compute).await { Ok(res) => { ctx.success(); Metrics::get().proxy.retries_metric.observe( @@ -192,7 +184,7 @@ where return Ok(res); } Err(e) => { - if !should_retry(&e, num_retries, connect_to_compute_retry_config) { + if !should_retry(&e, num_retries, compute.retry) { // Don't log an error here, caller will print the error Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { @@ -206,9 +198,9 @@ where warn!(error = ?e, num_retries, retriable = true, COULD_NOT_CONNECT); } - }; + } - let wait_duration = retry_after(num_retries, connect_to_compute_retry_config); + let wait_duration = retry_after(num_retries, compute.retry); num_retries += 1; let pause = ctx.latency_timer_pause(crate::metrics::Waiting::RetryTimeout); diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 4e4af88634..6f8b972348 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -1,7 +1,7 @@ use std::future::poll_fn; use std::io; use std::pin::Pin; -use std::task::{ready, Context, Poll}; +use std::task::{Context, Poll, ready}; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tracing::info; @@ -201,25 +201,26 @@ impl CopyBuffer { W: AsyncWrite + ?Sized, { loop { - // If our buffer is empty, then we need to read some data to - // continue. - if self.pos == self.cap && !self.read_done { - self.pos = 0; - self.cap = 0; - + // If there is some space left in our buffer, then we try to read some + // data to continue, thus maximizing the chances of a large write. + if self.cap < self.buf.len() && !self.read_done { match self.poll_fill_buf(cx, reader.as_mut()) { Poll::Ready(Ok(())) => (), Poll::Ready(Err(err)) => return Poll::Ready(Err(ErrorDirection::Read(err))), Poll::Pending => { - // Try flushing when the reader has no progress to avoid deadlock - // when the reader depends on buffered writer. - if self.need_flush { - ready!(writer.as_mut().poll_flush(cx)) - .map_err(ErrorDirection::Write)?; - self.need_flush = false; - } + // Ignore pending reads when our buffer is not empty, because + // we can try to write data immediately. + if self.pos == self.cap { + // Try flushing when the reader has no progress to avoid deadlock + // when the reader depends on buffered writer. + if self.need_flush { + ready!(writer.as_mut().poll_flush(cx)) + .map_err(ErrorDirection::Write)?; + self.need_flush = false; + } - return Poll::Pending; + return Poll::Pending; + } } } } @@ -246,9 +247,13 @@ impl CopyBuffer { "writer returned length larger than input slice" ); + // All data has been written, the buffer can be considered empty again + self.pos = 0; + self.cap = 0; + // If we've written all the data and we've seen EOF, flush out the // data and finish the transfer. - if self.pos == self.cap && self.read_done { + if self.read_done { ready!(writer.as_mut().poll_flush(cx)).map_err(ErrorDirection::Write)?; return Poll::Ready(Ok(self.amt)); } @@ -257,6 +262,7 @@ impl CopyBuffer { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use tokio::io::AsyncWriteExt; diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index e27c211932..955f754497 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -8,12 +8,13 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info, warn}; use crate::auth::endpoint_sni; -use crate::config::{TlsConfig, PG_ALPN_PROTOCOL}; +use crate::config::TlsConfig; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::Metrics; use crate::proxy::ERR_INSECURE_CONNECTION; use crate::stream::{PqStream, Stream, StreamUpgradeError}; +use crate::tls::PG_ALPN_PROTOCOL; #[derive(Error, Debug)] pub(crate) enum HandshakeError { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index cc04bc5e5c..0c6d352600 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -9,27 +9,28 @@ pub(crate) mod retry; pub(crate) mod wake_compute; use std::sync::Arc; -pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource}; +pub use copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute}; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use once_cell::sync::OnceCell; -use pq_proto::{BeMessage as Be, StartupMessageParams}; +use pq_proto::{BeMessage as Be, CancelKeyData, StartupMessageParams}; use regex::Regex; -use smol_str::{format_smolstr, SmolStr}; +use serde::{Deserialize, Serialize}; +use smol_str::{SmolStr, ToSmolStr, format_smolstr}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn, Instrument}; +use tracing::{Instrument, debug, error, info, warn}; -use self::connect_compute::{connect_to_compute, TcpMechanism}; +use self::connect_compute::{TcpMechanism, connect_to_compute}; use self::passthrough::ProxyPassthrough; -use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}; +use crate::cancellation::{self, CancellationHandler}; use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; -use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo}; -use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol}; +use crate::proxy::handshake::{HandshakeData, handshake}; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::{PqStream, Stream}; use crate::types::EndpointCacheKey; @@ -57,7 +58,7 @@ pub async fn task_main( auth_backend: &'static auth::Backend<'static, ()>, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, - cancellation_handler: Arc, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { @@ -99,25 +100,37 @@ pub async fn task_main( debug!("healthcheck received"); return; } - Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { + Ok((_socket, ConnectHeader::Missing)) + if config.proxy_protocol_v2 == ProxyProtocolV2::Required => + { warn!("missing required proxy protocol header"); return; } - Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { + Ok((_socket, ConnectHeader::Proxy(_))) + if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => + { warn!("proxy protocol header not supported"); return; } Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), - Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo { addr: peer_addr, extra: None }), + Ok((socket, ConnectHeader::Missing)) => ( + socket, + ConnectionInfo { + addr: peer_addr, + extra: None, + }, + ), }; match socket.inner.set_nodelay(true) { Ok(()) => {} Err(e) => { - error!("per-client task finished with an error: failed to set socket option: {e:#}"); + error!( + "per-client task finished with an error: failed to set socket option: {e:#}" + ); return; } - }; + } let ctx = RequestContext::new( session_id, @@ -152,13 +165,19 @@ pub async fn task_main( Ok(Some(p)) => { ctx.set_success(); let _disconnect = ctx.log_connect(); - match p.proxy_pass().await { + match p.proxy_pass(&config.connect_to_compute).await { Ok(()) => {} Err(ErrorSource::Client(e)) => { - warn!(?session_id, "per-client task finished with an IO error from the client: {e:#}"); + warn!( + ?session_id, + "per-client task finished with an IO error from the client: {e:#}" + ); } Err(ErrorSource::Compute(e)) => { - error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}"); + error!( + ?session_id, + "per-client task finished with an IO error from the compute: {e:#}" + ); } } } @@ -191,13 +210,6 @@ impl ClientMode { } } - pub(crate) fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool { - match self { - ClientMode::Tcp => config.allow_self_signed_compute, - ClientMode::Websockets { .. } => false, - } - } - fn hostname<'a, S>(&'a self, s: &'a Stream) -> Option<&'a str> { match self { ClientMode::Tcp => s.sni_hostname(), @@ -250,13 +262,13 @@ pub(crate) async fn handle_client( config: &'static ProxyConfig, auth_backend: &'static auth::Backend<'static, ()>, ctx: &RequestContext, - cancellation_handler: Arc, + cancellation_handler: Arc, stream: S, mode: ClientMode, endpoint_rate_limiter: Arc, conn_gauge: NumClientConnectionsGuard<'static>, cancellations: tokio_util::task::task_tracker::TaskTracker, -) -> Result>, ClientRequestError> { +) -> Result>, ClientRequestError> { debug!( protocol = %ctx.protocol(), "handling interactive connection from client" @@ -280,23 +292,21 @@ pub(crate) async fn handle_client( // spawn a task to cancel the session, but don't wait for it cancellations.spawn({ let cancellation_handler_clone = Arc::clone(&cancellation_handler); - let session_id = ctx.session_id(); - let peer_ip = ctx.peer_addr(); - let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id); + let ctx = ctx.clone(); + let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id()); cancel_span.follows_from(tracing::Span::current()); async move { - drop( - cancellation_handler_clone - .cancel_session( - cancel_key_data, - session_id, - peer_ip, - config.authentication_config.ip_allowlist_check_enabled, - ) - .instrument(cancel_span) - .await, - ); - } + cancellation_handler_clone + .cancel_session( + cancel_key_data, + ctx, + config.authentication_config.ip_allowlist_check_enabled, + config.authentication_config.is_vpc_acccess_proxy, + auth_backend.get_api(), + ) + .await + .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok(); + }.instrument(cancel_span) }); return Ok(None); @@ -322,7 +332,7 @@ pub(crate) async fn handle_client( }; let user = user_info.get_user().to_owned(); - let user_info = match user_info + let (user_info, _ip_allowlist) = match user_info .authenticate( ctx, &mut stream, @@ -342,30 +352,38 @@ pub(crate) async fn handle_client( } }; - let params_compat = match &user_info { - auth::Backend::ControlPlane(_, info) => { - info.info.options.get(NeonOptions::PARAMS_COMPAT).is_some() - } - auth::Backend::Local(_) => false, + let compute_user_info = match &user_info { + auth::Backend::ControlPlane(_, info) => &info.info, + auth::Backend::Local(_) => unreachable!("local proxy does not run tcp proxy service"), }; + let params_compat = compute_user_info + .options + .get(NeonOptions::PARAMS_COMPAT) + .is_some(); let mut node = connect_to_compute( ctx, &TcpMechanism { + user_info: compute_user_info.clone(), params_compat, params: ¶ms, locks: &config.connect_compute_locks, }, &user_info, - mode.allow_self_signed_compute(config), config.wake_compute_retry_config, - config.connect_to_compute_retry_config, + &config.connect_to_compute, ) .or_else(|e| stream.throw_error(e)) .await?; - let session = cancellation_handler.get_session(); - prepare_client_connection(&node, &session, &mut stream).await?; + let cancellation_handler_clone = Arc::clone(&cancellation_handler); + let session = cancellation_handler_clone.get_key(); + + session + .write_cancel_key(node.cancel_closure.clone()) + .await?; + + prepare_client_connection(&node, *session.key(), &mut stream).await?; // Before proxy passing, forward to compute whatever data is left in the // PqStream input buffer. Normally there is none, but our serverless npm @@ -374,28 +392,31 @@ pub(crate) async fn handle_client( let (stream, read_buf) = stream.into_inner(); node.stream.write_all(&read_buf).await?; + let private_link_id = match ctx.extra() { + Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), + Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), + None => None, + }; + Ok(Some(ProxyPassthrough { client: stream, aux: node.aux.clone(), + private_link_id, compute: node, session_id: ctx.session_id(), + cancel: session, _req: request_gauge, _conn: conn_gauge, - _cancel: session, })) } /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] -pub(crate) async fn prepare_client_connection

( +pub(crate) async fn prepare_client_connection( node: &compute::PostgresConnection, - session: &cancellation::Session

, + cancel_key_data: CancelKeyData, stream: &mut PqStream, ) -> Result<(), std::io::Error> { - // Register compute's query cancellation token and produce a new, unique one. - // The new token (cancel_key_data) will be sent to the client. - let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone()); - // Forward all deferred notices to the client. for notice in &node.delayed_notice { stream.write_message_noflush(&Be::Raw(b'N', notice.as_bytes()))?; @@ -417,7 +438,7 @@ pub(crate) async fn prepare_client_connection

( Ok(()) } -#[derive(Debug, Clone, PartialEq, Eq, Default)] +#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)] pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>); impl NeonOptions { @@ -494,7 +515,7 @@ impl NeonOptions { pub(crate) fn neon_option(bytes: &str) -> Option<(&str, &str)> { static RE: OnceCell = OnceCell::new(); - let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap()); + let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").expect("regex should be correct")); let cap = re.captures(bytes)?; let (_, [k, v]) = cap.extract(); diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index dcaa81e5cd..23b9897155 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -1,3 +1,4 @@ +use smol_str::SmolStr; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::debug; use utils::measured_stream::MeasuredStream; @@ -5,10 +6,11 @@ use utils::measured_stream::MeasuredStream; use super::copy_bidirectional::ErrorSource; use crate::cancellation; use crate::compute::PostgresConnection; +use crate::config::ComputeConfig; use crate::control_plane::messages::MetricsAuxInfo; use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; use crate::stream::Stream; -use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounterRecorder, TrafficDirection, USAGE_METRICS}; /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] @@ -16,10 +18,14 @@ pub(crate) async fn proxy_pass( client: impl AsyncRead + AsyncWrite + Unpin, compute: impl AsyncRead + AsyncWrite + Unpin, aux: MetricsAuxInfo, + private_link_id: Option, ) -> Result<(), ErrorSource> { - let usage = USAGE_METRICS.register(Ids { + // we will report ingress at a later date + let usage_tx = USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, + direction: TrafficDirection::Egress, + private_link_id, }); let metrics = &Metrics::get().proxy.io_bytes; @@ -30,7 +36,7 @@ pub(crate) async fn proxy_pass( |cnt| { // Number of bytes we sent to the client (outbound). metrics.get_metric(m_sent).inc_by(cnt as u64); - usage.record_egress(cnt as u64); + usage_tx.record_egress(cnt as u64); }, ); @@ -55,23 +61,41 @@ pub(crate) async fn proxy_pass( Ok(()) } -pub(crate) struct ProxyPassthrough { +pub(crate) struct ProxyPassthrough { pub(crate) client: Stream, pub(crate) compute: PostgresConnection, pub(crate) aux: MetricsAuxInfo, pub(crate) session_id: uuid::Uuid, + pub(crate) private_link_id: Option, + pub(crate) cancel: cancellation::Session, pub(crate) _req: NumConnectionRequestsGuard<'static>, pub(crate) _conn: NumClientConnectionsGuard<'static>, - pub(crate) _cancel: cancellation::Session

, } -impl ProxyPassthrough { - pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> { - let res = proxy_pass(self.client, self.compute.stream, self.aux).await; - if let Err(err) = self.compute.cancel_closure.try_cancel_query().await { +impl ProxyPassthrough { + pub(crate) async fn proxy_pass( + self, + compute_config: &ComputeConfig, + ) -> Result<(), ErrorSource> { + let res = proxy_pass( + self.client, + self.compute.stream, + self.aux, + self.private_link_id, + ) + .await; + if let Err(err) = self + .compute + .cancel_closure + .try_cancel_query(compute_config) + .await + { tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database"); } + + drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error + res } } diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 911b349416..171f539b1e 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -1,15 +1,16 @@ //! A group of high-level tests for connection establishing logic and auth. +#![allow(clippy::unimplemented, clippy::unwrap_used)] mod mitm; use std::time::Duration; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use async_trait::async_trait; use http::StatusCode; use postgres_client::config::SslMode; use postgres_client::tls::{MakeTlsConnect, NoTls}; -use retry::{retry_after, ShouldRetryWakeCompute}; +use retry::{ShouldRetryWakeCompute, retry_after}; use rstest::rstest; use rustls::crypto::ring; use rustls::pki_types; @@ -21,14 +22,16 @@ use super::*; use crate::auth::backend::{ ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, }; -use crate::config::{CertResolver, RetryConfig}; +use crate::config::{ComputeConfig, RetryConfig}; use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient}; use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status}; use crate::control_plane::{ - self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache, + self, CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedNodeInfo, NodeInfo, NodeInfoCache, }; use crate::error::ErrorKind; -use crate::postgres_rustls::MakeRustlsConnect; +use crate::tls::client_config::compute_client_config_with_certs; +use crate::tls::postgres_rustls::MakeRustlsConnect; +use crate::tls::server_config::CertResolver; use crate::types::{BranchId, EndpointId, ProjectId}; use crate::{sasl, scram}; @@ -66,7 +69,7 @@ fn generate_certs( } struct ClientConfig<'a> { - config: rustls::ClientConfig, + config: Arc, hostname: &'a str, } @@ -109,16 +112,7 @@ fn generate_tls_config<'a>( }; let client_config = { - let config = - rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_safe_default_protocol_versions() - .context("ring should support the default protocol versions")? - .with_root_certificates({ - let mut store = rustls::RootCertStore::empty(); - store.add(ca)?; - store - }) - .with_no_client_auth(); + let config = Arc::new(compute_client_config_with_certs([ca])); ClientConfig { config, hostname } }; @@ -340,8 +334,8 @@ async fn scram_auth_mock() -> anyhow::Result<()> { generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock())); - use rand::distributions::Alphanumeric; use rand::Rng; + use rand::distributions::Alphanumeric; let password: String = rand::thread_rng() .sample_iter(&Alphanumeric) .take(rand::random::() as usize) @@ -467,7 +461,7 @@ impl ConnectMechanism for TestConnectMechanism { &self, _ctx: &RequestContext, _node_info: &control_plane::CachedNodeInfo, - _timeout: std::time::Duration, + _config: &ComputeConfig, ) -> Result { let mut counter = self.counter.lock().unwrap(); let action = self.sequence[*counter]; @@ -532,9 +526,19 @@ impl TestControlPlaneClient for TestConnectMechanism { } } - fn get_allowed_ips_and_secret( + fn get_allowed_ips(&self) -> Result { + unimplemented!("not used in tests") + } + + fn get_allowed_vpc_endpoint_ids( &self, - ) -> Result<(CachedAllowedIps, Option), control_plane::errors::GetAuthInfoError> + ) -> Result { + unimplemented!("not used in tests") + } + + fn get_block_public_or_vpc_access( + &self, + ) -> Result { unimplemented!("not used in tests") } @@ -553,7 +557,6 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn branch_id: (&BranchId::from("branch")).into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, - allow_self_signed_compute: false, }; let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone())); node2.map(|()| node) @@ -576,6 +579,20 @@ fn helper_create_connect_info( user_info } +fn config() -> ComputeConfig { + let retry = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + + ComputeConfig { + retry, + tls: Arc::new(compute_client_config_with_certs(std::iter::empty())), + timeout: Duration::from_secs(2), + } +} + #[tokio::test] async fn connect_to_compute_success() { let _ = env_logger::try_init(); @@ -583,12 +600,8 @@ async fn connect_to_compute_success() { let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - let config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + let config = config(); + connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap(); mechanism.verify(); @@ -601,12 +614,8 @@ async fn connect_to_compute_retry() { let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - let config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + let config = config(); + connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap(); mechanism.verify(); @@ -620,12 +629,8 @@ async fn connect_to_compute_non_retry_1() { let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); let user_info = helper_create_connect_info(&mechanism); - let config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + let config = config(); + connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap_err(); mechanism.verify(); @@ -639,12 +644,8 @@ async fn connect_to_compute_non_retry_2() { let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - let config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + let config = config(); + connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap(); mechanism.verify(); @@ -665,18 +666,13 @@ async fn connect_to_compute_non_retry_3() { max_retries: 1, backoff_factor: 2.0, }; - let connect_to_compute_retry_config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; + let config = config(); connect_to_compute( &ctx, &mechanism, &user_info, - false, wake_compute_retry_config, - connect_to_compute_retry_config, + &config, ) .await .unwrap_err(); @@ -691,12 +687,8 @@ async fn wake_retry() { let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); let user_info = helper_create_connect_info(&mechanism); - let config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + let config = config(); + connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap(); mechanism.verify(); @@ -710,12 +702,8 @@ async fn wake_non_retry() { let ctx = RequestContext::test(); let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); let user_info = helper_create_connect_info(&mechanism); - let config = RetryConfig { - base_delay: Duration::from_secs(1), - max_retries: 5, - backoff_factor: 2.0, - }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + let config = config(); + connect_to_compute(&ctx, &mechanism, &user_info, config.retry, &config) .await .unwrap_err(); mechanism.verify(); diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 4e9206feff..9d8915e24a 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -3,8 +3,8 @@ use tracing::{error, info}; use super::connect_compute::ComputeConnectBackend; use crate::config::RetryConfig; use crate::context::RequestContext; -use crate::control_plane::errors::{ControlPlaneError, WakeComputeError}; use crate::control_plane::CachedNodeInfo; +use crate::control_plane::errors::{ControlPlaneError, WakeComputeError}; use crate::error::ReportableError; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index 45f9630dde..b3853d48e4 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -2,8 +2,8 @@ use std::hash::Hash; use std::sync::atomic::{AtomicUsize, Ordering}; use ahash::RandomState; -use dashmap::DashMap; -use rand::{thread_rng, Rng}; +use clashmap::ClashMap; +use rand::{Rng, thread_rng}; use tokio::time::Instant; use tracing::info; use utils::leaky_bucket::LeakyBucketState; @@ -14,7 +14,7 @@ use crate::intern::EndpointIdInt; pub type EndpointRateLimiter = LeakyBucketRateLimiter; pub struct LeakyBucketRateLimiter { - map: DashMap, + map: ClashMap, config: utils::leaky_bucket::LeakyBucketConfig, access_count: AtomicUsize, } @@ -27,7 +27,7 @@ impl LeakyBucketRateLimiter { pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self { Self { - map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards), + map: ClashMap::with_hasher_and_shard_amount(RandomState::new(), shards), config: config.into(), access_count: AtomicUsize::new(0), } @@ -58,7 +58,7 @@ impl LeakyBucketRateLimiter { let shard = thread_rng().gen_range(0..n); self.map.shards()[shard] .write() - .retain(|_, value| !value.get().bucket_is_empty(now)); + .retain(|(_, value)| !value.bucket_is_empty(now)); } } @@ -83,7 +83,7 @@ impl From for utils::leaky_bucket::LeakyBucketConfig { } #[cfg(test)] -#[allow(clippy::float_cmp)] +#[allow(clippy::float_cmp, clippy::unwrap_used)] mod tests { use std::time::Duration; diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index b74a9ab17e..f8eeb89f05 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -5,8 +5,8 @@ use std::time::Duration; use parking_lot::Mutex; use tokio::sync::Notify; -use tokio::time::error::Elapsed; use tokio::time::Instant; +use tokio::time::error::Elapsed; use self::aimd::Aimd; diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs index 3000cc4c2a..04e136b6d5 100644 --- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -63,6 +63,7 @@ impl LimitAlgorithm for Aimd { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::time::Duration; diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index a048721e77..71e2a92da6 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -1,17 +1,18 @@ use std::borrow::Cow; use std::collections::hash_map::RandomState; use std::hash::{BuildHasher, Hash}; -use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Mutex; +use std::sync::atomic::{AtomicUsize, Ordering}; use anyhow::bail; -use dashmap::DashMap; +use clashmap::ClashMap; use itertools::Itertools; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use tokio::time::{Duration, Instant}; use tracing::info; +use crate::ext::LockExt; use crate::intern::EndpointIdInt; pub struct GlobalRateLimiter { @@ -61,7 +62,7 @@ impl GlobalRateLimiter { pub type WakeComputeRateLimiter = BucketRateLimiter; pub struct BucketRateLimiter { - map: DashMap, Hasher>, + map: ClashMap, Hasher>, info: Cow<'static, [RateBucketInfo]>, access_count: AtomicUsize, rand: Mutex, @@ -137,6 +138,12 @@ impl RateBucketInfo { Self::new(200, Duration::from_secs(600)), ]; + // For all the sessions will be cancel key. So this limit is essentially global proxy limit. + pub const DEFAULT_REDIS_SET: [Self; 2] = [ + Self::new(100_000, Duration::from_secs(1)), + Self::new(50_000, Duration::from_secs(10)), + ]; + /// All of these are per endpoint-maskedip pair. /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). /// @@ -195,7 +202,7 @@ impl BucketRateLimiter { info!(buckets = ?info, "endpoint rate limiter"); Self { info, - map: DashMap::with_hasher_and_shard_amount(hasher, 64), + map: ClashMap::with_hasher_and_shard_amount(hasher, 64), access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request rand: Mutex::new(rand), } @@ -246,12 +253,13 @@ impl BucketRateLimiter { let n = self.map.shards().len(); // this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide // (impossible, infact, unless we have 2048 threads) - let shard = self.rand.lock().unwrap().gen_range(0..n); + let shard = self.rand.lock_propagate_poison().gen_range(0..n); self.map.shards()[shard].write().clear(); } } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::hash::BuildHasherDefault; use std::time::Duration; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 228dbb7f64..186fece4b2 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -2,14 +2,9 @@ use core::net::IpAddr; use std::sync::Arc; use pq_proto::CancelKeyData; -use redis::AsyncCommands; use tokio::sync::Mutex; use uuid::Uuid; -use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}; -use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; - pub trait CancellationPublisherMut: Send + Sync + 'static { #[allow(async_fn_in_trait)] async fn try_publish( @@ -81,94 +76,3 @@ impl CancellationPublisher for Arc> { .await } } - -pub struct RedisPublisherClient { - client: ConnectionWithCredentialsProvider, - region_id: String, - limiter: GlobalRateLimiter, -} - -impl RedisPublisherClient { - pub fn new( - client: ConnectionWithCredentialsProvider, - region_id: String, - info: &'static [RateBucketInfo], - ) -> anyhow::Result { - Ok(Self { - client, - region_id, - limiter: GlobalRateLimiter::new(info.into()), - }) - } - - async fn publish( - &mut self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - ) -> anyhow::Result<()> { - let payload = serde_json::to_string(&Notification::Cancel(CancelSession { - region_id: Some(self.region_id.clone()), - cancel_key_data, - session_id, - peer_addr: Some(peer_addr), - }))?; - let _: () = self.client.publish(PROXY_CHANNEL_NAME, payload).await?; - Ok(()) - } - pub(crate) async fn try_connect(&mut self) -> anyhow::Result<()> { - match self.client.connect().await { - Ok(()) => {} - Err(e) => { - tracing::error!("failed to connect to redis: {e}"); - return Err(e); - } - } - Ok(()) - } - async fn try_publish_internal( - &mut self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - ) -> anyhow::Result<()> { - // TODO: review redundant error duplication logs. - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping cancellation message"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - match self.publish(cancel_key_data, session_id, peer_addr).await { - Ok(()) => return Ok(()), - Err(e) => { - tracing::error!("failed to publish a message: {e}"); - } - } - tracing::info!("Publisher is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.publish(cancel_key_data, session_id, peer_addr).await - } -} - -impl CancellationPublisherMut for RedisPublisherClient { - async fn try_publish( - &mut self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - ) -> anyhow::Result<()> { - tracing::info!("publishing cancellation key to Redis"); - match self - .try_publish_internal(cancel_key_data, session_id, peer_addr) - .await - { - Ok(()) => { - tracing::debug!("cancellation key successfuly published to Redis"); - Ok(()) - } - Err(e) => { - tracing::error!("failed to publish a message: {e}"); - Err(e) - } - } - } -} diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index 82139ea1d5..b5c3d13216 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -29,6 +29,7 @@ impl Clone for Credentials { /// Provides PubSub connection without credentials refresh. pub struct ConnectionWithCredentialsProvider { credentials: Credentials, + // TODO: with more load on the connection, we should consider using a connection pool con: Option, refresh_token_task: Option>, mutex: tokio::sync::Mutex<()>, @@ -69,7 +70,11 @@ impl ConnectionWithCredentialsProvider { pub fn new_with_static_credentials(params: T) -> Self { Self { - credentials: Credentials::Static(params.into_connection_info().unwrap()), + credentials: Credentials::Static( + params + .into_connection_info() + .expect("static configured redis credentials should be a valid format"), + ), con: None, refresh_token_task: None, mutex: tokio::sync::Mutex::new(()), diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs index bf6dde9332..58e3c889a7 100644 --- a/proxy/src/redis/elasticache.rs +++ b/proxy/src/redis/elasticache.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use std::time::{Duration, SystemTime}; +use aws_config::Region; use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; @@ -8,7 +9,6 @@ use aws_config::meta::region::RegionProviderChain; use aws_config::profile::ProfileFileCredentialsProvider; use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; -use aws_config::Region; use aws_sdk_iam::config::ProvideCredentials; use aws_sigv4::http_request::{ self, SignableBody, SignableRequest, SignatureLocation, SigningSettings, diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs new file mode 100644 index 0000000000..7527bca6d0 --- /dev/null +++ b/proxy/src/redis/keys.rs @@ -0,0 +1,89 @@ +use std::io::ErrorKind; + +use anyhow::Ok; +use pq_proto::{CancelKeyData, id_to_cancel_key}; +use serde::{Deserialize, Serialize}; + +pub mod keyspace { + pub const CANCEL_PREFIX: &str = "cancel"; +} + +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) enum KeyPrefix { + #[serde(untagged)] + Cancel(CancelKeyData), +} + +impl KeyPrefix { + pub(crate) fn build_redis_key(&self) -> String { + match self { + KeyPrefix::Cancel(key) => { + let hi = (key.backend_pid as u64) << 32; + let lo = (key.cancel_key as u64) & 0xffff_ffff; + let id = hi | lo; + let keyspace = keyspace::CANCEL_PREFIX; + format!("{keyspace}:{id:x}") + } + } + } + + #[allow(dead_code)] + pub(crate) fn as_str(&self) -> &'static str { + match self { + KeyPrefix::Cancel(_) => keyspace::CANCEL_PREFIX, + } + } +} + +#[allow(dead_code)] +pub(crate) fn parse_redis_key(key: &str) -> anyhow::Result { + let (prefix, key_str) = key.split_once(':').ok_or_else(|| { + anyhow::anyhow!(std::io::Error::new( + ErrorKind::InvalidData, + "missing prefix" + )) + })?; + + match prefix { + keyspace::CANCEL_PREFIX => { + let id = u64::from_str_radix(key_str, 16)?; + + Ok(KeyPrefix::Cancel(id_to_cancel_key(id))) + } + _ => Err(anyhow::anyhow!(std::io::Error::new( + ErrorKind::InvalidData, + "unknown prefix" + ))), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_build_redis_key() { + let cancel_key: KeyPrefix = KeyPrefix::Cancel(CancelKeyData { + backend_pid: 12345, + cancel_key: 54321, + }); + + let redis_key = cancel_key.build_redis_key(); + assert_eq!(redis_key, "cancel:30390000d431"); + } + + #[test] + fn test_parse_redis_key() { + let redis_key = "cancel:30390000d431"; + let key: KeyPrefix = parse_redis_key(redis_key).expect("Failed to parse key"); + + let ref_key = CancelKeyData { + backend_pid: 12345, + cancel_key: 54321, + }; + + assert_eq!(key.as_str(), KeyPrefix::Cancel(ref_key).as_str()); + let KeyPrefix::Cancel(cancel_key) = key; + assert_eq!(ref_key, cancel_key); + } +} diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs new file mode 100644 index 0000000000..3689bf7ae2 --- /dev/null +++ b/proxy/src/redis/kv_ops.rs @@ -0,0 +1,184 @@ +use redis::{AsyncCommands, ToRedisArgs}; + +use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; + +pub struct RedisKVClient { + client: ConnectionWithCredentialsProvider, + limiter: GlobalRateLimiter, +} + +impl RedisKVClient { + pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self { + Self { + client, + limiter: GlobalRateLimiter::new(info.into()), + } + } + + pub async fn try_connect(&mut self) -> anyhow::Result<()> { + match self.client.connect().await { + Ok(()) => {} + Err(e) => { + tracing::error!("failed to connect to redis: {e}"); + return Err(e); + } + } + Ok(()) + } + + pub(crate) async fn hset(&mut self, key: K, field: F, value: V) -> anyhow::Result<()> + where + K: ToRedisArgs + Send + Sync, + F: ToRedisArgs + Send + Sync, + V: ToRedisArgs + Send + Sync, + { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping hset"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + + match self.client.hset(&key, &field, &value).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to set a key-value pair: {e}"); + } + } + + tracing::info!("Redis client is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.client + .hset(key, field, value) + .await + .map_err(anyhow::Error::new) + } + + #[allow(dead_code)] + pub(crate) async fn hset_multiple( + &mut self, + key: &str, + items: &[(K, V)], + ) -> anyhow::Result<()> + where + K: ToRedisArgs + Send + Sync, + V: ToRedisArgs + Send + Sync, + { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping hset_multiple"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + + match self.client.hset_multiple(key, items).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to set a key-value pair: {e}"); + } + } + + tracing::info!("Redis client is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.client + .hset_multiple(key, items) + .await + .map_err(anyhow::Error::new) + } + + #[allow(dead_code)] + pub(crate) async fn expire(&mut self, key: K, seconds: i64) -> anyhow::Result<()> + where + K: ToRedisArgs + Send + Sync, + { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping expire"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + + match self.client.expire(&key, seconds).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to set a key-value pair: {e}"); + } + } + + tracing::info!("Redis client is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.client + .expire(key, seconds) + .await + .map_err(anyhow::Error::new) + } + + #[allow(dead_code)] + pub(crate) async fn hget(&mut self, key: K, field: F) -> anyhow::Result + where + K: ToRedisArgs + Send + Sync, + F: ToRedisArgs + Send + Sync, + V: redis::FromRedisValue, + { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping hget"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + + match self.client.hget(&key, &field).await { + Ok(value) => return Ok(value), + Err(e) => { + tracing::error!("failed to get a value: {e}"); + } + } + + tracing::info!("Redis client is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.client + .hget(key, field) + .await + .map_err(anyhow::Error::new) + } + + pub(crate) async fn hget_all(&mut self, key: K) -> anyhow::Result + where + K: ToRedisArgs + Send + Sync, + V: redis::FromRedisValue, + { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping hgetall"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + + match self.client.hgetall(&key).await { + Ok(value) => return Ok(value), + Err(e) => { + tracing::error!("failed to get a value: {e}"); + } + } + + tracing::info!("Redis client is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.client.hgetall(key).await.map_err(anyhow::Error::new) + } + + pub(crate) async fn hdel(&mut self, key: K, field: F) -> anyhow::Result<()> + where + K: ToRedisArgs + Send + Sync, + F: ToRedisArgs + Send + Sync, + { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping hdel"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + + match self.client.hdel(&key, &field).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to delete a key-value pair: {e}"); + } + } + + tracing::info!("Redis client is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.client + .hdel(key, field) + .await + .map_err(anyhow::Error::new) + } +} diff --git a/proxy/src/redis/mod.rs b/proxy/src/redis/mod.rs index a322f0368c..8b46a8e6ca 100644 --- a/proxy/src/redis/mod.rs +++ b/proxy/src/redis/mod.rs @@ -1,4 +1,6 @@ pub mod cancellation_publisher; pub mod connection_with_credentials_provider; pub mod elasticache; +pub mod keys; +pub mod kv_ops; pub mod notifications; diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index f3aa97c032..5f9f2509e2 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -10,13 +10,10 @@ use uuid::Uuid; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use crate::cache::project_info::ProjectInfoCache; -use crate::cancellation::{CancelMap, CancellationHandler}; -use crate::intern::{ProjectIdInt, RoleNameInt}; +use crate::intern::{AccountIdInt, ProjectIdInt, RoleNameInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; -use tracing::Instrument; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; -pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20); @@ -24,11 +21,14 @@ async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Resu let mut conn = client.get_async_pubsub().await?; tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`"); conn.subscribe(CPLANE_CHANNEL_NAME).await?; - tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`"); - conn.subscribe(PROXY_CHANNEL_NAME).await?; Ok(conn) } +#[derive(Debug, Deserialize)] +struct NotificationHeader<'a> { + topic: &'a str, +} + #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] #[serde(tag = "topic", content = "data")] pub(crate) enum Notification { @@ -39,23 +39,67 @@ pub(crate) enum Notification { AllowedIpsUpdate { allowed_ips_update: AllowedIpsUpdate, }, + #[serde( + rename = "/block_public_or_vpc_access_updated", + deserialize_with = "deserialize_json_string" + )] + BlockPublicOrVpcAccessUpdated { + block_public_or_vpc_access_updated: BlockPublicOrVpcAccessUpdated, + }, + #[serde( + rename = "/allowed_vpc_endpoints_updated_for_org", + deserialize_with = "deserialize_json_string" + )] + AllowedVpcEndpointsUpdatedForOrg { + allowed_vpc_endpoints_updated_for_org: AllowedVpcEndpointsUpdatedForOrg, + }, + #[serde( + rename = "/allowed_vpc_endpoints_updated_for_projects", + deserialize_with = "deserialize_json_string" + )] + AllowedVpcEndpointsUpdatedForProjects { + allowed_vpc_endpoints_updated_for_projects: AllowedVpcEndpointsUpdatedForProjects, + }, #[serde( rename = "/password_updated", deserialize_with = "deserialize_json_string" )] PasswordUpdate { password_update: PasswordUpdate }, - #[serde(rename = "/cancel_session")] - Cancel(CancelSession), + + #[serde( + other, + deserialize_with = "deserialize_unknown_topic", + skip_serializing + )] + UnknownTopic, } + #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] pub(crate) struct AllowedIpsUpdate { project_id: ProjectIdInt, } + +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct BlockPublicOrVpcAccessUpdated { + project_id: ProjectIdInt, +} + +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct AllowedVpcEndpointsUpdatedForOrg { + account_id: AccountIdInt, +} + +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct AllowedVpcEndpointsUpdatedForProjects { + project_ids: Vec, +} + #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] pub(crate) struct PasswordUpdate { project_id: ProjectIdInt, role_name: RoleNameInt, } + #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] pub(crate) struct CancelSession { pub(crate) region_id: Option, @@ -73,9 +117,17 @@ where serde_json::from_str(&s).map_err(::custom) } +// https://github.com/serde-rs/serde/issues/1714 +fn deserialize_unknown_topic<'de, D>(deserializer: D) -> Result<(), D::Error> +where + D: serde::Deserializer<'de>, +{ + deserializer.deserialize_any(serde::de::IgnoredAny)?; + Ok(()) +} + struct MessageHandler { cache: Arc, - cancellation_handler: Arc>, region_id: String, } @@ -83,88 +135,63 @@ impl Clone for MessageHandler { fn clone(&self) -> Self { Self { cache: self.cache.clone(), - cancellation_handler: self.cancellation_handler.clone(), region_id: self.region_id.clone(), } } } impl MessageHandler { - pub(crate) fn new( - cache: Arc, - cancellation_handler: Arc>, - region_id: String, - ) -> Self { - Self { - cache, - cancellation_handler, - region_id, - } + pub(crate) fn new(cache: Arc, region_id: String) -> Self { + Self { cache, region_id } } + pub(crate) async fn increment_active_listeners(&self) { self.cache.increment_active_listeners().await; } + pub(crate) async fn decrement_active_listeners(&self) { self.cache.decrement_active_listeners().await; } + #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))] async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> { let payload: String = msg.get_payload()?; tracing::debug!(?payload, "received a message payload"); let msg: Notification = match serde_json::from_str(&payload) { + Ok(Notification::UnknownTopic) => { + match serde_json::from_str::(&payload) { + // don't update the metric for redis errors if it's just a topic we don't know about. + Ok(header) => tracing::warn!(topic = header.topic, "unknown topic"), + Err(e) => { + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: msg.get_channel_name(), + }); + tracing::error!("broken message: {e}"); + } + } + return Ok(()); + } Ok(msg) => msg, Err(e) => { Metrics::get().proxy.redis_errors_total.inc(RedisErrors { channel: msg.get_channel_name(), }); - tracing::error!("broken message: {e}"); + match serde_json::from_str::(&payload) { + Ok(header) => tracing::error!(topic = header.topic, "broken message: {e}"), + Err(_) => tracing::error!("broken message: {e}"), + } return Ok(()); } }; + tracing::debug!(?msg, "received a message"); match msg { - Notification::Cancel(cancel_session) => { - tracing::Span::current().record( - "session_id", - tracing::field::display(cancel_session.session_id), - ); - Metrics::get() - .proxy - .redis_events_count - .inc(RedisEventsCount::CancelSession); - if let Some(cancel_region) = cancel_session.region_id { - // If the message is not for this region, ignore it. - if cancel_region != self.region_id { - return Ok(()); - } - } - - // TODO: Remove unspecified peer_addr after the complete migration to the new format - let peer_addr = cancel_session - .peer_addr - .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED)); - let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?cancel_session.session_id); - cancel_span.follows_from(tracing::Span::current()); - // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message. - match self - .cancellation_handler - .cancel_session( - cancel_session.cancel_key_data, - uuid::Uuid::nil(), - peer_addr, - cancel_session.peer_addr.is_some(), - ) - .instrument(cancel_span) - .await - { - Ok(()) => {} - Err(e) => { - tracing::warn!("failed to cancel session: {e}"); - } - } - } - Notification::AllowedIpsUpdate { .. } | Notification::PasswordUpdate { .. } => { + Notification::AllowedIpsUpdate { .. } + | Notification::PasswordUpdate { .. } + | Notification::BlockPublicOrVpcAccessUpdated { .. } + | Notification::AllowedVpcEndpointsUpdatedForOrg { .. } + | Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => { invalidate_cache(self.cache.clone(), msg.clone()); if matches!(msg, Notification::AllowedIpsUpdate { .. }) { Metrics::get() @@ -176,7 +203,27 @@ impl MessageHandler { .proxy .redis_events_count .inc(RedisEventsCount::PasswordUpdate); + } else if matches!( + msg, + Notification::AllowedVpcEndpointsUpdatedForProjects { .. } + ) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForProjects); + } else if matches!(msg, Notification::AllowedVpcEndpointsUpdatedForOrg { .. }) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg); + } else if matches!(msg, Notification::BlockPublicOrVpcAccessUpdated { .. }) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::BlockPublicOrVpcAccessUpdate); } + // TODO: add additional metrics for the other event types. + // It might happen that the invalid entry is on the way to be cached. // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds. // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message. @@ -186,6 +233,8 @@ impl MessageHandler { invalidate_cache(cache, msg); }); } + + Notification::UnknownTopic => unreachable!(), } Ok(()) @@ -197,12 +246,27 @@ fn invalidate_cache(cache: Arc, msg: Notification) { Notification::AllowedIpsUpdate { allowed_ips_update } => { cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id); } + Notification::BlockPublicOrVpcAccessUpdated { + block_public_or_vpc_access_updated, + } => cache.invalidate_block_public_or_vpc_access_for_project( + block_public_or_vpc_access_updated.project_id, + ), + Notification::AllowedVpcEndpointsUpdatedForOrg { + allowed_vpc_endpoints_updated_for_org, + } => cache.invalidate_allowed_vpc_endpoint_ids_for_org( + allowed_vpc_endpoints_updated_for_org.account_id, + ), + Notification::AllowedVpcEndpointsUpdatedForProjects { + allowed_vpc_endpoints_updated_for_projects, + } => cache.invalidate_allowed_vpc_endpoint_ids_for_projects( + allowed_vpc_endpoints_updated_for_projects.project_ids, + ), Notification::PasswordUpdate { password_update } => cache .invalidate_role_secret_for_project( password_update.project_id, password_update.role_name, ), - Notification::Cancel(_) => unreachable!("cancel message should be handled separately"), + Notification::UnknownTopic => unreachable!(), } } @@ -222,8 +286,8 @@ async fn handle_messages( } Err(e) => { tracing::error!( - "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" - ); + "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" + ); tokio::time::sleep(RECONNECT_TIMEOUT).await; continue; } @@ -251,17 +315,12 @@ async fn handle_messages( pub async fn task_main( redis: ConnectionWithCredentialsProvider, cache: Arc, - cancel_map: CancelMap, region_id: String, ) -> anyhow::Result where C: ProjectInfoCache + Send + Sync + 'static, { - let cancellation_handler = Arc::new(CancellationHandler::<()>::new( - cancel_map, - crate::metrics::CancellationSource::FromRedis, - )); - let handler = MessageHandler::new(cache, cancellation_handler, region_id); + let handler = MessageHandler::new(cache, region_id); // 6h - 1m. // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost. let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60)); @@ -339,32 +398,29 @@ mod tests { Ok(()) } - #[test] - fn parse_cancel_session() -> anyhow::Result<()> { - let cancel_key_data = CancelKeyData { - backend_pid: 42, - cancel_key: 41, - }; - let uuid = uuid::Uuid::new_v4(); - let msg = Notification::Cancel(CancelSession { - cancel_key_data, - region_id: None, - session_id: uuid, - peer_addr: None, - }); - let text = serde_json::to_string(&msg)?; - let result: Notification = serde_json::from_str(&text)?; - assert_eq!(msg, result); - let msg = Notification::Cancel(CancelSession { - cancel_key_data, - region_id: Some("region".to_string()), - session_id: uuid, - peer_addr: None, - }); - let text = serde_json::to_string(&msg)?; - let result: Notification = serde_json::from_str(&text)?; - assert_eq!(msg, result,); + #[test] + fn parse_unknown_topic() -> anyhow::Result<()> { + let with_data = json!({ + "type": "message", + "topic": "/doesnotexist", + "data": { + "payload": "ignored" + }, + "extra_fields": "something" + }) + .to_string(); + let result: Notification = serde_json::from_str(&with_data)?; + assert_eq!(result, Notification::UnknownTopic); + + let without_data = json!({ + "type": "message", + "topic": "/doesnotexist", + "extra_fields": "something" + }) + .to_string(); + let result: Notification = serde_json::from_str(&without_data)?; + assert_eq!(result, Notification::UnknownTopic); Ok(()) } diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index 1373dfba3d..4922ece615 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -2,7 +2,7 @@ use pq_proto::{BeAuthenticationSaslMessage, BeMessage}; -use crate::parse::{split_at_const, split_cstr}; +use crate::parse::split_cstr; /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage). #[derive(Debug)] @@ -19,7 +19,7 @@ impl<'a> FirstMessage<'a> { let (method_cstr, tail) = split_cstr(bytes)?; let method = method_cstr.to_str().ok()?; - let (len_bytes, bytes) = split_at_const(tail)?; + let (len_bytes, bytes) = tail.split_first_chunk()?; let len = u32::from_be_bytes(*len_bytes) as usize; if len != bytes.len() { return None; @@ -51,6 +51,7 @@ impl<'a> ServerMessage<&'a str> { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index f1c916daa2..46e6a439e5 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -5,8 +5,8 @@ use std::io; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; -use super::messages::ServerMessage; use super::Mechanism; +use super::messages::ServerMessage; use crate::stream::PqStream; /// Abstracts away all peculiarities of the libpq's protocol. @@ -50,6 +50,12 @@ impl SaslStream<'_, S> { self.stream.write_message(&msg.to_reply()).await?; Ok(()) } + + // Queue a SASL message for the client. + fn send_noflush(&mut self, msg: &ServerMessage<&str>) -> io::Result<()> { + self.stream.write_message_noflush(&msg.to_reply())?; + Ok(()) + } } /// SASL authentication outcome. @@ -85,7 +91,7 @@ impl SaslStream<'_, S> { continue; } Step::Success(result, reply) => { - self.send(&ServerMessage::Final(&reply)).await?; + self.send_noflush(&ServerMessage::Final(&reply))?; Outcome::Success(result) } Step::Failure(reason) => Outcome::Failure(reason), diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index 87ab6e0d5f..9d56c465ec 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -90,7 +90,7 @@ mod tests { // number of insert operations let m = rng.gen_range(1..100); - let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid(); + let id = uuid::Builder::from_random_bytes(rng.r#gen()).into_uuid(); ids.push((id, n, m)); // N = sum(actual) diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 6a13f645a5..abd5aeae5b 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -5,6 +5,7 @@ use std::convert::Infallible; use hmac::{Hmac, Mac}; use sha2::Sha256; +use super::ScramKey; use super::messages::{ ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, }; @@ -12,8 +13,6 @@ use super::pbkdf2::Pbkdf2; use super::secret::ServerSecret; use super::signature::SignatureBuilder; use super::threadpool::ThreadPool; -use super::ScramKey; -use crate::config; use crate::intern::EndpointIdInt; use crate::sasl::{self, ChannelBinding, Error as SaslError}; @@ -59,14 +58,14 @@ enum ExchangeState { pub(crate) struct Exchange<'a> { state: ExchangeState, secret: &'a ServerSecret, - tls_server_end_point: config::TlsServerEndPoint, + tls_server_end_point: crate::tls::TlsServerEndPoint, } impl<'a> Exchange<'a> { pub(crate) fn new( secret: &'a ServerSecret, nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], - tls_server_end_point: config::TlsServerEndPoint, + tls_server_end_point: crate::tls::TlsServerEndPoint, ) -> Self { Self { state: ExchangeState::Initial(SaslInitial { nonce }), @@ -120,7 +119,7 @@ impl SaslInitial { fn transition( &self, secret: &ServerSecret, - tls_server_end_point: &config::TlsServerEndPoint, + tls_server_end_point: &crate::tls::TlsServerEndPoint, input: &str, ) -> sasl::Result> { let client_first_message = ClientFirstMessage::parse(input) @@ -155,7 +154,7 @@ impl SaslSentInner { fn transition( &self, secret: &ServerSecret, - tls_server_end_point: &config::TlsServerEndPoint, + tls_server_end_point: &crate::tls::TlsServerEndPoint, input: &str, ) -> sasl::Result> { let Self { @@ -168,8 +167,8 @@ impl SaslSentInner { .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?; let channel_binding = cbind_flag.encode(|_| match tls_server_end_point { - config::TlsServerEndPoint::Sha256(x) => Ok(x), - config::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding), + crate::tls::TlsServerEndPoint::Sha256(x) => Ok(x), + crate::tls::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding), })?; // This might've been caused by a MITM attack @@ -209,8 +208,8 @@ impl sasl::Mechanism for Exchange<'_> { type Output = super::ScramKey; fn exchange(mut self, input: &str) -> sasl::Result> { - use sasl::Step; use ExchangeState; + use sasl::Step; match &self.state { ExchangeState::Initial(init) => { match init.transition(self.secret, &self.tls_server_end_point, input)? { diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index 5ee3a51352..7b0b861ce9 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -4,7 +4,7 @@ use std::fmt; use std::ops::Range; use super::base64_decode_array; -use super::key::{ScramKey, SCRAM_KEY_LEN}; +use super::key::{SCRAM_KEY_LEN, ScramKey}; use super::signature::SignatureBuilder; use crate::sasl::ChannelBinding; @@ -185,6 +185,7 @@ impl fmt::Debug for OwnedServerFirstMessage { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs index 718445f61d..24f991d4d9 100644 --- a/proxy/src/scram/mod.rs +++ b/proxy/src/scram/mod.rs @@ -15,7 +15,7 @@ mod secret; mod signature; pub mod threadpool; -pub(crate) use exchange::{exchange, Exchange}; +pub(crate) use exchange::{Exchange, exchange}; use hmac::{Hmac, Mac}; pub(crate) use key::ScramKey; pub(crate) use secret::ServerSecret; @@ -57,6 +57,7 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::threadpool::ThreadPool; use super::{Exchange, ServerSecret}; @@ -76,11 +77,8 @@ mod tests { const NONCE: [u8; 18] = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ]; - let mut exchange = Exchange::new( - &secret, - || NONCE, - crate::config::TlsServerEndPoint::Undefined, - ); + let mut exchange = + Exchange::new(&secret, || NONCE, crate::tls::TlsServerEndPoint::Undefined); let client_first = "n,,n=user,r=rOprNGfwEbeRWgbNEkqO"; let client_final = "c=biws,r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,p=rw1r5Kph5ThxmaUBC2GAQ6MfXbPnNkFiTIvdb/Rear0="; diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 8c6a08d432..eb21b26ab4 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -72,6 +72,7 @@ impl ServerSecret { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/scram/signature.rs b/proxy/src/scram/signature.rs index d3255cf2ca..a5b1c3e9f4 100644 --- a/proxy/src/scram/signature.rs +++ b/proxy/src/scram/signature.rs @@ -1,6 +1,6 @@ //! Tools for client/server signature management. -use super::key::{ScramKey, SCRAM_KEY_LEN}; +use super::key::{SCRAM_KEY_LEN, ScramKey}; /// A collection of message parts needed to derive the client's signature. #[derive(Debug)] diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index ebc6dd2a3c..8f1684c75b 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -33,14 +33,11 @@ thread_local! { } impl ThreadPool { - pub fn new(n_workers: u8) -> Arc { + pub fn new(mut n_workers: u8) -> Arc { // rayon would be nice here, but yielding in rayon does not work well afaict. if n_workers == 0 { - return Arc::new(Self { - runtime: None, - metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), - }); + n_workers = 1; } Arc::new_cyclic(|pool| { @@ -66,7 +63,7 @@ impl ThreadPool { }); }) .build() - .unwrap(); + .expect("password threadpool runtime should be configured correctly"); Self { runtime: Some(runtime), @@ -79,7 +76,7 @@ impl ThreadPool { JobHandle( self.runtime .as_ref() - .unwrap() + .expect("runtime is always set") .spawn(JobSpec { pbkdf2, endpoint }), ) } @@ -87,7 +84,10 @@ impl ThreadPool { impl Drop for ThreadPool { fn drop(&mut self) { - self.runtime.take().unwrap().shutdown_background(); + self.runtime + .take() + .expect("runtime is always set") + .shutdown_background(); } } diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 251aa47084..72029102e0 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -3,33 +3,34 @@ use std::sync::Arc; use std::time::Duration; use async_trait::async_trait; +use ed25519_dalek::SigningKey; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; -use p256::ecdsa::SigningKey; -use p256::elliptic_curve::JwkEcKey; +use jose_jwk::jose_b64; use rand::rngs::OsRng; -use tokio::net::{lookup_host, TcpStream}; +use tokio::net::{TcpStream, lookup_host}; use tracing::field::display; use tracing::{debug, info}; use super::conn_pool::poll_client; use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool}; -use super::http_conn_pool::{self, poll_http2_client, HttpConnPool, Send}; -use super::local_conn_pool::{self, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION}; +use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client}; +use super::local_conn_pool::{self, EXT_NAME, EXT_SCHEMA, EXT_VERSION, LocalConnPool}; use crate::auth::backend::local::StaticAuthRules; use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; -use crate::auth::{self, check_peer_addr_is_in_list, AuthError}; +use crate::auth::{self, AuthError, check_peer_addr_is_in_list}; use crate::compute; use crate::compute_ctl::{ ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest, }; -use crate::config::ProxyConfig; +use crate::config::{ComputeConfig, ProxyConfig}; use crate::context::RequestContext; +use crate::control_plane::CachedNodeInfo; use crate::control_plane::client::ApiLockError; use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; use crate::control_plane::locks::ApiLocks; -use crate::control_plane::CachedNodeInfo; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::intern::EndpointIdInt; +use crate::protocol2::ConnectionInfoExtra; use crate::proxy::connect_compute::ConnectMechanism; use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute}; use crate::rate_limiter::EndpointRateLimiter; @@ -57,23 +58,49 @@ impl PoolingBackend { let user_info = user_info.clone(); let backend = self.auth_backend.as_ref().map(|()| user_info.clone()); - let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; + let allowed_ips = backend.get_allowed_ips(ctx).await?; + if self.config.authentication_config.ip_allowlist_check_enabled && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { return Err(AuthError::ip_address_not_allowed(ctx.peer_addr())); } + + let access_blocker_flags = backend.get_block_public_or_vpc_access(ctx).await?; + if self.config.authentication_config.is_vpc_acccess_proxy { + if access_blocker_flags.vpc_access_blocked { + return Err(AuthError::NetworkNotAllowed); + } + + let extra = ctx.extra(); + let incoming_endpoint_id = match extra { + None => String::new(), + Some(ConnectionInfoExtra::Aws { vpce_id }) => vpce_id.to_string(), + Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(), + }; + + if incoming_endpoint_id.is_empty() { + return Err(AuthError::MissingVPCEndpointId); + } + + let allowed_vpc_endpoint_ids = backend.get_allowed_vpc_endpoint_ids(ctx).await?; + // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that. + if !allowed_vpc_endpoint_ids.is_empty() + && !allowed_vpc_endpoint_ids.contains(&incoming_endpoint_id) + { + return Err(AuthError::vpc_endpoint_id_not_allowed(incoming_endpoint_id)); + } + } else if access_blocker_flags.public_access_blocked { + return Err(AuthError::NetworkNotAllowed); + } + if !self .endpoint_rate_limiter .check(user_info.endpoint.clone().into(), 1) { return Err(AuthError::too_many_connections()); } - let cached_secret = match maybe_secret { - Some(secret) => secret, - None => backend.get_role_secret(ctx).await?, - }; - + let cached_secret = backend.get_role_secret(ctx).await?; let secret = match cached_secret.value.clone() { Some(secret) => self.config.authentication_config.check_rate_limit( ctx, @@ -195,9 +222,8 @@ impl PoolingBackend { locks: &self.config.connect_compute_locks, }, &backend, - false, // do not allow self signed compute for http flow self.config.wake_compute_retry_config, - self.config.connect_to_compute_retry_config, + &self.config.connect_to_compute, ) .await } @@ -237,9 +263,8 @@ impl PoolingBackend { locks: &self.config.connect_compute_locks, }, &backend, - false, // do not allow self signed compute for http flow self.config.wake_compute_retry_config, - self.config.connect_to_compute_retry_config, + &self.config.connect_to_compute, ) .await } @@ -270,7 +295,11 @@ impl PoolingBackend { if !self.local_pool.initialized(&conn_info) { // only install and grant usage one at a time. - let _permit = local_backend.initialize.acquire().await.unwrap(); + let _permit = local_backend + .initialize + .acquire() + .await + .expect("semaphore should never be closed"); // check again for race if !self.local_pool.initialized(&conn_info) { @@ -340,7 +369,7 @@ impl PoolingBackend { debug!("setting up backend session state"); // initiates the auth session - if let Err(e) = client.execute("select auth.init()", &[]).await { + if let Err(e) = client.batch_execute("select auth.init();").await { discard.discard(); return Err(e.into()); } @@ -352,9 +381,15 @@ impl PoolingBackend { } } -fn create_random_jwk() -> (SigningKey, JwkEcKey) { - let key = SigningKey::random(&mut OsRng); - let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk(); +fn create_random_jwk() -> (SigningKey, jose_jwk::Key) { + let key = SigningKey::generate(&mut OsRng); + + let jwk = jose_jwk::Key::Okp(jose_jwk::Okp { + crv: jose_jwk::OkpCurves::Ed25519, + x: jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()), + d: None, + }); + (key, jwk) } @@ -362,9 +397,9 @@ fn create_random_jwk() -> (SigningKey, JwkEcKey) { pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError), - #[error("could not connection to postgres in compute")] + #[error("could not connect to postgres in compute")] PostgresConnectionError(#[from] postgres_client::Error), - #[error("could not connection to local-proxy in compute")] + #[error("could not connect to local-proxy in compute")] LocalProxyConnectionError(#[from] LocalProxyConnError), #[error("could not parse JWT payload")] JwtPayloadError(serde_json::Error), @@ -500,7 +535,7 @@ impl ConnectMechanism for TokioMechanism { &self, ctx: &RequestContext, node_info: &CachedNodeInfo, - timeout: Duration, + compute_config: &ComputeConfig, ) -> Result { let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; @@ -509,7 +544,7 @@ impl ConnectMechanism for TokioMechanism { let config = config .user(&self.conn_info.user_info.user) .dbname(&self.conn_info.dbname) - .connect_timeout(timeout); + .connect_timeout(compute_config.timeout); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let res = config.connect(postgres_client::NoTls).await; @@ -550,7 +585,7 @@ impl ConnectMechanism for HyperMechanism { &self, ctx: &RequestContext, node_info: &CachedNodeInfo, - timeout: Duration, + config: &ComputeConfig, ) -> Result { let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; @@ -558,7 +593,7 @@ impl ConnectMechanism for HyperMechanism { let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let port = node_info.config.get_port(); - let res = connect_http2(&host, port, timeout).await; + let res = connect_http2(&host, port, config.timeout).await; drop(pause); let (client, connection) = permit.release_result(res)?; @@ -613,7 +648,7 @@ async fn connect_http2( e, ))); } - }; + } }; let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new()) diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs index 6db986f1f7..ba8945afc5 100644 --- a/proxy/src/serverless/cancel_set.rs +++ b/proxy/src/serverless/cancel_set.rs @@ -6,7 +6,7 @@ use std::time::Duration; use indexmap::IndexMap; use parking_lot::Mutex; -use rand::{thread_rng, Rng}; +use rand::{Rng, thread_rng}; use rustc_hash::FxHasher; use tokio::time::Instant; use tokio_util::sync::CancellationToken; @@ -40,7 +40,7 @@ impl CancelSet { pub(crate) fn take(&self) -> Option { for _ in 0..4 { - if let Some(token) = self.take_raw(thread_rng().gen()) { + if let Some(token) = self.take_raw(thread_rng().r#gen()) { return Some(token); } tracing::trace!("failed to get cancel token"); @@ -68,7 +68,7 @@ impl CancelShard { fn take(&mut self, rng: usize) -> Option { NonZeroUsize::new(self.tokens.len()).and_then(|len| { // 10 second grace period so we don't cancel new connections - if self.tokens.get_index(rng % len)?.1 .0.elapsed() < Duration::from_secs(10) { + if self.tokens.get_index(rng % len)?.1.0.elapsed() < Duration::from_secs(10) { return None; } diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index cac5a173cb..6a9089fc2a 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,17 +1,17 @@ use std::fmt; use std::pin::pin; use std::sync::{Arc, Weak}; -use std::task::{ready, Poll}; +use std::task::{Poll, ready}; -use futures::future::poll_fn; use futures::Future; -use postgres_client::tls::NoTlsStream; +use futures::future::poll_fn; use postgres_client::AsyncMessage; +use postgres_client::tls::NoTlsStream; use smallvec::SmallVec; use tokio::net::TcpStream; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{Instrument, error, info, info_span, warn}; #[cfg(test)] use { super::conn_pool_lib::GlobalConnPoolOptions, @@ -186,8 +186,8 @@ impl ClientDataRemote { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { - use std::mem; use std::sync::atomic::AtomicBool; use super::*; @@ -269,39 +269,33 @@ mod tests { assert_eq!(0, pool.get_global_connections_count()); } { - let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client + let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + drop(client); assert_eq!(1, pool.get_global_connections_count()); } { - let mut closed_client = Client::new( + let closed_client = Client::new( create_inner_with(MockClient::new(true)), conn_info.clone(), ep_pool.clone(), ); - closed_client.do_drop().unwrap()(); - mem::forget(closed_client); // drop the client - // The closed client shouldn't be added to the pool. + drop(closed_client); assert_eq!(1, pool.get_global_connections_count()); } let is_closed: Arc = Arc::new(false.into()); { - let mut client = Client::new( + let client = Client::new( create_inner_with(MockClient(is_closed.clone())), conn_info.clone(), ep_pool.clone(), ); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client - + drop(client); // The client should be added to the pool. assert_eq!(2, pool.get_global_connections_count()); } { - let mut client = Client::new(create_inner(), conn_info, ep_pool); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client + let client = Client::new(create_inner(), conn_info, ep_pool); + drop(client); // The client shouldn't be added to the pool. Because the ep-pool is full. assert_eq!(2, pool.get_global_connections_count()); @@ -319,15 +313,13 @@ mod tests { &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), ); { - let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client + let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + drop(client); assert_eq!(3, pool.get_global_connections_count()); } { - let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client + let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + drop(client); // The client shouldn't be added to the pool. Because the global pool is full. assert_eq!(3, pool.get_global_connections_count()); diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 2a46c8f9c5..933204994b 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -5,11 +5,12 @@ use std::sync::atomic::{self, AtomicUsize}; use std::sync::{Arc, Weak}; use std::time::Duration; -use dashmap::DashMap; +use clashmap::ClashMap; use parking_lot::RwLock; use postgres_client::ReadyForQueryStatus; use rand::Rng; -use tracing::{debug, info, Span}; +use smol_str::ToSmolStr; +use tracing::{Span, debug, info}; use super::backend::HttpConnError; use super::conn_pool::ClientDataRemote; @@ -19,8 +20,9 @@ use crate::auth::backend::ComputeUserInfo; use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::protocol2::ConnectionInfoExtra; use crate::types::{DbName, EndpointCacheKey, RoleName}; -use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; #[derive(Debug, Clone)] pub(crate) struct ConnInfo { @@ -187,19 +189,22 @@ impl EndpointConnPool { pub(crate) fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInnerCommon) { let conn_id = client.get_conn_id(); - let pool_name = pool.read().get_name().to_string(); + let (max_conn, conn_count, pool_name) = { + let pool = pool.read(); + ( + pool.global_pool_size_max_conns, + pool.global_connections_count + .load(atomic::Ordering::Relaxed), + pool.get_name().to_string(), + ) + }; + if client.inner.is_closed() { info!(%conn_id, "{}: throwing away connection '{conn_info}' because connection is closed", pool_name); return; } - let global_max_conn = pool.read().global_pool_size_max_conns; - if pool - .read() - .global_connections_count - .load(atomic::Ordering::Relaxed) - >= global_max_conn - { + if conn_count >= max_conn { info!(%conn_id, "{}: throwing away connection '{conn_info}' because pool is full", pool_name); return; } @@ -348,11 +353,11 @@ where // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - pub(crate) global_pool: DashMap>>, + pub(crate) global_pool: ClashMap>>, /// Number of endpoint-connection pools /// - /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. + /// [`ClashMap::len`] iterates over all inner pools and acquires a read lock on each. /// That seems like far too much effort, so we're using a relaxed increment counter instead. /// It's only used for diagnostics. pub(crate) global_pool_size: AtomicUsize, @@ -393,7 +398,7 @@ where pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { let shards = config.pool_options.pool_shards; Arc::new(Self { - global_pool: DashMap::with_shard_amount(shards), + global_pool: ClashMap::with_shard_amount(shards), global_pool_size: AtomicUsize::new(0), config, global_connections_count: Arc::new(AtomicUsize::new(0)), @@ -439,10 +444,10 @@ where .start_timer(); let current_len = shard.len(); let mut clients_removed = 0; - shard.retain(|endpoint, x| { + shard.retain(|(endpoint, x)| { // if the current endpoint pool is unique (no other strong or weak references) // then it is currently not in use by any connections. - if let Some(pool) = Arc::get_mut(x.get_mut()) { + if let Some(pool) = Arc::get_mut(x) { let endpoints = pool.get_mut(); clients_removed = endpoints.clear_closed(); @@ -470,7 +475,9 @@ where .http_pool_opened_connections .get_metric() .dec_by(clients_removed as i64); - info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); + info!( + "pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}" + ); } let removed = current_len - new_len; @@ -632,36 +639,43 @@ impl Client { (&mut inner.inner, Discard { conn_info, pool }) } - pub(crate) fn metrics(&self) -> Arc { - let aux = &self.inner.as_ref().unwrap().aux; + pub(crate) fn metrics( + &self, + direction: TrafficDirection, + ctx: &RequestContext, + ) -> Arc { + let aux = &self + .inner + .as_ref() + .expect("client inner should not be removed") + .aux; + + let private_link_id = match ctx.extra() { + None => None, + Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), + Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), + }; + USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, + direction, + private_link_id, }) } +} - pub(crate) fn do_drop(&mut self) -> Option> { +impl Drop for Client { + fn drop(&mut self) { let conn_info = self.conn_info.clone(); let client = self .inner .take() .expect("client inner should not be removed"); if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { - let current_span = self.span.clone(); + let _current_span = self.span.enter(); // return connection to the pool - return Some(move || { - let _span = current_span.enter(); - EndpointConnPool::put(&conn_pool, &conn_info, client); - }); - } - None - } -} - -impl Drop for Client { - fn drop(&mut self) { - if let Some(drop) = self.do_drop() { - tokio::task::spawn_blocking(drop); + EndpointConnPool::put(&conn_pool, &conn_info, client); } } } @@ -703,7 +717,9 @@ impl Discard<'_, C> { pub(crate) fn discard(&mut self) { let conn_info = &self.conn_info; if std::mem::take(self.pool).strong_count() > 0 { - info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state"); + info!( + "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state" + ); } } } diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index fde38d0de3..338a79b4b3 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -5,8 +5,9 @@ use std::sync::{Arc, Weak}; use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; +use smol_str::ToSmolStr; use tokio::net::TcpStream; -use tracing::{debug, error, info, info_span, Instrument}; +use tracing::{Instrument, debug, error, info, info_span}; use super::backend::HttpConnError; use super::conn_pool_lib::{ @@ -16,8 +17,9 @@ use super::conn_pool_lib::{ use crate::context::RequestContext; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::protocol2::ConnectionInfoExtra; use crate::types::EndpointCacheKey; -use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; pub(crate) type Send = http2::SendRequest; pub(crate) type Connect = @@ -264,11 +266,24 @@ impl Client { Self { inner } } - pub(crate) fn metrics(&self) -> Arc { + pub(crate) fn metrics( + &self, + direction: TrafficDirection, + ctx: &RequestContext, + ) -> Arc { let aux = &self.inner.aux; + + let private_link_id = match ctx.extra() { + None => None, + Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), + Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), + }; + USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, + direction, + private_link_id, }) } } diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index c0208d4f68..95a28663a5 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -6,8 +6,8 @@ use bytes::Bytes; use http::{Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; +use http_utils::error::ApiError; use serde::Serialize; -use utils::http::error::ApiError; /// Like [`ApiError::into_response`] pub(crate) fn api_error_into_response(this: ApiError) -> Response> { @@ -59,14 +59,14 @@ pub(crate) fn api_error_into_response(this: ApiError) -> Response Response> { Response::builder() .status(status) .header(http::header::CONTENT_TYPE, "application/json") // we do not have nested maps with non string keys so serialization shouldn't fail .body( - Full::new(Bytes::from(serde_json::to_string(self).unwrap())) - .map_err(|x| match x {}) - .boxed(), + Full::new(Bytes::from( + serde_json::to_string(self) + .expect("serialising HttpErrorBody should never fail"), + )) + .map_err(|x| match x {}) + .boxed(), ) - .unwrap() + .expect("content-type header should be valid") } } -/// Same as [`utils::http::json::json_response`] +/// Same as [`http_utils::json::json_response`] pub(crate) fn json_response( status: StatusCode, data: T, diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 25b25c66d3..fbd12ad9cb 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -1,5 +1,5 @@ -use postgres_client::types::{Kind, Type}; use postgres_client::Row; +use postgres_client::types::{Kind, Type}; use serde_json::{Map, Value}; // @@ -204,7 +204,10 @@ fn pg_array_parse_inner( if c == '\\' { escaped = true; - (i, c) = pg_array_chr.next().unwrap(); + let Some(x) = pg_array_chr.next() else { + return Err(JsonConversionError::UnbalancedArray); + }; + (i, c) = x; } match c { @@ -253,6 +256,7 @@ fn pg_array_parse_inner( } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use serde_json::json; diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index b84cde9e25..8426a0810e 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -11,26 +11,24 @@ use std::collections::HashMap; use std::pin::pin; -use std::sync::atomic::AtomicUsize; use std::sync::Arc; -use std::task::{ready, Poll}; +use std::sync::atomic::AtomicUsize; +use std::task::{Poll, ready}; use std::time::Duration; -use futures::future::poll_fn; +use ed25519_dalek::{Signature, Signer, SigningKey}; use futures::Future; +use futures::future::poll_fn; use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; -use p256::ecdsa::{Signature, SigningKey}; use parking_lot::RwLock; -use postgres_client::tls::NoTlsStream; -use postgres_client::types::ToSql; use postgres_client::AsyncMessage; +use postgres_client::tls::NoTlsStream; use serde_json::value::RawValue; -use signature::Signer; use tokio::net::TcpStream; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, info_span, warn, Instrument}; +use tracing::{Instrument, debug, error, info, info_span, warn}; use super::backend::HttpConnError; use super::conn_pool_lib::{ @@ -42,7 +40,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; pub(crate) const EXT_NAME: &str = "pg_session_jwt"; -pub(crate) const EXT_VERSION: &str = "0.1.2"; +pub(crate) const EXT_VERSION: &str = "0.2.0"; pub(crate) const EXT_SCHEMA: &str = "auth"; #[derive(Clone)] @@ -179,7 +177,6 @@ pub(crate) fn poll_client( info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); let pool = Arc::downgrade(&global_pool); - let pool_clone = pool.clone(); let db_user = conn_info.db_and_user(); let idle = global_pool.get_idle_timeout(); @@ -273,11 +270,7 @@ pub(crate) fn poll_client( }), }; - Client::new( - inner, - conn_info, - Arc::downgrade(&pool_clone.upgrade().unwrap().global_pool), - ) + Client::new(inner, conn_info, Arc::downgrade(&global_pool.global_pool)) } impl ClientInnerCommon { @@ -286,14 +279,13 @@ impl ClientInnerCommon { local_data.jti += 1; let token = resign_jwt(&local_data.key, payload, local_data.jti)?; - // initiates the auth session + // discard all cannot run in a transaction. must be executed alone. self.inner.batch_execute("discard all").await?; - self.inner - .execute( - "select auth.jwt_session_init($1)", - &[&&*token as &(dyn ToSql + Sync)], - ) - .await?; + + // initiates the auth session + // this is safe from query injections as the jwt format free of any escape characters. + let query = format!("select auth.jwt_session_init('{token}')"); + self.inner.batch_execute(&query).await?; let pid = self.inner.get_process_id(); info!(pid, jti = local_data.jti, "user session state init"); @@ -321,7 +313,8 @@ fn resign_jwt(sk: &SigningKey, payload: &[u8], jti: u64) -> Result(buffer.format(jti)).unwrap(); + let jti = serde_json::from_str::<&RawValue>(buffer.format(jti)) + .expect("itoa formatted integer should be guaranteed valid json"); // update the jti in-place let payload = @@ -343,8 +336,8 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { let cap = jwt.capacity(); // we only need an empty header with the alg specified. - // base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9" - jwt.push_str("eyJhbGciOiJFUzI1NiJ9."); + // base64url(r#"{"alg":"EdDSA"}"#) == "eyJhbGciOiJFZERTQSJ9" + jwt.push_str("eyJhbGciOiJFZERTQSJ9."); // encode the jwt payload in-place base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt); @@ -368,15 +361,16 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { - use p256::ecdsa::SigningKey; + use ed25519_dalek::SigningKey; use typed_json::json; use super::resign_jwt; #[test] fn jwt_token_snapshot() { - let key = SigningKey::from_bytes(&[1; 32].into()).unwrap(); + let key = SigningKey::from_bytes(&[1; 32]); let data = json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string(); @@ -384,12 +378,20 @@ mod tests { // To validate the JWT, copy the JWT string and paste it into https://jwt.io/. // In the public-key box, paste the following jwk public key - // `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}` + // `{"kty":"OKP","crv":"Ed25519","x":"iojj3XQJ8ZX9UtstPLpdcspnCb8dlBIb83SIAbQPb1w"}` + // Note - jwt.io doesn't support EdDSA :( + // https://github.com/jsonwebtoken/jsonwebtoken.github.io/issues/509 - // let pub_key = p256::ecdsa::VerifyingKey::from(&key); - // let pub_key = p256::PublicKey::from(pub_key); - // println!("{}", pub_key.to_jwk_string()); + // let jwk = jose_jwk::Key::Okp(jose_jwk::Okp { + // crv: jose_jwk::OkpCurves::Ed25519, + // x: jose_jwk::jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()), + // d: None, + // }); + // println!("{}", serde_json::to_string(&jwk).unwrap()); - assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA"); + assert_eq!( + jwt, + "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg" + ); } } diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 80b42f9e55..dd0fb9c5b4 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -15,7 +15,7 @@ mod sql_over_http; mod websocket; use std::net::{IpAddr, SocketAddr}; -use std::pin::{pin, Pin}; +use std::pin::{Pin, pin}; use std::sync::Arc; use anyhow::Context; @@ -23,31 +23,32 @@ use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; pub use conn_pool_lib::GlobalConnPoolOptions; -use futures::future::{select, Either}; use futures::TryFutureExt; +use futures::future::{Either, select}; use http::{Method, Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Empty}; +use http_utils::error::ApiError; use hyper::body::Incoming; use hyper_util::rt::TokioExecutor; use hyper_util::server::conn::auto::Builder; -use rand::rngs::StdRng; use rand::SeedableRng; -use sql_over_http::{uuid_to_header_value, NEON_REQUEST_ID}; +use rand::rngs::StdRng; +use sql_over_http::{NEON_REQUEST_ID, uuid_to_header_value}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio::time::timeout; use tokio_rustls::TlsAcceptor; use tokio_util::sync::CancellationToken; use tokio_util::task::TaskTracker; -use tracing::{info, warn, Instrument}; -use utils::http::error::ApiError; +use tracing::{Instrument, info, warn}; -use crate::cancellation::CancellationHandlerMain; +use crate::cancellation::CancellationHandler; use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; +use crate::ext::TaskExt; use crate::metrics::Metrics; -use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo}; +use crate::protocol2::{ChainRW, ConnectHeader, ConnectionInfo, read_proxy_protocol}; use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; @@ -60,7 +61,7 @@ pub async fn task_main( auth_backend: &'static crate::auth::Backend<'static, ()>, ws_listener: TcpListener, cancellation_token: CancellationToken, - cancellation_handler: Arc, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { @@ -84,7 +85,7 @@ pub async fn task_main( cancellation_token.cancelled().await; tokio::task::spawn_blocking(move || conn_pool.shutdown()) .await - .unwrap(); + .propagate_task_panic(); } }); @@ -104,7 +105,7 @@ pub async fn task_main( cancellation_token.cancelled().await; tokio::task::spawn_blocking(move || http_conn_pool.shutdown()) .await - .unwrap(); + .propagate_task_panic(); } }); @@ -317,7 +318,7 @@ async fn connection_handler( backend: Arc, connections: TaskTracker, cancellations: TaskTracker, - cancellation_handler: Arc, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, cancellation_token: CancellationToken, conn: AsyncRW, @@ -411,7 +412,7 @@ async fn request_handler( config: &'static ProxyConfig, backend: Arc, ws_connections: TaskTracker, - cancellation_handler: Arc, + cancellation_handler: Arc, session_id: uuid::Uuid, conn_info: ConnectionInfo, // used to cancel in-flight HTTP requests. not used to cancel websockets diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 5e85f5ec40..8babfb5cd2 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -2,27 +2,28 @@ use std::pin::pin; use std::sync::Arc; use bytes::Bytes; -use futures::future::{select, try_join, Either}; +use futures::future::{Either, select, try_join}; use futures::{StreamExt, TryFutureExt}; -use http::header::AUTHORIZATION; use http::Method; +use http::header::AUTHORIZATION; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; +use http_utils::error::ApiError; use hyper::body::Incoming; use hyper::http::{HeaderName, HeaderValue}; -use hyper::{header, HeaderMap, Request, Response, StatusCode}; +use hyper::{HeaderMap, Request, Response, StatusCode, header}; +use indexmap::IndexMap; use postgres_client::error::{DbError, ErrorPosition, SqlState}; use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; use serde_json::Value; +use serde_json::value::RawValue; use tokio::time::{self, Instant}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; use typed_json::json; use url::Url; -use urlencoding; -use utils::http::error::ApiError; use uuid::Uuid; use super::backend::{LocalProxyConnError, PoolingBackend}; @@ -30,18 +31,18 @@ use super::conn_pool::{AuthData, ConnInfoWithAuth}; use super::conn_pool_lib::{self, ConnInfo}; use super::error::HttpCodeError; use super::http_util::json_response; -use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; +use super::json::{JsonConversionError, json_to_pg_text, pg_text_row_to_json}; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; -use crate::auth::{endpoint_sni, ComputeUserInfoParseError}; +use crate::auth::{ComputeUserInfoParseError, endpoint_sni}; use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::http::{read_body_with_limit, ReadBodyError}; +use crate::http::{ReadBodyError, read_body_with_limit}; use crate::metrics::{HttpDirection, Metrics}; -use crate::proxy::{run_until_cancelled, NeonOptions}; +use crate::proxy::{NeonOptions, run_until_cancelled}; use crate::serverless::backend::HttpConnError; use crate::types::{DbName, RoleName}; -use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; +use crate::usage_metrics::{MetricCounter, MetricCounterRecorder, TrafficDirection}; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] @@ -208,7 +209,7 @@ fn get_conn_info( } } Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => { - return Err(ConnInfoError::MissingHostname) + return Err(ConnInfoError::MissingHostname); } }; ctx.set_endpoint_id(endpoint.clone()); @@ -249,6 +250,50 @@ pub(crate) async fn handle( let mut response = match result { Ok(r) => { ctx.set_success(); + + // Handling the error response from local proxy here + if config.authentication_config.is_auth_broker && r.status().is_server_error() { + let status = r.status(); + + let body_bytes = r + .collect() + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::Error::msg(format!( + "could not collect http body: {e}" + ))) + })? + .to_bytes(); + + if let Ok(mut json_map) = + serde_json::from_slice::>(&body_bytes) + { + let message = json_map.get("message"); + if let Some(message) = message { + let msg: String = match serde_json::from_str(message.get()) { + Ok(msg) => msg, + Err(_) => { + "Unable to parse the response message from server".to_string() + } + }; + + error!("Error response from local_proxy: {status} {msg}"); + + json_map.retain(|key, _| !key.starts_with("neon:")); // remove all the neon-related keys + + let resp_json = serde_json::to_string(&json_map) + .unwrap_or("failed to serialize the response message".to_string()); + + return json_response(status, resp_json); + } + } + + error!("Unable to parse the response message from local_proxy"); + return json_response( + status, + json!({ "message": "Unable to parse the response message from server".to_string() }), + ); + } r } Err(e @ SqlOverHttpError::Cancelled(_)) => { @@ -618,8 +663,6 @@ async fn handle_db_inner( let authenticate_and_connect = Box::pin( async { - let is_local_proxy = matches!(backend.auth_backend, crate::auth::Backend::Local(_)); - let keys = match auth { AuthData::Password(pw) => { backend @@ -634,7 +677,9 @@ async fn handle_db_inner( }; let client = match keys.keys { - ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => { + ComputeCredentialKeys::JwtPayload(payload) + if backend.auth_backend.is_local_proxy() => + { let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?; let (cli_inner, _dsc) = client.client_inner(); cli_inner.set_jwt_session(&payload).await?; @@ -700,7 +745,7 @@ async fn handle_db_inner( } }; - let metrics = client.metrics(); + let metrics = client.metrics(TrafficDirection::Egress, ctx); let len = json_output.len(); let response = response @@ -773,7 +818,7 @@ async fn handle_auth_broker_inner( .expect("all headers and params received via hyper should be valid for request"); // todo: map body to count egress - let _metrics = client.metrics(); + let _metrics = client.metrics(TrafficDirection::Egress, ctx); Ok(client .inner @@ -976,7 +1021,7 @@ async fn query_to_json( data: QueryData, current_size: &mut usize, parsed_headers: HttpHeaders, -) -> Result<(ReadyForQueryStatus, impl Serialize), SqlOverHttpError> { +) -> Result<(ReadyForQueryStatus, impl Serialize + use), SqlOverHttpError> { let query_start = Instant::now(); let query_params = data.params; @@ -1073,10 +1118,10 @@ enum Discard<'a> { } impl Client { - fn metrics(&self) -> Arc { + fn metrics(&self, direction: TrafficDirection, ctx: &RequestContext) -> Arc { match self { - Client::Remote(client) => client.metrics(), - Client::Local(local_client) => local_client.metrics(), + Client::Remote(client) => client.metrics(direction, ctx), + Client::Local(local_client) => local_client.metrics(direction, ctx), } } @@ -1110,6 +1155,7 @@ impl Discard<'_> { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index bdb83fe6be..c4baeeb5cc 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,6 +1,6 @@ use std::pin::Pin; use std::sync::Arc; -use std::task::{ready, Context, Poll}; +use std::task::{Context, Poll, ready}; use anyhow::Context as _; use bytes::{Buf, BufMut, Bytes, BytesMut}; @@ -12,12 +12,12 @@ use pin_project_lite::pin_project; use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; use tracing::warn; -use crate::cancellation::CancellationHandlerMain; +use crate::cancellation::CancellationHandler; use crate::config::ProxyConfig; use crate::context::RequestContext; -use crate::error::{io_error, ReportableError}; +use crate::error::{ReportableError, io_error}; use crate::metrics::Metrics; -use crate::proxy::{handle_client, ClientMode, ErrorSource}; +use crate::proxy::{ClientMode, ErrorSource, handle_client}; use crate::rate_limiter::EndpointRateLimiter; pin_project! { @@ -129,7 +129,7 @@ pub(crate) async fn serve_websocket( auth_backend: &'static crate::auth::Backend<'static, ()>, ctx: RequestContext, websocket: OnUpgrade, - cancellation_handler: Arc, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, hostname: Option, cancellations: tokio_util::task::task_tracker::TaskTracker, @@ -168,7 +168,7 @@ pub(crate) async fn serve_websocket( Ok(Some(p)) => { ctx.set_success(); ctx.log_connect(); - match p.proxy_pass().await { + match p.proxy_pass(&config.connect_to_compute).await { Ok(()) => Ok(()), Err(ErrorSource::Client(err)) => Err(err).context("client"), Err(ErrorSource::Compute(err)) => Err(err).context("compute"), @@ -178,16 +178,17 @@ pub(crate) async fn serve_websocket( } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::pin::pin; use framed_websockets::WebSocketServer; use futures::{SinkExt, StreamExt}; - use tokio::io::{duplex, AsyncReadExt, AsyncWriteExt}; + use tokio::io::{AsyncReadExt, AsyncWriteExt, duplex}; use tokio::task::JoinSet; - use tokio_tungstenite::tungstenite::protocol::Role; - use tokio_tungstenite::tungstenite::Message; use tokio_tungstenite::WebSocketStream; + use tokio_tungstenite::tungstenite::Message; + use tokio_tungstenite::tungstenite::protocol::Role; use super::WebSocketRw; diff --git a/proxy/src/signals.rs b/proxy/src/signals.rs index 0b675683c0..32b2344a1c 100644 --- a/proxy/src/signals.rs +++ b/proxy/src/signals.rs @@ -12,7 +12,7 @@ pub async fn handle( where F: FnMut(), { - use tokio::signal::unix::{signal, SignalKind}; + use tokio::signal::unix::{SignalKind, signal}; let mut hangup = signal(SignalKind::hangup())?; let mut interrupt = signal(SignalKind::interrupt())?; diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 11f426819d..ace27a7284 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -11,9 +11,9 @@ use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; use tracing::debug; -use crate::config::TlsServerEndPoint; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::metrics::Metrics; +use crate::tls::TlsServerEndPoint; /// Stream wrapper which implements libpq's protocol. /// diff --git a/proxy/src/tls/client_config.rs b/proxy/src/tls/client_config.rs new file mode 100644 index 0000000000..a2d695aae1 --- /dev/null +++ b/proxy/src/tls/client_config.rs @@ -0,0 +1,42 @@ +use std::sync::Arc; + +use anyhow::bail; +use rustls::crypto::ring; + +pub(crate) fn load_certs() -> anyhow::Result> { + let der_certs = rustls_native_certs::load_native_certs(); + + if !der_certs.errors.is_empty() { + bail!("could not parse certificates: {:?}", der_certs.errors); + } + + let mut store = rustls::RootCertStore::empty(); + store.add_parsable_certificates(der_certs.certs); + Ok(Arc::new(store)) +} + +/// Loads the root certificates and constructs a client config suitable for connecting to the neon compute. +/// This function is blocking. +pub fn compute_client_config_with_root_certs() -> anyhow::Result { + Ok( + rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_safe_default_protocol_versions() + .expect("ring should support the default protocol versions") + .with_root_certificates(load_certs()?) + .with_no_client_auth(), + ) +} + +#[cfg(test)] +pub fn compute_client_config_with_certs( + certs: impl IntoIterator>, +) -> rustls::ClientConfig { + let mut store = rustls::RootCertStore::empty(); + store.add_parsable_certificates(certs); + + rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_safe_default_protocol_versions() + .expect("ring should support the default protocol versions") + .with_root_certificates(store) + .with_no_client_auth() +} diff --git a/proxy/src/tls/mod.rs b/proxy/src/tls/mod.rs new file mode 100644 index 0000000000..d6ce6bd9fc --- /dev/null +++ b/proxy/src/tls/mod.rs @@ -0,0 +1,72 @@ +pub mod client_config; +pub mod postgres_rustls; +pub mod server_config; + +use anyhow::Context; +use rustls::pki_types::CertificateDer; +use sha2::{Digest, Sha256}; +use tracing::{error, info}; +use x509_parser::oid_registry; + +/// +pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql"; + +/// Channel binding parameter +/// +/// +/// Description: The hash of the TLS server's certificate as it +/// appears, octet for octet, in the server's Certificate message. Note +/// that the Certificate message contains a certificate_list, in which +/// the first element is the server's certificate. +/// +/// The hash function is to be selected as follows: +/// +/// * if the certificate's signatureAlgorithm uses a single hash +/// function, and that hash function is either MD5 or SHA-1, then use SHA-256; +/// +/// * if the certificate's signatureAlgorithm uses a single hash +/// function and that hash function neither MD5 nor SHA-1, then use +/// the hash function associated with the certificate's +/// signatureAlgorithm; +/// +/// * if the certificate's signatureAlgorithm uses no hash functions or +/// uses multiple hash functions, then this channel binding type's +/// channel bindings are undefined at this time (updates to is channel +/// binding type may occur to address this issue if it ever arises). +#[derive(Debug, Clone, Copy)] +pub enum TlsServerEndPoint { + Sha256([u8; 32]), + Undefined, +} + +impl TlsServerEndPoint { + pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result { + let sha256_oids = [ + // I'm explicitly not adding MD5 or SHA1 here... They're bad. + oid_registry::OID_SIG_ECDSA_WITH_SHA256, + oid_registry::OID_PKCS1_SHA256WITHRSA, + ]; + + let pem = x509_parser::parse_x509_certificate(cert) + .context("Failed to parse PEM object from cerficiate")? + .1; + + info!(subject = %pem.subject, "parsing TLS certificate"); + + let reg = oid_registry::OidRegistry::default().with_all_crypto(); + let oid = pem.signature_algorithm.oid(); + let alg = reg.get(oid); + if sha256_oids.contains(oid) { + let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into(); + info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); + Ok(Self::Sha256(tls_server_end_point)) + } else { + error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding"); + Ok(Self::Undefined) + } + } + + pub fn supported(&self) -> bool { + !matches!(self, TlsServerEndPoint::Undefined) + } +} diff --git a/proxy/src/postgres_rustls/mod.rs b/proxy/src/tls/postgres_rustls.rs similarity index 96% rename from proxy/src/postgres_rustls/mod.rs rename to proxy/src/tls/postgres_rustls.rs index 5ef20991c3..f09e916a1d 100644 --- a/proxy/src/postgres_rustls/mod.rs +++ b/proxy/src/tls/postgres_rustls.rs @@ -2,8 +2,8 @@ use std::convert::TryFrom; use std::sync::Arc; use postgres_client::tls::MakeTlsConnect; -use rustls::pki_types::ServerName; use rustls::ClientConfig; +use rustls::pki_types::ServerName; use tokio::io::{AsyncRead, AsyncWrite}; mod private { @@ -15,10 +15,10 @@ mod private { use postgres_client::tls::{ChannelBinding, TlsConnect}; use rustls::pki_types::ServerName; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; - use tokio_rustls::client::TlsStream; use tokio_rustls::TlsConnector; + use tokio_rustls::client::TlsStream; - use crate::config::TlsServerEndPoint; + use crate::tls::TlsServerEndPoint; pub struct TlsConnectFuture { inner: tokio_rustls::Connect, @@ -126,16 +126,14 @@ mod private { /// That way you can connect to PostgreSQL using `rustls` as the TLS stack. #[derive(Clone)] pub struct MakeRustlsConnect { - config: Arc, + pub config: Arc, } impl MakeRustlsConnect { /// Creates a new `MakeRustlsConnect` from the provided `ClientConfig`. #[must_use] - pub fn new(config: ClientConfig) -> Self { - Self { - config: Arc::new(config), - } + pub fn new(config: Arc) -> Self { + Self { config } } } diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs new file mode 100644 index 0000000000..903c0b712b --- /dev/null +++ b/proxy/src/tls/server_config.rs @@ -0,0 +1,218 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use anyhow::{Context, bail}; +use itertools::Itertools; +use rustls::crypto::ring::{self, sign}; +use rustls::pki_types::{CertificateDer, PrivateKeyDer}; + +use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint}; + +pub struct TlsConfig { + pub config: Arc, + pub common_names: HashSet, + pub cert_resolver: Arc, +} + +impl TlsConfig { + pub fn to_server_config(&self) -> Arc { + self.config.clone() + } +} + +/// Configure TLS for the main endpoint. +pub fn configure_tls( + key_path: &str, + cert_path: &str, + certs_dir: Option<&String>, + allow_tls_keylogfile: bool, +) -> anyhow::Result { + let mut cert_resolver = CertResolver::new(); + + // add default certificate + cert_resolver.add_cert_path(key_path, cert_path, true)?; + + // add extra certificates + if let Some(certs_dir) = certs_dir { + for entry in std::fs::read_dir(certs_dir)? { + let entry = entry?; + let path = entry.path(); + if path.is_dir() { + // file names aligned with default cert-manager names + let key_path = path.join("tls.key"); + let cert_path = path.join("tls.crt"); + if key_path.exists() && cert_path.exists() { + cert_resolver.add_cert_path( + &key_path.to_string_lossy(), + &cert_path.to_string_lossy(), + false, + )?; + } + } + } + } + + let common_names = cert_resolver.get_common_names(); + + let cert_resolver = Arc::new(cert_resolver); + + // allow TLS 1.2 to be compatible with older client libraries + let mut config = + rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("ring should support TLS1.2 and TLS1.3")? + .with_no_client_auth() + .with_cert_resolver(cert_resolver.clone()); + + config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()]; + + if allow_tls_keylogfile { + // KeyLogFile will check for the SSLKEYLOGFILE environment variable. + config.key_log = Arc::new(rustls::KeyLogFile::new()); + } + + Ok(TlsConfig { + config: Arc::new(config), + common_names, + cert_resolver, + }) +} + +#[derive(Default, Debug)] +pub struct CertResolver { + certs: HashMap, TlsServerEndPoint)>, + default: Option<(Arc, TlsServerEndPoint)>, +} + +impl CertResolver { + pub fn new() -> Self { + Self::default() + } + + fn add_cert_path( + &mut self, + key_path: &str, + cert_path: &str, + is_default: bool, + ) -> anyhow::Result<()> { + let priv_key = { + let key_bytes = std::fs::read(key_path) + .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; + rustls_pemfile::private_key(&mut &key_bytes[..]) + .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + }; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") + })? + }; + + self.add_cert(priv_key, cert_chain, is_default) + } + + pub fn add_cert( + &mut self, + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, + is_default: bool, + ) -> anyhow::Result<()> { + let key = sign::any_supported_type(&priv_key).context("invalid private key")?; + + let first_cert = &cert_chain[0]; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + let pem = x509_parser::parse_x509_certificate(first_cert) + .context("Failed to parse PEM object from cerficiate")? + .1; + + let common_name = pem.subject().to_string(); + + // We need to get the canonical name for this certificate so we can match them against any domain names + // seen within the proxy codebase. + // + // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. + // We need to remove the wildcard prefix for the purposes of certificate selection. + // + // auth-broker does not use SNI and instead uses the Neon-Connection-String header. + // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. + // + // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string + // validation, so let's we can continue with any common-name + let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=") { + s.to_string() + } else { + bail!("Failed to parse common name from certificate") + }; + + let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); + + if is_default { + self.default = Some((cert.clone(), tls_server_end_point)); + } + + self.certs.insert(common_name, (cert, tls_server_end_point)); + + Ok(()) + } + + pub fn get_common_names(&self) -> HashSet { + self.certs.keys().map(|s| s.to_string()).collect() + } +} + +impl rustls::server::ResolvesServerCert for CertResolver { + fn resolve( + &self, + client_hello: rustls::server::ClientHello<'_>, + ) -> Option> { + self.resolve(client_hello.server_name()).map(|x| x.0) + } +} + +impl CertResolver { + pub fn resolve( + &self, + server_name: Option<&str>, + ) -> Option<(Arc, TlsServerEndPoint)> { + // loop here and cut off more and more subdomains until we find + // a match to get a proper wildcard support. OTOH, we now do not + // use nested domains, so keep this simple for now. + // + // With the current coding foo.com will match *.foo.com and that + // repeats behavior of the old code. + if let Some(mut sni_name) = server_name { + loop { + if let Some(cert) = self.certs.get(sni_name) { + return Some(cert.clone()); + } + if let Some((_, rest)) = sni_name.split_once('.') { + sni_name = rest; + } else { + return None; + } + } + } else { + // No SNI, use the default certificate, otherwise we can't get to + // options parameter which can be used to set endpoint name too. + // That means that non-SNI flow will not work for CNAME domains in + // verify-full mode. + // + // If that will be a problem we can: + // + // a) Instead of multi-cert approach use single cert with extra + // domains listed in Subject Alternative Name (SAN). + // b) Deploy separate proxy instances for extra domains. + self.default.clone() + } + } +} diff --git a/proxy/src/types.rs b/proxy/src/types.rs index 6e0bd61c94..d5952d1d8b 100644 --- a/proxy/src/types.rs +++ b/proxy/src/types.rs @@ -97,6 +97,8 @@ smol_str_wrapper!(EndpointId); smol_str_wrapper!(BranchId); // 90% of project strings are 23 characters or less. smol_str_wrapper!(ProjectId); +// 90% of account strings are 23 characters or less. +smol_str_wrapper!(AccountId); // will usually equal endpoint ID smol_str_wrapper!(EndpointCacheKey); diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 270cd7c24d..d73a84057a 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -50,6 +50,7 @@ impl std::fmt::Display for ApiUrl { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 65e74466f2..004d268fa1 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -2,20 +2,21 @@ //! and push them to a HTTP endpoint. use std::borrow::Cow; use std::convert::Infallible; -use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::time::Duration; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Bytes; use chrono::{DateTime, Datelike, Timelike, Utc}; -use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; -use dashmap::mapref::entry::Entry; -use dashmap::DashMap; +use clashmap::ClashMap; +use clashmap::mapref::entry::Entry; +use consumption_metrics::{CHUNK_SIZE, Event, EventChunk, EventType, idempotency_key}; use once_cell::sync::Lazy; use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; +use smol_str::SmolStr; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::{error, info, instrument, trace, warn}; @@ -43,6 +44,33 @@ const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60); pub(crate) struct Ids { pub(crate) endpoint_id: EndpointIdInt, pub(crate) branch_id: BranchIdInt, + pub(crate) direction: TrafficDirection, + #[serde(with = "none_as_empty_string")] + pub(crate) private_link_id: Option, +} + +mod none_as_empty_string { + use serde::Deserialize; + use smol_str::SmolStr; + + #[allow(clippy::ref_option)] + pub fn serialize(t: &Option, s: S) -> Result { + s.serialize_str(t.as_deref().unwrap_or("")) + } + + pub fn deserialize<'de, D: serde::Deserializer<'de>>( + d: D, + ) -> Result, D::Error> { + let s = SmolStr::deserialize(d)?; + if s.is_empty() { Ok(None) } else { Ok(Some(s)) } + } +} + +#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] +#[serde(rename_all = "lowercase")] +pub(crate) enum TrafficDirection { + Ingress, + Egress, } pub(crate) trait MetricCounterRecorder { @@ -137,7 +165,7 @@ type FastHasher = std::hash::BuildHasherDefault; #[derive(Default)] pub(crate) struct Metrics { - endpoints: DashMap, FastHasher>, + endpoints: ClashMap, FastHasher>, } impl Metrics { @@ -213,7 +241,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result( - endpoints: &DashMap, FastHasher>, + endpoints: &ClashMap, FastHasher>, ) -> Vec<(Ids, u64)> { let mut metrics_to_clear = Vec::new(); @@ -271,7 +299,7 @@ fn create_event_chunks<'a>( #[expect(clippy::too_many_arguments)] #[instrument(skip_all)] async fn collect_metrics_iteration( - endpoints: &DashMap, FastHasher>, + endpoints: &ClashMap, FastHasher>, client: &http::ClientWithMiddleware, metric_collection_endpoint: &reqwest::Url, storage: Option<&GenericRemoteStorage>, @@ -396,17 +424,18 @@ async fn upload_backup_events( TimeoutOrCancel::caused_by_cancel, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_UPLOAD_MAX_RETRIES, - "request_data_upload", + "usage_metrics_upload", cancel, ) .await .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) .and_then(|x| x) - .context("request_data_upload")?; + .with_context(|| format!("usage_metrics_upload: path={remote_path}"))?; Ok(()) } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::fs; use std::io::BufReader; @@ -504,6 +533,8 @@ mod tests { let counter = metrics.register(Ids { endpoint_id: (&EndpointId::from("e1")).into(), branch_id: (&BranchId::from("b1")).into(), + direction: TrafficDirection::Egress, + private_link_id: None, }); // the counter should be observed despite 0 egress diff --git a/pyproject.toml b/pyproject.toml index 01d15ee6bb..c6e5073bcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,25 +7,25 @@ package-mode = false python = "^3.11" pytest = "^7.4.4" psycopg2-binary = "^2.9.10" -typing-extensions = "^4.6.1" +typing-extensions = "^4.12.2" PyJWT = {version = "^2.1.0", extras = ["crypto"]} requests = "^2.32.3" pytest-xdist = "^3.3.1" -asyncpg = "^0.29.0" +asyncpg = "^0.30.0" aiopg = "^1.4.0" -Jinja2 = "^3.1.4" +Jinja2 = "^3.1.5" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.20241019" boto3 = "^1.34.11" -boto3-stubs = {extras = ["s3"], version = "^1.26.16"} +boto3-stubs = {extras = ["s3", "kms"], version = "^1.26.16"} moto = {extras = ["server"], version = "^5.0.6"} backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" -pytest-timeout = "^2.1.0" +pytest-timeout = "^2.3.1" Werkzeug = "^3.0.6" pytest-order = "^1.1.0" -allure-pytest = "^2.13.2" +allure-pytest = "^2.13.5" pytest-asyncio = "^0.21.0" toml = "^0.10.2" psutil = "^5.9.4" @@ -36,19 +36,20 @@ aiohttp = "3.10.11" pytest-rerunfailures = "^15.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" -zstandard = "^0.21.0" +zstandard = "^0.23.0" httpx = {extras = ["http2"], version = "^0.26.0"} pytest-repeat = "^0.9.3" websockets = "^12.0" clickhouse-connect = "^0.7.16" kafka-python = "^2.0.2" jwcrypto = "^1.5.6" -h2 = "^4.1.0" +h2 = {git = "https://github.com/python-hyper/h2"} types-jwcrypto = "^1.5.0.20240925" pyyaml = "^6.0.2" types-pyyaml = "^6.0.12.20240917" -testcontainers = "^4.8.1" -jsonnet = "^0.20.0" +testcontainers = "^4.9.0" +# Jsonnet doesn't support Python 3.13 yet +jsonnet = { version = "^0.20.0", markers = "python_version < '3.13'" } [tool.poetry.group.dev.dependencies] mypy = "==1.13.0" @@ -93,6 +94,7 @@ target-version = "py311" extend-exclude = [ "vendor/", "target/", + "test_runner/stubs/", # Autogenerated by mypy's stubgen ] line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter diff --git a/pytest.ini b/pytest.ini index 7197b078c6..237066b1f6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -11,7 +11,7 @@ markers = testpaths = test_runner minversion = 6.0 -log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s +log_format = %(asctime)s.%(msecs)03d %(levelname)s [%(filename)s:%(lineno)d] %(message)s log_date_format = %Y-%m-%d %H:%M:%S log_cli = true timeout = 300 diff --git a/rust-toolchain.toml b/rust-toolchain.toml index f0661a32e0..591d60ea79 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.83.0" +channel = "1.85.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 0422c46ab1..bb937ad56a 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "safekeeper" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] @@ -9,6 +9,7 @@ default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions testing = ["fail/failpoints"] +benchmarking = [] [dependencies] async-stream.workspace = true @@ -25,11 +26,11 @@ hex.workspace = true humantime.workspace = true http.workspace = true hyper0.workspace = true +itertools.workspace = true futures.workspace = true once_cell.workspace = true parking_lot.workspace = true pageserver_api.workspace = true -postgres.workspace = true postgres-protocol.workspace = true pprof.workspace = true rand.workspace = true @@ -38,6 +39,7 @@ scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } serde.workspace = true serde_json.workspace = true +smallvec.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true @@ -55,12 +57,15 @@ postgres_ffi.workspace = true pq_proto.workspace = true remote_storage.workspace = true safekeeper_api.workspace = true +safekeeper_client.workspace = true sha2.workspace = true sd-notify.workspace = true storage_broker.workspace = true tokio-stream.workspace = true +http-utils.workspace = true utils.workspace = true wal_decoder.workspace = true +env_logger.workspace = true workspace_hack.workspace = true @@ -76,3 +81,4 @@ tracing-subscriber = { workspace = true, features = ["json"] } [[bench]] name = "receive_wal" harness = false +required-features = ["benchmarking"] diff --git a/safekeeper/benches/benchutils.rs b/safekeeper/benches/benchutils.rs deleted file mode 100644 index 48d796221b..0000000000 --- a/safekeeper/benches/benchutils.rs +++ /dev/null @@ -1,108 +0,0 @@ -use std::sync::Arc; - -use camino_tempfile::Utf8TempDir; -use safekeeper::rate_limit::RateLimiter; -use safekeeper::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory}; -use safekeeper::state::{TimelinePersistentState, TimelineState}; -use safekeeper::timeline::{get_timeline_dir, SharedState, StateSK, Timeline}; -use safekeeper::timelines_set::TimelinesSet; -use safekeeper::wal_backup::remote_timeline_path; -use safekeeper::{control_file, wal_storage, SafeKeeperConf}; -use tokio::fs::create_dir_all; -use utils::id::{NodeId, TenantTimelineId}; -use utils::lsn::Lsn; - -/// A Safekeeper benchmarking environment. Uses a tempdir for storage, removed on drop. -pub struct Env { - /// Whether to enable fsync. - pub fsync: bool, - /// Benchmark directory. Deleted when dropped. - pub tempdir: Utf8TempDir, -} - -impl Env { - /// Creates a new benchmarking environment in a temporary directory. fsync controls whether to - /// enable fsyncing. - pub fn new(fsync: bool) -> anyhow::Result { - let tempdir = camino_tempfile::tempdir()?; - Ok(Self { fsync, tempdir }) - } - - /// Constructs a Safekeeper config for the given node ID. - fn make_conf(&self, node_id: NodeId) -> SafeKeeperConf { - let mut conf = SafeKeeperConf::dummy(); - conf.my_id = node_id; - conf.no_sync = !self.fsync; - conf.workdir = self.tempdir.path().join(format!("safekeeper-{node_id}")); - conf - } - - /// Constructs a Safekeeper with the given node and tenant/timeline ID. - /// - /// TODO: we should support using in-memory storage, to measure non-IO costs. This would be - /// easier if SafeKeeper used trait objects for storage rather than generics. It's also not - /// currently possible to construct a timeline using non-file storage since StateSK only accepts - /// SafeKeeper. - pub async fn make_safekeeper( - &self, - node_id: NodeId, - ttid: TenantTimelineId, - ) -> anyhow::Result> { - let conf = self.make_conf(node_id); - - let timeline_dir = get_timeline_dir(&conf, &ttid); - create_dir_all(&timeline_dir).await?; - - let mut pstate = TimelinePersistentState::empty(); - pstate.tenant_id = ttid.tenant_id; - pstate.timeline_id = ttid.timeline_id; - - let wal = wal_storage::PhysicalStorage::new(&ttid, &timeline_dir, &pstate, conf.no_sync)?; - let ctrl = - control_file::FileStorage::create_new(&timeline_dir, pstate, conf.no_sync).await?; - let state = TimelineState::new(ctrl); - let mut safekeeper = SafeKeeper::new(state, wal, conf.my_id)?; - - // Emulate an initial election. - safekeeper - .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected { - term: 1, - start_streaming_at: Lsn(0), - term_history: TermHistory(vec![(1, Lsn(0)).into()]), - timeline_start_lsn: Lsn(0), - })) - .await?; - - Ok(safekeeper) - } - - /// Constructs a timeline, including a new Safekeeper with the given node ID, and spawns its - /// manager task. - pub async fn make_timeline( - &self, - node_id: NodeId, - ttid: TenantTimelineId, - ) -> anyhow::Result> { - let conf = Arc::new(self.make_conf(node_id)); - let timeline_dir = get_timeline_dir(&conf, &ttid); - let remote_path = remote_timeline_path(&ttid)?; - - let safekeeper = self.make_safekeeper(node_id, ttid).await?; - let shared_state = SharedState::new(StateSK::Loaded(safekeeper)); - - let timeline = Timeline::new( - ttid, - &timeline_dir, - &remote_path, - shared_state, - conf.clone(), - ); - timeline.bootstrap( - &mut timeline.write_shared_state().await, - &conf, - Arc::new(TimelinesSet::default()), // ignored for now - RateLimiter::new(0, 0), - ); - Ok(timeline) - } -} diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index 313d945b94..122630d953 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -1,14 +1,10 @@ //! WAL ingestion benchmarks. -#[path = "benchutils.rs"] -mod benchutils; - use std::io::Write as _; -use benchutils::Env; use bytes::BytesMut; use camino_tempfile::tempfile; -use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion}; +use criterion::{BatchSize, Bencher, Criterion, criterion_group, criterion_main}; use itertools::Itertools as _; use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; use pprof::criterion::{Output, PProfProfiler}; @@ -16,6 +12,8 @@ use safekeeper::receive_wal::{self, WalAcceptor}; use safekeeper::safekeeper::{ AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, }; +use safekeeper::test_utils::Env; +use safekeeper_api::membership::SafekeeperGeneration as Generation; use tokio::io::AsyncWriteExt as _; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; @@ -24,14 +22,13 @@ const KB: usize = 1024; const MB: usize = 1024 * KB; const GB: usize = 1024 * MB; -/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB. -/// This mirrors the configuration in bin/safekeeper.rs. +/// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs. #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] -pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +#[unsafe(export_name = "malloc_conf")] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; // Register benchmarks with Criterion. criterion_group!( @@ -76,12 +73,15 @@ fn bench_process_msg(c: &mut Criterion) { assert!(size >= prefixlen); let message = vec![0; size - prefixlen]; - let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message)); + let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), Lsn(0)); // Set up the Safekeeper. let env = Env::new(fsync)?; - let mut safekeeper = - runtime.block_on(env.make_safekeeper(NodeId(1), TenantTimelineId::generate()))?; + let mut safekeeper = runtime.block_on(env.make_safekeeper( + NodeId(1), + TenantTimelineId::generate(), + Lsn(0), + ))?; b.iter_batched_ref( // Pre-construct WAL records and requests. Criterion will batch them. @@ -89,13 +89,12 @@ fn bench_process_msg(c: &mut Criterion) { let (lsn, record) = walgen.next().expect("endless WAL"); ProposerAcceptorMessage::AppendRequest(AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(0), begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }) @@ -134,7 +133,8 @@ fn bench_wal_acceptor(c: &mut Criterion) { let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded let env = Env::new(fsync)?; - let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message")); + let walgen = + &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message"), Lsn(0)); // Create buffered channels that can fit all requests, to avoid blocking on channels. let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(n); @@ -145,7 +145,7 @@ fn bench_wal_acceptor(c: &mut Criterion) { // TODO: WalAcceptor doesn't actually need a full timeline, only // Safekeeper::process_msg(). Consider decoupling them to simplify the setup. let tli = env - .make_timeline(NodeId(1), TenantTimelineId::generate()) + .make_timeline(NodeId(1), TenantTimelineId::generate(), Lsn(0)) .await? .wal_residence_guard() .await?; @@ -160,13 +160,12 @@ fn bench_wal_acceptor(c: &mut Criterion) { .take(n) .map(|(lsn, record)| AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(0), begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: Lsn(0), truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }) @@ -239,7 +238,7 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) { assert!(size >= prefixlen); let message = vec![0; size - prefixlen]; - let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message)); + let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), Lsn(0)); // Construct and spawn the WalAcceptor task. let env = Env::new(fsync)?; @@ -249,7 +248,7 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) { runtime.block_on(async { let tli = env - .make_timeline(NodeId(1), TenantTimelineId::generate()) + .make_timeline(NodeId(1), TenantTimelineId::generate(), Lsn(0)) .await? .wal_residence_guard() .await?; @@ -262,13 +261,12 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) { runtime.block_on(async { let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest { h: AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(0), begin_lsn: lsn, end_lsn: lsn + record.len() as u64, commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }, wal_data: record, }); diff --git a/safekeeper/client/Cargo.toml b/safekeeper/client/Cargo.toml new file mode 100644 index 0000000000..0b660aaf32 --- /dev/null +++ b/safekeeper/client/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "safekeeper_client" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +http-utils.workspace = true +safekeeper_api.workspace = true +thiserror.workspace = true +reqwest = { workspace = true, features = [ "stream" ] } +serde.workspace = true +utils.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/safekeeper/client/src/lib.rs b/safekeeper/client/src/lib.rs new file mode 100644 index 0000000000..3963fd466c --- /dev/null +++ b/safekeeper/client/src/lib.rs @@ -0,0 +1 @@ +pub mod mgmt_api; diff --git a/safekeeper/src/http/client.rs b/safekeeper/client/src/mgmt_api.rs similarity index 70% rename from safekeeper/src/http/client.rs rename to safekeeper/client/src/mgmt_api.rs index a166fc1ab9..5c305769dd 100644 --- a/safekeeper/src/http/client.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -2,21 +2,19 @@ //! //! Partially copied from pageserver client; some parts might be better to be //! united. -//! -//! It would be also good to move it out to separate crate, but this needs -//! duplication of internal-but-reported structs like WalSenderState, ServerInfo -//! etc. +use http_utils::error::HttpErrorBody; use reqwest::{IntoUrl, Method, StatusCode}; +use safekeeper_api::models::{ + PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + TimelineStatus, +}; use std::error::Error as _; use utils::{ - http::error::HttpErrorBody, id::{NodeId, TenantId, TimelineId}, logging::SecretString, }; -use super::routes::TimelineStatus; - #[derive(Debug, Clone)] pub struct Client { mgmt_api_endpoint: String, @@ -37,6 +35,9 @@ pub enum Error { /// Status is not ok; parsed error in body as `HttpErrorBody`. #[error("safekeeper API: {1}")] ApiError(StatusCode, String), + + #[error("Cancelled")] + Cancelled, } pub type Result = std::result::Result; @@ -81,6 +82,34 @@ impl Client { } } + pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}", + self.mgmt_api_endpoint, req.tenant_id, req.timeline_id + ); + let resp = self.post(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + + pub async fn pull_timeline(&self, req: &PullTimelineRequest) -> Result { + let uri = format!("{}/v1/pull_timeline", self.mgmt_api_endpoint); + let resp = self.post(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + + pub async fn delete_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.request(Method::DELETE, &uri, ()).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn timeline_status( &self, tenant_id: TenantId, @@ -107,6 +136,20 @@ impl Client { self.get(&uri).await } + pub async fn utilization(&self) -> Result { + let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); + let resp = self.get(&uri).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + + async fn post( + &self, + uri: U, + body: B, + ) -> Result { + self.request(Method::POST, uri, body).await + } + async fn get(&self, uri: U) -> Result { self.request(Method::GET, uri, ()).await } diff --git a/safekeeper/spec/MCProposerAcceptorReconfig.tla b/safekeeper/spec/MCProposerAcceptorReconfig.tla new file mode 100644 index 0000000000..a4b25e383a --- /dev/null +++ b/safekeeper/spec/MCProposerAcceptorReconfig.tla @@ -0,0 +1,41 @@ +---- MODULE MCProposerAcceptorReconfig ---- +EXTENDS TLC, ProposerAcceptorReconfig + +\* Augments the spec with model checking constraints. + +\* It slightly duplicates MCProposerAcceptorStatic, but we can't EXTENDS it +\* because it EXTENDS ProposerAcceptorStatic in turn. The duplication isn't big +\* anyway. + +\* For model checking. +CONSTANTS + max_entries, \* model constraint: max log entries acceptor/proposer can hold + max_term, \* model constraint: max allowed term + max_generation \* mode constraint: max config generation + +ASSUME max_entries \in Nat /\ max_term \in Nat /\ max_generation \in Nat + +\* Model space constraint. +StateConstraint == /\ \A p \in proposers: + /\ prop_state[p].term <= max_term + /\ Len(prop_state[p].wal) <= max_entries + /\ conf_store.generation <= max_generation + +\* Sets of proposers and acceptors and symmetric because we don't take any +\* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN +\* ...) +ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors) + +\* enforce order of the vars in the error trace with ALIAS +\* Note that ALIAS is supported only since version 1.8.0 which is pre-release +\* as of writing this. +Alias == [ + prop_state |-> prop_state, + prop_conf |-> prop_conf, + acc_state |-> acc_state, + acc_conf |-> acc_conf, + committed |-> committed, + conf_store |-> conf_store + ] + +==== diff --git a/safekeeper/spec/MCProposerAcceptorStatic.tla b/safekeeper/spec/MCProposerAcceptorStatic.tla index be3d99c697..b4eca1965a 100644 --- a/safekeeper/spec/MCProposerAcceptorStatic.tla +++ b/safekeeper/spec/MCProposerAcceptorStatic.tla @@ -3,6 +3,9 @@ EXTENDS TLC, ProposerAcceptorStatic \* Augments the spec with model checking constraints. +\* Note that MCProposerAcceptorReconfig duplicates it and might need to +\* be updated as well. + \* For model checking. CONSTANTS max_entries, \* model constraint: max log entries acceptor/proposer can hold diff --git a/safekeeper/spec/ProposerAcceptorReconfig.tla b/safekeeper/spec/ProposerAcceptorReconfig.tla new file mode 100644 index 0000000000..78de231a39 --- /dev/null +++ b/safekeeper/spec/ProposerAcceptorReconfig.tla @@ -0,0 +1,350 @@ +---- MODULE ProposerAcceptorReconfig ---- + +(* + Spec for https://github.com/neondatabase/neon/blob/538e2312a617c65d489d391892c70b2e4d7407b5/docs/rfcs/035-safekeeper-dynamic-membership-change.md + + Simplifications: + - The ones inherited from ProposerAcceptorStatic. + - We don't model transient state of the configuration change driver process + (storage controller in the implementation). Its actions StartChange and FinishChange + are taken based on the persistent state of safekeepers and conf store. The + justification for that is the following: once new configuration n is + created (e.g with StartChange or FinishChange), any old configuration + change driver working on older conf < n will never be able to commit + it to the conf store because it is protected by CAS. The + propagation of these older confs is still possible though, and + spec allows to do it through acceptors. + Plus the model is already pretty huge. + - Previous point also means that the FinishChange action is + based only on the current state of safekeepers, not from + the past. That's ok because while individual + acceptor may go down, + quorum one never does. So the FinishChange + condition which collects max of the quorum may get + only more strict over time. + + The invariants expectedly break if any of FinishChange + required conditions are removed. +*) + +EXTENDS Integers, Sequences, FiniteSets, TLC + +VARIABLES + \* state which is the same in the static spec + prop_state, + acc_state, + committed, + elected_history, + \* reconfiguration only state + prop_conf, \* prop_conf[p] is current configuration of proposer p + acc_conf, \* acc_conf[a] is current configuration of acceptor a + conf_store \* configuration in the configuration store. + +CONSTANT + acceptors, + proposers + +CONSTANT NULL + +\* Import ProposerAcceptorStatic under PAS. +\* +\* Note that all vars and consts are named the same and thus substituted +\* implicitly. +PAS == INSTANCE ProposerAcceptorStatic + +\******************************************************************************** +\* Helpers +\******************************************************************************** + +\******************************************************************************** +\* Type assertion +\******************************************************************************** + +\* Is c a valid config? +IsConfig(c) == + /\ DOMAIN c = {"generation", "members", "newMembers"} + \* Unique id of the configuration. + /\ c.generation \in Nat + /\ c.members \in SUBSET acceptors + \* newMembers is NULL when it is not a joint conf. + /\ \/ c.newMembers = NULL + \/ c.newMembers \in SUBSET acceptors + +TypeOk == + /\ PAS!TypeOk + /\ \A p \in proposers: IsConfig(prop_conf[p]) + /\ \A a \in acceptors: IsConfig(acc_conf[a]) + /\ IsConfig(conf_store) + +\******************************************************************************** +\* Initial +\******************************************************************************** + +Init == + /\ PAS!Init + /\ \E init_members \in SUBSET acceptors: + LET init_conf == [generation |-> 1, members |-> init_members, newMembers |-> NULL] IN + \* refer to RestartProposer why it is not NULL + /\ prop_conf = [p \in proposers |-> init_conf] + /\ acc_conf = [a \in acceptors |-> init_conf] + /\ conf_store = init_conf + \* We could start with anything, but to reduce state space state with + \* the most reasonable total acceptors - 1 conf size, which e.g. + \* makes basic {a1} -> {a2} change in {a1, a2} acceptors and {a1, a2, + \* a3} -> {a2, a3, a4} in {a1, a2, a3, a4} acceptors models even in + \* the smallest models with single change. + /\ Cardinality(init_members) = Cardinality(acceptors) - 1 + +\******************************************************************************** +\* Actions +\******************************************************************************** + +\* Proposer p loses all state, restarting. In the static spec we bump restarted +\* proposer term to max of some quorum + 1 which is a minimal term which can win +\* election. With reconfigurations it's harder to calculate such a term, so keep +\* it simple and take random acceptor one + 1. +\* +\* Also make proposer to adopt configuration of another random acceptor. In the +\* impl proposer starts with NULL configuration until handshake with first +\* acceptor. Removing this NULL special case makes the spec a bit simpler. +RestartProposer(p) == + /\ \E a \in acceptors: PAS!RestartProposerWithTerm(p, acc_state[a].term + 1) + /\ \E a \in acceptors: prop_conf' = [prop_conf EXCEPT ![p] = acc_conf[a]] + /\ UNCHANGED <> + +\* Acceptor a immediately votes for proposer p. +Vote(p, a) == + \* Configuration must be the same. + /\ prop_conf[p].generation = acc_conf[a].generation + \* And a is expected be a member of it. This is likely redundant as long as + \* becoming leader checks membership (though vote also contributes to max + \* calculation). + /\ \/ a \in prop_conf[p].members + \/ (prop_conf[p].newMembers /= NULL) /\ (a \in prop_conf[p].newMembers) + /\ PAS!Vote(p, a) + /\ UNCHANGED <> + +\* Proposer p gets elected. +BecomeLeader(p) == + /\ prop_state[p].state = "campaign" + \* Votes must form quorum in both sets (if the newMembers exists). + /\ PAS!FormsQuorum(DOMAIN prop_state[p].votes, prop_conf[p].members) + /\ \/ prop_conf[p].newMembers = NULL + \* TLA+ disjunction evaluation doesn't short-circuit for a good reason: + \* https://groups.google.com/g/tlaplus/c/U6tOJ4dsjVM/m/UdOznPCVBwAJ + \* so repeat the null check. + \/ (prop_conf[p].newMembers /= NULL) /\ (PAS!FormsQuorum(DOMAIN prop_state[p].votes, prop_conf[p].newMembers)) + \* DoBecomeLeader will copy WAL of the highest voter to proposer's WAL, so + \* ensure its conf is still the same. In the impl WAL fetching also has to + \* check the configuration. + /\ prop_conf[p].generation = acc_conf[PAS!MaxVoteAcc(p)].generation + /\ \A a \in DOMAIN prop_state[p].votes: prop_conf[p].generation = acc_conf[a].generation + /\ PAS!DoBecomeLeader(p) + /\ UNCHANGED <> + +UpdateTerm(p, a) == + /\ PAS!UpdateTerm(p, a) + /\ UNCHANGED <> + +TruncateWal(p, a) == + /\ prop_state[p].state = "leader" + \* Configuration must be the same. + /\ prop_conf[p].generation = acc_conf[a].generation + /\ PAS!TruncateWal(p, a) + /\ UNCHANGED <> + +NewEntry(p) == + /\ PAS!NewEntry(p) + /\ UNCHANGED <> + +AppendEntry(p, a) == + /\ prop_state[p].state = "leader" + \* Configuration must be the same. + /\ prop_conf[p].generation = acc_conf[a].generation + \* And a is member of it. Ignoring this likely wouldn't hurt, but not useful + \* either. + /\ \/ a \in prop_conf[p].members + \/ (prop_conf[p].newMembers /= NULL) /\ (a \in prop_conf[p].newMembers) + /\ PAS!AppendEntry(p, a) + /\ UNCHANGED <> + +\* see PAS!CommitEntries for comments. +CommitEntries(p) == + /\ prop_state[p].state = "leader" + /\ \E q1 \in PAS!AllMinQuorums(prop_conf[p].members): + LET q1_commit_lsn == PAS!QuorumCommitLsn(p, q1) IN + \* Configuration must be the same. + /\ \A a \in q1: prop_conf[p].generation = acc_conf[a].generation + /\ q1_commit_lsn /= NULL + \* We must collect acks from both quorums, if newMembers is present. + /\ IF prop_conf[p].newMembers = NULL THEN + PAS!DoCommitEntries(p, q1_commit_lsn) + ELSE + \E q2 \in PAS!AllMinQuorums(prop_conf[p].newMembers): + LET q2_commit_lsn == PAS!QuorumCommitLsn(p, q2) IN + \* Configuration must be the same. + /\ \A a \in q1: prop_conf[p].generation = acc_conf[a].generation + /\ q2_commit_lsn /= NULL + /\ PAS!DoCommitEntries(p, PAS!Min(q1_commit_lsn, q2_commit_lsn)) + /\ UNCHANGED <> + +\* Proposer p adopts higher conf c from conf store or from some acceptor. +ProposerSwitchConf(p) == + /\ \E c \in ({conf_store} \union {acc_conf[a]: a \in acceptors}): + \* p's conf is lower than c. + /\ (c.generation > prop_conf[p].generation) + \* We allow to bump conf without restart only when wp is already elected. + \* If it isn't, the votes it has already collected are from the previous + \* configuration and can't be used. + \* + \* So if proposer is in 'campaign' in the impl we would restart preserving + \* conf and increasing term. In the spec this transition is already covered + \* by more a generic RestartProposer, so we don't specify it here. + /\ prop_state[p].state = "leader" + /\ prop_conf' = [prop_conf EXCEPT ![p] = c] + /\ UNCHANGED <> + +\* Do CAS on the conf store, starting change into the new_members conf. +StartChange(new_members) == + \* Possible only if we don't already have the change in progress. + /\ conf_store.newMembers = NULL + \* Not necessary, but reduces space a bit. + /\ new_members /= conf_store.members + /\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.members, newMembers |-> new_members] + /\ UNCHANGED <> + +\* Acceptor's last_log_term. +AccLastLogTerm(acc) == + PAS!LastLogTerm(PAS!AcceptorTermHistory(acc)) + +\* Do CAS on the conf store, transferring joint conf into the newMembers only. +FinishChange == + \* have joint conf + /\ conf_store.newMembers /= NULL + \* The conditions for finishing the change are: + /\ \E qo \in PAS!AllMinQuorums(conf_store.members): + \* 1) Old majority must be aware of the joint conf. + \* Note: generally the driver can't know current acceptor + \* generation, it can only know that it once had been the + \* expected one, but it might have advanced since then. + \* But as explained at the top of the file if acceptor gen + \* advanced, FinishChange will never be able to complete + \* due to CAS anyway. We use strict equality here because + \* that's what makes sense conceptually (old driver should + \* abandon its attempt if it observes that conf has advanced). + /\ \A a \in qo: conf_store.generation = acc_conf[a].generation + \* 2) New member set must have log synced, i.e. some its majority needs + \* to have at least as high as max of some + \* old majority. + \* 3) Term must be synced, i.e. some majority of the new set must + \* have term >= than max term of some old majority. + \* This ensures that two leaders are never elected with the same + \* term even after config change (which would be bad unless we treat + \* generation as a part of term which we don't). + \* 4) A majority of the new set must be aware of the joint conf. + \* This allows to safely destoy acceptor state if it is not a + \* member of its current conf (which is useful for cleanup after + \* migration as well as for aborts). + /\ LET sync_pos == PAS!MaxTermLsn({[term |-> AccLastLogTerm(a), lsn |-> PAS!FlushLsn(a)]: a \in qo}) + sync_term == PAS!Maximum({acc_state[a].term: a \in qo}) + IN + \E qn \in PAS!AllMinQuorums(conf_store.newMembers): + \A a \in qn: + /\ PAS!TermLsnGE([term |-> AccLastLogTerm(a), lsn |-> PAS!FlushLsn(a)], sync_pos) + /\ acc_state[a].term >= sync_term + \* The same note as above about strict equality applies here. + /\ conf_store.generation = acc_conf[a].generation + /\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.newMembers, newMembers |-> NULL] + /\ UNCHANGED <> + +\* Do CAS on the conf store, aborting the change in progress. +AbortChange == + \* have joint conf + /\ conf_store.newMembers /= NULL + /\ conf_store' = [generation |-> conf_store.generation + 1, members |-> conf_store.members, newMembers |-> NULL] + /\ UNCHANGED <> + +\* Acceptor a switches to higher configuration from the conf store +\* or from some proposer. +AccSwitchConf(a) == + /\ \E c \in ({conf_store} \union {prop_conf[p]: p \in proposers}): + /\ acc_conf[a].generation < c.generation + /\ acc_conf' = [acc_conf EXCEPT ![a] = c] + /\ UNCHANGED <> + +\* Nuke all acceptor state if it is not a member of its current conf. Models +\* cleanup after migration/abort. +AccReset(a) == + /\ \/ (acc_conf[a].newMembers = NULL) /\ (a \notin acc_conf[a].members) + \/ (acc_conf[a].newMembers /= NULL) /\ (a \notin (acc_conf[a].members \union acc_conf[a].newMembers)) + /\ acc_state' = [acc_state EXCEPT ![a] = PAS!InitAcc] + \* Set nextSendLsn to `a` to NULL everywhere. nextSendLsn serves as a mark + \* that elected proposer performed TruncateWal on the acceptor, which isn't + \* true anymore after state reset. In the impl local deletion is expected to + \* terminate all existing connections. + /\ prop_state' = [p \in proposers |-> [prop_state[p] EXCEPT !.nextSendLsn[a] = NULL]] + /\ UNCHANGED <> + +\******************************************************************************* +\* Final spec +\******************************************************************************* + +Next == + \/ \E p \in proposers: RestartProposer(p) + \/ \E p \in proposers: \E a \in acceptors: Vote(p, a) + \/ \E p \in proposers: BecomeLeader(p) + \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a) + \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a) + \/ \E p \in proposers: NewEntry(p) + \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a) + \/ \E p \in proposers: CommitEntries(p) + \/ \E new_members \in SUBSET acceptors: StartChange(new_members) + \/ FinishChange + \/ AbortChange + \/ \E p \in proposers: ProposerSwitchConf(p) + \/ \E a \in acceptors: AccSwitchConf(a) + \/ \E a \in acceptors: AccReset(a) + +Spec == Init /\ [][Next]_<> + +\******************************************************************************** +\* Invariants +\******************************************************************************** + +AllConfs == + {conf_store} \union {prop_conf[p]: p \in proposers} \union {acc_conf[a]: a \in acceptors} + +\* Fairly trivial (given the conf store) invariant that different configurations +\* with the same generation are never issued. +ConfigSafety == + \A c1, c2 \in AllConfs: + (c1.generation = c2.generation) => (c1 = c2) + +ElectionSafety == PAS!ElectionSafety + +ElectionSafetyFull == PAS!ElectionSafetyFull + +LogIsMonotonic == PAS!LogIsMonotonic + +LogSafety == PAS!LogSafety + +\******************************************************************************** +\* Invariants which don't need to hold, but useful for playing/debugging. +\******************************************************************************** + +\* Check that we ever switch into non joint conf. +MaxAccConf == ~ \E a \in acceptors: + /\ acc_conf[a].generation = 3 + /\ acc_conf[a].newMembers /= NULL + +CommittedNotTruncated == PAS!CommittedNotTruncated + +MaxTerm == PAS!MaxTerm + +MaxStoreConf == conf_store.generation <= 1 + +MaxAccWalLen == PAS!MaxAccWalLen + +MaxCommitLsn == PAS!MaxCommitLsn + +==== diff --git a/safekeeper/spec/ProposerAcceptorStatic.tla b/safekeeper/spec/ProposerAcceptorStatic.tla index b2d2f005db..fab085bc2e 100644 --- a/safekeeper/spec/ProposerAcceptorStatic.tla +++ b/safekeeper/spec/ProposerAcceptorStatic.tla @@ -18,7 +18,7 @@ \* - old WAL is immediately copied to proposer on its election, without on-demand fetch later. \* Some ideas how to break it to play around to get a feeling: -\* - replace Quorums with BadQuorums. +\* - replace Quorum with BadQuorum. \* - remove 'don't commit entries from previous terms separately' rule in \* CommitEntries and observe figure 8 from the raft paper. \* With p2a3t4l4 32 steps error was found in 1h on 80 cores. @@ -69,16 +69,26 @@ Upsert(f, k, v, l(_)) == \***************** -NumAccs == Cardinality(acceptors) +\* Does set of acceptors `acc_set` form the quorum in the member set `members`? +\* Acceptors not from `members` are excluded (matters only for reconfig). +FormsQuorum(acc_set, members) == + Cardinality(acc_set \intersect members) >= (Cardinality(members) \div 2 + 1) -\* does acc_set form the quorum? -Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1) -\* all quorums of acceptors -Quorums == {subset \in SUBSET acceptors: Quorum(subset)} +\* Like FormsQuorum, but for minimal quorum. +FormsMinQuorum(acc_set, members) == + Cardinality(acc_set \intersect members) = (Cardinality(members) \div 2 + 1) -\* For substituting Quorums and seeing what happens. -BadQuorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2) -BadQuorums == {subset \in SUBSET acceptors: BadQuorum(subset)} +\* All sets of acceptors forming minimal quorums in the member set `members`. +AllQuorums(members) == {subset \in SUBSET members: FormsQuorum(subset, members)} +AllMinQuorums(members) == {subset \in SUBSET acceptors: FormsMinQuorum(subset, members)} + +\* For substituting Quorum and seeing what happens. +FormsBadQuorum(acc_set, members) == + Cardinality(acc_set \intersect members) >= (Cardinality(members) \div 2) +FormsMinBadQuorum(acc_set, members) == + Cardinality(acc_set \intersect members) = (Cardinality(members) \div 2) +AllBadQuorums(members) == {subset \in SUBSET acceptors: FormsBadQuorum(subset, members)} +AllMinBadQuorums(members) == {subset \in SUBSET acceptors: FormsMinBadQuorum(subset, members)} \* flushLsn (end of WAL, i.e. index of next entry) of acceptor a. FlushLsn(a) == Len(acc_state[a].wal) + 1 @@ -135,10 +145,11 @@ TypeOk == /\ IsWal(prop_state[p].wal) \* Map of acceptor -> next lsn to send. It is set when truncate_wal is \* done so sending entries is allowed only after that. In the impl TCP - \* ensures this ordering. + \* ensures this ordering. We use NULL instead of missing value to use + \* EXCEPT in AccReset. /\ \A a \in DOMAIN prop_state[p].nextSendLsn: /\ a \in acceptors - /\ prop_state[p].nextSendLsn[a] \in Lsns + /\ prop_state[p].nextSendLsn[a] \in Lsns \union {NULL} /\ \A a \in acceptors: /\ DOMAIN acc_state[a] = {"term", "termHistory", "wal"} /\ acc_state[a].term \in Terms @@ -167,6 +178,19 @@ TypeOk == \* Initial \******************************************************************************** +InitAcc == + [ + \* There will be no leader in zero term, 1 is the first + \* real. + term |-> 0, + \* Again, leader in term 0 doesn't exist, but we initialize + \* term histories with it to always have common point in + \* them. Lsn is 1 because TLA+ sequences are indexed from 1 + \* (we don't want to truncate WAL out of range). + termHistory |-> << [term |-> 0, lsn |-> 1] >>, + wal |-> << >> + ] + Init == /\ prop_state = [p \in proposers |-> [ state |-> "campaign", @@ -174,19 +198,9 @@ Init == votes |-> EmptyF, termHistory |-> << >>, wal |-> << >>, - nextSendLsn |-> EmptyF + nextSendLsn |-> [a \in acceptors |-> NULL] ]] - /\ acc_state = [a \in acceptors |-> [ - \* There will be no leader in zero term, 1 is the first - \* real. - term |-> 0, - \* Again, leader in term 0 doesn't exist, but we initialize - \* term histories with it to always have common point in - \* them. Lsn is 1 because TLA+ sequences are indexed from 1 - \* (we don't want to truncate WAL out of range). - termHistory |-> << [term |-> 0, lsn |-> 1] >>, - wal |-> << >> - ]] + /\ acc_state = [a \in acceptors |-> InitAcc] /\ committed = {} /\ elected_history = EmptyF @@ -195,23 +209,35 @@ Init == \* Actions \******************************************************************************** -\* Proposer loses all state. +RestartProposerWithTerm(p, new_term) == + /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign", + ![p].term = new_term, + ![p].votes = EmptyF, + ![p].termHistory = << >>, + ![p].wal = << >>, + ![p].nextSendLsn = [a \in acceptors |-> NULL]] + /\ UNCHANGED <> + +\* Proposer p loses all state, restarting. \* For simplicity (and to reduct state space), we assume it immediately gets \* current state from quorum q of acceptors determining the term he will request \* to vote for. -RestartProposer(p, q) == - /\ Quorum(q) - /\ LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN - /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign", - ![p].term = new_term, - ![p].votes = EmptyF, - ![p].termHistory = << >>, - ![p].wal = << >>, - ![p].nextSendLsn = EmptyF] - /\ UNCHANGED <> +RestartProposer(p) == + \E q \in AllQuorums(acceptors): + LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN + RestartProposerWithTerm(p, new_term) \* Term history of acceptor a's WAL: the one saved truncated to contain only <= -\* local FlushLsn entries. +\* local FlushLsn entries. Note that FlushLsn is the end LSN of the last entry +\* (and begin LSN of the next). The mental model for non strict comparison is +\* that once proposer is elected it immediately writes log record with zero +\* length. This allows leader to commit existing log without writing any new +\* entries. For example, assume acceptor has WAL +\* 1.1, 1.2 +\* written by prop with term 1; its current +\* is <1, 3>. Now prop with term 2 and max vote from this acc is elected. +\* Once TruncateWAL is done, becomes <2, 3> +\* without any new records explicitly written. AcceptorTermHistory(a) == SelectSeq(acc_state[a].termHistory, LAMBDA th_entry: th_entry.lsn <= FlushLsn(a)) @@ -230,35 +256,52 @@ Vote(p, a) == \* Get lastLogTerm from term history th. LastLogTerm(th) == th[Len(th)].term +\* Compares pairs: returns true if tl1 >= tl2. +TermLsnGE(tl1, tl2) == + /\ tl1.term >= tl2.term + /\ (tl1.term = tl2.term => tl1.lsn >= tl2.lsn) + +\* Choose max pair in the non empty set of them. +MaxTermLsn(term_lsn_set) == + CHOOSE max_tl \in term_lsn_set: \A tl \in term_lsn_set: TermLsnGE(max_tl, tl) + +\* Find acceptor with the highest vote in proposer p's votes. +MaxVoteAcc(p) == + CHOOSE a \in DOMAIN prop_state[p].votes: + LET a_vote == prop_state[p].votes[a] + a_vote_term_lsn == [term |-> LastLogTerm(a_vote.termHistory), lsn |-> a_vote.flushLsn] + vote_term_lsns == {[term |-> LastLogTerm(v.termHistory), lsn |-> v.flushLsn]: v \in Range(prop_state[p].votes)} + IN + a_vote_term_lsn = MaxTermLsn(vote_term_lsns) + +\* Workhorse for BecomeLeader. +\* Assumes the check prop_state[p] votes is quorum has been done *outside*. +DoBecomeLeader(p) == + LET + \* Find acceptor with the highest vote. + max_vote_acc == MaxVoteAcc(p) + max_vote == prop_state[p].votes[max_vote_acc] + prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn]) + IN + \* We copy all log preceding proposer's term from the max vote node so + \* make sure it is still on one term with us. This is a model + \* simplification which can be removed, in impl we fetch WAL on demand + \* from safekeeper which has it later. Note though that in case of on + \* demand fetch we must check on donor not only term match, but that + \* truncate_wal had already been done (if it is not max_vote_acc). + /\ acc_state[max_vote_acc].term = prop_state[p].term + /\ prop_state' = [prop_state EXCEPT ![p].state = "leader", + ![p].termHistory = prop_th, + ![p].wal = acc_state[max_vote_acc].wal + ] + /\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1) + /\ UNCHANGED <> + \* Proposer p gets elected. BecomeLeader(p) == /\ prop_state[p].state = "campaign" - /\ Quorum(DOMAIN prop_state[p].votes) - /\ LET - \* Find acceptor with the highest vote. - max_vote_acc == - CHOOSE a \in DOMAIN prop_state[p].votes: - LET v == prop_state[p].votes[a] - IN \A v2 \in Range(prop_state[p].votes): - /\ LastLogTerm(v.termHistory) >= LastLogTerm(v2.termHistory) - /\ (LastLogTerm(v.termHistory) = LastLogTerm(v2.termHistory) => v.flushLsn >= v2.flushLsn) - max_vote == prop_state[p].votes[max_vote_acc] - prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn]) - IN - \* We copy all log preceding proposer's term from the max vote node so - \* make sure it is still on one term with us. This is a model - \* simplification which can be removed, in impl we fetch WAL on demand - \* from safekeeper which has it later. Note though that in case of on - \* demand fetch we must check on donor not only term match, but that - \* truncate_wal had already been done (if it is not max_vote_acc). - /\ acc_state[max_vote_acc].term = prop_state[p].term - /\ prop_state' = [prop_state EXCEPT ![p].state = "leader", - ![p].termHistory = prop_th, - ![p].wal = acc_state[max_vote_acc].wal - ] - /\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1) - /\ UNCHANGED <> - + /\ FormsQuorum(DOMAIN prop_state[p].votes, acceptors) + /\ DoBecomeLeader(p) \* Acceptor a learns about elected proposer p's term. In impl it matches to \* VoteRequest/VoteResponse exchange when leader is already elected and is not @@ -287,10 +330,11 @@ FindHighestCommonPoint(prop_th, acc_th, acc_flush_lsn) == IN [term |-> last_common_term, lsn |-> Min(acc_common_term_end, prop_common_term_end)] -\* Elected proposer p immediately truncates WAL (and term history) of acceptor a -\* before starting streaming. Establishes nextSendLsn for a. +\* Elected proposer p immediately truncates WAL (and sets term history) of +\* acceptor a before starting streaming. Establishes nextSendLsn for a. \* -\* In impl this happens at each reconnection, here we also allow to do it multiple times. +\* In impl this happens at each reconnection, here we also allow to do it +\* multiple times. TruncateWal(p, a) == /\ prop_state[p].state = "leader" /\ acc_state[a].term = prop_state[p].term @@ -321,8 +365,8 @@ NewEntry(p) == AppendEntry(p, a) == /\ prop_state[p].state = "leader" /\ acc_state[a].term = prop_state[p].term - /\ a \in DOMAIN prop_state[p].nextSendLsn \* did TruncateWal - /\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send + /\ prop_state[p].nextSendLsn[a] /= NULL \* did TruncateWal + /\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send /\ LET send_lsn == prop_state[p].nextSendLsn[a] entry == prop_state[p].wal[send_lsn] @@ -337,41 +381,65 @@ AppendEntry(p, a) == PropStartLsn(p) == IF prop_state[p].state = "leader" THEN prop_state[p].termHistory[Len(prop_state[p].termHistory)].lsn ELSE NULL -\* Proposer p commits all entries it can using quorum q. Note that unlike -\* will62794/logless-reconfig this allows to commit entries from previous terms -\* (when conditions for that are met). -CommitEntries(p, q) == - /\ prop_state[p].state = "leader" - /\ \A a \in q: +\* LSN which can be committed by proposer p using min quorum q (check that q +\* forms quorum must have been done outside). NULL if there is none. +QuorumCommitLsn(p, q) == + IF + /\ prop_state[p].state = "leader" + /\ \A a \in q: + \* Without explicit responses to appends this ensures that append + \* up to FlushLsn has been accepted. /\ acc_state[a].term = prop_state[p].term \* nextSendLsn existence means TruncateWal has happened, it ensures \* acceptor's WAL (and FlushLsn) are from proper proposer's history. \* Alternatively we could compare LastLogTerm here, but that's closer to \* what we do in the impl (we check flushLsn in AppendResponse, but \* AppendRequest is processed only if HandleElected handling was good). - /\ a \in DOMAIN prop_state[p].nextSendLsn - \* Now find the LSN present on all the quorum. - /\ LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN - \* This is the basic Raft rule of not committing entries from previous - \* terms except along with current term entry (commit them only when - \* quorum recovers, i.e. last_log_term on it reaches leader's term). - /\ quorum_lsn >= PropStartLsn(p) - /\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(quorum_lsn - 1)} - /\ UNCHANGED <> + /\ prop_state[p].nextSendLsn[a] /= NULL + THEN + \* Now find the LSN present on all the quorum. + LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN + \* This is the basic Raft rule of not committing entries from previous + \* terms except along with current term entry (commit them only when + \* quorum recovers, i.e. last_log_term on it reaches leader's term). + IF quorum_lsn >= PropStartLsn(p) THEN + quorum_lsn + ELSE + NULL + ELSE + NULL + +\* Commit all entries on proposer p with record lsn < commit_lsn. +DoCommitEntries(p, commit_lsn) == + /\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(commit_lsn - 1)} + /\ UNCHANGED <> + +\* Proposer p commits all entries it can using some quorum. Note that unlike +\* will62794/logless-reconfig this allows to commit entries from previous terms +\* (when conditions for that are met). +CommitEntries(p) == + /\ prop_state[p].state = "leader" + \* Using min quorums here is better because 1) QuorumCommitLsn for + \* simplicity checks min across all accs in q. 2) it probably makes + \* evaluation faster. + /\ \E q \in AllMinQuorums(acceptors): + LET commit_lsn == QuorumCommitLsn(p, q) IN + /\ commit_lsn /= NULL + /\ DoCommitEntries(p, commit_lsn) \******************************************************************************* \* Final spec \******************************************************************************* Next == - \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q) + \/ \E p \in proposers: RestartProposer(p) \/ \E p \in proposers: \E a \in acceptors: Vote(p, a) \/ \E p \in proposers: BecomeLeader(p) \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a) \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a) \/ \E p \in proposers: NewEntry(p) \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a) - \/ \E q \in Quorums: \E p \in proposers: CommitEntries(p, q) + \/ \E p \in proposers: CommitEntries(p) Spec == Init /\ [][Next]_<> diff --git a/safekeeper/spec/modelcheck.sh b/safekeeper/spec/modelcheck.sh index 21ead7dad8..0084a8c638 100755 --- a/safekeeper/spec/modelcheck.sh +++ b/safekeeper/spec/modelcheck.sh @@ -2,6 +2,7 @@ # Usage: ./modelcheck.sh , e.g. # ./modelcheck.sh models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg MCProposerAcceptorStatic.tla +# ./modelcheck.sh models/MCProposerAcceptorReconfig_p2_a3_t3_l3_c3.cfg MCProposerAcceptorReconfig.tla CONFIG=$1 SPEC=$2 @@ -12,6 +13,7 @@ mkdir -p "tlc-results" CONFIG_FILE=$(basename -- "$CONFIG") outfilename="$SPEC-${CONFIG_FILE}-$(date --utc +%Y-%m-%d--%H-%M-%S)".log outfile="tlc-results/$outfilename" +echo "saving results to $outfile" touch $outfile # Save some info about the run. @@ -45,5 +47,6 @@ echo "" >> $outfile # https://docs.tlapl.us/codebase:architecture#fingerprint_sets_fpsets # # Add -simulate to run in infinite simulation mode. +# -coverage 1 is useful for profiling (check how many times actions are taken). java -Xmx$MEM -XX:MaxDirectMemorySize=$MEM -XX:+UseParallelGC -Dtlc2.tool.fp.FPSet.impl=tlc2.tool.fp.OffHeapDiskFPSet \ -cp "${TOOLSPATH}" tlc2.TLC $SPEC -config $CONFIG -workers auto -gzip | tee -a $outfile diff --git a/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg new file mode 100644 index 0000000000..8d34751083 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg @@ -0,0 +1,21 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2} +max_term = 2 +max_entries = 2 +max_generation = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ConfigSafety +ElectionSafetyFull +LogIsMonotonic +LogSafety +\* As its comment explains generally it is not expected to hold, but +\* in such small model it is true. +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg new file mode 100644 index 0000000000..eb7e0768ff --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg @@ -0,0 +1,19 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2} +max_term = 2 +max_entries = 2 +max_generation = 5 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ConfigSafety +ElectionSafetyFull +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg new file mode 100644 index 0000000000..b5fae13880 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a3_t2_l2_c3.cfg @@ -0,0 +1,20 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 2 +max_entries = 2 +max_generation = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ConfigSafety +ElectionSafetyFull +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + diff --git a/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg new file mode 100644 index 0000000000..71af9fa367 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorReconfig_p2_a4_t2_l2_c3.cfg @@ -0,0 +1,19 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3, a4} +max_term = 2 +max_entries = 2 +max_generation = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafetyFull +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + diff --git a/safekeeper/spec/remove_interm_progress.awk b/safekeeper/spec/remove_interm_progress.awk new file mode 100644 index 0000000000..6203f6fa4f --- /dev/null +++ b/safekeeper/spec/remove_interm_progress.awk @@ -0,0 +1,25 @@ +# Print all lines, but thin out lines starting with Progress: +# leave only first and last 5 ones in the beginning, and only 1 of 1440 +# of others (once a day). +# Also remove checkpointing logs. +{ + lines[NR] = $0 +} +$0 ~ /^Progress/ { + ++pcount +} +END { + progress_idx = 0 + for (i = 1; i <= NR; i++) { + if (lines[i] ~ /^Progress/) { + if (progress_idx < 5 || progress_idx >= pcount - 5 || progress_idx % 1440 == 0) { + print lines[i] + } + progress_idx++ + } + else if (lines[i] ~ /^Checkpointing/) {} + else { + print lines[i] + } + } +} \ No newline at end of file diff --git a/safekeeper/spec/remove_interm_progress.sh b/safekeeper/spec/remove_interm_progress.sh new file mode 100755 index 0000000000..a8724a2b92 --- /dev/null +++ b/safekeeper/spec/remove_interm_progress.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +awk -f remove_interm_progress.awk $1 > $1.thin \ No newline at end of file diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log b/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log new file mode 100644 index 0000000000..8aac9eb58c --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg-2024-12-11--04-24-12.log @@ -0,0 +1,65 @@ +git revision: 9e386917a +Platform: Linux neon-dev-arm64-1 6.8.0-49-generic #49-Ubuntu SMP PREEMPT_DYNAMIC Sun Nov 3 21:21:58 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorReconfig.tla +Config: models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c3.cfg +---- +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2} +max_term = 2 +max_entries = 2 +max_generation = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafetyFull +LogIsMonotonic +LogSafety +\* CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 99 and seed -9189733667206762985 with 35 workers on 80 cores with 27307MB heap and 30720MB offheap memory [pid: 391272] (Linux 6.8.0-49-generic aarch64, Ubuntu 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon2/safekeeper/spec/MCProposerAcceptorReconfig.tla +Parsing file /tmp/tlc-3211535543066978921/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorReconfig.tla +Parsing file /tmp/tlc-3211535543066978921/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-3211535543066978921/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-3211535543066978921/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-3211535543066978921/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-3211535543066978921/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-3211535543066978921/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module ProposerAcceptorReconfig +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorReconfig +Starting... (2024-12-11 04:24:13) +Computing initial states... +Finished computing initial states: 2 states generated, with 1 of them distinct at 2024-12-11 04:24:15. +Progress(16) at 2024-12-11 04:24:18: 1,427,589 states generated (1,427,589 s/min), 142,472 distinct states found (142,472 ds/min), 47,162 states left on queue. +Model checking completed. No error has been found. + Estimates of the probability that TLC did not check all reachable states + because two distinct states had the same fingerprint: + calculated (optimistic): val = 1.0E-6 + based on the actual fingerprints: val = 4.2E-8 +17746857 states generated, 1121659 distinct states found, 0 states left on queue. +The depth of the complete state graph search is 37. +The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 9 and the 95th percentile is 3). +Finished in 33s at (2024-12-11 04:24:46) diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log b/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log new file mode 100644 index 0000000000..40e7611ae3 --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorReconfig.tla-MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg-2024-12-11--04-26-11.log @@ -0,0 +1,64 @@ +git revision: 9e386917a +Platform: Linux neon-dev-arm64-1 6.8.0-49-generic #49-Ubuntu SMP PREEMPT_DYNAMIC Sun Nov 3 21:21:58 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorReconfig.tla +Config: models/MCProposerAcceptorReconfig_p2_a2_t2_l2_c5.cfg +---- +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2} +max_term = 2 +max_entries = 2 +max_generation = 5 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafetyFull +LogIsMonotonic +LogSafety +\* CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 114 and seed -8099467489737745861 with 35 workers on 80 cores with 27307MB heap and 30720MB offheap memory [pid: 392020] (Linux 6.8.0-49-generic aarch64, Ubuntu 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon2/safekeeper/spec/MCProposerAcceptorReconfig.tla +Parsing file /tmp/tlc-11757875725969857497/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorReconfig.tla +Parsing file /tmp/tlc-11757875725969857497/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-11757875725969857497/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-11757875725969857497/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-11757875725969857497/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-11757875725969857497/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /home/arseny/neon2/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-11757875725969857497/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module ProposerAcceptorReconfig +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorReconfig +Starting... (2024-12-11 04:26:12) +Computing initial states... +Finished computing initial states: 2 states generated, with 1 of them distinct at 2024-12-11 04:26:14. +Progress(14) at 2024-12-11 04:26:17: 1,519,385 states generated (1,519,385 s/min), 231,263 distinct states found (231,263 ds/min), 121,410 states left on queue. +Progress(20) at 2024-12-11 04:27:17: 42,757,204 states generated (41,237,819 s/min), 4,198,386 distinct states found (3,967,123 ds/min), 1,308,109 states left on queue. +Progress(22) at 2024-12-11 04:28:17: 83,613,929 states generated (40,856,725 s/min), 7,499,873 distinct states found (3,301,487 ds/min), 1,929,464 states left on queue. +Progress(23) at 2024-12-11 04:29:17: 124,086,758 states generated (40,472,829 s/min), 10,569,712 distinct states found (3,069,839 ds/min), 2,386,988 states left on queue. +Progress(24) at 2024-12-11 04:30:17: 163,412,538 states generated (39,325,780 s/min), 13,314,303 distinct states found (2,744,591 ds/min), 2,610,637 states left on queue. +Progress(25) at 2024-12-11 04:31:17: 202,643,708 states generated (39,231,170 s/min), 15,960,583 distinct states found (2,646,280 ds/min), 2,759,681 states left on queue. +Progress(26) at 2024-12-11 04:32:17: 240,681,633 states generated (38,037,925 s/min), 18,443,440 distinct states found (2,482,857 ds/min), 2,852,177 states left on queue. +Progress(27) at 2024-12-11 04:33:17: 278,559,134 states generated (37,877,501 s/min), 20,878,067 distinct states found (2,434,627 ds/min), 2,904,400 states left on queue. +Progress(28) at 2024-12-11 04:34:17: 316,699,911 states generated (38,140,777 s/min), 23,212,229 distinct states found (2,334,162 ds/min), 2,864,969 states left on queue. diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 13f6e34575..10fc4a4b59 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -1,60 +1,51 @@ // // Main entry point for the safekeeper executable // -use anyhow::{bail, Context, Result}; -use camino::{Utf8Path, Utf8PathBuf}; -use clap::{ArgAction, Parser}; -use futures::future::BoxFuture; -use futures::stream::FuturesUnordered; -use futures::{FutureExt, StreamExt}; -use remote_storage::RemoteStorageConfig; -use sd_notify::NotifyState; -use tokio::runtime::Handle; -use tokio::signal::unix::{signal, SignalKind}; -use tokio::task::JoinError; -use utils::logging::SecretString; - -use std::env::{var, VarError}; +use std::env::{VarError, var}; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; -use storage_broker::Uri; - -use tracing::*; -use utils::pid_file; +use anyhow::{Context, Result, bail}; +use camino::{Utf8Path, Utf8PathBuf}; +use clap::{ArgAction, Parser}; +use futures::future::BoxFuture; +use futures::stream::FuturesUnordered; +use futures::{FutureExt, StreamExt}; use metrics::set_build_info_metric; +use remote_storage::RemoteStorageConfig; use safekeeper::defaults::{ DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; -use safekeeper::http; -use safekeeper::wal_service; -use safekeeper::GlobalTimelines; -use safekeeper::SafeKeeperConf; -use safekeeper::{broker, WAL_SERVICE_RUNTIME}; -use safekeeper::{control_file, BROKER_RUNTIME}; -use safekeeper::{wal_backup, HTTP_RUNTIME}; -use storage_broker::DEFAULT_ENDPOINT; -use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; -use utils::{ - id::NodeId, - logging::{self, LogFormat}, - project_build_tag, project_git_version, - sentry_init::init_sentry, - tcp_listener, +use safekeeper::{ + BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, WAL_SERVICE_RUNTIME, broker, + control_file, http, wal_backup, wal_service, }; +use sd_notify::NotifyState; +use storage_broker::{DEFAULT_ENDPOINT, Uri}; +use tokio::runtime::Handle; +use tokio::signal::unix::{SignalKind, signal}; +use tokio::task::JoinError; +use tracing::*; +use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; +use utils::id::NodeId; +use utils::logging::{self, LogFormat, SecretString}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version, tcp_listener}; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21). +/// This adds roughly 3% overhead for allocations on average, which is acceptable considering +/// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] -#[export_name = "malloc_conf"] -pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +#[unsafe(export_name = "malloc_conf")] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "safekeeper.pid"; const ID_FILE_NAME: &str = "safekeeper.id"; @@ -205,6 +196,13 @@ struct Args { /// Also defines interval for eviction retries. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)] eviction_min_resident: Duration, + /// Enable fanning out WAL to different shards from the same reader + #[arg(long)] + wal_reader_fanout: bool, + /// Only fan out the WAL reader if the absoulte delta between the new requested position + /// and the current position of the reader is smaller than this value. + #[arg(long)] + max_delta_for_fanout: Option, } // Like PathBufValueParser, but allows empty string. @@ -368,6 +366,8 @@ async fn main() -> anyhow::Result<()> { control_file_save_interval: args.control_file_save_interval, partial_backup_concurrency: args.partial_backup_concurrency, eviction_min_resident: args.eviction_min_resident, + wal_reader_fanout: args.wal_reader_fanout, + max_delta_for_fanout: args.max_delta_for_fanout, }); // initialize sentry if SENTRY_DSN is provided diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 4b091e2c29..de6e275124 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -1,39 +1,25 @@ //! Communication with the broker, providing safekeeper peers and pageserver coordination. -use anyhow::anyhow; -use anyhow::bail; -use anyhow::Context; - -use anyhow::Error; -use anyhow::Result; - -use storage_broker::parse_proto_ttid; - -use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; -use storage_broker::proto::FilterTenantTimelineId; -use storage_broker::proto::MessageType; -use storage_broker::proto::SafekeeperDiscoveryResponse; -use storage_broker::proto::SubscribeByFilterRequest; -use storage_broker::proto::SubscribeSafekeeperInfoRequest; -use storage_broker::proto::TypeSubscription; -use storage_broker::proto::TypedMessage; -use storage_broker::Request; - -use std::sync::atomic::AtomicU64; use std::sync::Arc; -use std::time::Duration; -use std::time::Instant; -use std::time::UNIX_EPOCH; +use std::sync::atomic::AtomicU64; +use std::time::{Duration, Instant, UNIX_EPOCH}; + +use anyhow::{Context, Error, Result, anyhow, bail}; +use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryResponse, SubscribeByFilterRequest, + SubscribeSafekeeperInfoRequest, TypeSubscription, TypedMessage, +}; +use storage_broker::{Request, parse_proto_ttid}; use tokio::task::JoinHandle; use tokio::time::sleep; use tracing::*; -use crate::metrics::BROKER_ITERATION_TIMELINES; -use crate::metrics::BROKER_PULLED_UPDATES; -use crate::metrics::BROKER_PUSHED_UPDATES; -use crate::metrics::BROKER_PUSH_ALL_UPDATES_SECONDS; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; +use crate::metrics::{ + BROKER_ITERATION_TIMELINES, BROKER_PULLED_UPDATES, BROKER_PUSH_ALL_UPDATES_SECONDS, + BROKER_PUSHED_UPDATES, +}; +use crate::{GlobalTimelines, SafeKeeperConf}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 06e5afbf74..1bf3e4cac1 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -1,26 +1,26 @@ //! Control file serialization, deserialization and persistence. -use anyhow::{bail, ensure, Context, Result}; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use camino::{Utf8Path, Utf8PathBuf}; -use tokio::fs::File; -use tokio::io::AsyncWriteExt; -use utils::crashsafe::durable_rename; - use std::future::Future; use std::io::Read; use std::ops::Deref; use std::path::Path; use std::time::Instant; -use crate::control_file_upgrade::downgrade_v9_to_v8; -use crate::control_file_upgrade::upgrade_control_file; +use anyhow::{Context, Result, bail, ensure}; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use camino::{Utf8Path, Utf8PathBuf}; +use safekeeper_api::membership::INVALID_GENERATION; +use tokio::fs::File; +use tokio::io::AsyncWriteExt; +use utils::bin_ser::LeSer; +use utils::crashsafe::durable_rename; + +use crate::control_file_upgrade::{downgrade_v10_to_v9, upgrade_control_file}; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::state::{EvictionState, TimelinePersistentState}; -use utils::bin_ser::LeSer; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 9; +pub const SK_FORMAT_VERSION: u32 = 10; // contains persistent metadata for safekeeper pub const CONTROL_FILE_NAME: &str = "safekeeper.control"; @@ -169,10 +169,11 @@ impl TimelinePersistentState { let mut buf: Vec = Vec::new(); WriteBytesExt::write_u32::(&mut buf, SK_MAGIC)?; - if self.eviction_state == EvictionState::Present { - // temp hack for forward compatibility - const PREV_FORMAT_VERSION: u32 = 8; - let prev = downgrade_v9_to_v8(self); + if self.mconf.generation == INVALID_GENERATION { + // Temp hack for forward compatibility test: in case of none + // configuration save cfile in previous v9 format. + const PREV_FORMAT_VERSION: u32 = 9; + let prev = downgrade_v10_to_v9(self); WriteBytesExt::write_u32::(&mut buf, PREV_FORMAT_VERSION)?; prev.ser_into(&mut buf)?; } else { @@ -232,16 +233,23 @@ impl Storage for FileStorage { #[cfg(test)] mod test { - use super::*; + use safekeeper_api::membership::{Configuration, MemberSet, SafekeeperGeneration}; use tokio::fs; use utils::lsn::Lsn; + use super::*; + const NO_SYNC: bool = true; #[tokio::test] async fn test_read_write_safekeeper_state() -> anyhow::Result<()> { let tempdir = camino_tempfile::tempdir()?; let mut state = TimelinePersistentState::empty(); + state.mconf = Configuration { + generation: SafekeeperGeneration::new(42), + members: MemberSet::empty(), + new_members: None, + }; let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?; // Make a change. diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index a4b4670e42..1ad9e62f9b 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,18 +1,19 @@ //! Code to deal with safekeeper control file upgrades -use crate::{ - safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn}, - state::{EvictionState, PersistedPeers, TimelinePersistentState}, - wal_backup_partial, -}; -use anyhow::{bail, Result}; +use std::vec; + +use anyhow::{Result, bail}; use pq_proto::SystemId; +use safekeeper_api::membership::{Configuration, INVALID_GENERATION}; +use safekeeper_api::{ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tracing::*; -use utils::{ - bin_ser::LeSer, - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use utils::bin_ser::LeSer; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; + +use crate::safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn}; +use crate::state::{EvictionState, TimelinePersistentState}; +use crate::wal_backup_partial; /// Persistent consensus state of the acceptor. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] @@ -232,6 +233,90 @@ pub struct SafeKeeperStateV8 { pub partial_backup: wal_backup_partial::State, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PersistedPeerInfo { + /// LSN up to which safekeeper offloaded WAL to s3. + pub backup_lsn: Lsn, + /// Term of the last entry. + pub term: Term, + /// LSN of the last record. + pub flush_lsn: Lsn, + /// Up to which LSN safekeeper regards its WAL as committed. + pub commit_lsn: Lsn, +} + +impl PersistedPeerInfo { + pub fn new() -> Self { + Self { + backup_lsn: Lsn::INVALID, + term: safekeeper_api::INITIAL_TERM, + flush_lsn: Lsn(0), + commit_lsn: Lsn(0), + } + } +} + +// make clippy happy +impl Default for PersistedPeerInfo { + fn default() -> Self { + Self::new() + } +} + +/// Note: SafekeeperStateVn is old name for TimelinePersistentStateVn. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct TimelinePersistentStateV9 { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum *and available locally*. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + /// Peers and their state as we remember it. Knowing peers themselves is + /// fundamental; but state is saved here only for informational purposes and + /// obviously can be stale. (Currently not saved at all, but let's provision + /// place to have less file version upgrades). + pub peers: PersistedPeers, + /// Holds names of partial segments uploaded to remote storage. Used to + /// clean up old objects without leaving garbage in remote storage. + pub partial_backup: wal_backup_partial::State, + /// Eviction state of the timeline. If it's Offloaded, we should download + /// WAL files from remote storage to serve the timeline. + pub eviction_state: EvictionState, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -247,6 +332,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result Result Result Result Result Result Result Result Result Result Result Result Result SafeKeeperStateV8 { - assert!(state.eviction_state == EvictionState::Present); - SafeKeeperStateV8 { +// Used as a temp hack to make forward compatibility test work. Should be +// removed after PR adding v10 is merged. +pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersistentStateV9 { + assert!(state.mconf.generation == INVALID_GENERATION); + TimelinePersistentStateV9 { tenant_id: state.tenant_id, timeline_id: state.timeline_id, acceptor_state: state.acceptor_state.clone(), @@ -425,8 +537,9 @@ pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 backup_lsn: state.backup_lsn, peer_horizon_lsn: state.peer_horizon_lsn, remote_consistent_lsn: state.remote_consistent_lsn, - peers: state.peers.clone(), + peers: PersistedPeers(vec![]), partial_backup: state.partial_backup.clone(), + eviction_state: state.eviction_state, } } @@ -434,11 +547,11 @@ pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 mod tests { use std::str::FromStr; - use utils::{id::NodeId, Hex}; - - use crate::safekeeper::PersistedPeerInfo; + use utils::Hex; + use utils::id::NodeId; use super::*; + use crate::control_file_upgrade::PersistedPeerInfo; #[test] fn roundtrip_v1() { diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 28ef2b1d23..11daff22cb 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -1,23 +1,22 @@ -use anyhow::{bail, Result}; +use std::sync::Arc; + +use anyhow::{Result, bail}; use camino::Utf8PathBuf; use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; -use std::sync::Arc; -use tokio::{ - fs::OpenOptions, - io::{AsyncSeekExt, AsyncWriteExt}, -}; +use safekeeper_api::membership::Configuration; +use tokio::fs::OpenOptions; +use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tracing::{info, warn}; -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; -use crate::{ - control_file::FileStorage, - state::TimelinePersistentState, - timeline::{TimelineError, WalResidentTimeline}, - timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, - wal_backup::copy_s3_segments, - wal_storage::{wal_file_paths, WalReader}, - GlobalTimelines, -}; +use crate::GlobalTimelines; +use crate::control_file::FileStorage; +use crate::state::TimelinePersistentState; +use crate::timeline::{TimelineError, WalResidentTimeline}; +use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}; +use crate::wal_backup::copy_s3_segments; +use crate::wal_storage::{WalReader, wal_file_paths}; // we don't want to have more than 10 segments on disk after copy, because they take space const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64; @@ -147,10 +146,10 @@ pub async fn handle_request( let mut new_state = TimelinePersistentState::new( &request.destination_ttid, + Configuration::empty(), state.server.clone(), - vec![], - request.until_lsn, start_lsn, + request.until_lsn, )?; new_state.timeline_start_lsn = start_lsn; new_state.peer_horizon_lsn = request.until_lsn; diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index 93011eddec..68a38e1498 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -2,37 +2,25 @@ use std::fs; use std::fs::DirEntry; -use std::io::BufReader; -use std::io::Read; +use std::io::{BufReader, Read}; use std::path::PathBuf; use std::sync::Arc; -use anyhow::bail; -use anyhow::Result; -use camino::Utf8Path; -use camino::Utf8PathBuf; +use anyhow::{Result, bail}; +use camino::{Utf8Path, Utf8PathBuf}; use chrono::{DateTime, Utc}; -use postgres_ffi::XLogSegNo; -use postgres_ffi::MAX_SEND_SIZE; -use serde::Deserialize; -use serde::Serialize; - use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName}; +use postgres_ffi::{MAX_SEND_SIZE, XLogSegNo}; +use safekeeper_api::models::WalSenderState; +use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; -use utils::id::NodeId; -use utils::id::TenantTimelineId; -use utils::id::{TenantId, TimelineId}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; use crate::safekeeper::TermHistory; -use crate::send_wal::WalSenderState; -use crate::state::TimelineMemState; -use crate::state::TimelinePersistentState; -use crate::timeline::get_timeline_dir; -use crate::timeline::WalResidentTimeline; -use crate::timeline_manager; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; +use crate::state::{TimelineMemState, TimelinePersistentState}; +use crate::timeline::{WalResidentTimeline, get_timeline_dir}; +use crate::{GlobalTimelines, SafeKeeperConf, timeline_manager}; /// Various filters that influence the resulting JSON output. #[derive(Debug, Serialize, Deserialize, Clone)] diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 2ca6333ba8..dd7008c87d 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -1,35 +1,32 @@ //! Part of Safekeeper pretending to be Postgres, i.e. handling Postgres //! protocol commands. +use std::future::Future; +use std::str::{self, FromStr}; +use std::sync::Arc; + use anyhow::Context; use pageserver_api::models::ShardParameters; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; -use std::future::Future; -use std::str::{self, FromStr}; -use std::sync::Arc; +use postgres_backend::{PostgresBackend, QueryError}; +use postgres_ffi::PG_TLI; +use pq_proto::{BeMessage, FeStartupPacket, INT4_OID, RowDescriptor, TEXT_OID}; +use regex::Regex; +use safekeeper_api::Term; +use safekeeper_api::models::ConnectionId; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{debug, info, info_span, Instrument}; +use tracing::{Instrument, debug, info, info_span}; +use utils::auth::{Claims, JwtAuth, Scope}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; use utils::postgres_client::PostgresClientProtocol; use utils::shard::{ShardCount, ShardNumber}; use crate::auth::check_permission; -use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; - -use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE}; -use crate::safekeeper::Term; +use crate::json_ctrl::{AppendLogicalMessage, handle_json_ctrl}; +use crate::metrics::{PG_QUERIES_GAUGE, TrafficMetrics}; use crate::timeline::TimelineError; -use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; -use postgres_backend::PostgresBackend; -use postgres_backend::QueryError; -use postgres_ffi::PG_TLI; -use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; -use regex::Regex; -use utils::auth::{Claims, JwtAuth, Scope}; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; /// Safekeeper handler of postgres commands pub struct SafekeeperPostgresHandler { @@ -52,16 +49,70 @@ pub struct SafekeeperPostgresHandler { /// Parsed Postgres command. enum SafekeeperPostgresCommand { - StartWalPush, - StartReplication { start_lsn: Lsn, term: Option }, + StartWalPush { + proto_version: u32, + // Eventually timelines will be always created explicitly by storcon. + // This option allows legacy behaviour for compute to do that until we + // fully migrate. + allow_timeline_creation: bool, + }, + StartReplication { + start_lsn: Lsn, + term: Option, + }, IdentifySystem, TimelineStatus, - JSONCtrl { cmd: AppendLogicalMessage }, + JSONCtrl { + cmd: AppendLogicalMessage, + }, } fn parse_cmd(cmd: &str) -> anyhow::Result { if cmd.starts_with("START_WAL_PUSH") { - Ok(SafekeeperPostgresCommand::StartWalPush) + // Allow additional options in postgres START_REPLICATION style like + // START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false'). + // Parsing here is very naive and breaks in case of commas or + // whitespaces in values, but enough for our purposes. + let re = Regex::new(r"START_WAL_PUSH(\s+?\((.*)\))?").unwrap(); + let caps = re + .captures(cmd) + .context(format!("failed to parse START_WAL_PUSH command {}", cmd))?; + // capture () content + let options = caps.get(2).map(|m| m.as_str()).unwrap_or(""); + // default values + let mut proto_version = 2; + let mut allow_timeline_creation = true; + for kvstr in options.split(",") { + if kvstr.is_empty() { + continue; + } + let mut kvit = kvstr.split_whitespace(); + let key = kvit.next().context(format!( + "failed to parse key in kv {} in command {}", + kvstr, cmd + ))?; + let value = kvit.next().context(format!( + "failed to parse value in kv {} in command {}", + kvstr, cmd + ))?; + let value_trimmed = value.trim_matches('\''); + if key == "proto_version" { + proto_version = value_trimmed.parse::().context(format!( + "failed to parse proto_version value {} in command {}", + value, cmd + ))?; + } + if key == "allow_timeline_creation" { + allow_timeline_creation = value_trimmed.parse::().context(format!( + "failed to parse allow_timeline_creation value {} in command {}", + value, cmd + ))?; + } + } + Ok(SafekeeperPostgresCommand::StartWalPush { + proto_version, + allow_timeline_creation, + }) } else if cmd.starts_with("START_REPLICATION") { let re = Regex::new( // We follow postgres START_REPLICATION LOGICAL options to pass term. @@ -95,7 +146,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { match cmd { - SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH", + SafekeeperPostgresCommand::StartWalPush { .. } => "START_WAL_PUSH", SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION", SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS", SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM", @@ -293,8 +344,11 @@ impl postgres_backend::Handler self.ttid = TenantTimelineId::new(tenant_id, timeline_id); match cmd { - SafekeeperPostgresCommand::StartWalPush => { - self.handle_start_wal_push(pgb) + SafekeeperPostgresCommand::StartWalPush { + proto_version, + allow_timeline_creation, + } => { + self.handle_start_wal_push(pgb, proto_version, allow_timeline_creation) .instrument(info_span!("WAL receiver")) .await } @@ -467,3 +521,39 @@ impl SafekeeperPostgresHandler { } } } + +#[cfg(test)] +mod tests { + use super::SafekeeperPostgresCommand; + + /// Test parsing of START_WAL_PUSH command + #[test] + fn test_start_wal_push_parse() { + let cmd = "START_WAL_PUSH"; + let parsed = super::parse_cmd(cmd).expect("failed to parse"); + match parsed { + SafekeeperPostgresCommand::StartWalPush { + proto_version, + allow_timeline_creation, + } => { + assert_eq!(proto_version, 2); + assert!(allow_timeline_creation); + } + _ => panic!("unexpected command"), + } + + let cmd = + "START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false', unknown 'hoho')"; + let parsed = super::parse_cmd(cmd).expect("failed to parse"); + match parsed { + SafekeeperPostgresCommand::StartWalPush { + proto_version, + allow_timeline_creation, + } => { + assert_eq!(proto_version, 3); + assert!(!allow_timeline_creation); + } + _ => panic!("unexpected command"), + } + } +} diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 7229ccb739..f162985ef7 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,10 +1,9 @@ -pub mod client; pub mod routes; -pub use routes::make_router; - -pub use safekeeper_api::models; use std::sync::Arc; +pub use routes::make_router; +pub use safekeeper_api::models; + use crate::{GlobalTimelines, SafeKeeperConf}; pub async fn task_main( @@ -15,7 +14,7 @@ pub async fn task_main( let router = make_router(conf, global_timelines) .build() .map_err(|err| anyhow::anyhow!(err))?; - let service = utils::http::RouterService::new(router).unwrap(); + let service = http_utils::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)?; server.serve(service).await?; Ok(()) // unreachable diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 71c36f1d46..3b3bc71ac4 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,55 +1,41 @@ -use hyper::{Body, Request, Response, StatusCode}; -use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::fmt; use std::io::Write as _; use std::str::FromStr; use std::sync::Arc; -use storage_broker::proto::SafekeeperTimelineInfo; -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; + +use http_utils::endpoint::{ + self, ChannelWriter, auth_middleware, check_permission_with, profile_cpu_handler, + profile_heap_handler, prometheus_metrics_handler, request_span, +}; +use http_utils::error::ApiError; +use http_utils::failpoints::failpoints_handler; +use http_utils::json::{json_request, json_response}; +use http_utils::request::{ensure_no_body, parse_query_param, parse_request_param}; +use http_utils::{RequestExt, RouterBuilder}; +use hyper::{Body, Request, Response, StatusCode}; +use postgres_ffi::WAL_SEGMENT_SIZE; +use safekeeper_api::models::{ + AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TermSwitchApiEntry, + TimelineCopyRequest, TimelineCreateRequest, TimelineStatus, TimelineTermBumpRequest, +}; +use safekeeper_api::{ServerInfo, models}; +use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; use tokio::sync::mpsc; use tokio::task; use tokio_stream::wrappers::ReceiverStream; use tokio_util::sync::CancellationToken; -use tracing::{info_span, Instrument}; -use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::{ - profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, - ChannelWriter, -}; -use utils::http::request::parse_query_param; - -use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; -use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest}; -use utils::{ - auth::SwappableJwtAuth, - http::{ - endpoint::{self, auth_middleware, check_permission_with}, - error::ApiError, - json::{json_request, json_response}, - request::{ensure_no_body, parse_request_param}, - RequestExt, RouterBuilder, - }, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use tracing::{Instrument, info_span}; +use utils::auth::SwappableJwtAuth; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; use crate::debug_dump::TimelineDigestRequest; -use crate::receive_wal::WalReceiverState; -use crate::safekeeper::Term; -use crate::safekeeper::{ServerInfo, TermLsn}; -use crate::send_wal::WalSenderState; -use crate::timeline::PeerInfo; +use crate::safekeeper::TermLsn; use crate::timelines_global_map::TimelineDeleteForceResult; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; -use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline}; - -#[derive(Debug, Serialize)] -struct SafekeeperStatus { - id: NodeId, -} +use crate::{ + GlobalTimelines, SafeKeeperConf, copy_timeline, debug_dump, patch_control_file, pull_timeline, +}; /// Healthcheck handler. async fn status_handler(request: Request) -> Result, ApiError> { @@ -73,50 +59,6 @@ fn get_global_timelines(request: &Request) -> Arc { .clone() } -/// Same as TermLsn, but serializes LSN using display serializer -/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response. -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct TermSwitchApiEntry { - pub term: Term, - pub lsn: Lsn, -} - -impl From for TermLsn { - fn from(api_val: TermSwitchApiEntry) -> Self { - TermLsn { - term: api_val.term, - lsn: api_val.lsn, - } - } -} - -/// Augment AcceptorState with last_log_term for convenience -#[derive(Debug, Serialize, Deserialize)] -pub struct AcceptorStateStatus { - pub term: Term, - pub epoch: Term, // aka last_log_term - pub term_history: Vec, -} - -/// Info about timeline on safekeeper ready for reporting. -#[derive(Debug, Serialize, Deserialize)] -pub struct TimelineStatus { - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub acceptor_state: AcceptorStateStatus, - pub pg_info: ServerInfo, - pub flush_lsn: Lsn, - pub timeline_start_lsn: Lsn, - pub local_start_lsn: Lsn, - pub commit_lsn: Lsn, - pub backup_lsn: Lsn, - pub peer_horizon_lsn: Lsn, - pub remote_consistent_lsn: Lsn, - pub peers: Vec, - pub walsenders: Vec, - pub walreceivers: Vec, -} - fn check_permission(request: &Request, tenant_id: Option) -> Result<(), ApiError> { check_permission_with(request, |claims| { crate::auth::check_permission(claims, tenant_id) @@ -160,20 +102,28 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { + check_permission(&request, None)?; + let global_timelines = get_global_timelines(&request); + let utilization = global_timelines.get_timeline_counts(); + json_response(StatusCode::OK, utilization) +} + /// List all (not deleted) timelines. /// Note: it is possible to do the same with debug_dump. async fn timeline_list_handler(request: Request) -> Result, ApiError> { @@ -187,6 +137,15 @@ async fn timeline_list_handler(request: Request) -> Result, json_response(StatusCode::OK, res) } +impl From for TermLsn { + fn from(api_val: TermSwitchApiEntry) -> Self { + TermLsn { + term: api_val.term, + lsn: api_val.lsn, + } + } +} + /// Report info about timeline. async fn timeline_status_handler(request: Request) -> Result, ApiError> { let ttid = TenantTimelineId::new( @@ -222,6 +181,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result) -> Result, ApiError> { check_permission(&request, None)?; - let data: pull_timeline::Request = json_request(&mut request).await?; + let data: PullTimelineRequest = json_request(&mut request).await?; let conf = get_conf(&request); let global_timelines = get_global_timelines(&request); @@ -307,6 +267,28 @@ async fn timeline_snapshot_handler(request: Request) -> Result, +) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let global_timelines = get_global_timelines(&request); + let tli = global_timelines.get(ttid).map_err(ApiError::from)?; + + let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?; + let response = tli + .membership_switch(data.mconf) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + async fn timeline_copy_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -604,7 +586,7 @@ pub fn make_router( if conf.http_auth.is_some() { router = router.middleware(auth_middleware(|request| { const ALLOWLIST_ROUTES: &[&str] = - &["/v1/status", "/metrics", "/profile/cpu", "profile/heap"]; + &["/v1/status", "/metrics", "/profile/cpu", "/profile/heap"]; if ALLOWLIST_ROUTES.contains(&request.uri().path()) { None } else { @@ -635,6 +617,7 @@ pub fn make_router( failpoints_handler(r, cancel).await }) }) + .get("/v1/utilization", |r| request_span(r, utilization_handler)) .delete("/v1/tenant/:tenant_id", |r| { request_span(r, tenant_delete_handler) }) @@ -658,6 +641,10 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id", |r| request_span(r, timeline_snapshot_handler), ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/membership", + |r| request_span(r, timeline_membership_handler), + ) .post( "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy", |r| request_span(r, timeline_copy_handler), diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index dc4ad3706e..793ea9c3e9 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -7,24 +7,23 @@ //! use anyhow::Context; -use postgres_backend::QueryError; +use postgres_backend::{PostgresBackend, QueryError}; +use postgres_ffi::{WAL_SEGMENT_SIZE, encode_logical_message}; +use pq_proto::{BeMessage, RowDescriptor, TEXT_OID}; +use safekeeper_api::membership::{Configuration, INVALID_GENERATION}; +use safekeeper_api::{ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::*; +use utils::lsn::Lsn; use crate::handler::SafekeeperPostgresHandler; -use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; use crate::safekeeper::{ - AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, + AcceptorProposerMessage, AppendRequest, AppendRequestHeader, AppendResponse, + ProposerAcceptorMessage, ProposerElected, TermHistory, TermLsn, }; -use crate::safekeeper::{Term, TermHistory, TermLsn}; use crate::state::TimelinePersistentState; use crate::timeline::WalResidentTimeline; -use postgres_backend::PostgresBackend; -use postgres_ffi::encode_logical_message; -use postgres_ffi::WAL_SEGMENT_SIZE; -use pq_proto::{BeMessage, RowDescriptor, TEXT_OID}; -use utils::lsn::Lsn; #[derive(Serialize, Deserialize, Debug)] pub struct AppendLogicalMessage { @@ -104,6 +103,7 @@ async fn prepare_safekeeper( .global_timelines .create( spg.ttid, + Configuration::empty(), ServerInfo { pg_version, wal_seg_size: WAL_SEGMENT_SIZE as u32, @@ -130,10 +130,10 @@ async fn send_proposer_elected( let history = TermHistory(history_entries); let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected { + generation: INVALID_GENERATION, term, start_streaming_at: lsn, term_history: history, - timeline_start_lsn: lsn, }); tli.process_msg(&proposer_elected_request).await?; @@ -167,13 +167,12 @@ pub async fn append_logical_message( let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest { h: AppendRequestHeader { + generation: INVALID_GENERATION, term: msg.term, - term_start_lsn: begin_lsn, begin_lsn, end_lsn, commit_lsn, truncate_lsn: msg.truncate_lsn, - proposer_uuid: [0u8; 16], }, wal_data, }); diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index abe6e00a66..c52b097066 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -2,15 +2,16 @@ extern crate hyper0 as hyper; +use std::time::Duration; + use camino::Utf8PathBuf; use once_cell::sync::Lazy; use remote_storage::RemoteStorageConfig; -use tokio::runtime::Runtime; - -use std::time::Duration; use storage_broker::Uri; - -use utils::{auth::SwappableJwtAuth, id::NodeId, logging::SecretString}; +use tokio::runtime::Runtime; +use utils::auth::SwappableJwtAuth; +use utils::id::NodeId; +use utils::logging::SecretString; mod auth; pub mod broker; @@ -43,8 +44,12 @@ pub mod wal_reader_stream; pub mod wal_service; pub mod wal_storage; +#[cfg(any(test, feature = "benchmarking"))] +pub mod test_utils; + mod timelines_global_map; use std::sync::Arc; + pub use timelines_global_map::GlobalTimelines; use utils::auth::JwtAuth; @@ -105,6 +110,8 @@ pub struct SafeKeeperConf { pub control_file_save_interval: Duration, pub partial_backup_concurrency: usize, pub eviction_min_resident: Duration, + pub wal_reader_fanout: bool, + pub max_delta_for_fanout: Option, } impl SafeKeeperConf { @@ -147,6 +154,8 @@ impl SafeKeeperConf { control_file_save_interval: Duration::from_secs(1), partial_backup_concurrency: 1, eviction_min_resident: Duration::ZERO, + wal_reader_fanout: false, + max_delta_for_fanout: None, } } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 5883f402c7..cb21a5f6d2 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -1,30 +1,28 @@ //! Global safekeeper mertics and per-timeline safekeeper metrics. -use std::{ - sync::{Arc, RwLock}, - time::{Instant, SystemTime}, -}; +use std::sync::{Arc, RwLock}; +use std::time::{Instant, SystemTime}; use anyhow::Result; use futures::Future; +use metrics::core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}; +use metrics::proto::MetricFamily; use metrics::{ - core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, - pow2_buckets, - proto::MetricFamily, + DISK_FSYNC_SECONDS_BUCKETS, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, + IntCounterPair, IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, pow2_buckets, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair, - register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge, GaugeVec, - Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, - IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS, + register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, + register_int_gauge_vec, }; use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; -use utils::{id::TenantTimelineId, lsn::Lsn, pageserver_feedback::PageserverFeedback}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; -use crate::{ - receive_wal::MSG_QUEUE_SIZE, - state::{TimelineMemState, TimelinePersistentState}, - GlobalTimelines, -}; +use crate::GlobalTimelines; +use crate::receive_wal::MSG_QUEUE_SIZE; +use crate::state::{TimelineMemState, TimelinePersistentState}; // Global metrics across all timelines. pub static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { @@ -211,6 +209,14 @@ pub static WAL_RECEIVERS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_wal_receivers") }); +pub static WAL_READERS: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "safekeeper_wal_readers", + "Number of active WAL readers (may serve pageservers or other safekeepers)", + &["kind", "target"] + ) + .expect("Failed to register safekeeper_wal_receivers") +}); pub static WAL_RECEIVER_QUEUE_DEPTH: Lazy = Lazy::new(|| { // Use powers of two buckets, but add a bucket at 0 and the max queue size to track empty and // full queues respectively. @@ -443,6 +449,7 @@ pub struct FullTimelineInfo { pub timeline_is_active: bool, pub num_computes: u32, pub last_removed_segno: XLogSegNo, + pub interpreted_wal_reader_tasks: usize, pub epoch_start_lsn: Lsn, pub mem_state: TimelineMemState, @@ -472,6 +479,7 @@ pub struct TimelineCollector { disk_usage: GenericGaugeVec, acceptor_term: GenericGaugeVec, written_wal_bytes: GenericGaugeVec, + interpreted_wal_reader_tasks: GenericGaugeVec, written_wal_seconds: GaugeVec, flushed_wal_seconds: GaugeVec, collect_timeline_metrics: Gauge, @@ -670,6 +678,16 @@ impl TimelineCollector { .unwrap(); descs.extend(active_timelines_count.desc().into_iter().cloned()); + let interpreted_wal_reader_tasks = GenericGaugeVec::new( + Opts::new( + "safekeeper_interpreted_wal_reader_tasks", + "Number of active interpreted wal reader tasks, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(interpreted_wal_reader_tasks.desc().into_iter().cloned()); + TimelineCollector { global_timelines, descs, @@ -693,6 +711,7 @@ impl TimelineCollector { collect_timeline_metrics, timelines_count, active_timelines_count, + interpreted_wal_reader_tasks, } } } @@ -721,6 +740,7 @@ impl Collector for TimelineCollector { self.disk_usage.reset(); self.acceptor_term.reset(); self.written_wal_bytes.reset(); + self.interpreted_wal_reader_tasks.reset(); self.written_wal_seconds.reset(); self.flushed_wal_seconds.reset(); @@ -782,6 +802,9 @@ impl Collector for TimelineCollector { self.written_wal_bytes .with_label_values(labels) .set(tli.wal_storage.write_wal_bytes); + self.interpreted_wal_reader_tasks + .with_label_values(labels) + .set(tli.interpreted_wal_reader_tasks as u64); self.written_wal_seconds .with_label_values(labels) .set(tli.wal_storage.write_wal_seconds); @@ -834,6 +857,7 @@ impl Collector for TimelineCollector { mfs.extend(self.disk_usage.collect()); mfs.extend(self.acceptor_term.collect()); mfs.extend(self.written_wal_bytes.collect()); + mfs.extend(self.interpreted_wal_reader_tasks.collect()); mfs.extend(self.written_wal_seconds.collect()); mfs.extend(self.flushed_wal_seconds.collect()); diff --git a/safekeeper/src/patch_control_file.rs b/safekeeper/src/patch_control_file.rs index 2136d1b5f7..efdbd9b3d7 100644 --- a/safekeeper/src/patch_control_file.rs +++ b/safekeeper/src/patch_control_file.rs @@ -4,7 +4,8 @@ use serde::{Deserialize, Serialize}; use serde_json::Value; use tracing::info; -use crate::{state::TimelinePersistentState, timeline::Timeline}; +use crate::state::TimelinePersistentState; +use crate::timeline::Timeline; #[derive(Deserialize, Debug, Clone)] pub struct Request { diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index f58a9dca1d..fc58b8509a 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -1,45 +1,38 @@ -use anyhow::{anyhow, bail, Context, Result}; +use std::cmp::min; +use std::io::{self, ErrorKind}; +use std::sync::Arc; + +use anyhow::{Context, Result, anyhow, bail}; use bytes::Bytes; use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; -use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; -use serde::{Deserialize, Serialize}; -use std::{ - cmp::min, - io::{self, ErrorKind}, - sync::Arc, -}; -use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task}; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; +use safekeeper_api::Term; +use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}; +use safekeeper_client::mgmt_api; +use safekeeper_client::mgmt_api::Client; +use serde::Deserialize; +use tokio::fs::OpenOptions; +use tokio::io::AsyncWrite; +use tokio::sync::mpsc; +use tokio::task; use tokio_tar::{Archive, Builder, Header}; -use tokio_util::{ - io::{CopyToBytes, SinkWriter}, - sync::PollSender, -}; +use tokio_util::io::{CopyToBytes, SinkWriter}; +use tokio_util::sync::PollSender; use tracing::{error, info, instrument}; +use utils::crashsafe::fsync_async_opt; +use utils::id::{NodeId, TenantTimelineId}; +use utils::logging::SecretString; +use utils::lsn::Lsn; +use utils::pausable_failpoint; -use crate::{ - control_file::CONTROL_FILE_NAME, - debug_dump, - http::{ - client::{self, Client}, - routes::TimelineStatus, - }, - safekeeper::Term, - state::{EvictionState, TimelinePersistentState}, - timeline::{Timeline, WalResidentTimeline}, - timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, - wal_backup, - wal_storage::open_wal_file, - GlobalTimelines, -}; -use utils::{ - crashsafe::fsync_async_opt, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - logging::SecretString, - lsn::Lsn, - pausable_failpoint, -}; +use crate::control_file::CONTROL_FILE_NAME; +use crate::state::{EvictionState, TimelinePersistentState}; +use crate::timeline::{Timeline, WalResidentTimeline}; +use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}; +use crate::wal_storage::open_wal_file; +use crate::{GlobalTimelines, debug_dump, wal_backup}; /// Stream tar archive of timeline to tx. #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))] @@ -373,28 +366,18 @@ impl WalResidentTimeline { // change, but as long as older history is strictly part of new that's // fine), but there is no need to do it. if bctx.term != term || bctx.last_log_term != last_log_term { - bail!("term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}", - bctx.term, bctx.last_log_term, term, last_log_term); + bail!( + "term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}", + bctx.term, + bctx.last_log_term, + term, + last_log_term + ); } Ok(()) } } -/// pull_timeline request body. -#[derive(Debug, Deserialize)] -pub struct Request { - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub http_hosts: Vec, -} - -#[derive(Debug, Serialize)] -pub struct Response { - // Donor safekeeper host - pub safekeeper_host: String, - // TODO: add more fields? -} - /// Response for debug dump request. #[derive(Debug, Deserialize)] pub struct DebugDumpResponse { @@ -407,10 +390,10 @@ pub struct DebugDumpResponse { /// Find the most advanced safekeeper and pull timeline from it. pub async fn handle_request( - request: Request, + request: PullTimelineRequest, sk_auth_token: Option, global_timelines: Arc, -) -> Result { +) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( request.tenant_id, request.timeline_id, @@ -422,7 +405,7 @@ pub async fn handle_request( let http_hosts = request.http_hosts.clone(); // Figure out statuses of potential donors. - let responses: Vec> = + let responses: Vec> = futures::future::join_all(http_hosts.iter().map(|url| async { let cclient = Client::new(url.clone(), sk_auth_token.clone()); let info = cclient @@ -462,7 +445,7 @@ async fn pull_timeline( host: String, sk_auth_token: Option, global_timelines: Arc, -) -> Result { +) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); info!( "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}", @@ -537,7 +520,7 @@ async fn pull_timeline( .load_temp_timeline(ttid, &tli_dir_path, false) .await?; - Ok(Response { + Ok(PullTimelineResponse { safekeeper_host: host, }) } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 2a49890d61..7967acde3f 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -2,36 +2,21 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use crate::handler::SafekeeperPostgresHandler; -use crate::metrics::{ - WAL_RECEIVERS, WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL, - WAL_RECEIVER_QUEUE_SIZE_TOTAL, -}; -use crate::safekeeper::AcceptorProposerMessage; -use crate::safekeeper::ProposerAcceptorMessage; -use crate::safekeeper::ServerInfo; -use crate::timeline::WalResidentTimeline; -use crate::wal_service::ConnectionId; -use crate::GlobalTimelines; -use anyhow::{anyhow, Context}; -use bytes::BytesMut; -use parking_lot::MappedMutexGuard; -use parking_lot::Mutex; -use parking_lot::MutexGuard; -use postgres_backend::CopyStreamHandlerEnd; -use postgres_backend::PostgresBackend; -use postgres_backend::PostgresBackendReader; -use postgres_backend::QueryError; -use pq_proto::BeMessage; -use serde::Deserialize; -use serde::Serialize; use std::future; use std::net::SocketAddr; use std::sync::Arc; -use tokio::io::AsyncRead; -use tokio::io::AsyncWrite; + +use anyhow::{Context, anyhow}; +use bytes::BytesMut; +use parking_lot::{MappedMutexGuard, Mutex, MutexGuard}; +use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError}; +use pq_proto::BeMessage; +use safekeeper_api::ServerInfo; +use safekeeper_api::membership::Configuration; +use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus}; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc::error::SendTimeoutError; -use tokio::sync::mpsc::{channel, Receiver, Sender}; +use tokio::sync::mpsc::{Receiver, Sender, channel}; use tokio::task; use tokio::task::JoinHandle; use tokio::time::{Duration, Instant, MissedTickBehavior}; @@ -40,6 +25,15 @@ use utils::id::TenantTimelineId; use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; +use crate::GlobalTimelines; +use crate::handler::SafekeeperPostgresHandler; +use crate::metrics::{ + WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL, WAL_RECEIVER_QUEUE_SIZE_TOTAL, + WAL_RECEIVERS, +}; +use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; +use crate::timeline::WalResidentTimeline; + const DEFAULT_FEEDBACK_CAPACITY: usize = 8; /// Registry of WalReceivers (compute connections). Timeline holds it (wrapped @@ -171,21 +165,6 @@ impl WalReceiversShared { } } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct WalReceiverState { - /// None means it is recovery initiated by us (this safekeeper). - pub conn_id: Option, - pub status: WalReceiverStatus, -} - -/// Walreceiver status. Currently only whether it passed voting stage and -/// started receiving the stream, but it is easy to add more if needed. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub enum WalReceiverStatus { - Voting, - Streaming, -} - /// Scope guard to access slot in WalReceivers registry and unregister from /// it in Drop. pub struct WalReceiverGuard { @@ -216,9 +195,14 @@ impl SafekeeperPostgresHandler { pub async fn handle_start_wal_push( &mut self, pgb: &mut PostgresBackend, + proto_version: u32, + allow_timeline_creation: bool, ) -> Result<(), QueryError> { let mut tli: Option = None; - if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await { + if let Err(end) = self + .handle_start_wal_push_guts(pgb, &mut tli, proto_version, allow_timeline_creation) + .await + { // Log the result and probably send it to the client, closing the stream. let handle_end_fut = pgb.handle_copy_stream_end(end); // If we managed to create the timeline, augment logging with current LSNs etc. @@ -238,6 +222,8 @@ impl SafekeeperPostgresHandler { &mut self, pgb: &mut PostgresBackend, tli: &mut Option, + proto_version: u32, + allow_timeline_creation: bool, ) -> Result<(), CopyStreamHandlerEnd> { // The `tli` parameter is only used for passing _out_ a timeline, one should // not have been passed in. @@ -266,12 +252,17 @@ impl SafekeeperPostgresHandler { conn_id: self.conn_id, pgb_reader: &mut pgb_reader, peer_addr, + proto_version, acceptor_handle: &mut acceptor_handle, global_timelines: self.global_timelines.clone(), }; - // Read first message and create timeline if needed. - let res = network_reader.read_first_message().await; + // Read first message and create timeline if needed and allowed. This + // won't be when timelines will be always created by storcon and + // allow_timeline_creation becomes false. + let res = network_reader + .read_first_message(allow_timeline_creation) + .await; let network_res = if let Ok((timeline, next_msg)) = res { let pageserver_feedback_rx: tokio::sync::broadcast::Receiver = @@ -285,7 +276,7 @@ impl SafekeeperPostgresHandler { tokio::select! { // todo: add read|write .context to these errors r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r, - r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r, + r = network_write(pgb, reply_rx, pageserver_feedback_rx, proto_version) => r, _ = timeline_cancel.cancelled() => { return Err(CopyStreamHandlerEnd::Cancelled); } @@ -329,40 +320,53 @@ struct NetworkReader<'a, IO> { conn_id: ConnectionId, pgb_reader: &'a mut PostgresBackendReader, peer_addr: SocketAddr, + proto_version: u32, // WalAcceptor is spawned when we learn server info from walproposer and // create timeline; handle is put here. acceptor_handle: &'a mut Option>>, global_timelines: Arc, } -impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { +impl NetworkReader<'_, IO> { async fn read_first_message( &mut self, + allow_timeline_creation: bool, ) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> { // Receive information about server to create timeline, if not yet. - let next_msg = read_message(self.pgb_reader).await?; + let next_msg = read_message(self.pgb_reader, self.proto_version).await?; let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( - "start handshake with walproposer {} sysid {} timeline {}", - self.peer_addr, greeting.system_id, greeting.tli, + "start handshake with walproposer {} sysid {}", + self.peer_addr, greeting.system_id, ); let server_info = ServerInfo { pg_version: greeting.pg_version, system_id: greeting.system_id, wal_seg_size: greeting.wal_seg_size, }; - let tli = self - .global_timelines - .create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID) - .await - .context("create timeline")?; + let tli = if allow_timeline_creation { + self.global_timelines + .create( + self.ttid, + Configuration::empty(), + server_info, + Lsn::INVALID, + Lsn::INVALID, + ) + .await + .context("create timeline")? + } else { + self.global_timelines + .get(self.ttid) + .context("get timeline")? + }; tli.wal_residence_guard().await? } _ => { return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!( "unexpected message {next_msg:?} instead of greeting" - ))) + ))); } }; Ok((tli, next_msg)) @@ -385,7 +389,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { )); // Forward all messages to WalAcceptor - read_network_loop(self.pgb_reader, msg_tx, next_msg).await + read_network_loop(self.pgb_reader, msg_tx, next_msg, self.proto_version).await } } @@ -393,9 +397,10 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { /// TODO: Return Ok(None) on graceful termination. async fn read_message( pgb_reader: &mut PostgresBackendReader, + proto_version: u32, ) -> Result { let copy_data = pgb_reader.read_copy_message().await?; - let msg = ProposerAcceptorMessage::parse(copy_data)?; + let msg = ProposerAcceptorMessage::parse(copy_data, proto_version)?; Ok(msg) } @@ -403,6 +408,7 @@ async fn read_network_loop( pgb_reader: &mut PostgresBackendReader, msg_tx: Sender, mut next_msg: ProposerAcceptorMessage, + proto_version: u32, ) -> Result<(), CopyStreamHandlerEnd> { /// Threshold for logging slow WalAcceptor sends. const SLOW_THRESHOLD: Duration = Duration::from_secs(5); @@ -435,7 +441,7 @@ async fn read_network_loop( WAL_RECEIVER_QUEUE_DEPTH_TOTAL.inc(); WAL_RECEIVER_QUEUE_SIZE_TOTAL.add(size as i64); - next_msg = read_message(pgb_reader).await?; + next_msg = read_message(pgb_reader, proto_version).await?; } } @@ -448,6 +454,7 @@ async fn network_write( pgb_writer: &mut PostgresBackend, mut reply_rx: Receiver, mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver, + proto_version: u32, ) -> Result<(), CopyStreamHandlerEnd> { let mut buf = BytesMut::with_capacity(128); @@ -485,7 +492,7 @@ async fn network_write( }; buf.clear(); - msg.serialize(&mut buf)?; + msg.serialize(&mut buf, proto_version)?; pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?; } } diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index 7b87166aa0..c2760792b8 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -1,39 +1,36 @@ //! This module implements pulling WAL from peer safekeepers if compute can't //! provide it, i.e. safekeeper lags too much. +use std::fmt; +use std::pin::pin; use std::time::SystemTime; -use std::{fmt, pin::pin}; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use futures::StreamExt; use postgres_protocol::message::backend::ReplicationMessage; -use tokio::sync::mpsc::{channel, Receiver, Sender}; -use tokio::time::timeout; -use tokio::{ - select, - time::sleep, - time::{self, Duration}, -}; +use safekeeper_api::Term; +use safekeeper_api::membership::INVALID_GENERATION; +use safekeeper_api::models::{PeerInfo, TimelineStatus}; +use tokio::select; +use tokio::sync::mpsc::{Receiver, Sender, channel}; +use tokio::time::{self, Duration, sleep, timeout}; use tokio_postgres::replication::ReplicationStream; use tokio_postgres::types::PgLsn; use tracing::*; -use utils::postgres_client::{ConnectionConfigArgs, PostgresClientProtocol}; -use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}; - -use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE}; -use crate::safekeeper::{AppendRequest, AppendRequestHeader}; -use crate::timeline::WalResidentTimeline; -use crate::{ - http::routes::TimelineStatus, - receive_wal::MSG_QUEUE_SIZE, - safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory, - TermLsn, VoteRequest, - }, - timeline::PeerInfo, - SafeKeeperConf, +use utils::id::NodeId; +use utils::lsn::Lsn; +use utils::postgres_client::{ + ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config, }; +use crate::SafeKeeperConf; +use crate::receive_wal::{MSG_QUEUE_SIZE, REPLY_QUEUE_SIZE, WalAcceptor}; +use crate::safekeeper::{ + AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, + ProposerElected, TermHistory, TermLsn, VoteRequest, +}; +use crate::timeline::WalResidentTimeline; + /// Entrypoint for per timeline task which always runs, checking whether /// recovery for this safekeeper is needed and starting it if so. #[instrument(name = "recovery", skip_all, fields(ttid = %tli.ttid))] @@ -267,7 +264,10 @@ async fn recover( ); // Now understand our term history. - let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term }); + let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { + generation: INVALID_GENERATION, + term: donor.term, + }); let vote_response = match tli .process_msg(&vote_request) .await @@ -302,10 +302,10 @@ async fn recover( // truncate WAL locally let pe = ProposerAcceptorMessage::Elected(ProposerElected { + generation: INVALID_GENERATION, term: donor.term, start_streaming_at: last_common_point.lsn, term_history: donor_th, - timeline_start_lsn: Lsn::INVALID, }); // Successful ProposerElected handling always returns None. If term changed, // we'll find out that during the streaming. Note: it is expected to get @@ -343,12 +343,17 @@ async fn recovery_stream( cfg.replication_mode(tokio_postgres::config::ReplicationMode::Physical); let connect_timeout = Duration::from_millis(10000); - let (client, connection) = match time::timeout(connect_timeout, cfg.connect(postgres::NoTls)) - .await + let (client, connection) = match time::timeout( + connect_timeout, + cfg.connect(tokio_postgres::NoTls), + ) + .await { Ok(client_and_conn) => client_and_conn?, Err(_elapsed) => { - bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open"); + bail!( + "timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open" + ); } }; trace!("connected to {:?}", donor); @@ -434,13 +439,12 @@ async fn network_io( match msg { ReplicationMessage::XLogData(xlog_data) => { let ar_hdr = AppendRequestHeader { + generation: INVALID_GENERATION, term: donor.term, - term_start_lsn: Lsn::INVALID, // unused begin_lsn: Lsn(xlog_data.wal_start()), end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64, commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it truncate_lsn: Lsn::INVALID, // do not attempt to advance - proposer_uuid: [0; 16], }; let ar = AppendRequest { h: ar_hdr, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 6eb69f0b7c..0edac04b97 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -1,40 +1,36 @@ //! Acceptor part of proposer-acceptor consensus algorithm. -use anyhow::{bail, Context, Result}; -use byteorder::{LittleEndian, ReadBytesExt}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; - -use postgres_ffi::{TimeLineID, MAX_SEND_SIZE}; -use serde::{Deserialize, Serialize}; -use std::cmp::max; -use std::cmp::min; +use std::cmp::{max, min}; use std::fmt; use std::io::Read; -use storage_broker::proto::SafekeeperTimelineInfo; +use std::str::FromStr; -use tracing::*; - -use crate::control_file; -use crate::metrics::MISC_OPERATION_SECONDS; -use crate::send_wal::HotStandbyFeedback; - -use crate::state::TimelineState; -use crate::wal_storage; +use anyhow::{Context, Result, bail}; +use byteorder::{LittleEndian, ReadBytesExt}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use postgres_ffi::{MAX_SEND_SIZE, TimeLineID}; use pq_proto::SystemId; -use utils::pageserver_feedback::PageserverFeedback; -use utils::{ - bin_ser::LeSer, - id::{NodeId, TenantId, TimelineId}, - lsn::Lsn, +use safekeeper_api::membership::{ + INVALID_GENERATION, MemberSet, SafekeeperGeneration as Generation, SafekeeperId, }; +use safekeeper_api::models::HotStandbyFeedback; +use safekeeper_api::{Term, membership}; +use serde::{Deserialize, Serialize}; +use storage_broker::proto::SafekeeperTimelineInfo; +use tracing::*; +use utils::bin_ser::LeSer; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; -const SK_PROTOCOL_VERSION: u32 = 2; +use crate::metrics::MISC_OPERATION_SECONDS; +use crate::state::TimelineState; +use crate::{control_file, wal_storage}; + +pub const SK_PROTO_VERSION_2: u32 = 2; +pub const SK_PROTO_VERSION_3: u32 = 3; pub const UNKNOWN_SERVER_VERSION: u32 = 0; -/// Consensus logical timestamp. -pub type Term = u64; -pub const INVALID_TERM: Term = 0; - #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] pub struct TermLsn { pub term: Term, @@ -59,8 +55,28 @@ impl TermHistory { TermHistory(Vec::new()) } - // Parse TermHistory as n_entries followed by TermLsn pairs + // Parse TermHistory as n_entries followed by TermLsn pairs in network order. pub fn from_bytes(bytes: &mut Bytes) -> Result { + let n_entries = bytes + .get_u32_f() + .with_context(|| "TermHistory misses len")?; + let mut res = Vec::with_capacity(n_entries as usize); + for i in 0..n_entries { + let term = bytes + .get_u64_f() + .with_context(|| format!("TermHistory pos {} misses term", i))?; + let lsn = bytes + .get_u64_f() + .with_context(|| format!("TermHistory pos {} misses lsn", i))? + .into(); + res.push(TermLsn { term, lsn }) + } + Ok(TermHistory(res)) + } + + // Parse TermHistory as n_entries followed by TermLsn pairs in LE order. + // TODO remove once v2 protocol is fully dropped. + pub fn from_bytes_le(bytes: &mut Bytes) -> Result { if bytes.remaining() < 4 { bail!("TermHistory misses len"); } @@ -127,10 +143,7 @@ impl TermHistory { ); last_common_idx = Some(i); } - let last_common_idx = match last_common_idx { - None => return None, // no common point - Some(lci) => lci, - }; + let last_common_idx = last_common_idx?; // Now find where it ends at both prop and sk and take min. End of // (common) term is the start of the next except it is the last one; // there it is flush_lsn in case of safekeeper or, in case of proposer @@ -198,51 +211,23 @@ impl AcceptorState { } } -/// Information about Postgres. Safekeeper gets it once and then verifies -/// all further connections from computes match. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct ServerInfo { +// protocol messages + +/// Initial Proposer -> Acceptor message +#[derive(Debug, Deserialize)] +pub struct ProposerGreeting { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub mconf: membership::Configuration, /// Postgres server version pub pg_version: u32, pub system_id: SystemId, pub wal_seg_size: u32, } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub struct PersistedPeerInfo { - /// LSN up to which safekeeper offloaded WAL to s3. - pub backup_lsn: Lsn, - /// Term of the last entry. - pub term: Term, - /// LSN of the last record. - pub flush_lsn: Lsn, - /// Up to which LSN safekeeper regards its WAL as committed. - pub commit_lsn: Lsn, -} - -impl PersistedPeerInfo { - pub fn new() -> Self { - Self { - backup_lsn: Lsn::INVALID, - term: INVALID_TERM, - flush_lsn: Lsn(0), - commit_lsn: Lsn(0), - } - } -} - -// make clippy happy -impl Default for PersistedPeerInfo { - fn default() -> Self { - Self::new() - } -} - -// protocol messages - -/// Initial Proposer -> Acceptor message +/// V2 of the message; exists as a struct because we (de)serialized it as is. #[derive(Debug, Deserialize)] -pub struct ProposerGreeting { +pub struct ProposerGreetingV2 { /// proposer-acceptor protocol version pub protocol_version: u32, /// Postgres server version @@ -259,27 +244,35 @@ pub struct ProposerGreeting { /// (acceptor voted for). #[derive(Debug, Serialize)] pub struct AcceptorGreeting { - term: u64, node_id: NodeId, + mconf: membership::Configuration, + term: u64, } /// Vote request sent from proposer to safekeepers -#[derive(Debug, Deserialize)] +#[derive(Debug)] pub struct VoteRequest { + pub generation: Generation, + pub term: Term, +} + +/// V2 of the message; exists as a struct because we (de)serialized it as is. +#[derive(Debug, Deserialize)] +pub struct VoteRequestV2 { pub term: Term, } /// Vote itself, sent from safekeeper to proposer #[derive(Debug, Serialize)] pub struct VoteResponse { + generation: Generation, // membership conf generation pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date. - vote_given: u64, // fixme u64 due to padding + vote_given: bool, // Safekeeper flush_lsn (end of WAL) + history of term switches allow // proposer to choose the most advanced one. pub flush_lsn: Lsn, truncate_lsn: Lsn, pub term_history: TermHistory, - timeline_start_lsn: Lsn, } /* @@ -288,10 +281,10 @@ pub struct VoteResponse { */ #[derive(Debug)] pub struct ProposerElected { + pub generation: Generation, // membership conf generation pub term: Term, pub start_streaming_at: Lsn, pub term_history: TermHistory, - pub timeline_start_lsn: Lsn, } /// Request with WAL message sent from proposer to safekeeper. Along the way it @@ -303,6 +296,22 @@ pub struct AppendRequest { } #[derive(Debug, Clone, Deserialize)] pub struct AppendRequestHeader { + pub generation: Generation, // membership conf generation + // safekeeper's current term; if it is higher than proposer's, the compute is out of date. + pub term: Term, + /// start position of message in WAL + pub begin_lsn: Lsn, + /// end position of message in WAL + pub end_lsn: Lsn, + /// LSN committed by quorum of safekeepers + pub commit_lsn: Lsn, + /// minimal LSN which may be needed by proposer to perform recovery of some safekeeper + pub truncate_lsn: Lsn, +} + +/// V2 of the message; exists as a struct because we (de)serialized it as is. +#[derive(Debug, Clone, Deserialize)] +pub struct AppendRequestHeaderV2 { // safekeeper's current term; if it is higher than proposer's, the compute is out of date. pub term: Term, // TODO: remove this field from the protocol, it in unused -- LSN of term @@ -323,6 +332,9 @@ pub struct AppendRequestHeader { /// Report safekeeper state to proposer #[derive(Debug, Serialize, Clone)] pub struct AppendResponse { + // Membership conf generation. Not strictly required because on mismatch + // connection is reset, but let's sanity check it. + generation: Generation, // Current term of the safekeeper; if it is higher than proposer's, the // compute is out of date. pub term: Term, @@ -339,8 +351,9 @@ pub struct AppendResponse { } impl AppendResponse { - fn term_only(term: Term) -> AppendResponse { + fn term_only(generation: Generation, term: Term) -> AppendResponse { AppendResponse { + generation, term, flush_lsn: Lsn(0), commit_lsn: Lsn(0), @@ -361,65 +374,322 @@ pub enum ProposerAcceptorMessage { FlushWAL, } +/// Augment Bytes with fallible get_uN where N is number of bytes methods. +/// All reads are in network (big endian) order. +trait BytesF { + fn get_u8_f(&mut self) -> Result; + fn get_u16_f(&mut self) -> Result; + fn get_u32_f(&mut self) -> Result; + fn get_u64_f(&mut self) -> Result; +} + +impl BytesF for Bytes { + fn get_u8_f(&mut self) -> Result { + if self.is_empty() { + bail!("no bytes left, expected 1"); + } + Ok(self.get_u8()) + } + fn get_u16_f(&mut self) -> Result { + if self.remaining() < 2 { + bail!("no bytes left, expected 2"); + } + Ok(self.get_u16()) + } + fn get_u32_f(&mut self) -> Result { + if self.remaining() < 4 { + bail!("only {} bytes left, expected 4", self.remaining()); + } + Ok(self.get_u32()) + } + fn get_u64_f(&mut self) -> Result { + if self.remaining() < 8 { + bail!("only {} bytes left, expected 8", self.remaining()); + } + Ok(self.get_u64()) + } +} + impl ProposerAcceptorMessage { - /// Parse proposer message. - pub fn parse(msg_bytes: Bytes) -> Result { - // xxx using Reader is inefficient but easy to work with bincode - let mut stream = msg_bytes.reader(); - // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is - let tag = stream.read_u64::()? as u8 as char; - match tag { - 'g' => { - let msg = ProposerGreeting::des_from(&mut stream)?; - Ok(ProposerAcceptorMessage::Greeting(msg)) - } - 'v' => { - let msg = VoteRequest::des_from(&mut stream)?; - Ok(ProposerAcceptorMessage::VoteRequest(msg)) - } - 'e' => { - let mut msg_bytes = stream.into_inner(); - if msg_bytes.remaining() < 16 { - bail!("ProposerElected message is not complete"); - } - let term = msg_bytes.get_u64_le(); - let start_streaming_at = msg_bytes.get_u64_le().into(); - let term_history = TermHistory::from_bytes(&mut msg_bytes)?; - if msg_bytes.remaining() < 8 { - bail!("ProposerElected message is not complete"); - } - let timeline_start_lsn = msg_bytes.get_u64_le().into(); - let msg = ProposerElected { - term, - start_streaming_at, - timeline_start_lsn, - term_history, + /// Read cstring from Bytes. + fn get_cstr(buf: &mut Bytes) -> Result { + let pos = buf + .iter() + .position(|x| *x == 0) + .ok_or_else(|| anyhow::anyhow!("missing cstring terminator"))?; + let result = buf.split_to(pos); + buf.advance(1); // drop the null terminator + match std::str::from_utf8(&result) { + Ok(s) => Ok(s.to_string()), + Err(e) => bail!("invalid utf8 in cstring: {}", e), + } + } + + /// Read membership::Configuration from Bytes. + fn get_mconf(buf: &mut Bytes) -> Result { + let generation = Generation::new(buf.get_u32_f().with_context(|| "reading generation")?); + let members_len = buf.get_u32_f().with_context(|| "reading members_len")?; + // Main member set must have at least someone in valid configuration. + // Empty conf is allowed until we fully migrate. + if generation != INVALID_GENERATION && members_len == 0 { + bail!("empty members_len"); + } + let mut members = MemberSet::empty(); + for i in 0..members_len { + let id = buf + .get_u64_f() + .with_context(|| format!("reading member {} node_id", i))?; + let host = Self::get_cstr(buf).with_context(|| format!("reading member {} host", i))?; + let pg_port = buf + .get_u16_f() + .with_context(|| format!("reading member {} port", i))?; + let sk = SafekeeperId { + id: NodeId(id), + host, + pg_port, + }; + members.add(sk)?; + } + let new_members_len = buf.get_u32_f().with_context(|| "reading new_members_len")?; + // Non joint conf. + if new_members_len == 0 { + Ok(membership::Configuration { + generation, + members, + new_members: None, + }) + } else { + let mut new_members = MemberSet::empty(); + for i in 0..new_members_len { + let id = buf + .get_u64_f() + .with_context(|| format!("reading new member {} node_id", i))?; + let host = Self::get_cstr(buf) + .with_context(|| format!("reading new member {} host", i))?; + let pg_port = buf + .get_u16_f() + .with_context(|| format!("reading new member {} port", i))?; + let sk = SafekeeperId { + id: NodeId(id), + host, + pg_port, }; - Ok(ProposerAcceptorMessage::Elected(msg)) + new_members.add(sk)?; } - 'a' => { - // read header followed by wal data - let hdr = AppendRequestHeader::des_from(&mut stream)?; - let rec_size = hdr - .end_lsn - .checked_sub(hdr.begin_lsn) - .context("begin_lsn > end_lsn in AppendRequest")? - .0 as usize; - if rec_size > MAX_SEND_SIZE { - bail!( - "AppendRequest is longer than MAX_SEND_SIZE ({})", - MAX_SEND_SIZE - ); + Ok(membership::Configuration { + generation, + members, + new_members: Some(new_members), + }) + } + } + + /// Parse proposer message. + pub fn parse(mut msg_bytes: Bytes, proto_version: u32) -> Result { + if proto_version == SK_PROTO_VERSION_3 { + if msg_bytes.is_empty() { + bail!("ProposerAcceptorMessage is not complete: missing tag"); + } + let tag = msg_bytes.get_u8_f().with_context(|| { + "ProposerAcceptorMessage is not complete: missing tag".to_string() + })? as char; + match tag { + 'g' => { + let tenant_id_str = + Self::get_cstr(&mut msg_bytes).with_context(|| "reading tenant_id")?; + let tenant_id = TenantId::from_str(&tenant_id_str)?; + let timeline_id_str = + Self::get_cstr(&mut msg_bytes).with_context(|| "reading timeline_id")?; + let timeline_id = TimelineId::from_str(&timeline_id_str)?; + let mconf = Self::get_mconf(&mut msg_bytes)?; + let pg_version = msg_bytes + .get_u32_f() + .with_context(|| "reading pg_version")?; + let system_id = msg_bytes.get_u64_f().with_context(|| "reading system_id")?; + let wal_seg_size = msg_bytes + .get_u32_f() + .with_context(|| "reading wal_seg_size")?; + let g = ProposerGreeting { + tenant_id, + timeline_id, + mconf, + pg_version, + system_id, + wal_seg_size, + }; + Ok(ProposerAcceptorMessage::Greeting(g)) } + 'v' => { + let generation = Generation::new( + msg_bytes + .get_u32_f() + .with_context(|| "reading generation")?, + ); + let term = msg_bytes.get_u64_f().with_context(|| "reading term")?; + let v = VoteRequest { generation, term }; + Ok(ProposerAcceptorMessage::VoteRequest(v)) + } + 'e' => { + let generation = Generation::new( + msg_bytes + .get_u32_f() + .with_context(|| "reading generation")?, + ); + let term = msg_bytes.get_u64_f().with_context(|| "reading term")?; + let start_streaming_at: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading start_streaming_at")? + .into(); + let term_history = TermHistory::from_bytes(&mut msg_bytes)?; + let msg = ProposerElected { + generation, + term, + start_streaming_at, + term_history, + }; + Ok(ProposerAcceptorMessage::Elected(msg)) + } + 'a' => { + let generation = Generation::new( + msg_bytes + .get_u32_f() + .with_context(|| "reading generation")?, + ); + let term = msg_bytes.get_u64_f().with_context(|| "reading term")?; + let begin_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading begin_lsn")? + .into(); + let end_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading end_lsn")? + .into(); + let commit_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading commit_lsn")? + .into(); + let truncate_lsn: Lsn = msg_bytes + .get_u64_f() + .with_context(|| "reading truncate_lsn")? + .into(); + let hdr = AppendRequestHeader { + generation, + term, + begin_lsn, + end_lsn, + commit_lsn, + truncate_lsn, + }; + let rec_size = hdr + .end_lsn + .checked_sub(hdr.begin_lsn) + .context("begin_lsn > end_lsn in AppendRequest")? + .0 as usize; + if rec_size > MAX_SEND_SIZE { + bail!( + "AppendRequest is longer than MAX_SEND_SIZE ({})", + MAX_SEND_SIZE + ); + } + if msg_bytes.remaining() < rec_size { + bail!( + "reading WAL: only {} bytes left, wanted {}", + msg_bytes.remaining(), + rec_size + ); + } + let wal_data = msg_bytes.copy_to_bytes(rec_size); + let msg = AppendRequest { h: hdr, wal_data }; - let mut wal_data_vec: Vec = vec![0; rec_size]; - stream.read_exact(&mut wal_data_vec)?; - let wal_data = Bytes::from(wal_data_vec); - let msg = AppendRequest { h: hdr, wal_data }; - - Ok(ProposerAcceptorMessage::AppendRequest(msg)) + Ok(ProposerAcceptorMessage::AppendRequest(msg)) + } + _ => bail!("unknown proposer-acceptor message tag: {}", tag), } - _ => bail!("unknown proposer-acceptor message tag: {}", tag), + } else if proto_version == SK_PROTO_VERSION_2 { + // xxx using Reader is inefficient but easy to work with bincode + let mut stream = msg_bytes.reader(); + // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is + let tag = stream.read_u64::()? as u8 as char; + match tag { + 'g' => { + let msgv2 = ProposerGreetingV2::des_from(&mut stream)?; + let g = ProposerGreeting { + tenant_id: msgv2.tenant_id, + timeline_id: msgv2.timeline_id, + mconf: membership::Configuration { + generation: INVALID_GENERATION, + members: MemberSet::empty(), + new_members: None, + }, + pg_version: msgv2.pg_version, + system_id: msgv2.system_id, + wal_seg_size: msgv2.wal_seg_size, + }; + Ok(ProposerAcceptorMessage::Greeting(g)) + } + 'v' => { + let msg = VoteRequestV2::des_from(&mut stream)?; + let v = VoteRequest { + generation: INVALID_GENERATION, + term: msg.term, + }; + Ok(ProposerAcceptorMessage::VoteRequest(v)) + } + 'e' => { + let mut msg_bytes = stream.into_inner(); + if msg_bytes.remaining() < 16 { + bail!("ProposerElected message is not complete"); + } + let term = msg_bytes.get_u64_le(); + let start_streaming_at = msg_bytes.get_u64_le().into(); + let term_history = TermHistory::from_bytes_le(&mut msg_bytes)?; + if msg_bytes.remaining() < 8 { + bail!("ProposerElected message is not complete"); + } + let _timeline_start_lsn = msg_bytes.get_u64_le(); + let msg = ProposerElected { + generation: INVALID_GENERATION, + term, + start_streaming_at, + term_history, + }; + Ok(ProposerAcceptorMessage::Elected(msg)) + } + 'a' => { + // read header followed by wal data + let hdrv2 = AppendRequestHeaderV2::des_from(&mut stream)?; + let hdr = AppendRequestHeader { + generation: INVALID_GENERATION, + term: hdrv2.term, + begin_lsn: hdrv2.begin_lsn, + end_lsn: hdrv2.end_lsn, + commit_lsn: hdrv2.commit_lsn, + truncate_lsn: hdrv2.truncate_lsn, + }; + let rec_size = hdr + .end_lsn + .checked_sub(hdr.begin_lsn) + .context("begin_lsn > end_lsn in AppendRequest")? + .0 as usize; + if rec_size > MAX_SEND_SIZE { + bail!( + "AppendRequest is longer than MAX_SEND_SIZE ({})", + MAX_SEND_SIZE + ); + } + + let mut wal_data_vec: Vec = vec![0; rec_size]; + stream.read_exact(&mut wal_data_vec)?; + let wal_data = Bytes::from(wal_data_vec); + + let msg = AppendRequest { h: hdr, wal_data }; + + Ok(ProposerAcceptorMessage::AppendRequest(msg)) + } + _ => bail!("unknown proposer-acceptor message tag: {}", tag), + } + } else { + bail!("unsupported protocol version {}", proto_version); } } @@ -433,36 +703,21 @@ impl ProposerAcceptorMessage { // We explicitly list all fields, to draw attention here when new fields are added. let mut size = BASE_SIZE; size += match self { - Self::Greeting(ProposerGreeting { - protocol_version: _, - pg_version: _, - proposer_id: _, - system_id: _, - timeline_id: _, - tenant_id: _, - tli: _, - wal_seg_size: _, - }) => 0, + Self::Greeting(_) => 0, - Self::VoteRequest(VoteRequest { term: _ }) => 0, + Self::VoteRequest(_) => 0, - Self::Elected(ProposerElected { - term: _, - start_streaming_at: _, - term_history: _, - timeline_start_lsn: _, - }) => 0, + Self::Elected(_) => 0, Self::AppendRequest(AppendRequest { h: AppendRequestHeader { + generation: _, term: _, - term_start_lsn: _, begin_lsn: _, end_lsn: _, commit_lsn: _, truncate_lsn: _, - proposer_uuid: _, }, wal_data, }) => wal_data.len(), @@ -470,13 +725,12 @@ impl ProposerAcceptorMessage { Self::NoFlushAppendRequest(AppendRequest { h: AppendRequestHeader { + generation: _, term: _, - term_start_lsn: _, begin_lsn: _, end_lsn: _, commit_lsn: _, truncate_lsn: _, - proposer_uuid: _, }, wal_data, }) => wal_data.len(), @@ -497,45 +751,118 @@ pub enum AcceptorProposerMessage { } impl AcceptorProposerMessage { - /// Serialize acceptor -> proposer message. - pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { - match self { - AcceptorProposerMessage::Greeting(msg) => { - buf.put_u64_le('g' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.node_id.0); - } - AcceptorProposerMessage::VoteResponse(msg) => { - buf.put_u64_le('v' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.vote_given); - buf.put_u64_le(msg.flush_lsn.into()); - buf.put_u64_le(msg.truncate_lsn.into()); - buf.put_u32_le(msg.term_history.0.len() as u32); - for e in &msg.term_history.0 { - buf.put_u64_le(e.term); - buf.put_u64_le(e.lsn.into()); - } - buf.put_u64_le(msg.timeline_start_lsn.into()); - } - AcceptorProposerMessage::AppendResponse(msg) => { - buf.put_u64_le('a' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.flush_lsn.into()); - buf.put_u64_le(msg.commit_lsn.into()); - buf.put_i64_le(msg.hs_feedback.ts); - buf.put_u64_le(msg.hs_feedback.xmin); - buf.put_u64_le(msg.hs_feedback.catalog_xmin); + fn put_cstr(buf: &mut BytesMut, s: &str) { + buf.put_slice(s.as_bytes()); + buf.put_u8(0); // null terminator + } - // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback - // if it is not present. - if let Some(ref msg) = msg.pageserver_feedback { - msg.serialize(buf); - } - } + /// Serialize membership::Configuration into buf. + fn serialize_mconf(buf: &mut BytesMut, mconf: &membership::Configuration) { + buf.put_u32(mconf.generation.into_inner()); + buf.put_u32(mconf.members.m.len() as u32); + for sk in &mconf.members.m { + buf.put_u64(sk.id.0); + Self::put_cstr(buf, &sk.host); + buf.put_u16(sk.pg_port); } + if let Some(ref new_members) = mconf.new_members { + buf.put_u32(new_members.m.len() as u32); + for sk in &new_members.m { + buf.put_u64(sk.id.0); + Self::put_cstr(buf, &sk.host); + buf.put_u16(sk.pg_port); + } + } else { + buf.put_u32(0); + } + } - Ok(()) + /// Serialize acceptor -> proposer message. + pub fn serialize(&self, buf: &mut BytesMut, proto_version: u32) -> Result<()> { + if proto_version == SK_PROTO_VERSION_3 { + match self { + AcceptorProposerMessage::Greeting(msg) => { + buf.put_u8(b'g'); + buf.put_u64(msg.node_id.0); + Self::serialize_mconf(buf, &msg.mconf); + buf.put_u64(msg.term) + } + AcceptorProposerMessage::VoteResponse(msg) => { + buf.put_u8(b'v'); + buf.put_u32(msg.generation.into_inner()); + buf.put_u64(msg.term); + buf.put_u8(msg.vote_given as u8); + buf.put_u64(msg.flush_lsn.into()); + buf.put_u64(msg.truncate_lsn.into()); + buf.put_u32(msg.term_history.0.len() as u32); + for e in &msg.term_history.0 { + buf.put_u64(e.term); + buf.put_u64(e.lsn.into()); + } + } + AcceptorProposerMessage::AppendResponse(msg) => { + buf.put_u8(b'a'); + buf.put_u32(msg.generation.into_inner()); + buf.put_u64(msg.term); + buf.put_u64(msg.flush_lsn.into()); + buf.put_u64(msg.commit_lsn.into()); + buf.put_i64(msg.hs_feedback.ts); + buf.put_u64(msg.hs_feedback.xmin); + buf.put_u64(msg.hs_feedback.catalog_xmin); + + // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback + // if it is not present. + if let Some(ref msg) = msg.pageserver_feedback { + msg.serialize(buf); + } + } + } + Ok(()) + // TODO remove 3 after converting all msgs + } else if proto_version == SK_PROTO_VERSION_2 { + match self { + AcceptorProposerMessage::Greeting(msg) => { + buf.put_u64_le('g' as u64); + // v2 didn't have mconf and fields were reordered + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.node_id.0); + } + AcceptorProposerMessage::VoteResponse(msg) => { + // v2 didn't have generation, had u64 vote_given and timeline_start_lsn + buf.put_u64_le('v' as u64); + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.vote_given as u64); + buf.put_u64_le(msg.flush_lsn.into()); + buf.put_u64_le(msg.truncate_lsn.into()); + buf.put_u32_le(msg.term_history.0.len() as u32); + for e in &msg.term_history.0 { + buf.put_u64_le(e.term); + buf.put_u64_le(e.lsn.into()); + } + // removed timeline_start_lsn + buf.put_u64_le(0); + } + AcceptorProposerMessage::AppendResponse(msg) => { + // v2 didn't have generation + buf.put_u64_le('a' as u64); + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.flush_lsn.into()); + buf.put_u64_le(msg.commit_lsn.into()); + buf.put_i64_le(msg.hs_feedback.ts); + buf.put_u64_le(msg.hs_feedback.xmin); + buf.put_u64_le(msg.hs_feedback.catalog_xmin); + + // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback + // if it is not present. + if let Some(ref msg) = msg.pageserver_feedback { + msg.serialize(buf); + } + } + } + Ok(()) + } else { + bail!("unsupported protocol version {}", proto_version); + } } } @@ -632,14 +959,6 @@ where &mut self, msg: &ProposerGreeting, ) -> Result> { - // Check protocol compatibility - if msg.protocol_version != SK_PROTOCOL_VERSION { - bail!( - "incompatible protocol version {}, expected {}", - msg.protocol_version, - SK_PROTOCOL_VERSION - ); - } /* Postgres major version mismatch is treated as fatal error * because safekeepers parse WAL headers and the format * may change between versions. @@ -694,15 +1013,16 @@ where self.state.finish_change(&state).await?; } - info!( - "processed greeting from walproposer {}, sending term {:?}", - msg.proposer_id.map(|b| format!("{:X}", b)).join(""), - self.state.acceptor_state.term - ); - Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting { - term: self.state.acceptor_state.term, + let apg = AcceptorGreeting { node_id: self.node_id, - }))) + mconf: self.state.mconf.clone(), + term: self.state.acceptor_state.term, + }; + info!( + "processed greeting {:?} from walproposer, sending {:?}", + msg, apg + ); + Ok(Some(AcceptorProposerMessage::Greeting(apg))) } /// Give vote for the given term, if we haven't done that previously. @@ -723,12 +1043,12 @@ where self.wal_store.flush_wal().await?; // initialize with refusal let mut resp = VoteResponse { + generation: self.state.mconf.generation, term: self.state.acceptor_state.term, - vote_given: false as u64, + vote_given: false, flush_lsn: self.flush_lsn(), truncate_lsn: self.state.inmem.peer_horizon_lsn, term_history: self.get_term_history(), - timeline_start_lsn: self.state.timeline_start_lsn, }; if self.state.acceptor_state.term < msg.term { let mut state = self.state.start_change(); @@ -737,15 +1057,16 @@ where self.state.finish_change(&state).await?; resp.term = self.state.acceptor_state.term; - resp.vote_given = true as u64; + resp.vote_given = true; } - info!("processed VoteRequest for term {}: {:?}", msg.term, &resp); + info!("processed {:?}: sending {:?}", msg, &resp); Ok(Some(AcceptorProposerMessage::VoteResponse(resp))) } /// Form AppendResponse from current state. fn append_response(&self) -> AppendResponse { let ar = AppendResponse { + generation: self.state.mconf.generation, term: self.state.acceptor_state.term, flush_lsn: self.flush_lsn(), commit_lsn: self.state.commit_lsn, @@ -808,9 +1129,14 @@ where // and walproposer recalculates the streaming point. OTOH repeating // error indicates a serious bug. if last_common_point.lsn != msg.start_streaming_at { - bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", - last_common_point, msg.start_streaming_at, - self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, + bail!( + "refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", + last_common_point, + msg.start_streaming_at, + self.state.acceptor_state.term, + sk_th, + self.flush_lsn(), + msg.term_history, ); } @@ -818,8 +1144,12 @@ where assert!( msg.start_streaming_at >= self.state.inmem.commit_lsn, "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", - msg.start_streaming_at, self.state.inmem.commit_lsn, - self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, + msg.start_streaming_at, + self.state.inmem.commit_lsn, + self.state.acceptor_state.term, + sk_th, + self.flush_lsn(), + msg.term_history, ); // Before first WAL write initialize its segment. It makes first segment @@ -844,18 +1174,22 @@ where // Here we learn initial LSN for the first time, set fields // interested in that. - if state.timeline_start_lsn == Lsn(0) { - // Remember point where WAL begins globally. - state.timeline_start_lsn = msg.timeline_start_lsn; - info!( - "setting timeline_start_lsn to {:?}", - state.timeline_start_lsn - ); + if let Some(start_lsn) = msg.term_history.0.first() { + if state.timeline_start_lsn == Lsn(0) { + // Remember point where WAL begins globally. In the future it + // will be intialized immediately on timeline creation. + state.timeline_start_lsn = start_lsn.lsn; + info!( + "setting timeline_start_lsn to {:?}", + state.timeline_start_lsn + ); + } } + if state.peer_horizon_lsn == Lsn(0) { // Update peer_horizon_lsn as soon as we know where timeline starts. // It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn. - state.peer_horizon_lsn = msg.timeline_start_lsn; + state.peer_horizon_lsn = state.timeline_start_lsn; } if state.local_start_lsn == Lsn(0) { state.local_start_lsn = msg.start_streaming_at; @@ -935,7 +1269,10 @@ where // If our term is higher, immediately refuse the message. if self.state.acceptor_state.term > msg.h.term { - let resp = AppendResponse::term_only(self.state.acceptor_state.term); + let resp = AppendResponse::term_only( + self.state.mconf.generation, + self.state.acceptor_state.term, + ); return Ok(Some(AcceptorProposerMessage::AppendResponse(resp))); } @@ -963,10 +1300,8 @@ where ); } - // Now we know that we are in the same term as the proposer, - // processing the message. - - self.state.inmem.proposer_uuid = msg.h.proposer_uuid; + // Now we know that we are in the same term as the proposer, process the + // message. // do the job if !msg.wal_data.is_empty() { @@ -1025,7 +1360,7 @@ where /// Update commit_lsn from peer safekeeper data. pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> { - if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) { + if Lsn(sk_info.commit_lsn) != Lsn::INVALID { // Note: the check is too restrictive, generally we can update local // commit_lsn if our history matches (is part of) history of advanced // commit_lsn provider. @@ -1039,12 +1374,19 @@ where #[cfg(test)] mod tests { + use std::ops::Deref; + use std::str::FromStr; + use std::time::{Instant, UNIX_EPOCH}; + use futures::future::BoxFuture; - use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; + use postgres_ffi::{WAL_SEGMENT_SIZE, XLogSegNo}; + use safekeeper_api::ServerInfo; + use safekeeper_api::membership::{ + Configuration, MemberSet, SafekeeperGeneration, SafekeeperId, + }; use super::*; - use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState}; - use std::{ops::Deref, str::FromStr, time::Instant}; + use crate::state::{EvictionState, TimelinePersistentState}; // fake storage for tests struct InMemoryState { @@ -1127,10 +1469,13 @@ mod tests { let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok - let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); + let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { + generation: Generation::new(0), + term: 1, + }); let mut vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { - Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0), + Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given), r => panic!("unexpected response: {:?}", r), } @@ -1145,7 +1490,7 @@ mod tests { // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { - Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0), + Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(!resp.vote_given), r => panic!("unexpected response: {:?}", r), } } @@ -1160,13 +1505,12 @@ mod tests { let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { + generation: Generation::new(0), term: 2, - term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), commit_lsn: Lsn(0), truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }; let mut append_request = AppendRequest { h: ar_hdr.clone(), @@ -1174,6 +1518,7 @@ mod tests { }; let pem = ProposerElected { + generation: Generation::new(0), term: 2, start_streaming_at: Lsn(1), term_history: TermHistory(vec![ @@ -1186,7 +1531,6 @@ mod tests { lsn: Lsn(3), }, ]), - timeline_start_lsn: Lsn(1), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .await @@ -1221,26 +1565,25 @@ mod tests { let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); let pem = ProposerElected { + generation: Generation::new(0), term: 1, start_streaming_at: Lsn(1), term_history: TermHistory(vec![TermLsn { term: 1, lsn: Lsn(1), }]), - timeline_start_lsn: Lsn(1), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .await .unwrap(); let ar_hdr = AppendRequestHeader { + generation: Generation::new(0), term: 1, - term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), commit_lsn: Lsn(0), truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], }; let append_request = AppendRequest { h: ar_hdr.clone(), @@ -1327,12 +1670,21 @@ mod tests { #[test] fn test_sk_state_bincode_serde_roundtrip() { - use utils::Hex; let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap(); let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap(); let state = TimelinePersistentState { tenant_id, timeline_id, + mconf: Configuration { + generation: SafekeeperGeneration::new(42), + members: MemberSet::new(vec![SafekeeperId { + id: NodeId(1), + host: "hehe.org".to_owned(), + pg_port: 5432, + }]) + .expect("duplicate member"), + new_members: None, + }, acceptor_state: AcceptorState { term: 42, term_history: TermHistory(vec![TermLsn { @@ -1356,70 +1708,13 @@ mod tests { backup_lsn: Lsn(1234567300), peer_horizon_lsn: Lsn(9999999), remote_consistent_lsn: Lsn(1234560000), - peers: PersistedPeers(vec![( - NodeId(1), - PersistedPeerInfo { - backup_lsn: Lsn(1234567000), - term: 42, - flush_lsn: Lsn(1234567800 - 8), - commit_lsn: Lsn(1234567600), - }, - )]), partial_backup: crate::wal_backup_partial::State::default(), eviction_state: EvictionState::Present, + creation_ts: UNIX_EPOCH, }; let ser = state.ser().unwrap(); - #[rustfmt::skip] - let expected = [ - // tenant_id as length prefixed hex - 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x63, 0x66, 0x30, 0x34, 0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36, - // timeline_id as length prefixed hex - 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x31, 0x31, 0x32, 0x64, 0x65, 0x64, 0x36, 0x36, 0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35, 0x34, 0x34, 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34, - // term - 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // length prefix - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // unsure why this order is swapped - 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // pg_version - 0x0e, 0x00, 0x00, 0x00, - // systemid - 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, - // wal_seg_size - 0x78, 0x56, 0x34, 0x12, - // pguuid as length prefixed hex - 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x63, 0x34, 0x37, 0x61, 0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39, 0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31, - - // timeline_start_lsn - 0x00, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, - 0x84, 0x00, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, - 0x7f, 0x96, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0xe4, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00, - // length prefix for persistentpeers - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // nodeid - 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // backuplsn - 0x58, 0xff, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00, - 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, - 0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, - // partial_backup - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // eviction_state - 0x00, 0x00, 0x00, 0x00, - ]; - - assert_eq!(Hex(&ser), Hex(&expected)); - let deser = TimelinePersistentState::des(&ser).unwrap(); assert_eq!(deser, state); diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 2589030422..be0c849a5f 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -1,113 +1,642 @@ +use std::collections::HashMap; +use std::fmt::Display; +use std::sync::Arc; use std::time::Duration; -use anyhow::Context; +use anyhow::{Context, anyhow}; use futures::StreamExt; +use futures::future::Either; use pageserver_api::shard::ShardIdentity; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend}; -use postgres_ffi::MAX_SEND_SIZE; -use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder}; +use postgres_ffi::get_current_timestamp; +use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder}; use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive}; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::sync::mpsc::error::SendError; +use tokio::task::JoinHandle; use tokio::time::MissedTickBehavior; +use tracing::{Instrument, error, info, info_span}; +use utils::critical; use utils::lsn::Lsn; -use utils::postgres_client::Compression; -use utils::postgres_client::InterpretedFormat; +use utils::postgres_client::{Compression, InterpretedFormat}; use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords}; use wal_decoder::wire_format::ToWireFormat; -use crate::send_wal::EndWatchView; -use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder}; +use crate::metrics::WAL_READERS; +use crate::send_wal::{EndWatchView, WalSenderGuard}; +use crate::timeline::WalResidentTimeline; +use crate::wal_reader_stream::{StreamingWalReader, WalBytes}; -/// Shard-aware interpreted record sender. -/// This is used for sending WAL to the pageserver. Said WAL -/// is pre-interpreted and filtered for the shard. -pub(crate) struct InterpretedWalSender<'a, IO> { - pub(crate) format: InterpretedFormat, - pub(crate) compression: Option, - pub(crate) pgb: &'a mut PostgresBackend, - pub(crate) wal_stream_builder: WalReaderStreamBuilder, - pub(crate) end_watch_view: EndWatchView, - pub(crate) shard: ShardIdentity, - pub(crate) pg_version: u32, - pub(crate) appname: Option, +/// Identifier used to differentiate between senders of the same +/// shard. +/// +/// In the steady state there's only one, but two pageservers may +/// temporarily have the same shard attached and attempt to ingest +/// WAL for it. See also [`ShardSenderId`]. +#[derive(Hash, Eq, PartialEq, Copy, Clone)] +struct SenderId(u8); + +impl SenderId { + fn first() -> Self { + SenderId(0) + } + + fn next(&self) -> Self { + SenderId(self.0.checked_add(1).expect("few senders")) + } } -struct Batch { +#[derive(Hash, Eq, PartialEq)] +struct ShardSenderId { + shard: ShardIdentity, + sender_id: SenderId, +} + +impl Display for ShardSenderId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}{}", self.sender_id.0, self.shard.shard_slug()) + } +} + +impl ShardSenderId { + fn new(shard: ShardIdentity, sender_id: SenderId) -> Self { + ShardSenderId { shard, sender_id } + } + + fn shard(&self) -> ShardIdentity { + self.shard + } +} + +/// Shard-aware fan-out interpreted record reader. +/// Reads WAL from disk, decodes it, intepretets it, and sends +/// it to any [`InterpretedWalSender`] connected to it. +/// Each [`InterpretedWalSender`] corresponds to one shard +/// and gets interpreted records concerning that shard only. +pub(crate) struct InterpretedWalReader { + wal_stream: StreamingWalReader, + shard_senders: HashMap>, + shard_notification_rx: Option>, + state: Arc>, + pg_version: u32, +} + +/// A handle for [`InterpretedWalReader`] which allows for interacting with it +/// when it runs as a separate tokio task. +#[derive(Debug)] +pub(crate) struct InterpretedWalReaderHandle { + join_handle: JoinHandle>, + state: Arc>, + shard_notification_tx: tokio::sync::mpsc::UnboundedSender, +} + +struct ShardSenderState { + sender_id: SenderId, + tx: tokio::sync::mpsc::Sender, + next_record_lsn: Lsn, +} + +/// State of [`InterpretedWalReader`] visible outside of the task running it. +#[derive(Debug)] +pub(crate) enum InterpretedWalReaderState { + Running { + current_position: Lsn, + /// Tracks the start of the PG WAL LSN from which the current batch of + /// interpreted records originated. + current_batch_wal_start: Option, + }, + Done, +} + +pub(crate) struct Batch { wal_end_lsn: Lsn, available_wal_end_lsn: Lsn, records: InterpretedWalRecords, } -impl InterpretedWalSender<'_, IO> { - /// Send interpreted WAL to a receiver. - /// Stops when an error occurs or the receiver is caught up and there's no active compute. - /// - /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ? - /// convenience. - pub(crate) async fn run(self) -> Result<(), CopyStreamHandlerEnd> { - let mut wal_position = self.wal_stream_builder.start_pos(); - let mut wal_decoder = - WalStreamDecoder::new(self.wal_stream_builder.start_pos(), self.pg_version); +#[derive(thiserror::Error, Debug)] +pub enum InterpretedWalReaderError { + /// Handler initiates the end of streaming. + #[error("decode error: {0}")] + Decode(#[from] WalDecodeError), + #[error("read or interpret error: {0}")] + ReadOrInterpret(#[from] anyhow::Error), + #[error("wal stream closed")] + WalStreamClosed, +} - let stream = self.wal_stream_builder.build(MAX_SEND_SIZE).await?; - let mut stream = std::pin::pin!(stream); +enum CurrentPositionUpdate { + Reset { from: Lsn, to: Lsn }, + NotReset(Lsn), +} - let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1)); - keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip); - keepalive_ticker.reset(); +impl CurrentPositionUpdate { + fn current_position(&self) -> Lsn { + match self { + CurrentPositionUpdate::Reset { from: _, to } => *to, + CurrentPositionUpdate::NotReset(lsn) => *lsn, + } + } - let (tx, mut rx) = tokio::sync::mpsc::channel::(2); + fn previous_position(&self) -> Lsn { + match self { + CurrentPositionUpdate::Reset { from, to: _ } => *from, + CurrentPositionUpdate::NotReset(lsn) => *lsn, + } + } +} + +impl InterpretedWalReaderState { + fn current_position(&self) -> Option { + match self { + InterpretedWalReaderState::Running { + current_position, .. + } => Some(*current_position), + InterpretedWalReaderState::Done => None, + } + } + + #[cfg(test)] + fn current_batch_wal_start(&self) -> Option { + match self { + InterpretedWalReaderState::Running { + current_batch_wal_start, + .. + } => *current_batch_wal_start, + InterpretedWalReaderState::Done => None, + } + } + + // Reset the current position of the WAL reader if the requested starting position + // of the new shard is smaller than the current value. + fn maybe_reset(&mut self, new_shard_start_pos: Lsn) -> CurrentPositionUpdate { + match self { + InterpretedWalReaderState::Running { + current_position, + current_batch_wal_start, + } => { + if new_shard_start_pos < *current_position { + let from = *current_position; + *current_position = new_shard_start_pos; + *current_batch_wal_start = None; + CurrentPositionUpdate::Reset { + from, + to: *current_position, + } + } else { + CurrentPositionUpdate::NotReset(*current_position) + } + } + InterpretedWalReaderState::Done => { + panic!("maybe_reset called on finished reader") + } + } + } + + fn update_current_batch_wal_start(&mut self, lsn: Lsn) { + match self { + InterpretedWalReaderState::Running { + current_batch_wal_start, + .. + } => { + if current_batch_wal_start.is_none() { + *current_batch_wal_start = Some(lsn); + } + } + InterpretedWalReaderState::Done => { + panic!("update_current_batch_wal_start called on finished reader") + } + } + } + + fn take_current_batch_wal_start(&mut self) -> Lsn { + match self { + InterpretedWalReaderState::Running { + current_batch_wal_start, + .. + } => current_batch_wal_start.take().unwrap(), + InterpretedWalReaderState::Done => { + panic!("take_current_batch_wal_start called on finished reader") + } + } + } + + fn update_current_position(&mut self, lsn: Lsn) { + match self { + InterpretedWalReaderState::Running { + current_position, .. + } => { + *current_position = lsn; + } + InterpretedWalReaderState::Done => { + panic!("update_current_position called on finished reader") + } + } + } +} + +pub(crate) struct AttachShardNotification { + shard_id: ShardIdentity, + sender: tokio::sync::mpsc::Sender, + start_pos: Lsn, +} + +impl InterpretedWalReader { + /// Spawn the reader in a separate tokio task and return a handle + pub(crate) fn spawn( + wal_stream: StreamingWalReader, + start_pos: Lsn, + tx: tokio::sync::mpsc::Sender, + shard: ShardIdentity, + pg_version: u32, + appname: &Option, + ) -> InterpretedWalReaderHandle { + let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running { + current_position: start_pos, + current_batch_wal_start: None, + })); + + let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel(); + + let reader = InterpretedWalReader { + wal_stream, + shard_senders: HashMap::from([( + shard, + smallvec::smallvec![ShardSenderState { + sender_id: SenderId::first(), + tx, + next_record_lsn: start_pos, + }], + )]), + shard_notification_rx: Some(shard_notification_rx), + state: state.clone(), + pg_version, + }; + + let metric = WAL_READERS + .get_metric_with_label_values(&["task", appname.as_deref().unwrap_or("safekeeper")]) + .unwrap(); + + let join_handle = tokio::task::spawn( + async move { + metric.inc(); + scopeguard::defer! { + metric.dec(); + } + + reader + .run_impl(start_pos) + .await + .inspect_err(|err| critical!("failed to read WAL record: {err:?}")) + } + .instrument(info_span!("interpreted wal reader")), + ); + + InterpretedWalReaderHandle { + join_handle, + state, + shard_notification_tx, + } + } + + /// Construct the reader without spawning anything + /// Callers should drive the future returned by [`Self::run`]. + pub(crate) fn new( + wal_stream: StreamingWalReader, + start_pos: Lsn, + tx: tokio::sync::mpsc::Sender, + shard: ShardIdentity, + pg_version: u32, + shard_notification_rx: Option< + tokio::sync::mpsc::UnboundedReceiver, + >, + ) -> InterpretedWalReader { + let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running { + current_position: start_pos, + current_batch_wal_start: None, + })); + + InterpretedWalReader { + wal_stream, + shard_senders: HashMap::from([( + shard, + smallvec::smallvec![ShardSenderState { + sender_id: SenderId::first(), + tx, + next_record_lsn: start_pos, + }], + )]), + shard_notification_rx, + state: state.clone(), + pg_version, + } + } + + /// Entry point for future (polling) based wal reader. + pub(crate) async fn run( + self, + start_pos: Lsn, + appname: &Option, + ) -> Result<(), CopyStreamHandlerEnd> { + let metric = WAL_READERS + .get_metric_with_label_values(&["future", appname.as_deref().unwrap_or("safekeeper")]) + .unwrap(); + + metric.inc(); + scopeguard::defer! { + metric.dec(); + } + + if let Err(err) = self.run_impl(start_pos).await { + critical!("failed to read WAL record: {err:?}"); + } else { + info!("interpreted wal reader exiting"); + } + + Err(CopyStreamHandlerEnd::Other(anyhow!( + "interpreted wal reader finished" + ))) + } + + /// Send interpreted WAL to one or more [`InterpretedWalSender`]s + /// Stops when an error is encountered or when the [`InterpretedWalReaderHandle`] + /// goes out of scope. + async fn run_impl(mut self, start_pos: Lsn) -> Result<(), InterpretedWalReaderError> { + let defer_state = self.state.clone(); + scopeguard::defer! { + *defer_state.write().unwrap() = InterpretedWalReaderState::Done; + } + + let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version); loop { tokio::select! { - // Get some WAL from the stream and then: decode, interpret and push it down the - // pipeline. - wal = stream.next(), if tx.capacity() > 0 => { - let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal { - Some(some) => some?, - None => { break; } + // Main branch for reading WAL and forwarding it + wal_or_reset = self.wal_stream.next() => { + let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below")); + let WalBytes { + wal, + wal_start_lsn, + wal_end_lsn, + available_wal_end_lsn, + } = match wal { + Some(some) => some.map_err(InterpretedWalReaderError::ReadOrInterpret)?, + None => { + // [`StreamingWalReader::next`] is an endless stream of WAL. + // It shouldn't ever finish unless it panicked or became internally + // inconsistent. + return Result::Err(InterpretedWalReaderError::WalStreamClosed); + } }; - wal_position = wal_end_lsn; + self.state.write().unwrap().update_current_batch_wal_start(wal_start_lsn); + wal_decoder.feed_bytes(&wal); - let mut records = Vec::new(); + // Deserialize and interpret WAL records from this batch of WAL. + // Interpreted records for each shard are collected separately. + let shard_ids = self.shard_senders.keys().copied().collect::>(); + let mut records_by_sender: HashMap> = HashMap::new(); let mut max_next_record_lsn = None; - while let Some((next_record_lsn, recdata)) = wal_decoder - .poll_decode() - .with_context(|| "Failed to decode WAL")? + while let Some((next_record_lsn, recdata)) = wal_decoder.poll_decode()? { assert!(next_record_lsn.is_aligned()); max_next_record_lsn = Some(next_record_lsn); - // Deserialize and interpret WAL record let interpreted = InterpretedWalRecord::from_bytes_filtered( recdata, - &self.shard, + &shard_ids, next_record_lsn, self.pg_version, ) .with_context(|| "Failed to interpret WAL")?; - if !interpreted.is_empty() { - records.push(interpreted); + for (shard, record) in interpreted { + if record.is_empty() { + continue; + } + + let mut states_iter = self.shard_senders + .get(&shard) + .expect("keys collected above") + .iter() + .filter(|state| record.next_record_lsn > state.next_record_lsn) + .peekable(); + while let Some(state) = states_iter.next() { + let shard_sender_id = ShardSenderId::new(shard, state.sender_id); + + // The most commont case is one sender per shard. Peek and break to avoid the + // clone in that situation. + if states_iter.peek().is_none() { + records_by_sender.entry(shard_sender_id).or_default().push(record); + break; + } else { + records_by_sender.entry(shard_sender_id).or_default().push(record.clone()); + } + } } } - let batch = InterpretedWalRecords { - records, - next_record_lsn: max_next_record_lsn + let max_next_record_lsn = match max_next_record_lsn { + Some(lsn) => lsn, + None => { + continue; + } }; - tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap(); + // Update the current position such that new receivers can decide + // whether to attach to us or spawn a new WAL reader. + let batch_wal_start_lsn = { + let mut guard = self.state.write().unwrap(); + guard.update_current_position(max_next_record_lsn); + guard.take_current_batch_wal_start() + }; + + // Send interpreted records downstream. Anything that has already been seen + // by a shard is filtered out. + let mut shard_senders_to_remove = Vec::new(); + for (shard, states) in &mut self.shard_senders { + for state in states { + let shard_sender_id = ShardSenderId::new(*shard, state.sender_id); + + let batch = if max_next_record_lsn > state.next_record_lsn { + // This batch contains at least one record that this shard has not + // seen yet. + let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default(); + + InterpretedWalRecords { + records, + next_record_lsn: max_next_record_lsn, + raw_wal_start_lsn: Some(batch_wal_start_lsn), + } + } else if wal_end_lsn > state.next_record_lsn { + // All the records in this batch were seen by the shard + // However, the batch maps to a chunk of WAL that the + // shard has not yet seen. Notify it of the start LSN + // of the PG WAL chunk such that it doesn't look like a gap. + InterpretedWalRecords { + records: Vec::default(), + next_record_lsn: state.next_record_lsn, + raw_wal_start_lsn: Some(batch_wal_start_lsn), + } + } else { + // The shard has seen this chunk of WAL before. Skip it. + continue; + }; + + let res = state.tx.send(Batch { + wal_end_lsn, + available_wal_end_lsn, + records: batch, + }).await; + + if res.is_err() { + shard_senders_to_remove.push(shard_sender_id); + } else { + state.next_record_lsn = std::cmp::max(state.next_record_lsn, max_next_record_lsn); + } + } + } + + // Clean up any shard senders that have dropped out. + // This is inefficient, but such events are rare (connection to PS termination) + // and the number of subscriptions on the same shards very small (only one + // for the steady state). + for to_remove in shard_senders_to_remove { + let shard_senders = self.shard_senders.get_mut(&to_remove.shard()).expect("saw it above"); + if let Some(idx) = shard_senders.iter().position(|s| s.sender_id == to_remove.sender_id) { + shard_senders.remove(idx); + tracing::info!("Removed shard sender {}", to_remove); + } + + if shard_senders.is_empty() { + self.shard_senders.remove(&to_remove.shard()); + } + } }, - // For a previously interpreted batch, serialize it and push it down the wire. - batch = rx.recv() => { + // Listen for new shards that want to attach to this reader. + // If the reader is not running as a task, then this is not supported + // (see the pending branch below). + notification = match self.shard_notification_rx.as_mut() { + Some(rx) => Either::Left(rx.recv()), + None => Either::Right(std::future::pending()) + } => { + if let Some(n) = notification { + let AttachShardNotification { shard_id, sender, start_pos } = n; + + // Update internal and external state, then reset the WAL stream + // if required. + let senders = self.shard_senders.entry(shard_id).or_default(); + let new_sender_id = match senders.last() { + Some(sender) => sender.sender_id.next(), + None => SenderId::first() + }; + + senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos}); + + // If the shard is subscribing below the current position the we need + // to update the cursor that tracks where we are at in the WAL + // ([`Self::state`]) and reset the WAL stream itself + // (`[Self::wal_stream`]). This must be done atomically from the POV of + // anything outside the select statement. + let position_reset = self.state.write().unwrap().maybe_reset(start_pos); + match position_reset { + CurrentPositionUpdate::Reset { from: _, to } => { + self.wal_stream.reset(to).await; + wal_decoder = WalStreamDecoder::new(to, self.pg_version); + }, + CurrentPositionUpdate::NotReset(_) => {} + }; + + tracing::info!( + "Added shard sender {} with start_pos={} previous_pos={} current_pos={}", + ShardSenderId::new(shard_id, new_sender_id), + start_pos, + position_reset.previous_position(), + position_reset.current_position(), + ); + } + } + } + } + } + + #[cfg(test)] + fn state(&self) -> Arc> { + self.state.clone() + } +} + +impl InterpretedWalReaderHandle { + /// Fan-out the reader by attaching a new shard to it + pub(crate) fn fanout( + &self, + shard_id: ShardIdentity, + sender: tokio::sync::mpsc::Sender, + start_pos: Lsn, + ) -> Result<(), SendError> { + self.shard_notification_tx.send(AttachShardNotification { + shard_id, + sender, + start_pos, + }) + } + + /// Get the current WAL position of the reader + pub(crate) fn current_position(&self) -> Option { + self.state.read().unwrap().current_position() + } + + pub(crate) fn abort(&self) { + self.join_handle.abort() + } +} + +impl Drop for InterpretedWalReaderHandle { + fn drop(&mut self) { + tracing::info!("Aborting interpreted wal reader"); + self.abort() + } +} + +pub(crate) struct InterpretedWalSender<'a, IO> { + pub(crate) format: InterpretedFormat, + pub(crate) compression: Option, + pub(crate) appname: Option, + + pub(crate) tli: WalResidentTimeline, + pub(crate) start_lsn: Lsn, + + pub(crate) pgb: &'a mut PostgresBackend, + pub(crate) end_watch_view: EndWatchView, + pub(crate) wal_sender_guard: Arc, + pub(crate) rx: tokio::sync::mpsc::Receiver, +} + +impl InterpretedWalSender<'_, IO> { + /// Send interpreted WAL records over the network. + /// Also manages keep-alives if nothing was sent for a while. + pub(crate) async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> { + let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1)); + keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip); + keepalive_ticker.reset(); + + let mut wal_position = self.start_lsn; + + loop { + tokio::select! { + batch = self.rx.recv() => { let batch = match batch { Some(b) => b, - None => { break; } + None => { + return Result::Err( + CopyStreamHandlerEnd::Other(anyhow!("Interpreted WAL reader exited early")) + ); + } }; + wal_position = batch.wal_end_lsn; + let buf = batch .records .to_wire(self.format, self.compression) @@ -127,7 +656,21 @@ impl InterpretedWalSender<'_, IO> { })).await?; } // Send a periodic keep alive when the connection has been idle for a while. + // Since we've been idle, also check if we can stop streaming. _ = keepalive_ticker.tick() => { + if let Some(remote_consistent_lsn) = self.wal_sender_guard + .walsenders() + .get_ws_remote_consistent_lsn(self.wal_sender_guard.id()) + { + if self.tli.should_walsender_stop(remote_consistent_lsn).await { + // Stop streaming if the receivers are caught up and + // there's no active compute. This causes the loop in + // [`crate::send_interpreted_wal::InterpretedWalSender::run`] + // to exit and terminate the WAL stream. + break; + } + } + self.pgb .write_message(&BeMessage::KeepAlive(WalSndKeepAlive { wal_end: self.end_watch_view.get().0, @@ -135,14 +678,427 @@ impl InterpretedWalSender<'_, IO> { request_reply: true, })) .await?; - } + }, } } - // The loop above ends when the receiver is caught up and there's no more WAL to send. Err(CopyStreamHandlerEnd::ServerInitiated(format!( "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", self.appname, wal_position, ))) } } +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::str::FromStr; + use std::time::Duration; + + use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; + use postgres_ffi::MAX_SEND_SIZE; + use tokio::sync::mpsc::error::TryRecvError; + use utils::id::{NodeId, TenantTimelineId}; + use utils::lsn::Lsn; + use utils::shard::{ShardCount, ShardNumber}; + + use crate::send_interpreted_wal::{AttachShardNotification, Batch, InterpretedWalReader}; + use crate::test_utils::Env; + use crate::wal_reader_stream::StreamingWalReader; + + #[tokio::test] + async fn test_interpreted_wal_reader_fanout() { + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 8 * 1024; + const MSG_COUNT: usize = 200; + const PG_VERSION: u32 = 17; + const SHARD_COUNT: u8 = 2; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let env = Env::new(true).unwrap(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None) + .await + .unwrap(); + let end_pos = end_watch.get(); + + tracing::info!("Doing first round of reads ..."); + + let streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + start_lsn, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + let shard_0 = ShardIdentity::new( + ShardNumber(0), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let shard_1 = ShardIdentity::new( + ShardNumber(1), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let mut shards = HashMap::new(); + + for shard_number in 0..SHARD_COUNT { + let shard_id = ShardIdentity::new( + ShardNumber(shard_number), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + shards.insert(shard_id, (Some(tx), Some(rx))); + } + + let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap(); + let mut shard_0_rx = shards.get_mut(&shard_0).unwrap().1.take().unwrap(); + + let handle = InterpretedWalReader::spawn( + streaming_wal_reader, + start_lsn, + shard_0_tx, + shard_0, + PG_VERSION, + &Some("pageserver".to_string()), + ); + + tracing::info!("Reading all WAL with only shard 0 attached ..."); + + let mut shard_0_interpreted_records = Vec::new(); + while let Some(batch) = shard_0_rx.recv().await { + shard_0_interpreted_records.push(batch.records); + if batch.wal_end_lsn == batch.available_wal_end_lsn { + break; + } + } + + let shard_1_tx = shards.get_mut(&shard_1).unwrap().0.take().unwrap(); + let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap(); + + tracing::info!("Attaching shard 1 to the reader at start of WAL"); + handle.fanout(shard_1, shard_1_tx, start_lsn).unwrap(); + + tracing::info!("Reading all WAL with shard 0 and shard 1 attached ..."); + + let mut shard_1_interpreted_records = Vec::new(); + while let Some(batch) = shard_1_rx.recv().await { + shard_1_interpreted_records.push(batch.records); + if batch.wal_end_lsn == batch.available_wal_end_lsn { + break; + } + } + + // This test uses logical messages. Those only go to shard 0. Check that the + // filtering worked and shard 1 did not get any. + assert!( + shard_1_interpreted_records + .iter() + .all(|recs| recs.records.is_empty()) + ); + + // Shard 0 should not receive anything more since the reader is + // going through wal that it has already processed. + let res = shard_0_rx.try_recv(); + if let Ok(ref ok) = res { + tracing::error!( + "Shard 0 received batch: wal_end_lsn={} available_wal_end_lsn={}", + ok.wal_end_lsn, + ok.available_wal_end_lsn + ); + } + assert!(matches!(res, Err(TryRecvError::Empty))); + + // Check that the next records lsns received by the two shards match up. + let shard_0_next_lsns = shard_0_interpreted_records + .iter() + .map(|recs| recs.next_record_lsn) + .collect::>(); + let shard_1_next_lsns = shard_1_interpreted_records + .iter() + .map(|recs| recs.next_record_lsn) + .collect::>(); + assert_eq!(shard_0_next_lsns, shard_1_next_lsns); + + handle.abort(); + let mut done = false; + for _ in 0..5 { + if handle.current_position().is_none() { + done = true; + break; + } + tokio::time::sleep(Duration::from_millis(1)).await; + } + + assert!(done); + } + + #[tokio::test] + async fn test_interpreted_wal_reader_same_shard_fanout() { + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 8 * 1024; + const MSG_COUNT: usize = 200; + const PG_VERSION: u32 = 17; + const SHARD_COUNT: u8 = 2; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let env = Env::new(true).unwrap(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let mut next_record_lsns = Vec::default(); + let end_watch = + Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns)) + .await + .unwrap(); + let end_pos = end_watch.get(); + + let streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + start_lsn, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + let shard_0 = ShardIdentity::new( + ShardNumber(0), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + struct Sender { + tx: Option>, + rx: tokio::sync::mpsc::Receiver, + shard: ShardIdentity, + start_lsn: Lsn, + received_next_record_lsns: Vec, + } + + impl Sender { + fn new(start_lsn: Lsn, shard: ShardIdentity) -> Self { + let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + Self { + tx: Some(tx), + rx, + shard, + start_lsn, + received_next_record_lsns: Vec::default(), + } + } + } + + assert!(next_record_lsns.len() > 7); + let start_lsns = vec![ + next_record_lsns[5], + next_record_lsns[1], + next_record_lsns[3], + ]; + let mut senders = start_lsns + .into_iter() + .map(|lsn| Sender::new(lsn, shard_0)) + .collect::>(); + + let first_sender = senders.first_mut().unwrap(); + let handle = InterpretedWalReader::spawn( + streaming_wal_reader, + first_sender.start_lsn, + first_sender.tx.take().unwrap(), + first_sender.shard, + PG_VERSION, + &Some("pageserver".to_string()), + ); + + for sender in senders.iter_mut().skip(1) { + handle + .fanout(sender.shard, sender.tx.take().unwrap(), sender.start_lsn) + .unwrap(); + } + + for sender in senders.iter_mut() { + loop { + let batch = sender.rx.recv().await.unwrap(); + tracing::info!( + "Sender with start_lsn={} received batch ending at {} with {} records", + sender.start_lsn, + batch.wal_end_lsn, + batch.records.records.len() + ); + + for rec in batch.records.records { + sender.received_next_record_lsns.push(rec.next_record_lsn); + } + + if batch.wal_end_lsn == batch.available_wal_end_lsn { + break; + } + } + } + + handle.abort(); + let mut done = false; + for _ in 0..5 { + if handle.current_position().is_none() { + done = true; + break; + } + tokio::time::sleep(Duration::from_millis(1)).await; + } + + assert!(done); + + for sender in senders { + tracing::info!( + "Validating records received by sender with start_lsn={}", + sender.start_lsn + ); + + assert!(sender.received_next_record_lsns.is_sorted()); + let expected = next_record_lsns + .iter() + .filter(|lsn| **lsn > sender.start_lsn) + .copied() + .collect::>(); + assert_eq!(sender.received_next_record_lsns, expected); + } + } + + #[tokio::test] + async fn test_batch_start_tracking_on_reset() { + // When the WAL stream is reset to an older LSN, + // the current batch start LSN should be invalidated. + // This test constructs such a scenario: + // 1. Shard 0 is reading somewhere ahead + // 2. Reader reads some WAL, but does not decode a full record (partial read) + // 3. Shard 1 attaches to the reader and resets it to an older LSN + // 4. Shard 1 should get the correct batch WAL start LSN + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 64 * 1024; + const MSG_COUNT: usize = 10; + const PG_VERSION: u32 = 17; + const SHARD_COUNT: u8 = 2; + const WAL_READER_BATCH_SIZE: usize = 8192; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let shard_0_start_lsn = Lsn::from_str("0/14AFE10").unwrap(); + let env = Env::new(true).unwrap(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None) + .await + .unwrap(); + let end_pos = end_watch.get(); + + let streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + shard_0_start_lsn, + end_pos, + end_watch, + WAL_READER_BATCH_SIZE, + ); + + let shard_0 = ShardIdentity::new( + ShardNumber(0), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let shard_1 = ShardIdentity::new( + ShardNumber(1), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + + let mut shards = HashMap::new(); + + for shard_number in 0..SHARD_COUNT { + let shard_id = ShardIdentity::new( + ShardNumber(shard_number), + ShardCount(SHARD_COUNT), + ShardStripeSize::default(), + ) + .unwrap(); + let (tx, rx) = tokio::sync::mpsc::channel::(MSG_COUNT * 2); + shards.insert(shard_id, (Some(tx), Some(rx))); + } + + let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap(); + + let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel(); + + let reader = InterpretedWalReader::new( + streaming_wal_reader, + shard_0_start_lsn, + shard_0_tx, + shard_0, + PG_VERSION, + Some(shard_notification_rx), + ); + + let reader_state = reader.state(); + let mut reader_fut = std::pin::pin!(reader.run(start_lsn, &None)); + loop { + let poll = futures::poll!(reader_fut.as_mut()); + assert!(poll.is_pending()); + + let guard = reader_state.read().unwrap(); + if guard.current_batch_wal_start().is_some() { + break; + } + } + + shard_notification_tx + .send(AttachShardNotification { + shard_id: shard_1, + sender: shards.get_mut(&shard_1).unwrap().0.take().unwrap(), + start_pos: start_lsn, + }) + .unwrap(); + + let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap(); + loop { + let poll = futures::poll!(reader_fut.as_mut()); + assert!(poll.is_pending()); + + let try_recv_res = shard_1_rx.try_recv(); + match try_recv_res { + Ok(batch) => { + assert_eq!(batch.records.raw_wal_start_lsn.unwrap(), start_lsn); + break; + } + Err(tokio::sync::mpsc::error::TryRecvError::Empty) => {} + Err(tokio::sync::mpsc::error::TryRecvError::Disconnected) => { + unreachable!(); + } + } + } + } +} diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 0887cf7264..33e3d0485c 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -1,40 +1,44 @@ //! This module implements the streaming side of replication protocol, starting //! with the "START_REPLICATION" message, and registry of walsenders. -use crate::handler::SafekeeperPostgresHandler; -use crate::metrics::RECEIVED_PS_FEEDBACKS; -use crate::receive_wal::WalReceivers; -use crate::safekeeper::{Term, TermLsn}; -use crate::send_interpreted_wal::InterpretedWalSender; -use crate::timeline::WalResidentTimeline; -use crate::wal_reader_stream::WalReaderStreamBuilder; -use crate::wal_service::ConnectionId; -use crate::wal_storage::WalReader; -use anyhow::{bail, Context as AnyhowContext}; -use bytes::Bytes; -use futures::future::Either; -use parking_lot::Mutex; -use postgres_backend::PostgresBackend; -use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; -use postgres_ffi::get_current_timestamp; -use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; -use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; -use serde::{Deserialize, Serialize}; -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::failpoint_support; -use utils::id::TenantTimelineId; -use utils::pageserver_feedback::PageserverFeedback; -use utils::postgres_client::PostgresClientProtocol; - use std::cmp::{max, min}; use std::net::SocketAddr; -use std::str; use std::sync::Arc; use std::time::Duration; + +use anyhow::{Context as AnyhowContext, bail}; +use bytes::Bytes; +use futures::FutureExt; +use itertools::Itertools; +use parking_lot::Mutex; +use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError}; +use postgres_ffi::{MAX_SEND_SIZE, TimestampTz, get_current_timestamp}; +use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; +use safekeeper_api::Term; +use safekeeper_api::models::{ + HotStandbyFeedback, INVALID_FULL_TRANSACTION_ID, ReplicationFeedback, StandbyFeedback, + StandbyReply, +}; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::watch::Receiver; use tokio::time::timeout; use tracing::*; -use utils::{bin_ser::BeSer, lsn::Lsn}; +use utils::bin_ser::BeSer; +use utils::failpoint_support; +use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; +use utils::postgres_client::PostgresClientProtocol; + +use crate::handler::SafekeeperPostgresHandler; +use crate::metrics::{RECEIVED_PS_FEEDBACKS, WAL_READERS}; +use crate::receive_wal::WalReceivers; +use crate::safekeeper::TermLsn; +use crate::send_interpreted_wal::{ + Batch, InterpretedWalReader, InterpretedWalReaderHandle, InterpretedWalSender, +}; +use crate::timeline::WalResidentTimeline; +use crate::wal_reader_stream::StreamingWalReader; +use crate::wal_storage::WalReader; // See: https://www.postgresql.org/docs/13/protocol-replication.html const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; @@ -42,71 +46,18 @@ const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r'; // neon extension of replication protocol const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; -type FullTransactionId = u64; - -/// Hot standby feedback received from replica -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct HotStandbyFeedback { - pub ts: TimestampTz, - pub xmin: FullTransactionId, - pub catalog_xmin: FullTransactionId, -} - -const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0; - -impl HotStandbyFeedback { - pub fn empty() -> HotStandbyFeedback { - HotStandbyFeedback { - ts: 0, - xmin: 0, - catalog_xmin: 0, - } - } -} - -/// Standby status update -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct StandbyReply { - pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby. - pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby. - pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby. - pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01. - pub reply_requested: bool, -} - -impl StandbyReply { - fn empty() -> Self { - StandbyReply { - write_lsn: Lsn::INVALID, - flush_lsn: Lsn::INVALID, - apply_lsn: Lsn::INVALID, - reply_ts: 0, - reply_requested: false, - } - } -} - -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct StandbyFeedback { - pub reply: StandbyReply, - pub hs_feedback: HotStandbyFeedback, -} - -impl StandbyFeedback { - pub fn empty() -> Self { - StandbyFeedback { - reply: StandbyReply::empty(), - hs_feedback: HotStandbyFeedback::empty(), - } - } -} - /// WalSenders registry. Timeline holds it (wrapped in Arc). pub struct WalSenders { mutex: Mutex, walreceivers: Arc, } +pub struct WalSendersTimelineMetricValues { + pub ps_feedback_counter: u64, + pub last_ps_feedback: PageserverFeedback, + pub interpreted_wal_reader_tasks: usize, +} + impl WalSenders { pub fn new(walreceivers: Arc) -> Arc { Arc::new(WalSenders { @@ -117,21 +68,8 @@ impl WalSenders { /// Register new walsender. Returned guard provides access to the slot and /// automatically deregisters in Drop. - fn register( - self: &Arc, - ttid: TenantTimelineId, - addr: SocketAddr, - conn_id: ConnectionId, - appname: Option, - ) -> WalSenderGuard { + fn register(self: &Arc, walsender_state: WalSenderState) -> WalSenderGuard { let slots = &mut self.mutex.lock().slots; - let walsender_state = WalSenderState { - ttid, - addr, - conn_id, - appname, - feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()), - }; // find empty slot or create new one let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) { slots[pos] = Some(walsender_state); @@ -147,9 +85,79 @@ impl WalSenders { } } + fn create_or_update_interpreted_reader< + FUp: FnOnce(&Arc) -> anyhow::Result<()>, + FNew: FnOnce() -> InterpretedWalReaderHandle, + >( + self: &Arc, + id: WalSenderId, + start_pos: Lsn, + max_delta_for_fanout: Option, + update: FUp, + create: FNew, + ) -> anyhow::Result<()> { + let state = &mut self.mutex.lock(); + + let mut selected_interpreted_reader = None; + for slot in state.slots.iter().flatten() { + if let WalSenderState::Interpreted(slot_state) = slot { + if let Some(ref interpreted_reader) = slot_state.interpreted_wal_reader { + let select = match (interpreted_reader.current_position(), max_delta_for_fanout) + { + (Some(pos), Some(max_delta)) => { + let delta = pos.0.abs_diff(start_pos.0); + delta <= max_delta + } + // Reader is not active + (None, _) => false, + // Gating fanout by max delta is disabled. + // Attach to any active reader. + (_, None) => true, + }; + + if select { + selected_interpreted_reader = Some(interpreted_reader.clone()); + break; + } + } + } + } + + let slot = state.get_slot_mut(id); + let slot_state = match slot { + WalSenderState::Interpreted(s) => s, + WalSenderState::Vanilla(_) => unreachable!(), + }; + + let selected_or_new = match selected_interpreted_reader { + Some(selected) => { + update(&selected)?; + selected + } + None => Arc::new(create()), + }; + + slot_state.interpreted_wal_reader = Some(selected_or_new); + + Ok(()) + } + /// Get state of all walsenders. - pub fn get_all(self: &Arc) -> Vec { - self.mutex.lock().slots.iter().flatten().cloned().collect() + pub fn get_all_public(self: &Arc) -> Vec { + self.mutex + .lock() + .slots + .iter() + .flatten() + .map(|state| match state { + WalSenderState::Vanilla(s) => { + safekeeper_api::models::WalSenderState::Vanilla(s.clone()) + } + WalSenderState::Interpreted(s) => { + safekeeper_api::models::WalSenderState::Interpreted(s.public_state.clone()) + } + }) + .collect() } /// Get LSN of the most lagging pageserver receiver. Return None if there are no @@ -160,7 +168,7 @@ impl WalSenders { .slots .iter() .flatten() - .filter_map(|s| match s.feedback { + .filter_map(|s| match s.get_feedback() { ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn), ReplicationFeedback::Standby(_) => None, }) @@ -168,9 +176,25 @@ impl WalSenders { } /// Returns total counter of pageserver feedbacks received and last feedback. - pub fn get_ps_feedback_stats(self: &Arc) -> (u64, PageserverFeedback) { + pub fn info_for_metrics(self: &Arc) -> WalSendersTimelineMetricValues { let shared = self.mutex.lock(); - (shared.ps_feedback_counter, shared.last_ps_feedback) + + let interpreted_wal_reader_tasks = shared + .slots + .iter() + .filter_map(|ss| match ss { + Some(WalSenderState::Interpreted(int)) => int.interpreted_wal_reader.as_ref(), + Some(WalSenderState::Vanilla(_)) => None, + None => None, + }) + .unique_by(|reader| Arc::as_ptr(reader)) + .count(); + + WalSendersTimelineMetricValues { + ps_feedback_counter: shared.ps_feedback_counter, + last_ps_feedback: shared.last_ps_feedback, + interpreted_wal_reader_tasks, + } } /// Get aggregated hot standby feedback (we send it to compute). @@ -181,7 +205,7 @@ impl WalSenders { /// Record new pageserver feedback, update aggregated values. fn record_ps_feedback(self: &Arc, id: WalSenderId, feedback: &PageserverFeedback) { let mut shared = self.mutex.lock(); - shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback); + *shared.get_slot_mut(id).get_mut_feedback() = ReplicationFeedback::Pageserver(*feedback); shared.last_ps_feedback = *feedback; shared.ps_feedback_counter += 1; drop(shared); @@ -200,10 +224,10 @@ impl WalSenders { "Record standby reply: ts={} apply_lsn={}", reply.reply_ts, reply.apply_lsn ); - match &mut slot.feedback { + match &mut slot.get_mut_feedback() { ReplicationFeedback::Standby(sf) => sf.reply = *reply, ReplicationFeedback::Pageserver(_) => { - slot.feedback = ReplicationFeedback::Standby(StandbyFeedback { + *slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback { reply: *reply, hs_feedback: HotStandbyFeedback::empty(), }) @@ -215,10 +239,10 @@ impl WalSenders { fn record_hs_feedback(self: &Arc, id: WalSenderId, feedback: &HotStandbyFeedback) { let mut shared = self.mutex.lock(); let slot = shared.get_slot_mut(id); - match &mut slot.feedback { + match &mut slot.get_mut_feedback() { ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback, ReplicationFeedback::Pageserver(_) => { - slot.feedback = ReplicationFeedback::Standby(StandbyFeedback { + *slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback { reply: StandbyReply::empty(), hs_feedback: *feedback, }) @@ -232,7 +256,7 @@ impl WalSenders { pub fn get_ws_remote_consistent_lsn(self: &Arc, id: WalSenderId) -> Option { let shared = self.mutex.lock(); let slot = shared.get_slot(id); - match slot.feedback { + match slot.get_feedback() { ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn), _ => None, } @@ -256,6 +280,47 @@ struct WalSendersShared { slots: Vec>, } +/// Safekeeper internal definitions of wal sender state +/// +/// As opposed to [`safekeeper_api::models::WalSenderState`] these struct may +/// include state that we don not wish to expose to the public api. +#[derive(Debug, Clone)] +pub(crate) enum WalSenderState { + Vanilla(VanillaWalSenderInternalState), + Interpreted(InterpretedWalSenderInternalState), +} + +type VanillaWalSenderInternalState = safekeeper_api::models::VanillaWalSenderState; + +#[derive(Debug, Clone)] +pub(crate) struct InterpretedWalSenderInternalState { + public_state: safekeeper_api::models::InterpretedWalSenderState, + interpreted_wal_reader: Option>, +} + +impl WalSenderState { + fn get_addr(&self) -> &SocketAddr { + match self { + WalSenderState::Vanilla(state) => &state.addr, + WalSenderState::Interpreted(state) => &state.public_state.addr, + } + } + + fn get_feedback(&self) -> &ReplicationFeedback { + match self { + WalSenderState::Vanilla(state) => &state.feedback, + WalSenderState::Interpreted(state) => &state.public_state.feedback, + } + } + + fn get_mut_feedback(&mut self) -> &mut ReplicationFeedback { + match self { + WalSenderState::Vanilla(state) => &mut state.feedback, + WalSenderState::Interpreted(state) => &mut state.public_state.feedback, + } + } +} + impl WalSendersShared { fn new() -> Self { WalSendersShared { @@ -282,7 +347,7 @@ impl WalSendersShared { let mut agg = HotStandbyFeedback::empty(); let mut reply_agg = StandbyReply::empty(); for ws_state in self.slots.iter().flatten() { - if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback { + if let ReplicationFeedback::Standby(standby_feedback) = ws_state.get_feedback() { let hs_feedback = standby_feedback.hs_feedback; // doing Option math like op1.iter().chain(op2.iter()).min() // would be nicer, but we serialize/deserialize this struct @@ -341,25 +406,6 @@ impl WalSendersShared { } } -// Serialized is used only for pretty printing in json. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct WalSenderState { - ttid: TenantTimelineId, - addr: SocketAddr, - conn_id: ConnectionId, - // postgres application_name - appname: Option, - feedback: ReplicationFeedback, -} - -// Receiver is either pageserver or regular standby, which have different -// feedbacks. -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -enum ReplicationFeedback { - Pageserver(PageserverFeedback), - Standby(StandbyFeedback), -} - // id of the occupied slot in WalSenders to access it (and save in the // WalSenderGuard). We could give Arc directly to the slot, but there is not // much sense in that as values aggregation which is performed on each feedback @@ -393,7 +439,7 @@ impl SafekeeperPostgresHandler { /// Wrapper around handle_start_replication_guts handling result. Error is /// handled here while we're still in walsender ttid span; with API /// extension, this can probably be moved into postgres_backend. - pub async fn handle_start_replication( + pub async fn handle_start_replication( &mut self, pgb: &mut PostgresBackend, start_pos: Lsn, @@ -418,7 +464,7 @@ impl SafekeeperPostgresHandler { Ok(()) } - pub async fn handle_start_replication_guts( + pub async fn handle_start_replication_guts( &mut self, pgb: &mut PostgresBackend, start_pos: Lsn, @@ -428,12 +474,30 @@ impl SafekeeperPostgresHandler { let appname = self.appname.clone(); // Use a guard object to remove our entry from the timeline when we are done. - let ws_guard = Arc::new(tli.get_walsenders().register( - self.ttid, - *pgb.get_peer_addr(), - self.conn_id, - self.appname.clone(), - )); + let ws_guard = match self.protocol() { + PostgresClientProtocol::Vanilla => Arc::new(tli.get_walsenders().register( + WalSenderState::Vanilla(VanillaWalSenderInternalState { + ttid: self.ttid, + addr: *pgb.get_peer_addr(), + conn_id: self.conn_id, + appname: self.appname.clone(), + feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()), + }), + )), + PostgresClientProtocol::Interpreted { .. } => Arc::new(tli.get_walsenders().register( + WalSenderState::Interpreted(InterpretedWalSenderInternalState { + public_state: safekeeper_api::models::InterpretedWalSenderState { + ttid: self.ttid, + shard: self.shard.unwrap(), + addr: *pgb.get_peer_addr(), + conn_id: self.conn_id, + appname: self.appname.clone(), + feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()), + }, + interpreted_wal_reader: None, + }), + )), + }; // Walsender can operate in one of two modes which we select by // application_name: give only committed WAL (used by pageserver) or all @@ -479,7 +543,7 @@ impl SafekeeperPostgresHandler { pgb, // should succeed since we're already holding another guard tli: tli.wal_residence_guard().await?, - appname, + appname: appname.clone(), start_pos, end_pos, term, @@ -489,7 +553,7 @@ impl SafekeeperPostgresHandler { send_buf: vec![0u8; MAX_SEND_SIZE], }; - Either::Left(sender.run()) + FutureExt::boxed(sender.run()) } PostgresClientProtocol::Interpreted { format, @@ -497,27 +561,97 @@ impl SafekeeperPostgresHandler { } => { let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000; let end_watch_view = end_watch.view(); - let wal_stream_builder = WalReaderStreamBuilder { - tli: tli.wal_residence_guard().await?, - start_pos, - end_pos, - term, - end_watch, - wal_sender_guard: ws_guard.clone(), - }; + let wal_residence_guard = tli.wal_residence_guard().await?; + let (tx, rx) = tokio::sync::mpsc::channel::(2); + let shard = self.shard.unwrap(); - let sender = InterpretedWalSender { - format, - compression, - pgb, - wal_stream_builder, - end_watch_view, - shard: self.shard.unwrap(), - pg_version, - appname, - }; + if self.conf.wal_reader_fanout && !shard.is_unsharded() { + let ws_id = ws_guard.id(); + ws_guard.walsenders().create_or_update_interpreted_reader( + ws_id, + start_pos, + self.conf.max_delta_for_fanout, + { + let tx = tx.clone(); + |reader| { + tracing::info!( + "Fanning out interpreted wal reader at {}", + start_pos + ); + reader + .fanout(shard, tx, start_pos) + .with_context(|| "Failed to fan out reader") + } + }, + || { + tracing::info!("Spawning interpreted wal reader at {}", start_pos); - Either::Right(sender.run()) + let wal_stream = StreamingWalReader::new( + wal_residence_guard, + term, + start_pos, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + InterpretedWalReader::spawn( + wal_stream, start_pos, tx, shard, pg_version, &appname, + ) + }, + )?; + + let sender = InterpretedWalSender { + format, + compression, + appname, + tli: tli.wal_residence_guard().await?, + start_lsn: start_pos, + pgb, + end_watch_view, + wal_sender_guard: ws_guard.clone(), + rx, + }; + + FutureExt::boxed(sender.run()) + } else { + let wal_reader = StreamingWalReader::new( + wal_residence_guard, + term, + start_pos, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + let reader = InterpretedWalReader::new( + wal_reader, start_pos, tx, shard, pg_version, None, + ); + + let sender = InterpretedWalSender { + format, + compression, + appname: appname.clone(), + tli: tli.wal_residence_guard().await?, + start_lsn: start_pos, + pgb, + end_watch_view, + wal_sender_guard: ws_guard.clone(), + rx, + }; + + FutureExt::boxed(async move { + // Sender returns an Err on all code paths. + // If the sender finishes first, we will drop the reader future. + // If the reader finishes first, the sender will finish too since + // the wal sender has dropped. + let res = tokio::try_join!(sender.run(), reader.run(start_pos, &appname)); + match res.map(|_| ()) { + Ok(_) => unreachable!("sender finishes with Err by convention"), + err_res => err_res, + } + }) + } } }; @@ -546,7 +680,8 @@ impl SafekeeperPostgresHandler { .clone(); info!( "finished streaming to {}, feedback={:?}", - ws_state.addr, ws_state.feedback, + ws_state.get_addr(), + ws_state.get_feedback(), ); // Join pg backend back. @@ -654,6 +789,18 @@ impl WalSender<'_, IO> { /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ? /// convenience. async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> { + let metric = WAL_READERS + .get_metric_with_label_values(&[ + "future", + self.appname.as_deref().unwrap_or("safekeeper"), + ]) + .unwrap(); + + metric.inc(); + scopeguard::defer! { + metric.dec(); + } + loop { // Wait for the next portion if it is not there yet, or just // update our end of WAL available for sending value, we @@ -759,9 +906,9 @@ impl WalSender<'_, IO> { // pageserver to identify WalReceiverError::SuccessfulCompletion, // do not change this string without updating pageserver. return Err(CopyStreamHandlerEnd::ServerInitiated(format!( - "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", - self.appname, self.start_pos, - ))); + "ending streaming to {:?} at {}, receiver is caughtup and there is no computes", + self.appname, self.start_pos, + ))); } } } @@ -888,7 +1035,8 @@ impl ReplyReader { #[cfg(test)] mod tests { - use utils::id::{TenantId, TimelineId}; + use safekeeper_api::models::FullTransactionId; + use utils::id::{TenantId, TenantTimelineId, TimelineId}; use super::*; @@ -905,13 +1053,13 @@ mod tests { // add to wss specified feedback setting other fields to dummy values fn push_feedback(wss: &mut WalSendersShared, feedback: ReplicationFeedback) { - let walsender_state = WalSenderState { + let walsender_state = WalSenderState::Vanilla(VanillaWalSenderInternalState { ttid: mock_ttid(), addr: mock_addr(), conn_id: 1, appname: None, feedback, - }; + }); wss.slots.push(Some(walsender_state)) } diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 941b7e67d0..e437e6d2cd 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -1,26 +1,24 @@ //! Defines per timeline data stored persistently (SafeKeeperPersistentState) //! and its wrapper with in memory layer (SafekeeperState). -use std::{cmp::max, ops::Deref}; +use std::cmp::max; +use std::ops::Deref; +use std::time::SystemTime; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::TimelineTermBumpResponse; +use safekeeper_api::membership::Configuration; +use safekeeper_api::models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}; +use safekeeper_api::{INITIAL_TERM, ServerInfo, Term}; use serde::{Deserialize, Serialize}; -use utils::{ - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use tracing::info; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; -use crate::{ - control_file, - safekeeper::{ - AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory, - UNKNOWN_SERVER_VERSION, - }, - timeline::TimelineError, - wal_backup_partial::{self}, -}; +use crate::control_file; +use crate::safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION}; +use crate::timeline::TimelineError; +use crate::wal_backup_partial::{self}; /// Persistent information stored on safekeeper node about timeline. /// On disk data is prefixed by magic and format version and followed by checksum. @@ -30,6 +28,8 @@ pub struct TimelinePersistentState { pub tenant_id: TenantId, #[serde(with = "hex")] pub timeline_id: TimelineId, + /// Membership configuration. + pub mconf: Configuration, /// persistent acceptor state pub acceptor_state: AcceptorState, /// information about server @@ -61,22 +61,15 @@ pub struct TimelinePersistentState { /// pushed to s3. We don't remove WAL beyond it. Persisted only for /// informational purposes, we receive it from pageserver (or broker). pub remote_consistent_lsn: Lsn, - /// Peers and their state as we remember it. Knowing peers themselves is - /// fundamental; but state is saved here only for informational purposes and - /// obviously can be stale. (Currently not saved at all, but let's provision - /// place to have less file version upgrades). - pub peers: PersistedPeers, /// Holds names of partial segments uploaded to remote storage. Used to /// clean up old objects without leaving garbage in remote storage. pub partial_backup: wal_backup_partial::State, /// Eviction state of the timeline. If it's Offloaded, we should download /// WAL files from remote storage to serve the timeline. pub eviction_state: EvictionState, + pub creation_ts: SystemTime, } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); - /// State of the local WAL files. Used to track current timeline state, /// that can be either WAL files are present on disk or last partial segment /// is offloaded to remote storage. @@ -90,12 +83,14 @@ pub enum EvictionState { } impl TimelinePersistentState { + /// commit_lsn is the same as start_lsn in the normal creaiton; see + /// `TimelineCreateRequest` comments.` pub fn new( ttid: &TenantTimelineId, + mconf: Configuration, server_info: ServerInfo, - peers: Vec, + start_lsn: Lsn, commit_lsn: Lsn, - local_start_lsn: Lsn, ) -> anyhow::Result { if server_info.wal_seg_size == 0 { bail!(TimelineError::UninitializedWalSegSize(*ttid)); @@ -105,49 +100,59 @@ impl TimelinePersistentState { bail!(TimelineError::UninitialinzedPgVersion(*ttid)); } - if commit_lsn < local_start_lsn { + if commit_lsn < start_lsn { bail!( - "commit_lsn {} is smaller than local_start_lsn {}", + "commit_lsn {} is smaller than start_lsn {}", commit_lsn, - local_start_lsn + start_lsn ); } + // If we are given with init LSN, initialize term history with it. It + // ensures that walproposer always must be able to find a common point + // in histories; if it can't something is corrupted. Not having LSN here + // is so far left for legacy case where timeline is created by compute + // and LSN during creation is not known yet. + let term_history = if commit_lsn != Lsn::INVALID { + TermHistory(vec![TermLsn { + term: INITIAL_TERM, + lsn: start_lsn, + }]) + } else { + TermHistory::empty() + }; + Ok(TimelinePersistentState { tenant_id: ttid.tenant_id, timeline_id: ttid.timeline_id, + mconf, acceptor_state: AcceptorState { - term: 0, - term_history: TermHistory::empty(), + term: INITIAL_TERM, + term_history, }, server: server_info, proposer_uuid: [0; 16], - timeline_start_lsn: Lsn(0), - local_start_lsn, + timeline_start_lsn: start_lsn, + local_start_lsn: start_lsn, commit_lsn, - backup_lsn: local_start_lsn, - peer_horizon_lsn: local_start_lsn, + backup_lsn: start_lsn, + peer_horizon_lsn: start_lsn, remote_consistent_lsn: Lsn(0), - peers: PersistedPeers( - peers - .iter() - .map(|p| (*p, PersistedPeerInfo::new())) - .collect(), - ), partial_backup: wal_backup_partial::State::default(), eviction_state: EvictionState::Present, + creation_ts: SystemTime::now(), }) } pub fn empty() -> Self { TimelinePersistentState::new( &TenantTimelineId::empty(), + Configuration::empty(), ServerInfo { pg_version: 170000, /* Postgres server version (major * 10000) */ system_id: 0, /* Postgres system identifier */ wal_seg_size: WAL_SEGMENT_SIZE as u32, }, - vec![], Lsn::INVALID, Lsn::INVALID, ) @@ -252,6 +257,31 @@ where current_term: after, }) } + + /// Switch into membership configuration `to` if it is higher than the + /// current one. + pub async fn membership_switch( + &mut self, + to: Configuration, + ) -> Result { + let before = self.mconf.clone(); + // Is switch allowed? + if to.generation <= self.mconf.generation { + info!( + "ignoring request to switch membership conf to lower {}, current conf {}", + to, self.mconf + ); + } else { + let mut state = self.start_change(); + state.mconf = to.clone(); + self.finish_change(&state).await?; + info!("switched membership conf to {} from {}", to, before); + } + Ok(TimelineMembershipSwitchResponse { + previous_conf: before, + current_conf: self.mconf.clone(), + }) + } } impl Deref for TimelineState diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs new file mode 100644 index 0000000000..e6f74185c1 --- /dev/null +++ b/safekeeper/src/test_utils.rs @@ -0,0 +1,176 @@ +use std::sync::Arc; + +use camino_tempfile::Utf8TempDir; +use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; +use safekeeper_api::membership::SafekeeperGeneration as Generation; +use tokio::fs::create_dir_all; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; + +use crate::rate_limit::RateLimiter; +use crate::receive_wal::WalAcceptor; +use crate::safekeeper::{ + AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, + ProposerElected, SafeKeeper, TermHistory, +}; +use crate::send_wal::EndWatch; +use crate::state::{TimelinePersistentState, TimelineState}; +use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir}; +use crate::timelines_set::TimelinesSet; +use crate::wal_backup::remote_timeline_path; +use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage}; + +/// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop. +pub struct Env { + /// Whether to enable fsync. + pub fsync: bool, + /// Benchmark directory. Deleted when dropped. + pub tempdir: Utf8TempDir, +} + +impl Env { + /// Creates a new test or benchmarking environment in a temporary directory. fsync controls whether to + /// enable fsyncing. + pub fn new(fsync: bool) -> anyhow::Result { + let tempdir = camino_tempfile::tempdir()?; + Ok(Self { fsync, tempdir }) + } + + /// Constructs a Safekeeper config for the given node ID. + fn make_conf(&self, node_id: NodeId) -> SafeKeeperConf { + let mut conf = SafeKeeperConf::dummy(); + conf.my_id = node_id; + conf.no_sync = !self.fsync; + conf.workdir = self.tempdir.path().join(format!("safekeeper-{node_id}")); + conf + } + + /// Constructs a Safekeeper with the given node and tenant/timeline ID. + /// + /// TODO: we should support using in-memory storage, to measure non-IO costs. This would be + /// easier if SafeKeeper used trait objects for storage rather than generics. It's also not + /// currently possible to construct a timeline using non-file storage since StateSK only accepts + /// SafeKeeper. + pub async fn make_safekeeper( + &self, + node_id: NodeId, + ttid: TenantTimelineId, + start_lsn: Lsn, + ) -> anyhow::Result> { + let conf = self.make_conf(node_id); + + let timeline_dir = get_timeline_dir(&conf, &ttid); + create_dir_all(&timeline_dir).await?; + + let mut pstate = TimelinePersistentState::empty(); + pstate.tenant_id = ttid.tenant_id; + pstate.timeline_id = ttid.timeline_id; + + let wal = wal_storage::PhysicalStorage::new(&ttid, &timeline_dir, &pstate, conf.no_sync)?; + let ctrl = + control_file::FileStorage::create_new(&timeline_dir, pstate, conf.no_sync).await?; + let state = TimelineState::new(ctrl); + let mut safekeeper = SafeKeeper::new(state, wal, conf.my_id)?; + + // Emulate an initial election. + safekeeper + .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected { + generation: Generation::new(0), + term: 1, + start_streaming_at: start_lsn, + term_history: TermHistory(vec![(1, start_lsn).into()]), + })) + .await?; + + Ok(safekeeper) + } + + /// Constructs a timeline, including a new Safekeeper with the given node ID, and spawns its + /// manager task. + pub async fn make_timeline( + &self, + node_id: NodeId, + ttid: TenantTimelineId, + start_lsn: Lsn, + ) -> anyhow::Result> { + let conf = Arc::new(self.make_conf(node_id)); + let timeline_dir = get_timeline_dir(&conf, &ttid); + let remote_path = remote_timeline_path(&ttid)?; + + let safekeeper = self.make_safekeeper(node_id, ttid, start_lsn).await?; + let shared_state = SharedState::new(StateSK::Loaded(safekeeper)); + + let timeline = Timeline::new( + ttid, + &timeline_dir, + &remote_path, + shared_state, + conf.clone(), + ); + timeline.bootstrap( + &mut timeline.write_shared_state().await, + &conf, + Arc::new(TimelinesSet::default()), // ignored for now + RateLimiter::new(0, 0), + ); + Ok(timeline) + } + + // This will be dead code when building a non-benchmark target with the + // benchmarking feature enabled. + #[allow(dead_code)] + pub(crate) async fn write_wal( + tli: Arc, + start_lsn: Lsn, + msg_size: usize, + msg_count: usize, + mut next_record_lsns: Option<&mut Vec>, + ) -> anyhow::Result { + let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE); + let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE); + + let end_watch = EndWatch::Commit(tli.get_commit_lsn_watch_rx()); + + WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0)); + + let prefix = c"neon-file:"; + let prefixlen = prefix.to_bytes_with_nul().len(); + assert!(msg_size >= prefixlen); + let message = vec![0; msg_size - prefixlen]; + + let walgen = + &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn); + for _ in 0..msg_count { + let (lsn, record) = walgen.next().unwrap(); + if let Some(ref mut lsns) = next_record_lsns { + lsns.push(lsn); + } + + let req = AppendRequest { + h: AppendRequestHeader { + generation: Generation::new(0), + term: 1, + begin_lsn: lsn, + end_lsn: lsn + record.len() as u64, + commit_lsn: lsn, + truncate_lsn: Lsn(0), + }, + wal_data: record, + }; + + let end_lsn = req.h.end_lsn; + + let msg = ProposerAcceptorMessage::AppendRequest(req); + msg_tx.send(msg).await?; + while let Some(reply) = reply_rx.recv().await { + if let AcceptorProposerMessage::AppendResponse(resp) = reply { + if resp.flush_lsn >= end_lsn { + break; + } + } + } + } + + Ok(end_watch) + } +} diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 94d6ef1061..c140f16ced 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -1,86 +1,56 @@ //! This module implements Timeline lifecycle management and has all necessary code //! to glue together SafeKeeper and all other background services. -use anyhow::{anyhow, bail, Result}; -use camino::{Utf8Path, Utf8PathBuf}; -use remote_storage::RemotePath; -use safekeeper_api::models::TimelineTermBumpResponse; -use serde::{Deserialize, Serialize}; -use tokio::fs::{self}; -use tokio_util::sync::CancellationToken; -use utils::id::TenantId; -use utils::sync::gate::Gate; - use std::cmp::max; use std::ops::{Deref, DerefMut}; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::time::Duration; -use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; -use tokio::{sync::watch, time::Instant}; -use tracing::*; -use utils::http::error::ApiError; -use utils::{ - id::{NodeId, TenantTimelineId}, - lsn::Lsn, + +use anyhow::{Result, anyhow, bail}; +use camino::{Utf8Path, Utf8PathBuf}; +use http_utils::error::ApiError; +use remote_storage::RemotePath; +use safekeeper_api::Term; +use safekeeper_api::membership::Configuration; +use safekeeper_api::models::{ + PeerInfo, TimelineMembershipSwitchResponse, TimelineTermBumpResponse, }; +use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; +use tokio::fs::{self}; +use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard, watch}; +use tokio::time::Instant; +use tokio_util::sync::CancellationToken; +use tracing::*; +use utils::id::{NodeId, TenantId, TenantTimelineId}; +use utils::lsn::Lsn; +use utils::sync::gate::Gate; -use storage_broker::proto::SafekeeperTimelineInfo; -use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; - -use crate::control_file; +use crate::metrics::{FullTimelineInfo, MISC_OPERATION_SECONDS, WalStorageMetrics}; use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; -use crate::safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, Term, TermLsn, -}; -use crate::send_wal::WalSenders; +use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn}; +use crate::send_wal::{WalSenders, WalSendersTimelineMetricValues}; use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState}; use crate::timeline_guard::ResidenceGuard; use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; use crate::wal_backup::{self, remote_timeline_path}; use crate::wal_backup_partial::PartialRemoteSegment; - -use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS}; use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; -use crate::SafeKeeperConf; -use crate::{debug_dump, timeline_manager, wal_storage}; +use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage}; -/// Things safekeeper should know about timeline state on peers. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PeerInfo { - pub sk_id: NodeId, - pub term: Term, - /// Term of the last entry. - pub last_log_term: Term, - /// LSN of the last record. - pub flush_lsn: Lsn, - pub commit_lsn: Lsn, - /// Since which LSN safekeeper has WAL. - pub local_start_lsn: Lsn, - /// When info was received. Serde annotations are not very useful but make - /// the code compile -- we don't rely on this field externally. - #[serde(skip)] - #[serde(default = "Instant::now")] - ts: Instant, - pub pg_connstr: String, - pub http_connstr: String, -} - -impl PeerInfo { - fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo { - PeerInfo { - sk_id: NodeId(sk_info.safekeeper_id), - term: sk_info.term, - last_log_term: sk_info.last_log_term, - flush_lsn: Lsn(sk_info.flush_lsn), - commit_lsn: Lsn(sk_info.commit_lsn), - local_start_lsn: Lsn(sk_info.local_start_lsn), - pg_connstr: sk_info.safekeeper_connstr.clone(), - http_connstr: sk_info.http_connstr.clone(), - ts, - } +fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo { + PeerInfo { + sk_id: NodeId(sk_info.safekeeper_id), + term: sk_info.term, + last_log_term: sk_info.last_log_term, + flush_lsn: Lsn(sk_info.flush_lsn), + commit_lsn: Lsn(sk_info.commit_lsn), + local_start_lsn: Lsn(sk_info.local_start_lsn), + pg_connstr: sk_info.safekeeper_connstr.clone(), + http_connstr: sk_info.http_connstr.clone(), + ts, } } @@ -190,7 +160,7 @@ impl StateSK { pub fn state(&self) -> &TimelineState { match self { StateSK::Loaded(sk) => &sk.state, - StateSK::Offloaded(ref s) => s, + StateSK::Offloaded(s) => s, StateSK::Empty => unreachable!(), } } @@ -198,7 +168,7 @@ impl StateSK { pub fn state_mut(&mut self) -> &mut TimelineState { match self { StateSK::Loaded(sk) => &mut sk.state, - StateSK::Offloaded(ref mut s) => s, + StateSK::Offloaded(s) => s, StateSK::Empty => unreachable!(), } } @@ -213,6 +183,13 @@ impl StateSK { self.state_mut().term_bump(to).await } + pub async fn membership_switch( + &mut self, + to: Configuration, + ) -> Result { + self.state_mut().membership_switch(to).await + } + /// Close open WAL files to release FDs. fn close_wal_store(&mut self) { if let StateSK::Loaded(sk) = self { @@ -607,6 +584,8 @@ impl Timeline { assert!(self.cancel.is_cancelled()); assert!(self.gate.close_complete()); + info!("deleting timeline {} from disk", self.ttid); + // Close associated FDs. Nobody will be able to touch timeline data once // it is cancelled, so WAL storage won't be opened again. shared_state.sk.close_wal_store(); @@ -697,7 +676,7 @@ impl Timeline { { let mut shared_state = self.write_shared_state().await; shared_state.sk.record_safekeeper_info(&sk_info).await?; - let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now()); + let peer_info = peer_info_from_sk_info(&sk_info, Instant::now()); shared_state.peers_info.upsert(&peer_info); } Ok(()) @@ -727,16 +706,22 @@ impl Timeline { return None; } - let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats(); + let WalSendersTimelineMetricValues { + ps_feedback_counter, + last_ps_feedback, + interpreted_wal_reader_tasks, + } = self.walsenders.info_for_metrics(); + let state = self.read_shared_state().await; Some(FullTimelineInfo { ttid: self.ttid, - ps_feedback_count, + ps_feedback_count: ps_feedback_counter, last_ps_feedback, wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), timeline_is_active: self.broker_active.load(Ordering::Relaxed), num_computes: self.walreceivers.get_num() as u32, last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed), + interpreted_wal_reader_tasks, epoch_start_lsn: state.sk.term_start_lsn(), mem_state: state.sk.state().inmem.clone(), persisted_state: TimelinePersistentState::clone(state.sk.state()), @@ -755,7 +740,7 @@ impl Timeline { debug_dump::Memory { is_cancelled: self.is_cancelled(), peers_info_len: state.peers_info.0.len(), - walsenders: self.walsenders.get_all(), + walsenders: self.walsenders.get_all_public(), wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), active: self.broker_active.load(Ordering::Relaxed), num_computes: self.walreceivers.get_num() as u32, @@ -793,6 +778,14 @@ impl Timeline { state.sk.term_bump(to).await } + pub async fn membership_switch( + self: &Arc, + to: Configuration, + ) -> Result { + let mut state = self.write_shared_state().await; + state.sk.membership_switch(to).await + } + /// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`] async fn do_wal_residence_guard( self: &Arc, diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 303421c837..06ccb32d03 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -7,23 +7,19 @@ use anyhow::Context; use camino::Utf8PathBuf; use remote_storage::RemotePath; -use tokio::{ - fs::File, - io::{AsyncRead, AsyncWriteExt}, -}; +use tokio::fs::File; +use tokio::io::{AsyncRead, AsyncWriteExt}; use tracing::{debug, info, instrument, warn}; use utils::crashsafe::durable_rename; -use crate::{ - metrics::{ - EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES, - }, - rate_limit::rand_duration, - timeline_manager::{Manager, StateSnapshot}, - wal_backup, - wal_backup_partial::{self, PartialRemoteSegment}, - wal_storage::wal_file_paths, +use crate::metrics::{ + EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, EvictionEvent, NUM_EVICTED_TIMELINES, }; +use crate::rate_limit::rand_duration; +use crate::timeline_manager::{Manager, StateSnapshot}; +use crate::wal_backup; +use crate::wal_backup_partial::{self, PartialRemoteSegment}; +use crate::wal_storage::wal_file_paths; impl Manager { /// Returns true if the timeline is ready for eviction. diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index c02fb904cf..71e99a4de7 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -7,41 +7,36 @@ //! Be aware that you need to be extra careful with manager code, because it is not respawned on panic. //! Also, if it will stuck in some branch, it will prevent any further progress in the timeline. -use std::{ - sync::{atomic::AtomicUsize, Arc}, - time::Duration, -}; +use std::sync::Arc; +use std::sync::atomic::AtomicUsize; +use std::time::Duration; use futures::channel::oneshot; use postgres_ffi::XLogSegNo; +use safekeeper_api::Term; +use safekeeper_api::models::PeerInfo; use serde::{Deserialize, Serialize}; -use tokio::{ - task::{JoinError, JoinHandle}, - time::Instant, -}; +use tokio::task::{JoinError, JoinHandle}; +use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, info, info_span, instrument, warn, Instrument}; +use tracing::{Instrument, debug, info, info_span, instrument, warn}; use utils::lsn::Lsn; -use crate::{ - control_file::{FileStorage, Storage}, - metrics::{ - MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, - NUM_EVICTED_TIMELINES, - }, - rate_limit::{rand_duration, RateLimiter}, - recovery::recovery_main, - remove_wal::calc_horizon_lsn, - safekeeper::Term, - send_wal::WalSenders, - state::TimelineState, - timeline::{ManagerTimeline, PeerInfo, ReadGuardSharedState, StateSK, WalResidentTimeline}, - timeline_guard::{AccessService, GuardId, ResidenceGuard}, - timelines_set::{TimelineSetGuard, TimelinesSet}, - wal_backup::{self, WalBackupTaskHandle}, - wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}, - SafeKeeperConf, +use crate::SafeKeeperConf; +use crate::control_file::{FileStorage, Storage}; +use crate::metrics::{ + MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, NUM_EVICTED_TIMELINES, }; +use crate::rate_limit::{RateLimiter, rand_duration}; +use crate::recovery::recovery_main; +use crate::remove_wal::calc_horizon_lsn; +use crate::send_wal::WalSenders; +use crate::state::TimelineState; +use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}; +use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard}; +use crate::timelines_set::{TimelineSetGuard, TimelinesSet}; +use crate::wal_backup::{self, WalBackupTaskHandle}; +use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}; pub(crate) struct StateSnapshot { // inmem values diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index e1241ceb9b..1d29030711 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -2,29 +2,33 @@ //! All timelines should always be present in this map, this is done by loading them //! all from the disk on startup and keeping them in memory. -use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; -use crate::rate_limit::RateLimiter; -use crate::safekeeper::ServerInfo; -use crate::state::TimelinePersistentState; -use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; -use crate::timelines_set::TimelinesSet; -use crate::wal_storage::Storage; -use crate::{control_file, wal_storage, SafeKeeperConf}; -use anyhow::{bail, Context, Result}; -use camino::Utf8PathBuf; -use camino_tempfile::Utf8TempDir; -use serde::Serialize; use std::collections::HashMap; use std::str::FromStr; use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; + +use anyhow::{Context, Result, bail}; +use camino::Utf8PathBuf; +use camino_tempfile::Utf8TempDir; +use safekeeper_api::ServerInfo; +use safekeeper_api::membership::Configuration; +use safekeeper_api::models::SafekeeperUtilization; +use serde::Serialize; use tokio::fs; use tracing::*; use utils::crashsafe::{durable_rename, fsync_async_opt}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; +use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; +use crate::rate_limit::RateLimiter; +use crate::state::TimelinePersistentState; +use crate::timeline::{Timeline, TimelineError, get_tenant_dir, get_timeline_dir}; +use crate::timelines_set::TimelinesSet; +use crate::wal_storage::Storage; +use crate::{SafeKeeperConf, control_file, wal_storage}; + // Timeline entry in the global map: either a ready timeline, or mark that it is // being created. #[derive(Clone)] @@ -214,9 +218,10 @@ impl GlobalTimelines { pub(crate) async fn create( &self, ttid: TenantTimelineId, + mconf: Configuration, server_info: ServerInfo, + start_lsn: Lsn, commit_lsn: Lsn, - local_start_lsn: Lsn, ) -> Result> { let (conf, _, _) = { let state = self.state.lock().unwrap(); @@ -239,8 +244,7 @@ impl GlobalTimelines { // TODO: currently we create only cfile. It would be reasonable to // immediately initialize first WAL segment as well. - let state = - TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?; + let state = TimelinePersistentState::new(&ttid, mconf, server_info, start_lsn, commit_lsn)?; control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?; let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?; Ok(timeline) @@ -415,6 +419,20 @@ impl GlobalTimelines { .collect() } + /// Returns statistics about timeline counts + pub fn get_timeline_counts(&self) -> SafekeeperUtilization { + let global_lock = self.state.lock().unwrap(); + let timeline_count = global_lock + .timelines + .values() + .filter(|t| match t { + GlobalMapTimeline::CreationInProgress => false, + GlobalMapTimeline::Timeline(t) => !t.is_cancelled(), + }) + .count() as u64; + SafekeeperUtilization { timeline_count } + } + /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant, /// and that's why it can return cancelled timelines, to retry deleting them. fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec> { @@ -459,6 +477,8 @@ impl GlobalTimelines { info!("deleting timeline {}, only_local={}", ttid, only_local); timeline.shutdown().await; + info!("timeline {ttid} shut down for deletion"); + // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs index 096e348295..1d1abc530f 100644 --- a/safekeeper/src/timelines_set.rs +++ b/safekeeper/src/timelines_set.rs @@ -1,4 +1,5 @@ -use std::{collections::HashMap, sync::Arc}; +use std::collections::HashMap; +use std::sync::Arc; use utils::id::TenantTimelineId; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 34b5dbeaa1..6176e64698 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -1,36 +1,32 @@ -use anyhow::{Context, Result}; - -use camino::{Utf8Path, Utf8PathBuf}; -use futures::stream::FuturesOrdered; -use futures::StreamExt; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; -use utils::backoff; -use utils::id::NodeId; - use std::cmp::min; use std::collections::HashSet; use std::num::NonZeroU32; use std::pin::Pin; use std::time::Duration; +use anyhow::{Context, Result}; +use camino::{Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use futures::stream::FuturesOrdered; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; -use postgres_ffi::XLogFileName; -use postgres_ffi::{XLogSegNo, PG_TLI}; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; use remote_storage::{ DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata, }; +use safekeeper_api::models::PeerInfo; use tokio::fs::File; - use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::{watch, OnceCell}; +use tokio::sync::{OnceCell, watch}; +use tokio::task::JoinHandle; +use tokio_util::sync::CancellationToken; use tracing::*; - -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::backoff; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; -use crate::timeline::{PeerInfo, WalResidentTimeline}; +use crate::timeline::WalResidentTimeline; use crate::timeline_manager::{Manager, StateSnapshot}; use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME}; @@ -309,9 +305,12 @@ impl WalBackupTask { retry_attempt = 0; } Err(e) => { + // We might have managed to upload some segment even though + // some later in the range failed, so log backup_lsn + // separately. error!( - "failed while offloading range {}-{}: {:?}", - backup_lsn, commit_lsn, e + "failed while offloading range {}-{}, backup_lsn {}: {:?}", + backup_lsn, commit_lsn, backup_lsn, e ); retry_attempt = retry_attempt.saturating_add(1); @@ -337,6 +336,13 @@ async fn backup_lsn_range( let start_lsn = *backup_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); + info!( + "offloading segnos {:?} of range [{}-{})", + segments.iter().map(|&s| s.seg_no).collect::>(), + start_lsn, + end_lsn, + ); + // Pool of concurrent upload tasks. We use `FuturesOrdered` to // preserve order of uploads, and update `backup_lsn` only after // all previous uploads are finished. @@ -373,10 +379,10 @@ async fn backup_lsn_range( } info!( - "offloaded segnos {:?} up to {}, previous backup_lsn {}", + "offloaded segnos {:?} of range [{}-{})", segments.iter().map(|&s| s.seg_no).collect::>(), - end_lsn, start_lsn, + end_lsn, ); Ok(()) } diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index bddfca50e4..049852a048 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -20,23 +20,23 @@ //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. use camino::Utf8PathBuf; -use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; use remote_storage::RemotePath; +use safekeeper_api::Term; use serde::{Deserialize, Serialize}; - use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn}; -use utils::{id::NodeId, lsn::Lsn}; +use utils::id::NodeId; +use utils::lsn::Lsn; -use crate::{ - metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, - rate_limit::{rand_duration, RateLimiter}, - safekeeper::Term, - timeline::WalResidentTimeline, - timeline_manager::StateSnapshot, - wal_backup::{self}, - SafeKeeperConf, +use crate::SafeKeeperConf; +use crate::metrics::{ + MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS, }; +use crate::rate_limit::{RateLimiter, rand_duration}; +use crate::timeline::WalResidentTimeline; +use crate::timeline_manager::StateSnapshot; +use crate::wal_backup::{self}; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum UploadStatus { @@ -535,6 +535,10 @@ pub async fn main_task( // limit concurrent uploads let _upload_permit = tokio::select! { acq = limiter.acquire_partial_backup() => acq, + _ = backup.tli.cancel.cancelled() => { + info!("timeline canceled"); + return None; + } _ = cancel.cancelled() => { info!("task canceled"); return None; diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs index f8c0c502cd..cc9d4e6e3b 100644 --- a/safekeeper/src/wal_reader_stream.rs +++ b/safekeeper/src/wal_reader_stream.rs @@ -1,34 +1,17 @@ -use std::sync::Arc; +use std::pin::Pin; +use std::task::{Context, Poll}; -use async_stream::try_stream; use bytes::Bytes; -use futures::Stream; -use postgres_backend::CopyStreamHandlerEnd; -use std::time::Duration; -use tokio::time::timeout; +use futures::stream::BoxStream; +use futures::{Stream, StreamExt}; +use safekeeper_api::Term; use utils::lsn::Lsn; -use crate::{ - safekeeper::Term, - send_wal::{EndWatch, WalSenderGuard}, - timeline::WalResidentTimeline, -}; - -pub(crate) struct WalReaderStreamBuilder { - pub(crate) tli: WalResidentTimeline, - pub(crate) start_pos: Lsn, - pub(crate) end_pos: Lsn, - pub(crate) term: Option, - pub(crate) end_watch: EndWatch, - pub(crate) wal_sender_guard: Arc, -} - -impl WalReaderStreamBuilder { - pub(crate) fn start_pos(&self) -> Lsn { - self.start_pos - } -} +use crate::send_wal::EndWatch; +use crate::timeline::WalResidentTimeline; +use crate::wal_storage::WalReader; +#[derive(PartialEq, Eq, Debug)] pub(crate) struct WalBytes { /// Raw PG WAL pub(crate) wal: Bytes, @@ -44,106 +27,269 @@ pub(crate) struct WalBytes { pub(crate) available_wal_end_lsn: Lsn, } -impl WalReaderStreamBuilder { - /// Builds a stream of Postgres WAL starting from [`Self::start_pos`]. - /// The stream terminates when the receiver (pageserver) is fully caught up - /// and there's no active computes. - pub(crate) async fn build( - self, - buffer_size: usize, - ) -> anyhow::Result>> { - // TODO(vlad): The code below duplicates functionality from [`crate::send_wal`]. - // We can make the raw WAL sender use this stream too and remove the duplication. - let Self { - tli, - mut start_pos, - mut end_pos, - term, - mut end_watch, - wal_sender_guard, - } = self; - let mut wal_reader = tli.get_walreader(start_pos).await?; - let mut buffer = vec![0; buffer_size]; +struct PositionedWalReader { + start: Lsn, + end: Lsn, + reader: Option, +} - const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); +/// A streaming WAL reader wrapper which can be reset while running +pub(crate) struct StreamingWalReader { + stream: BoxStream<'static, WalOrReset>, + start_changed_tx: tokio::sync::watch::Sender, +} - Ok(try_stream! { - loop { - let have_something_to_send = end_pos > start_pos; +pub(crate) enum WalOrReset { + Wal(anyhow::Result), + Reset(Lsn), +} - if !have_something_to_send { - // wait for lsn - let res = timeout(POLL_STATE_TIMEOUT, end_watch.wait_for_lsn(start_pos, term)).await; - match res { - Ok(ok) => { - end_pos = ok?; - }, - Err(_) => { - if let EndWatch::Commit(_) = end_watch { - if let Some(remote_consistent_lsn) = wal_sender_guard - .walsenders() - .get_ws_remote_consistent_lsn(wal_sender_guard.id()) - { - if tli.should_walsender_stop(remote_consistent_lsn).await { - // Stop streaming if the receivers are caught up and - // there's no active compute. This causes the loop in - // [`crate::send_interpreted_wal::InterpretedWalSender::run`] - // to exit and terminate the WAL stream. - return; - } - } - } - - continue; - } - } - } - - - assert!( - end_pos > start_pos, - "nothing to send after waiting for WAL" - ); - - // try to send as much as available, capped by the buffer size - let mut chunk_end_pos = start_pos + buffer_size as u64; - // if we went behind available WAL, back off - if chunk_end_pos >= end_pos { - chunk_end_pos = end_pos; - } else { - // If sending not up to end pos, round down to page boundary to - // avoid breaking WAL record not at page boundary, as protocol - // demands. See walsender.c (XLogSendPhysical). - chunk_end_pos = chunk_end_pos - .checked_sub(chunk_end_pos.block_offset()) - .unwrap(); - } - let send_size = (chunk_end_pos.0 - start_pos.0) as usize; - let buffer = &mut buffer[..send_size]; - let send_size: usize; - { - // If uncommitted part is being pulled, check that the term is - // still the expected one. - let _term_guard = if let Some(t) = term { - Some(tli.acquire_term(t).await?) - } else { - None - }; - // Read WAL into buffer. send_size can be additionally capped to - // segment boundary here. - send_size = wal_reader.read(buffer).await? - }; - let wal = Bytes::copy_from_slice(&buffer[..send_size]); - - yield WalBytes { - wal, - wal_start_lsn: start_pos, - wal_end_lsn: start_pos + send_size as u64, - available_wal_end_lsn: end_pos - }; - - start_pos += send_size as u64; - } - }) +impl WalOrReset { + pub(crate) fn get_wal(self) -> Option> { + match self { + WalOrReset::Wal(wal) => Some(wal), + WalOrReset::Reset(_) => None, + } + } +} + +impl StreamingWalReader { + pub(crate) fn new( + tli: WalResidentTimeline, + term: Option, + start: Lsn, + end: Lsn, + end_watch: EndWatch, + buffer_size: usize, + ) -> Self { + let (start_changed_tx, start_changed_rx) = tokio::sync::watch::channel(start); + + let state = WalReaderStreamState { + tli, + wal_reader: PositionedWalReader { + start, + end, + reader: None, + }, + term, + end_watch, + buffer: vec![0; buffer_size], + buffer_size, + }; + + // When a change notification is received while polling the internal + // reader, stop polling the read future and service the change. + let stream = futures::stream::unfold( + (state, start_changed_rx), + |(mut state, mut rx)| async move { + let wal_or_reset = tokio::select! { + read_res = state.read() => { WalOrReset::Wal(read_res) }, + changed_res = rx.changed() => { + if changed_res.is_err() { + return None; + } + + let new_start_pos = rx.borrow_and_update(); + WalOrReset::Reset(*new_start_pos) + } + }; + + if let WalOrReset::Reset(lsn) = wal_or_reset { + state.wal_reader.start = lsn; + state.wal_reader.reader = None; + } + + Some((wal_or_reset, (state, rx))) + }, + ) + .boxed(); + + Self { + stream, + start_changed_tx, + } + } + + /// Reset the stream to a given position. + pub(crate) async fn reset(&mut self, start: Lsn) { + self.start_changed_tx.send(start).unwrap(); + while let Some(wal_or_reset) = self.stream.next().await { + match wal_or_reset { + WalOrReset::Reset(at) => { + // Stream confirmed the reset. + // There may only one ongoing reset at any given time, + // hence the assertion. + assert_eq!(at, start); + break; + } + WalOrReset::Wal(_) => { + // Ignore wal generated before reset was handled + } + } + } + } +} + +impl Stream for StreamingWalReader { + type Item = WalOrReset; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.stream).poll_next(cx) + } +} + +struct WalReaderStreamState { + tli: WalResidentTimeline, + wal_reader: PositionedWalReader, + term: Option, + end_watch: EndWatch, + buffer: Vec, + buffer_size: usize, +} + +impl WalReaderStreamState { + async fn read(&mut self) -> anyhow::Result { + // Create reader if needed + if self.wal_reader.reader.is_none() { + self.wal_reader.reader = Some(self.tli.get_walreader(self.wal_reader.start).await?); + } + + let have_something_to_send = self.wal_reader.end > self.wal_reader.start; + if !have_something_to_send { + tracing::debug!( + "Waiting for wal: start={}, end={}", + self.wal_reader.end, + self.wal_reader.start + ); + self.wal_reader.end = self + .end_watch + .wait_for_lsn(self.wal_reader.start, self.term) + .await?; + tracing::debug!( + "Done waiting for wal: start={}, end={}", + self.wal_reader.end, + self.wal_reader.start + ); + } + + assert!( + self.wal_reader.end > self.wal_reader.start, + "nothing to send after waiting for WAL" + ); + + // Calculate chunk size + let mut chunk_end_pos = self.wal_reader.start + self.buffer_size as u64; + if chunk_end_pos >= self.wal_reader.end { + chunk_end_pos = self.wal_reader.end; + } else { + chunk_end_pos = chunk_end_pos + .checked_sub(chunk_end_pos.block_offset()) + .unwrap(); + } + + let send_size = (chunk_end_pos.0 - self.wal_reader.start.0) as usize; + let buffer = &mut self.buffer[..send_size]; + + // Read WAL + let send_size = { + let _term_guard = if let Some(t) = self.term { + Some(self.tli.acquire_term(t).await?) + } else { + None + }; + self.wal_reader + .reader + .as_mut() + .unwrap() + .read(buffer) + .await? + }; + + let wal = Bytes::copy_from_slice(&buffer[..send_size]); + let result = WalBytes { + wal, + wal_start_lsn: self.wal_reader.start, + wal_end_lsn: self.wal_reader.start + send_size as u64, + available_wal_end_lsn: self.wal_reader.end, + }; + + self.wal_reader.start += send_size as u64; + + Ok(result) + } +} + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use futures::StreamExt; + use postgres_ffi::MAX_SEND_SIZE; + use utils::id::{NodeId, TenantTimelineId}; + use utils::lsn::Lsn; + + use crate::test_utils::Env; + use crate::wal_reader_stream::StreamingWalReader; + + #[tokio::test] + async fn test_streaming_wal_reader_reset() { + let _ = env_logger::builder().is_test(true).try_init(); + + const SIZE: usize = 8 * 1024; + const MSG_COUNT: usize = 200; + + let start_lsn = Lsn::from_str("0/149FD18").unwrap(); + let env = Env::new(true).unwrap(); + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn) + .await + .unwrap(); + + let resident_tli = tli.wal_residence_guard().await.unwrap(); + let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None) + .await + .unwrap(); + let end_pos = end_watch.get(); + + tracing::info!("Doing first round of reads ..."); + + let mut streaming_wal_reader = StreamingWalReader::new( + resident_tli, + None, + start_lsn, + end_pos, + end_watch, + MAX_SEND_SIZE, + ); + + let mut before_reset = Vec::new(); + while let Some(wor) = streaming_wal_reader.next().await { + let wal = wor.get_wal().unwrap().unwrap(); + let stop = wal.available_wal_end_lsn == wal.wal_end_lsn; + before_reset.push(wal); + + if stop { + break; + } + } + + tracing::info!("Resetting the WAL stream ..."); + + streaming_wal_reader.reset(start_lsn).await; + + tracing::info!("Doing second round of reads ..."); + + let mut after_reset = Vec::new(); + while let Some(wor) = streaming_wal_reader.next().await { + let wal = wor.get_wal().unwrap().unwrap(); + let stop = wal.available_wal_end_lsn == wal.wal_end_lsn; + after_reset.push(wal); + + if stop { + break; + } + } + + assert_eq!(before_reset, after_reset); } } diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 1ff83918a7..045fa88cb0 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -2,20 +2,23 @@ //! WAL service listens for client connections and //! receive WAL from wal_proposer and send it to WAL receivers //! -use anyhow::{Context, Result}; -use postgres_backend::QueryError; +use std::os::fd::AsRawFd; use std::sync::Arc; use std::time::Duration; + +use anyhow::{Context, Result}; +use postgres_backend::{AuthType, PostgresBackend, QueryError}; +use safekeeper_api::models::ConnectionId; use tokio::net::TcpStream; use tokio_io_timeout::TimeoutReader; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::{auth::Scope, measured_stream::MeasuredStream}; +use utils::auth::Scope; +use utils::measured_stream::MeasuredStream; +use crate::handler::SafekeeperPostgresHandler; use crate::metrics::TrafficMetrics; -use crate::SafeKeeperConf; -use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines}; -use postgres_backend::{AuthType, PostgresBackend}; +use crate::{GlobalTimelines, SafeKeeperConf}; /// Accept incoming TCP connections and spawn them into a background thread. /// @@ -61,6 +64,7 @@ async fn handle_socket( global_timelines: Arc, ) -> Result<(), QueryError> { socket.set_nodelay(true)?; + let socket_fd = socket.as_raw_fd(); let peer_addr = socket.peer_addr()?; // Set timeout on reading from the socket. It prevents hanged up connection @@ -106,7 +110,7 @@ async fn handle_socket( auth_pair, global_timelines, ); - let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; + let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?; // libpq protocol between safekeeper and walproposer / pageserver // We don't use shutdown. pgbackend @@ -114,8 +118,6 @@ async fn handle_socket( .await } -/// Unique WAL service connection ids are logged in spans for observability. -pub type ConnectionId = u32; pub type ConnectionCount = u32; pub fn issue_connection_id(count: &mut ConnectionCount) -> ConnectionId { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index e338d70731..ed197a3f83 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -7,32 +7,32 @@ //! //! Note that last file has `.partial` suffix, that's different from postgres. -use anyhow::{bail, Context, Result}; -use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; -use futures::future::BoxFuture; -use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; -use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI}; -use remote_storage::RemotePath; use std::cmp::{max, min}; use std::future::Future; use std::io::{self, SeekFrom}; use std::pin::Pin; -use tokio::fs::{self, remove_file, File, OpenOptions}; -use tokio::io::{AsyncRead, AsyncWriteExt}; -use tokio::io::{AsyncReadExt, AsyncSeekExt}; + +use anyhow::{Context, Result, bail}; +use bytes::Bytes; +use camino::{Utf8Path, Utf8PathBuf}; +use futures::future::BoxFuture; +use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion}; +use pq_proto::SystemId; +use remote_storage::RemotePath; +use tokio::fs::{self, File, OpenOptions, remove_file}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; use tracing::*; use utils::crashsafe::durable_rename; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; use crate::metrics::{ - time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, + REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure, }; use crate::state::TimelinePersistentState; use crate::wal_backup::{read_object, remote_timeline_path}; -use postgres_ffi::waldecoder::WalStreamDecoder; -use postgres_ffi::XLogFileName; -use pq_proto::SystemId; -use utils::{id::TenantTimelineId, lsn::Lsn}; pub trait Storage { // Last written LSN. @@ -200,7 +200,12 @@ impl PhysicalStorage { ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, ); if flush_lsn < state.commit_lsn { - bail!("timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file", ttid.timeline_id, flush_lsn, state.commit_lsn); + bail!( + "timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file", + ttid.timeline_id, + flush_lsn, + state.commit_lsn + ); } if flush_lsn < state.peer_horizon_lsn { warn!( diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs index 8e5b17a143..8e54d2bb86 100644 --- a/safekeeper/tests/misc_test.rs +++ b/safekeeper/tests/misc_test.rs @@ -3,9 +3,9 @@ use std::sync::Arc; use tracing::{info, warn}; use utils::lsn::Lsn; -use crate::walproposer_sim::{ - log::{init_logger, init_tracing_logger}, - simulation::{generate_network_opts, generate_schedule, Schedule, TestAction, TestConfig}, +use crate::walproposer_sim::log::{init_logger, init_tracing_logger}; +use crate::walproposer_sim::simulation::{ + Schedule, TestAction, TestConfig, generate_network_opts, generate_schedule, }; pub mod walproposer_sim; diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs index 1a932ef699..e29b58836a 100644 --- a/safekeeper/tests/random_test.rs +++ b/safekeeper/tests/random_test.rs @@ -1,11 +1,9 @@ use rand::Rng; use tracing::{info, warn}; -use crate::walproposer_sim::{ - log::{init_logger, init_tracing_logger}, - simulation::{generate_network_opts, generate_schedule, TestConfig}, - simulation_logs::validate_events, -}; +use crate::walproposer_sim::log::{init_logger, init_tracing_logger}; +use crate::walproposer_sim::simulation::{TestConfig, generate_network_opts, generate_schedule}; +use crate::walproposer_sim::simulation_logs::validate_events; pub mod walproposer_sim; @@ -18,7 +16,7 @@ fn test_random_schedules() -> anyhow::Result<()> { let mut config = TestConfig::new(Some(clock)); for _ in 0..500 { - let seed: u64 = rand::thread_rng().gen(); + let seed: u64 = rand::thread_rng().r#gen(); config.network = generate_network_opts(seed); let test = config.start(seed); diff --git a/safekeeper/tests/simple_test.rs b/safekeeper/tests/simple_test.rs index 0be9d0deef..f7b266e39c 100644 --- a/safekeeper/tests/simple_test.rs +++ b/safekeeper/tests/simple_test.rs @@ -1,7 +1,8 @@ use tracing::info; use utils::lsn::Lsn; -use crate::walproposer_sim::{log::init_logger, simulation::TestConfig}; +use crate::walproposer_sim::log::init_logger; +use crate::walproposer_sim::simulation::TestConfig; pub mod walproposer_sim; diff --git a/safekeeper/tests/walproposer_sim/log.rs b/safekeeper/tests/walproposer_sim/log.rs index 870f30de4f..e2ba3282ca 100644 --- a/safekeeper/tests/walproposer_sim/log.rs +++ b/safekeeper/tests/walproposer_sim/log.rs @@ -1,9 +1,11 @@ -use std::{fmt, sync::Arc}; +use std::fmt; +use std::sync::Arc; use desim::time::Timing; use once_cell::sync::OnceCell; use parking_lot::Mutex; -use tracing_subscriber::fmt::{format::Writer, time::FormatTime}; +use tracing_subscriber::fmt::format::Writer; +use tracing_subscriber::fmt::time::FormatTime; /// SimClock can be plugged into tracing logger to print simulation time. #[derive(Clone)] diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 12aa025771..6ce1a9940e 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -2,30 +2,30 @@ //! Gets messages from the network, passes them down to consensus module and //! sends replies back. -use std::{collections::HashMap, sync::Arc, time::Duration}; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; -use anyhow::{bail, Result}; +use anyhow::{Result, bail}; use bytes::{Bytes, BytesMut}; use camino::Utf8PathBuf; -use desim::{ - executor::{self, PollSome}, - network::TCP, - node_os::NodeOs, - proto::{AnyMessage, NetEvent, NodeEvent}, -}; +use desim::executor::{self, PollSome}; +use desim::network::TCP; +use desim::node_os::NodeOs; +use desim::proto::{AnyMessage, NetEvent, NodeEvent}; use http::Uri; -use safekeeper::{ - safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION}, - state::{TimelinePersistentState, TimelineState}, - timeline::TimelineError, - wal_storage::Storage, - SafeKeeperConf, +use safekeeper::SafeKeeperConf; +use safekeeper::safekeeper::{ + ProposerAcceptorMessage, SK_PROTO_VERSION_3, SafeKeeper, UNKNOWN_SERVER_VERSION, }; +use safekeeper::state::{TimelinePersistentState, TimelineState}; +use safekeeper::timeline::TimelineError; +use safekeeper::wal_storage::Storage; +use safekeeper_api::ServerInfo; +use safekeeper_api::membership::Configuration; use tracing::{debug, info_span, warn}; -use utils::{ - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; use super::safekeeper_disk::{DiskStateStorage, DiskWALStorage, SafekeeperDisk, TimelineDisk}; @@ -95,8 +95,13 @@ impl GlobalMap { let commit_lsn = Lsn::INVALID; let local_start_lsn = Lsn::INVALID; - let state = - TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?; + let state = TimelinePersistentState::new( + &ttid, + Configuration::empty(), + server_info, + commit_lsn, + local_start_lsn, + )?; let disk_timeline = self.disk.put_state(&ttid, state); let control_store = DiskStateStorage::new(disk_timeline.clone()); @@ -172,6 +177,8 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { control_file_save_interval: Duration::from_secs(1), partial_backup_concurrency: 1, eviction_min_resident: Duration::ZERO, + wal_reader_fanout: false, + max_delta_for_fanout: None, }; let mut global = GlobalMap::new(disk, conf.clone())?; @@ -277,7 +284,7 @@ impl ConnState { bail!("finished processing START_REPLICATION") } - let msg = ProposerAcceptorMessage::parse(copy_data)?; + let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTO_VERSION_3)?; debug!("got msg: {:?}", msg); self.process(msg, global) } else { @@ -393,7 +400,7 @@ impl ConnState { // TODO: if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn let mut buf = BytesMut::with_capacity(128); - reply.serialize(&mut buf)?; + reply.serialize(&mut buf, SK_PROTO_VERSION_3)?; self.tcp.send(AnyMessage::Bytes(buf.into())); } diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs index b854754ecf..94a849b5f0 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -1,22 +1,23 @@ use std::collections::HashMap; +use std::ops::Deref; use std::sync::Arc; - -use parking_lot::Mutex; -use safekeeper::state::TimelinePersistentState; -use utils::id::TenantTimelineId; - -use super::block_storage::BlockStorage; - -use std::{ops::Deref, time::Instant}; +use std::time::Instant; use anyhow::Result; use bytes::{Buf, BytesMut}; use futures::future::BoxFuture; -use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo}; -use safekeeper::{control_file, metrics::WalStorageMetrics, wal_storage}; +use parking_lot::Mutex; +use postgres_ffi::XLogSegNo; +use postgres_ffi::waldecoder::WalStreamDecoder; +use safekeeper::metrics::WalStorageMetrics; +use safekeeper::state::TimelinePersistentState; +use safekeeper::{control_file, wal_storage}; use tracing::{debug, info}; +use utils::id::TenantTimelineId; use utils::lsn::Lsn; +use super::block_storage::BlockStorage; + /// All safekeeper state that is usually saved to disk. pub struct SafekeeperDisk { pub timelines: Mutex>>, diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs index fabf450eef..f314143952 100644 --- a/safekeeper/tests/walproposer_sim/simulation.rs +++ b/safekeeper/tests/walproposer_sim/simulation.rs @@ -1,23 +1,24 @@ -use std::{cell::Cell, str::FromStr, sync::Arc}; +use std::cell::Cell; +use std::str::FromStr; +use std::sync::Arc; -use crate::walproposer_sim::{safekeeper::run_server, walproposer_api::SimulationApi}; -use desim::{ - executor::{self, ExternalHandle}, - node_os::NodeOs, - options::{Delay, NetworkOptions}, - proto::{AnyMessage, NodeEvent}, - world::Node, - world::World, -}; +use desim::executor::{self, ExternalHandle}; +use desim::node_os::NodeOs; +use desim::options::{Delay, NetworkOptions}; +use desim::proto::{AnyMessage, NodeEvent}; +use desim::world::{Node, World}; use rand::{Rng, SeedableRng}; use tracing::{debug, info_span, warn}; -use utils::{id::TenantTimelineId, lsn::Lsn}; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; use walproposer::walproposer::{Config, Wrapper}; -use super::{ - log::SimClock, safekeeper_disk::SafekeeperDisk, walproposer_api, - walproposer_disk::DiskWalProposer, -}; +use super::log::SimClock; +use super::safekeeper_disk::SafekeeperDisk; +use super::walproposer_api; +use super::walproposer_disk::DiskWalProposer; +use crate::walproposer_sim::safekeeper::run_server; +use crate::walproposer_sim::walproposer_api::SimulationApi; /// Simulated safekeeper node. pub struct SafekeeperNode { diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs index 5578c94cf6..6451589e80 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_api.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -1,26 +1,20 @@ -use std::{ - cell::{RefCell, RefMut, UnsafeCell}, - ffi::CStr, - sync::Arc, -}; +use std::cell::{RefCell, RefMut, UnsafeCell}; +use std::ffi::CStr; +use std::sync::Arc; use bytes::Bytes; -use desim::{ - executor::{self, PollSome}, - network::TCP, - node_os::NodeOs, - proto::{AnyMessage, NetEvent, NodeEvent}, - world::NodeId, -}; +use desim::executor::{self, PollSome}; +use desim::network::TCP; +use desim::node_os::NodeOs; +use desim::proto::{AnyMessage, NetEvent, NodeEvent}; +use desim::world::NodeId; use tracing::debug; use utils::lsn::Lsn; -use walproposer::{ - api_bindings::Level, - bindings::{ - NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, - }, - walproposer::{ApiImpl, Config}, +use walproposer::api_bindings::Level; +use walproposer::bindings::{ + NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, }; +use walproposer::walproposer::{ApiImpl, Config}; use super::walproposer_disk::DiskWalProposer; @@ -578,7 +572,9 @@ impl ApiImpl for SimulationApi { let disk_lsn = disk.lock().flush_rec_ptr().0; debug!("start_streaming at {} (disk_lsn={})", startpos, disk_lsn); if startpos < disk_lsn { - debug!("startpos < disk_lsn, it means we wrote some transaction even before streaming started"); + debug!( + "startpos < disk_lsn, it means we wrote some transaction even before streaming started" + ); } assert!(startpos <= disk_lsn); let mut broadcasted = Lsn(startpos); diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs index aefb3919a1..fe3eee8a5a 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -1,4 +1,5 @@ -use std::{ffi::CStr, sync::Arc}; +use std::ffi::CStr; +use std::sync::Arc; use parking_lot::{Mutex, MutexGuard}; use postgres_ffi::v16::wal_generator::{LogicalMessageGenerator, WalGenerator}; @@ -18,7 +19,7 @@ impl DiskWalProposer { internal_available_lsn: Lsn(0), prev_lsn: Lsn(0), disk: BlockStorage::new(), - wal_generator: WalGenerator::new(LogicalMessageGenerator::new(c"", &[])), + wal_generator: WalGenerator::new(LogicalMessageGenerator::new(c"", &[]), Lsn(0)), }), }) } diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js index e8e0b3c23a..96a0ea3267 100755 --- a/scripts/comment-test-report.js +++ b/scripts/comment-test-report.js @@ -84,6 +84,12 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => { } else { arch = "unknown" } + let lfcState = "" + if (test.parameters.includes("'with-lfc'")) { + lfcState = "with-lfc" + } else { + lfcState = "without-lfc" + } // Removing build type and PostgreSQL version from the test name to make it shorter const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "") @@ -91,6 +97,7 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => { test.pgVersion = pgVersion test.buildType = buildType test.arch = arch + test.lfcState = lfcState if (test.status === "passed") { passedTests[pgVersion][testName].push(test) @@ -157,7 +164,7 @@ const reportSummary = async (params) => { const links = [] for (const test of tests) { const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}` - links.push(`[${test.buildType}-${test.arch}](${allureLink})`) + links.push(`[${test.buildType}-${test.arch}-${test.lfcState}](${allureLink})`) } summary += `- \`${testName}\`: ${links.join(", ")}\n` } @@ -188,7 +195,7 @@ const reportSummary = async (params) => { const links = [] for (const test of tests) { const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries` - links.push(`[${test.buildType}-${test.arch}](${allureLink})`) + links.push(`[${test.buildType}-${test.arch}-${test.lfcState}](${allureLink})`) } summary += `- \`${testName}\`: ${links.join(", ")}\n` } diff --git a/scripts/generate_image_maps.py b/scripts/generate_image_maps.py new file mode 100644 index 0000000000..39ece5b38f --- /dev/null +++ b/scripts/generate_image_maps.py @@ -0,0 +1,62 @@ +import itertools +import json +import os + +build_tag = os.environ["BUILD_TAG"] +branch = os.environ["BRANCH"] +dev_acr = os.environ["DEV_ACR"] +prod_acr = os.environ["PROD_ACR"] +dev_aws = os.environ["DEV_AWS"] +prod_aws = os.environ["PROD_AWS"] +aws_region = os.environ["AWS_REGION"] + +components = { + "neon": ["neon"], + "compute": [ + "compute-node-v14", + "compute-node-v15", + "compute-node-v16", + "compute-node-v17", + "vm-compute-node-v14", + "vm-compute-node-v15", + "vm-compute-node-v16", + "vm-compute-node-v17", + ], +} + +registries = { + "dev": [ + "docker.io/neondatabase", + "ghcr.io/neondatabase", + f"{dev_aws}.dkr.ecr.{aws_region}.amazonaws.com", + f"{dev_acr}.azurecr.io/neondatabase", + ], + "prod": [ + f"{prod_aws}.dkr.ecr.{aws_region}.amazonaws.com", + f"{prod_acr}.azurecr.io/neondatabase", + ], +} + +outputs: dict[str, dict[str, list[str]]] = {} + +target_tags = [build_tag, "latest"] if branch == "main" else [build_tag] +target_stages = ["dev", "prod"] if branch.startswith("release") else ["dev"] + +for component_name, component_images in components.items(): + for stage in target_stages: + outputs[f"{component_name}-{stage}"] = dict( + [ + ( + f"docker.io/neondatabase/{component_image}:{build_tag}", + [ + f"{combo[0]}/{component_image}:{combo[1]}" + for combo in itertools.product(registries[stage], target_tags) + ], + ) + for component_image in component_images + ] + ) + +with open(os.environ["GITHUB_OUTPUT"], "a") as f: + for key, value in outputs.items(): + f.write(f"{key}={json.dumps(value)}\n") diff --git a/scripts/ingest_regress_test_result-new-format.py b/scripts/ingest_regress_test_result-new-format.py index 064c516718..3a5cdf013a 100644 --- a/scripts/ingest_regress_test_result-new-format.py +++ b/scripts/ingest_regress_test_result-new-format.py @@ -32,6 +32,7 @@ CREATE TABLE IF NOT EXISTS results ( flaky BOOLEAN NOT NULL, arch arch DEFAULT 'X64', lfc BOOLEAN DEFAULT false NOT NULL, + sanitizers BOOLEAN DEFAULT false NOT NULL, build_type TEXT NOT NULL, pg_version INT NOT NULL, run_id BIGINT NOT NULL, @@ -39,7 +40,7 @@ CREATE TABLE IF NOT EXISTS results ( reference TEXT NOT NULL, revision CHAR(40) NOT NULL, raw JSONB COMPRESSION lz4 NOT NULL, - UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id) + UNIQUE (parent_suite, suite, name, arch, lfc, sanitizers, build_type, pg_version, started_at, stopped_at, run_id) ); """ @@ -56,6 +57,7 @@ class Row: flaky: bool arch: str lfc: bool + sanitizers: bool build_type: str pg_version: int run_id: int @@ -134,7 +136,8 @@ def ingest_test_result( if p["name"].startswith("__") } arch = parameters.get("arch", "UNKNOWN").strip("'") - lfc = parameters.get("lfc", "False") == "True" + lfc = parameters.get("lfc", "without-lfc").strip("'") == "with-lfc" + sanitizers = parameters.get("sanitizers", "disabled").strip("'") == "enabled" build_type, pg_version, unparametrized_name = parse_test_name(test["name"]) labels = {label["name"]: label["value"] for label in test["labels"]} @@ -149,6 +152,7 @@ def ingest_test_result( flaky=test["flaky"] or test["retriesStatusChange"], arch=arch, lfc=lfc, + sanitizers=sanitizers, build_type=build_type, pg_version=pg_version, run_id=run_id, diff --git a/scripts/push_with_image_map.py b/scripts/push_with_image_map.py new file mode 100644 index 0000000000..c68f6ad407 --- /dev/null +++ b/scripts/push_with_image_map.py @@ -0,0 +1,22 @@ +import json +import os +import subprocess + +image_map = os.getenv("IMAGE_MAP") +if not image_map: + raise ValueError("IMAGE_MAP environment variable is not set") + +try: + parsed_image_map: dict[str, list[str]] = json.loads(image_map) +except json.JSONDecodeError as e: + raise ValueError("Failed to parse IMAGE_MAP as JSON") from e + +for source, targets in parsed_image_map.items(): + for target in targets: + cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source] + print(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + if result.returncode != 0: + print(f"Error: {result.stdout}") + raise RuntimeError(f"Command failed: {' '.join(cmd)}") diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index 17d4aed63b..e4db9a317d 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "storage_broker" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [features] diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index 1a6fb7fedf..86f2dd9a6c 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -1,18 +1,14 @@ -use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, Instant}; use clap::Parser; - -use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::{ - FilterTenantTimelineId, MessageType, SubscribeByFilterRequest, + FilterTenantTimelineId, MessageType, SafekeeperTimelineInfo, SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, TypedMessage, }; - use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT}; use tokio::time; - use tonic::Request; const ABOUT: &str = r#" diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 1fbb651656..cc33ec20ff 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -10,7 +10,14 @@ //! //! Only safekeeper message is supported, but it is not hard to add something //! else with generics. -use clap::{command, Parser}; +use std::collections::HashMap; +use std::convert::Infallible; +use std::net::SocketAddr; +use std::pin::Pin; +use std::sync::Arc; +use std::time::Duration; + +use clap::{Parser, command}; use futures_core::Stream; use futures_util::StreamExt; use http_body_util::Full; @@ -19,28 +26,10 @@ use hyper::header::CONTENT_TYPE; use hyper::service::service_fn; use hyper::{Method, StatusCode}; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; -use parking_lot::RwLock; -use std::collections::HashMap; -use std::convert::Infallible; -use std::net::SocketAddr; -use std::pin::Pin; -use std::sync::Arc; -use std::time::Duration; -use tokio::net::TcpListener; -use tokio::sync::broadcast; -use tokio::sync::broadcast::error::RecvError; -use tokio::time; -use tonic::body::{self, empty_body, BoxBody}; -use tonic::codegen::Service; -use tonic::transport::server::Connected; -use tonic::Code; -use tonic::{Request, Response, Status}; -use tracing::*; -use utils::signals::ShutdownSignals; - use metrics::{Encoder, TextEncoder}; +use parking_lot::RwLock; use storage_broker::metrics::{ - BROADCASTED_MESSAGES_TOTAL, BROADCAST_DROPPED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL, + BROADCAST_DROPPED_MESSAGES_TOTAL, BROADCASTED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE, PROCESSED_MESSAGES_TOTAL, PUBLISHED_ONEOFF_MESSAGES_TOTAL, }; use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer}; @@ -49,10 +38,19 @@ use storage_broker::proto::{ FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage, }; -use storage_broker::{parse_proto_ttid, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR}; +use storage_broker::{DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR, parse_proto_ttid}; +use tokio::net::TcpListener; +use tokio::sync::broadcast; +use tokio::sync::broadcast::error::RecvError; +use tokio::time; +use tonic::body::{self, BoxBody, empty_body}; +use tonic::codegen::Service; +use tonic::{Code, Request, Response, Status}; +use tracing::*; use utils::id::TenantTimelineId; use utils::logging::{self, LogFormat}; use utils::sentry_init::init_sentry; +use utils::signals::ShutdownSignals; use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); @@ -459,9 +457,10 @@ impl BrokerService for Broker { &self, request: Request>, ) -> Result, Status> { - let remote_addr = request - .remote_addr() - .expect("TCPConnectInfo inserted by handler"); + let &RemoteAddr(remote_addr) = request + .extensions() + .get() + .expect("RemoteAddr inserted by handler"); let mut publisher = self.registry.register_publisher(remote_addr); let mut stream = request.into_inner(); @@ -484,9 +483,10 @@ impl BrokerService for Broker { &self, request: Request, ) -> Result, Status> { - let remote_addr = request - .remote_addr() - .expect("TCPConnectInfo inserted by handler"); + let &RemoteAddr(remote_addr) = request + .extensions() + .get() + .expect("RemoteAddr inserted by handler"); let proto_key = request .into_inner() .subscription_key @@ -537,9 +537,10 @@ impl BrokerService for Broker { &self, request: Request, ) -> std::result::Result, Status> { - let remote_addr = request - .remote_addr() - .expect("TCPConnectInfo inserted by handler"); + let &RemoteAddr(remote_addr) = request + .extensions() + .get() + .expect("RemoteAddr inserted by handler"); let proto_filter = request.into_inner(); let ttid_filter = proto_filter.tenant_timeline_id.as_ref(); @@ -628,6 +629,9 @@ async fn http1_handler( Ok(resp) } +#[derive(Clone, Copy)] +struct RemoteAddr(SocketAddr); + #[tokio::main] async fn main() -> Result<(), Box> { let args = Args::parse(); @@ -687,13 +691,13 @@ async fn main() -> Result<(), Box> { .max_concurrent_streams(None); let storage_broker_server_cloned = storage_broker_server.clone(); - let connect_info = stream.connect_info(); + let remote_addr = RemoteAddr(addr); let service_fn_ = async move { service_fn(move |mut req| { // That's what tonic's MakeSvc.call does to pass conninfo to // the request handler (and where its request.remote_addr() // expects it to find). - req.extensions_mut().insert(connect_info.clone()); + req.extensions_mut().insert(remote_addr); // Technically this second clone is not needed, but consume // by async block is apparently unavoidable. BTW, error @@ -738,11 +742,12 @@ async fn main() -> Result<(), Box> { #[cfg(test)] mod tests { - use super::*; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use tokio::sync::broadcast::error::TryRecvError; use utils::id::{TenantId, TimelineId}; + use super::*; + fn msg(timeline_id: Vec) -> Message { Message::SafekeeperTimelineInfo(SafekeeperTimelineInfo { safekeeper_id: 1, diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index 3ac40f6e14..55d411f607 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -1,12 +1,11 @@ use std::time::Duration; -use tonic::codegen::StdError; -use tonic::transport::{ClientTlsConfig, Endpoint}; -use tonic::{transport::Channel, Status}; -use utils::id::{TenantId, TenantTimelineId, TimelineId}; -use proto::{ - broker_service_client::BrokerServiceClient, TenantTimelineId as ProtoTenantTimelineId, -}; +use proto::TenantTimelineId as ProtoTenantTimelineId; +use proto::broker_service_client::BrokerServiceClient; +use tonic::Status; +use tonic::codegen::StdError; +use tonic::transport::{Channel, ClientTlsConfig, Endpoint}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; // Code generated by protobuf. pub mod proto { @@ -20,11 +19,8 @@ pub mod proto { pub mod metrics; // Re-exports to avoid direct tonic dependency in user crates. -pub use tonic::Code; -pub use tonic::Request; -pub use tonic::Streaming; - pub use hyper::Uri; +pub use tonic::{Code, Request, Streaming}; pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051"; pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}"); diff --git a/storage_broker/src/metrics.rs b/storage_broker/src/metrics.rs index 1fd3dd5ad6..ecfb594eba 100644 --- a/storage_broker/src/metrics.rs +++ b/storage_broker/src/metrics.rs @@ -1,6 +1,6 @@ //! Broker metrics. -use metrics::{register_int_counter, register_int_gauge, IntCounter, IntGauge}; +use metrics::{IntCounter, IntGauge, register_int_counter, register_int_gauge}; use once_cell::sync::Lazy; pub static NUM_PUBS: Lazy = Lazy::new(|| { diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 2f5d266567..b63ba154da 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "storage_controller" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [[bin]] @@ -18,12 +18,14 @@ anyhow.workspace = true bytes.workspace = true chrono.workspace = true clap.workspace = true +cron.workspace = true fail.workspace = true futures.workspace = true hex.workspace = true hyper0.workspace = true humantime.workspace = true itertools.workspace = true +json-structural-diff.workspace = true lasso.workspace = true once_cell.workspace = true pageserver_api.workspace = true @@ -32,6 +34,11 @@ postgres_connection.workspace = true rand.workspace = true reqwest = { workspace = true, features = ["stream"] } routerify.workspace = true +safekeeper_api.workspace = true +safekeeper_client.workspace = true +tikv-jemallocator.workspace = true +regex.workspace = true +rustls-native-certs.workspace = true serde.workspace = true serde_json.workspace = true thiserror.workspace = true @@ -39,20 +46,23 @@ tokio.workspace = true tokio-util.workspace = true tracing.workspace = true measured.workspace = true +rustls.workspace = true scopeguard.workspace = true strum.workspace = true strum_macros.workspace = true +tokio-postgres.workspace = true +tokio-postgres-rustls.workspace = true -diesel = { version = "2.1.4", features = [ +diesel = { version = "2.2.6", features = [ "serde_json", - "postgres", - "r2d2", "chrono", ] } -diesel_migrations = { version = "2.1.0" } -r2d2 = { version = "0.8.10" } +diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] } +diesel_migrations = { version = "2.2.0" } +scoped-futures = "0.1.4" +http-utils = { path = "../libs/http-utils/" } utils = { path = "../libs/utils/" } metrics = { path = "../libs/metrics/" } control_plane = { path = "../control_plane" } -workspace_hack = { version = "0.1", path = "../workspace_hack" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } \ No newline at end of file diff --git a/storage_controller/client/src/control_api.rs b/storage_controller/client/src/control_api.rs index a981b5020e..f8a2790769 100644 --- a/storage_controller/client/src/control_api.rs +++ b/storage_controller/client/src/control_api.rs @@ -1,7 +1,6 @@ use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; use reqwest::{Method, Url}; use serde::{de::DeserializeOwned, Serialize}; -use std::str::FromStr; pub struct Client { base_url: Url, @@ -31,16 +30,11 @@ impl Client { RQ: Serialize + Sized, RS: DeserializeOwned + Sized, { - // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out - // for general purpose API access. - let url = Url::from_str(&format!( - "http://{}:{}/{path}", - self.base_url.host_str().unwrap(), - self.base_url.port().unwrap() - )) - .unwrap(); - - let mut builder = self.client.request(method, url); + let request_path = self + .base_url + .join(&path) + .expect("Failed to build request path"); + let mut builder = self.client.request(method, request_path); if let Some(body) = body { builder = builder.json(&body) } diff --git a/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/down.sql b/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/down.sql new file mode 100644 index 0000000000..e26bff798f --- /dev/null +++ b/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/down.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers DROP scheduling_policy; diff --git a/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/up.sql b/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/up.sql new file mode 100644 index 0000000000..d83cc6cc46 --- /dev/null +++ b/storage_controller/migrations/2024-12-12-212515_safekeepers_scheduling_policy/up.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers ADD scheduling_policy VARCHAR NOT NULL DEFAULT 'disabled'; diff --git a/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql new file mode 100644 index 0000000000..c2624f858b --- /dev/null +++ b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql @@ -0,0 +1,4 @@ +-- this sadly isn't a "true" revert of the migration, as the column is now at the end of the table. +-- But preserving order is not a trivial operation. +-- https://wiki.postgresql.org/wiki/Alter_column_position +ALTER TABLE safekeepers ADD active BOOLEAN NOT NULL DEFAULT false; diff --git a/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql new file mode 100644 index 0000000000..d76f044eda --- /dev/null +++ b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers DROP active; diff --git a/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql new file mode 100644 index 0000000000..3c7126e343 --- /dev/null +++ b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql @@ -0,0 +1,2 @@ +ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'disabled'; +UPDATE safekeepers SET scheduling_policy = 'disabled' WHERE scheduling_policy = 'pause'; diff --git a/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql new file mode 100644 index 0000000000..9ff75444f3 --- /dev/null +++ b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql @@ -0,0 +1,2 @@ +ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'pause'; +UPDATE safekeepers SET scheduling_policy = 'pause' WHERE scheduling_policy = 'disabled'; diff --git a/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql new file mode 100644 index 0000000000..0f051d3ac3 --- /dev/null +++ b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/down.sql @@ -0,0 +1 @@ +ALTER TABLE nodes DROP listen_https_port; diff --git a/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql new file mode 100644 index 0000000000..172237d477 --- /dev/null +++ b/storage_controller/migrations/2025-02-11-144848_pageserver_use_https/up.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ADD listen_https_port INTEGER; diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs index 6f1355eb68..a630316f46 100644 --- a/storage_controller/src/background_node_operations.rs +++ b/storage_controller/src/background_node_operations.rs @@ -1,9 +1,10 @@ -use std::{borrow::Cow, fmt::Debug, fmt::Display}; +use std::borrow::Cow; +use std::fmt::{Debug, Display}; use tokio_util::sync::CancellationToken; use utils::id::NodeId; -pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32; +pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64; #[derive(Copy, Clone)] pub(crate) struct Drain { diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 2b2ece3f02..b602af362d 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -1,20 +1,21 @@ +use std::borrow::Cow; +use std::collections::HashMap; use std::error::Error as _; use std::sync::Arc; -use std::{collections::HashMap, time::Duration}; +use std::time::Duration; use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; use futures::StreamExt; use hyper::StatusCode; +use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; use postgres_connection::parse_host_port; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; -use tracing::{info_span, Instrument}; -use utils::{ - backoff::{self}, - id::{NodeId, TenantId}, -}; +use tracing::{Instrument, info_span}; +use utils::backoff::{self}; +use utils::id::{NodeId, TenantId}; use crate::service::Config; @@ -28,6 +29,9 @@ struct UnshardedComputeHookTenant { // Which node is this tenant attached to node_id: NodeId, + // The tenant's preferred AZ, so that we may pass this on to the control plane + preferred_az: Option, + // Must hold this lock to send a notification. send_lock: Arc>>, } @@ -36,6 +40,9 @@ struct ShardedComputeHookTenant { shard_count: ShardCount, shards: Vec<(ShardNumber, NodeId)>, + // The tenant's preferred AZ, so that we may pass this on to the control plane + preferred_az: Option, + // Must hold this lock to send a notification. The contents represent // the last successfully sent notification, and are used to coalesce multiple // updates by only sending when there is a chance since our last successful send. @@ -64,17 +71,24 @@ enum ComputeHookTenant { impl ComputeHookTenant { /// Construct with at least one shard's information - fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self { + fn new( + tenant_shard_id: TenantShardId, + stripe_size: ShardStripeSize, + preferred_az: Option, + node_id: NodeId, + ) -> Self { if tenant_shard_id.shard_count.count() > 1 { Self::Sharded(ShardedComputeHookTenant { shards: vec![(tenant_shard_id.shard_number, node_id)], stripe_size, shard_count: tenant_shard_id.shard_count, + preferred_az, send_lock: Arc::default(), }) } else { Self::Unsharded(UnshardedComputeHookTenant { node_id, + preferred_az, send_lock: Arc::default(), }) } @@ -109,7 +123,10 @@ impl ComputeHookTenant { if let Some(shard_idx) = shard_idx { sharded.shards.remove(shard_idx); } else { - tracing::warn!("Shard not found while handling detach") + // This is a valid but niche case, where the tenant was previously attached + // as a Secondary location and then detached, so has no previously notified + // state. + tracing::info!("Shard not found while handling detach") } } ComputeHookTenant::Unsharded(_) => { @@ -120,15 +137,20 @@ impl ComputeHookTenant { /// Set one shard's location. If stripe size or shard count have changed, Self is reset /// and drops existing content. - fn update( - &mut self, - tenant_shard_id: TenantShardId, - stripe_size: ShardStripeSize, - node_id: NodeId, - ) { + fn update(&mut self, shard_update: ShardUpdate) { + let tenant_shard_id = shard_update.tenant_shard_id; + let node_id = shard_update.node_id; + let stripe_size = shard_update.stripe_size; + let preferred_az = shard_update.preferred_az; + match self { Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => { - unsharded_tenant.node_id = node_id + unsharded_tenant.node_id = node_id; + if unsharded_tenant.preferred_az.as_ref() + != preferred_az.as_ref().map(|az| az.as_ref()) + { + unsharded_tenant.preferred_az = preferred_az.map(|az| az.as_ref().clone()); + } } Self::Sharded(sharded_tenant) if sharded_tenant.stripe_size == stripe_size @@ -146,10 +168,21 @@ impl ComputeHookTenant { .push((tenant_shard_id.shard_number, node_id)); sharded_tenant.shards.sort_by_key(|s| s.0) } + + if sharded_tenant.preferred_az.as_ref() + != preferred_az.as_ref().map(|az| az.as_ref()) + { + sharded_tenant.preferred_az = preferred_az.map(|az| az.as_ref().clone()); + } } _ => { // Shard count changed: reset struct. - *self = Self::new(tenant_shard_id, stripe_size, node_id); + *self = Self::new( + tenant_shard_id, + stripe_size, + preferred_az.map(|az| az.into_owned()), + node_id, + ); } } } @@ -165,6 +198,7 @@ struct ComputeHookNotifyRequestShard { #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] struct ComputeHookNotifyRequest { tenant_id: TenantId, + preferred_az: Option, stripe_size: Option, shards: Vec, } @@ -190,7 +224,7 @@ pub(crate) enum NotifyError { // We shutdown while sending #[error("Shutting down")] ShuttingDown, - // A response indicates we will never succeed, such as 400 or 404 + // A response indicates we will never succeed, such as 400 or 403 #[error("Non-retryable error {0}")] Fatal(StatusCode), @@ -238,6 +272,10 @@ impl ComputeHookTenant { node_id: unsharded_tenant.node_id, }], stripe_size: None, + preferred_az: unsharded_tenant + .preferred_az + .as_ref() + .map(|az| az.0.clone()), }), Self::Sharded(sharded_tenant) if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize => @@ -253,6 +291,7 @@ impl ComputeHookTenant { }) .collect(), stripe_size: Some(sharded_tenant.stripe_size), + preferred_az: sharded_tenant.preferred_az.as_ref().map(|az| az.0.clone()), }) } Self::Sharded(sharded_tenant) => { @@ -313,6 +352,17 @@ pub(super) struct ComputeHook { client: reqwest::Client, } +/// Callers may give us a list of these when asking us to send a bulk batch +/// of notifications in the background. This is a 'notification' in the sense of +/// other code notifying us of a shard's status, rather than being the final notification +/// that we send upwards to the control plane for the whole tenant. +pub(crate) struct ShardUpdate<'a> { + pub(crate) tenant_shard_id: TenantShardId, + pub(crate) node_id: NodeId, + pub(crate) stripe_size: ShardStripeSize, + pub(crate) preferred_az: Option>, +} + impl ComputeHook { pub(super) fn new(config: Config) -> Self { let authorization_header = config @@ -363,6 +413,7 @@ impl ComputeHook { tenant_id, shards, stripe_size, + preferred_az: _preferred_az, } = reconfigure_request; let compute_pageservers = shards @@ -503,24 +554,30 @@ impl ComputeHook { } /// Synchronous phase: update the per-tenant state for the next intended notification - fn notify_prepare( - &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - stripe_size: ShardStripeSize, - ) -> MaybeSendResult { + fn notify_prepare(&self, shard_update: ShardUpdate) -> MaybeSendResult { let mut state_locked = self.state.lock().unwrap(); use std::collections::hash_map::Entry; + let tenant_shard_id = shard_update.tenant_shard_id; + let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { - Entry::Vacant(e) => e.insert(ComputeHookTenant::new( - tenant_shard_id, - stripe_size, - node_id, - )), + Entry::Vacant(e) => { + let ShardUpdate { + tenant_shard_id, + node_id, + stripe_size, + preferred_az, + } = shard_update; + e.insert(ComputeHookTenant::new( + tenant_shard_id, + stripe_size, + preferred_az.map(|az| az.into_owned()), + node_id, + )) + } Entry::Occupied(e) => { let tenant = e.into_mut(); - tenant.update(tenant_shard_id, stripe_size, node_id); + tenant.update(shard_update); tenant } }; @@ -608,13 +665,14 @@ impl ComputeHook { /// if something failed. pub(super) fn notify_background( self: &Arc, - notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>, + notifications: Vec, result_tx: tokio::sync::mpsc::Sender>, cancel: &CancellationToken, ) { let mut maybe_sends = Vec::new(); - for (tenant_shard_id, node_id, stripe_size) in notifications { - let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + for shard_update in notifications { + let tenant_shard_id = shard_update.tenant_shard_id; + let maybe_send_result = self.notify_prepare(shard_update); maybe_sends.push((tenant_shard_id, maybe_send_result)) } @@ -678,15 +736,14 @@ impl ComputeHook { /// periods, but we don't retry forever. The **caller** is responsible for handling failures and /// ensuring that they eventually call again to ensure that the compute is eventually notified of /// the proper pageserver nodes for a tenant. - #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))] - pub(super) async fn notify( + #[tracing::instrument(skip_all, fields(tenant_id=%shard_update.tenant_shard_id.tenant_id, shard_id=%shard_update.tenant_shard_id.shard_slug(), node_id))] + pub(super) async fn notify<'a>( &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - stripe_size: ShardStripeSize, + shard_update: ShardUpdate<'a>, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + let tenant_shard_id = shard_update.tenant_shard_id; + let maybe_send_result = self.notify_prepare(shard_update); self.notify_execute(maybe_send_result, tenant_shard_id, cancel) .await } @@ -706,7 +763,10 @@ impl ComputeHook { let mut state_locked = self.state.lock().unwrap(); match state_locked.entry(tenant_shard_id.tenant_id) { Entry::Vacant(_) => { - tracing::warn!("Compute hook tenant not found for detach"); + // This is a valid but niche case, where the tenant was previously attached + // as a Secondary location and then detached, so has no previously notified + // state. + tracing::info!("Compute hook tenant not found for detach"); } Entry::Occupied(mut e) => { let sharded = e.get().is_sharded(); @@ -739,6 +799,7 @@ pub(crate) mod tests { shard_number: ShardNumber(0), }, ShardStripeSize(12345), + None, NodeId(1), ); @@ -765,30 +826,32 @@ pub(crate) mod tests { // Writing the first shard of a multi-sharded situation (i.e. in a split) // resets the tenant state and puts it in an non-notifying state (need to // see all shards) - tenant_state.update( - TenantShardId { + tenant_state.update(ShardUpdate { + tenant_shard_id: TenantShardId { tenant_id, shard_count: ShardCount::new(2), shard_number: ShardNumber(1), }, - ShardStripeSize(32768), - NodeId(1), - ); + stripe_size: ShardStripeSize(32768), + preferred_az: None, + node_id: NodeId(1), + }); assert!(matches!( tenant_state.maybe_send(tenant_id, None), MaybeSendResult::Noop )); // Writing the second shard makes it ready to notify - tenant_state.update( - TenantShardId { + tenant_state.update(ShardUpdate { + tenant_shard_id: TenantShardId { tenant_id, shard_count: ShardCount::new(2), shard_number: ShardNumber(0), }, - ShardStripeSize(32768), - NodeId(1), - ); + stripe_size: ShardStripeSize(32768), + preferred_az: None, + node_id: NodeId(1), + }); let send_result = tenant_state.maybe_send(tenant_id, None); let MaybeSendResult::Transmit((request, mut guard)) = send_result else { diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs index 47f4276ff2..bd4b8ba38f 100644 --- a/storage_controller/src/drain_utils.rs +++ b/storage_controller/src/drain_utils.rs @@ -1,15 +1,14 @@ -use std::{ - collections::{BTreeMap, HashMap}, - sync::Arc, -}; +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; use pageserver_api::controller_api::{NodeSchedulingPolicy, ShardSchedulingPolicy}; -use utils::{id::NodeId, shard::TenantShardId}; +use utils::id::NodeId; +use utils::shard::TenantShardId; -use crate::{ - background_node_operations::OperationError, node::Node, scheduler::Scheduler, - tenant_shard::TenantShard, -}; +use crate::background_node_operations::OperationError; +use crate::node::Node; +use crate::scheduler::Scheduler; +use crate::tenant_shard::TenantShard; pub(crate) struct TenantShardIterator { tenants_accessor: F, @@ -112,7 +111,7 @@ impl TenantShardDrain { } } - match scheduler.node_preferred(tenant_shard.intent.get_secondary()) { + match tenant_shard.preferred_secondary(scheduler) { Some(node) => Some(node), None => { tracing::warn!( @@ -188,10 +187,8 @@ impl TenantShardDrain { mod tests { use std::sync::Arc; - use utils::{ - id::TenantId, - shard::{ShardCount, ShardNumber, TenantShardId}, - }; + use utils::id::TenantId; + use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use super::TenantShardIterator; diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index b7e66d33eb..56a331becd 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -1,23 +1,28 @@ -use futures::{stream::FuturesUnordered, StreamExt}; -use std::{ - collections::HashMap, - sync::Arc, - time::{Duration, Instant}, -}; -use tokio_util::sync::CancellationToken; - -use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization}; +use std::collections::HashMap; +use std::fmt::Debug; +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, Instant}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; +use pageserver_api::controller_api::{NodeAvailability, SkSchedulingPolicy}; +use pageserver_api::models::PageserverUtilization; +use safekeeper_api::models::SafekeeperUtilization; +use safekeeper_client::mgmt_api; use thiserror::Error; +use tokio_util::sync::CancellationToken; use utils::id::NodeId; +use utils::logging::SecretString; use crate::node::Node; +use crate::safekeeper::Safekeeper; -struct HeartbeaterTask { - receiver: tokio::sync::mpsc::UnboundedReceiver, +struct HeartbeaterTask { + receiver: tokio::sync::mpsc::UnboundedReceiver>, cancel: CancellationToken, - state: HashMap, + state: HashMap, max_offline_interval: Duration, max_warming_up_interval: Duration, @@ -36,8 +41,17 @@ pub(crate) enum PageserverState { Offline, } +#[derive(Debug, Clone)] +pub(crate) enum SafekeeperState { + Available { + last_seen_at: Instant, + utilization: SafekeeperUtilization, + }, + Offline, +} + #[derive(Debug)] -pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>); +pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, State)>); #[derive(Debug, Error)] pub(crate) enum HeartbeaterError { @@ -45,23 +59,28 @@ pub(crate) enum HeartbeaterError { Cancel, } -struct HeartbeatRequest { - pageservers: Arc>, - reply: tokio::sync::oneshot::Sender>, +struct HeartbeatRequest { + servers: Arc>, + reply: tokio::sync::oneshot::Sender, HeartbeaterError>>, } -pub(crate) struct Heartbeater { - sender: tokio::sync::mpsc::UnboundedSender, +pub(crate) struct Heartbeater { + sender: tokio::sync::mpsc::UnboundedSender>, } -impl Heartbeater { +#[allow(private_bounds)] +impl Heartbeater +where + HeartbeaterTask: HeartBeat, +{ pub(crate) fn new( jwt_token: Option, max_offline_interval: Duration, max_warming_up_interval: Duration, cancel: CancellationToken, ) -> Self { - let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::(); + let (sender, receiver) = + tokio::sync::mpsc::unbounded_channel::>(); let mut heartbeater = HeartbeaterTask::new( receiver, jwt_token, @@ -76,12 +95,12 @@ impl Heartbeater { pub(crate) async fn heartbeat( &self, - pageservers: Arc>, - ) -> Result { + servers: Arc>, + ) -> Result, HeartbeaterError> { let (sender, receiver) = tokio::sync::oneshot::channel(); self.sender .send(HeartbeatRequest { - pageservers, + servers, reply: sender, }) .map_err(|_| HeartbeaterError::Cancel)?; @@ -93,9 +112,12 @@ impl Heartbeater { } } -impl HeartbeaterTask { +impl HeartbeaterTask +where + HeartbeaterTask: HeartBeat, +{ fn new( - receiver: tokio::sync::mpsc::UnboundedReceiver, + receiver: tokio::sync::mpsc::UnboundedReceiver>, jwt_token: Option, max_offline_interval: Duration, max_warming_up_interval: Duration, @@ -110,15 +132,19 @@ impl HeartbeaterTask { jwt_token, } } - async fn run(&mut self) { loop { tokio::select! { request = self.receiver.recv() => { match request { Some(req) => { - let res = self.heartbeat(req.pageservers).await; - req.reply.send(res).unwrap(); + if req.reply.is_closed() { + // Prevent a possibly infinite buildup of the receiver channel, if requests arrive faster than we can handle them + continue; + } + let res = self.heartbeat(req.servers).await; + // Ignore the return value in order to not panic if the heartbeat function's future was cancelled + _ = req.reply.send(res); }, None => { return; } } @@ -127,11 +153,20 @@ impl HeartbeaterTask { } } } +} +pub(crate) trait HeartBeat { + fn heartbeat( + &mut self, + pageservers: Arc>, + ) -> impl Future, HeartbeaterError>> + Send; +} + +impl HeartBeat for HeartbeaterTask { async fn heartbeat( &mut self, pageservers: Arc>, - ) -> Result { + ) -> Result, HeartbeaterError> { let mut new_state = HashMap::new(); let mut heartbeat_futs = FuturesUnordered::new(); @@ -186,21 +221,21 @@ impl HeartbeaterTask { Some((*node_id, status)) } }); + } - loop { - let maybe_status = tokio::select! { - next = heartbeat_futs.next() => { - match next { - Some(result) => result, - None => { break; } - } - }, - _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } - }; + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; - if let Some((node_id, status)) = maybe_status { - new_state.insert(node_id, status); - } + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); } } @@ -272,3 +307,130 @@ impl HeartbeaterTask { Ok(AvailablityDeltas(deltas)) } } + +impl HeartBeat for HeartbeaterTask { + async fn heartbeat( + &mut self, + safekeepers: Arc>, + ) -> Result, HeartbeaterError> { + let mut new_state = HashMap::new(); + + let mut heartbeat_futs = FuturesUnordered::new(); + for (node_id, sk) in &*safekeepers { + if sk.scheduling_policy() == SkSchedulingPolicy::Decomissioned { + continue; + } + heartbeat_futs.push({ + let jwt_token = self + .jwt_token + .as_ref() + .map(|t| SecretString::from(t.to_owned())); + let cancel = self.cancel.clone(); + + async move { + let response = sk + .with_client_retries( + |client| async move { client.get_utilization().await }, + &jwt_token, + 3, + 3, + Duration::from_secs(1), + &cancel, + ) + .await; + + let status = match response { + Ok(utilization) => SafekeeperState::Available { + last_seen_at: Instant::now(), + utilization, + }, + Err(mgmt_api::Error::Cancelled) => { + // This indicates cancellation of the request. + // We ignore the node in this case. + return None; + } + Err(e) => { + tracing::info!( + "Marking safekeeper {} at as offline: {e}", + sk.base_url() + ); + SafekeeperState::Offline + } + }; + + Some((*node_id, status)) + } + }); + } + + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; + + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); + } + } + + let mut offline = 0; + for state in new_state.values() { + match state { + SafekeeperState::Offline { .. } => offline += 1, + SafekeeperState::Available { .. } => {} + } + } + + tracing::info!( + "Heartbeat round complete for {} safekeepers, {} offline", + new_state.len(), + offline + ); + + let mut deltas = Vec::new(); + let now = Instant::now(); + for (node_id, sk_state) in new_state.iter_mut() { + use std::collections::hash_map::Entry::*; + let entry = self.state.entry(*node_id); + + let mut needs_update = false; + match entry { + Occupied(ref occ) => match (occ.get(), &sk_state) { + (SafekeeperState::Offline, SafekeeperState::Offline) => {} + (SafekeeperState::Available { last_seen_at, .. }, SafekeeperState::Offline) => { + if now - *last_seen_at >= self.max_offline_interval { + deltas.push((*node_id, sk_state.clone())); + needs_update = true; + } + } + _ => { + deltas.push((*node_id, sk_state.clone())); + needs_update = true; + } + }, + Vacant(_) => { + // This is a new node. Don't generate a delta for it. + deltas.push((*node_id, sk_state.clone())); + } + } + + match entry { + Occupied(mut occ) if needs_update => { + (*occ.get_mut()) = sk_state.clone(); + } + Vacant(vac) => { + vac.insert(sk_state.clone()); + } + _ => {} + } + } + + Ok(AvailablityDeltas(deltas)) + } +} diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 39e078ba7c..5b5ae80eaf 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1,64 +1,56 @@ -use crate::http; -use crate::metrics::{ - HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, - METRICS_REGISTRY, -}; -use crate::persistence::SafekeeperPersistence; -use crate::reconciler::ReconcileError; -use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT}; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant}; + use anyhow::Context; +use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; use futures::Future; +use http_utils::endpoint::{ + self, auth_middleware, check_permission_with, profile_cpu_handler, profile_heap_handler, + request_span, +}; +use http_utils::error::ApiError; +use http_utils::failpoints::failpoints_handler; +use http_utils::json::{json_request, json_response}; +use http_utils::request::{must_get_query_param, parse_query_param, parse_request_param}; +use http_utils::{RequestExt, RouterBuilder}; use hyper::header::CONTENT_TYPE; -use hyper::{Body, Request, Response}; -use hyper::{StatusCode, Uri}; +use hyper::{Body, Request, Response, StatusCode, Uri}; use metrics::{BuildInfo, NeonMetrics}; use pageserver_api::controller_api::{ MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse, MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, - ShardsPreferredAzsRequest, TenantCreateRequest, + NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest, + ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, }; use pageserver_api::models::{ - TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest, - TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, + TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, + TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest, + TimelineCreateRequest, }; use pageserver_api::shard::TenantShardId; -use pageserver_client::{mgmt_api, BlockUnblock}; -use std::str::FromStr; -use std::sync::Arc; -use std::time::{Duration, Instant}; +use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; +use pageserver_client::{BlockUnblock, mgmt_api}; +use routerify::Middleware; use tokio_util::sync::CancellationToken; use utils::auth::{Scope, SwappableJwtAuth}; -use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::{auth_middleware, check_permission_with, request_span}; -use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param}; -use utils::id::{TenantId, TimelineId}; +use utils::id::{NodeId, TenantId, TimelineId}; -use utils::{ - http::{ - endpoint::{self}, - error::ApiError, - json::{json_request, json_response}, - RequestExt, RouterBuilder, - }, - id::NodeId, +use crate::http; +use crate::metrics::{ + HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, METRICS_REGISTRY, + PageserverRequestLabelGroup, }; - -use pageserver_api::controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest, - TenantShardMigrateRequest, -}; -use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; - -use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; - -use routerify::Middleware; +use crate::persistence::SafekeeperUpsert; +use crate::reconciler::ReconcileError; +use crate::service::{LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT, Service}; /// State available to HTTP request handlers pub struct HttpState { service: Arc, auth: Option>, neon_metrics: NeonMetrics, - allowlist_routes: Vec, + allowlist_routes: &'static [&'static str], } impl HttpState { @@ -67,15 +59,17 @@ impl HttpState { auth: Option>, build_info: BuildInfo, ) -> Self { - let allowlist_routes = ["/status", "/ready", "/metrics"] - .iter() - .map(|v| v.parse().unwrap()) - .collect::>(); Self { service, auth, neon_metrics: NeonMetrics::new(build_info), - allowlist_routes, + allowlist_routes: &[ + "/status", + "/ready", + "/metrics", + "/profile/cpu", + "/profile/heap", + ], } } } @@ -208,6 +202,27 @@ async fn handle_tenant_location_config( ) } +async fn handle_tenant_config_patch( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::PageServerApi)?; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let config_req = json_request::(&mut req).await?; + + json_response( + StatusCode::OK, + service.tenant_config_patch(config_req).await?, + ) +} + async fn handle_tenant_config_set( service: Arc, req: Request, @@ -499,6 +514,35 @@ async fn handle_tenant_timeline_block_unblock_gc( json_response(StatusCode::OK, ()) } +async fn handle_tenant_timeline_download_heatmap_layers( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + let concurrency: Option = parse_query_param(&req, "concurrency")?; + + service + .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .await?; + + json_response(StatusCode::OK, ()) +} + +// For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters +// and tenant/timeline IDs. Since we are proxying to arbitrary paths, we don't have routing templates to +// compare to, so we can just filter out our well known ID format with regexes. +fn path_without_ids(path: &str) -> String { + static ID_REGEX: std::sync::OnceLock = std::sync::OnceLock::new(); + ID_REGEX + .get_or_init(|| regex::Regex::new(r"([0-9a-fA-F]{32}(-[0-9]{4})?|\?.*)").unwrap()) + .replace_all(path, "") + .to_string() +} + async fn handle_tenant_timeline_passthrough( service: Arc, req: Request, @@ -534,10 +578,7 @@ async fn handle_tenant_timeline_passthrough( .metrics_group .storage_controller_passthrough_request_latency; - // This is a bit awkward. We remove the param from the request - // and join the words by '_' to get a label for the request. - let just_path = path.replace(&tenant_shard_str, ""); - let path_label = just_path + let path_label = path_without_ids(&path) .split('/') .filter(|token| !token.is_empty()) .collect::>() @@ -550,7 +591,10 @@ async fn handle_tenant_timeline_passthrough( let _timer = latency.start_timer(labels.clone()); - let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref()); + let client = mgmt_api::Client::new( + node.base_url(), + service.get_config().pageserver_jwt_token.as_deref(), + ); let resp = client.get_raw(path).await.map_err(|e| // We return 503 here because if we can't successfully send a request to the pageserver, // either we aren't available or the pageserver is unavailable. @@ -631,6 +675,10 @@ async fn handle_tenant_list( ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let limit: Option = parse_query_param(&req, "limit")?; + let start_after: Option = parse_query_param(&req, "start_after")?; + tracing::info!("start_after: {:?}", start_after); + match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { return res; @@ -638,7 +686,7 @@ async fn handle_tenant_list( ForwardOutcome::NotForwarded(_req) => {} }; - json_response(StatusCode::OK, service.tenant_list()) + json_response(StatusCode::OK, service.tenant_list(limit, start_after)) } async fn handle_node_register(req: Request) -> Result, ApiError> { @@ -668,7 +716,8 @@ async fn handle_node_list(req: Request) -> Result, ApiError }; let state = get_state(&req); - let nodes = state.service.node_list().await?; + let mut nodes = state.service.node_list().await?; + nodes.sort_by_key(|n| n.get_id()); let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); json_response(StatusCode::OK, api_nodes) @@ -857,6 +906,21 @@ async fn handle_cancel_node_fill(req: Request) -> Result, A json_response(StatusCode::ACCEPTED, ()) } +async fn handle_safekeeper_list(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Infra)?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let state = get_state(&req); + let safekeepers = state.service.safekeepers_list().await?; + json_response(StatusCode::OK, safekeepers) +} + async fn handle_metadata_health_update(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Scrubber)?; @@ -968,6 +1032,29 @@ async fn handle_tenant_shard_migrate( ) } +async fn handle_tenant_shard_migrate_secondary( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + let migrate_req = json_request::(&mut req).await?; + json_response( + StatusCode::OK, + service + .tenant_shard_migrate_secondary(tenant_shard_id, migrate_req) + .await?, + ) +} + async fn handle_tenant_shard_cancel_reconcile( service: Arc, req: Request, @@ -1181,7 +1268,7 @@ impl From for ApiError { /// /// Not used by anything except manual testing. async fn handle_get_safekeeper(req: Request) -> Result, ApiError> { - check_permissions(&req, Scope::Admin)?; + check_permissions(&req, Scope::Infra)?; let id = parse_request_param::(&req, "id")?; @@ -1199,7 +1286,7 @@ async fn handle_get_safekeeper(req: Request) -> Result, Api match res { Ok(b) => json_response(StatusCode::OK, b), Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => { - Err(ApiError::NotFound("unknown instance_id".into())) + Err(ApiError::NotFound("unknown instance id".into())) } Err(other) => Err(other.into()), } @@ -1212,7 +1299,7 @@ async fn handle_get_safekeeper(req: Request) -> Result, Api async fn handle_upsert_safekeeper(mut req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Infra)?; - let body = json_request::(&mut req).await?; + let body = json_request::(&mut req).await?; let id = parse_request_param::(&req, "id")?; if id != body.id { @@ -1240,6 +1327,32 @@ async fn handle_upsert_safekeeper(mut req: Request) -> Result, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let body = json_request::(&mut req).await?; + let id = parse_request_param::(&req, "id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let state = get_state(&req); + + state + .service + .set_safekeeper_scheduling_policy(id, body.scheduling_policy) + .await?; + + json_response(StatusCode::OK, ()) +} + /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only /// be allowed to run if Service has finished its initial reconciliation. async fn tenant_service_handler( @@ -1301,23 +1414,26 @@ pub fn prologue_leadership_status_check_middleware< let state = get_state(&req); let leadership_status = state.service.get_leadership_status(); - enum AllowedRoutes<'a> { + enum AllowedRoutes { All, - Some(Vec<&'a str>), + Some(&'static [&'static str]), } let allowed_routes = match leadership_status { LeadershipStatus::Leader => AllowedRoutes::All, LeadershipStatus::SteppedDown => AllowedRoutes::All, - LeadershipStatus::Candidate => { - AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec()) - } + LeadershipStatus::Candidate => AllowedRoutes::Some(&[ + "/ready", + "/status", + "/metrics", + "/profile/cpu", + "/profile/heap", + ]), }; - let uri = req.uri().to_string(); match allowed_routes { AllowedRoutes::All => Ok(req), - AllowedRoutes::Some(allowed) if allowed.contains(&uri.as_str()) => Ok(req), + AllowedRoutes::Some(allowed) if allowed.contains(&req.uri().path()) => Ok(req), _ => { tracing::info!( "Request {} not allowed due to current leadership state", @@ -1332,8 +1448,8 @@ pub fn prologue_leadership_status_check_middleware< }) } -fn prologue_metrics_middleware( -) -> Middleware { +fn prologue_metrics_middleware() +-> Middleware { Middleware::pre(move |req| async move { let meta = RequestMeta { method: req.method().clone(), @@ -1346,8 +1462,8 @@ fn prologue_metrics_middleware }) } -fn epilogue_metrics_middleware( -) -> Middleware { +fn epilogue_metrics_middleware() +-> Middleware { Middleware::post_with_info(move |resp, req_info| async move { let request_name = match req_info.context::() { Some(name) => name, @@ -1426,7 +1542,8 @@ enum ForwardOutcome { /// Potentially forward the request to the current storage controler leader. /// More specifically we forward when: -/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"] +/// 1. Request is not one of: +/// ["/control/v1/step_down", "/status", "/ready", "/metrics", "/profile/cpu", "/profile/heap"] /// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state /// 3. There is a leader in the database to forward to /// 4. Leader from step (3) is not the current instance @@ -1447,10 +1564,17 @@ enum ForwardOutcome { /// Hence, if we are in the edge case scenario the leader persisted in the database is the /// stepped down instance that received the request. Condition (4) above covers this scenario. async fn maybe_forward(req: Request) -> ForwardOutcome { - const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"]; + const NOT_FOR_FORWARD: &[&str] = &[ + "/control/v1/step_down", + "/status", + "/ready", + "/metrics", + "/profile/cpu", + "/profile/heap", + ]; - let uri = req.uri().to_string(); - let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str()); + let uri = req.uri(); + let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.path()); // Fast return before trying to take any Service locks, if we will never forward anyway if !uri_for_forward { @@ -1490,8 +1614,8 @@ async fn maybe_forward(req: Request) -> ForwardOutcome { Err(err) => { return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError( anyhow::anyhow!( - "Failed to parse leader uri for forwarding while in stepped down state: {err}" - ), + "Failed to parse leader uri for forwarding while in stepped down state: {err}" + ), ))); } }; @@ -1650,7 +1774,7 @@ pub fn make_router( if auth.is_some() { router = router.middleware(auth_middleware(|request| { let state = get_state(request); - if state.allowlist_routes.contains(request.uri()) { + if state.allowlist_routes.contains(&request.uri().path()) { None } else { state.auth.as_deref() @@ -1663,13 +1787,19 @@ pub fn make_router( .get("/metrics", |r| { named_request_span(r, measured_metrics_handler, RequestName("metrics")) }) - // Non-prefixed generic endpoints (status, metrics) + // Non-prefixed generic endpoints (status, metrics, profiling) .get("/status", |r| { named_request_span(r, handle_status, RequestName("status")) }) .get("/ready", |r| { named_request_span(r, handle_ready, RequestName("ready")) }) + .get("/profile/cpu", |r| { + named_request_span(r, profile_cpu_handler, RequestName("profile_cpu")) + }) + .get("/profile/heap", |r| { + named_request_span(r, profile_heap_handler, RequestName("profile_heap")) + }) // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix .post("/upcall/v1/re-attach", |r| { named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach")) @@ -1795,6 +1925,32 @@ pub fn make_router( RequestName("control_v1_metadata_health_list_outdated"), ) }) + // Safekeepers + .get("/control/v1/safekeeper", |r| { + named_request_span( + r, + handle_safekeeper_list, + RequestName("control_v1_safekeeper_list"), + ) + }) + .get("/control/v1/safekeeper/:id", |r| { + named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper")) + }) + .post("/control/v1/safekeeper/:id", |r| { + // id is in the body + named_request_span( + r, + handle_upsert_safekeeper, + RequestName("v1_safekeeper_post"), + ) + }) + .post("/control/v1/safekeeper/:id/scheduling_policy", |r| { + named_request_span( + r, + handle_safekeeper_scheduling_policy, + RequestName("v1_safekeeper_status"), + ) + }) // Tenant Shard operations .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| { tenant_service_handler( @@ -1803,6 +1959,16 @@ pub fn make_router( RequestName("control_v1_tenant_migrate"), ) }) + .put( + "/control/v1/tenant/:tenant_shard_id/migrate_secondary", + |r| { + tenant_service_handler( + r, + handle_tenant_shard_migrate_secondary, + RequestName("control_v1_tenant_migrate_secondary"), + ) + }, + ) .put( "/control/v1/tenant/:tenant_shard_id/cancel_reconcile", |r| { @@ -1847,13 +2013,6 @@ pub fn make_router( .put("/control/v1/step_down", |r| { named_request_span(r, handle_step_down, RequestName("control_v1_step_down")) }) - .get("/control/v1/safekeeper/:id", |r| { - named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper")) - }) - .post("/control/v1/safekeeper/:id", |r| { - // id is in the body - named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper")) - }) // Tenant operations // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. @@ -1863,6 +2022,13 @@ pub fn make_router( .delete("/v1/tenant/:tenant_id", |r| { tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant")) }) + .patch("/v1/tenant/config", |r| { + tenant_service_handler( + r, + handle_tenant_config_patch, + RequestName("v1_tenant_config"), + ) + }) .put("/v1/tenant/config", |r| { tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config")) }) @@ -1945,6 +2111,16 @@ pub fn make_router( ) }, ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_download_heatmap_layers, + RequestName("v1_tenant_timeline_download_heatmap_layers"), + ) + }, + ) // Tenant detail GET passthrough to shard zero: .get("/v1/tenant/:tenant_id", |r| { tenant_service_handler( @@ -1964,3 +2140,31 @@ pub fn make_router( ) }) } + +#[cfg(test)] +mod test { + + use super::path_without_ids; + + #[test] + fn test_path_without_ids() { + assert_eq!( + path_without_ids( + "/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788" + ), + "/v1/tenant//timeline/" + ); + assert_eq!( + path_without_ids( + "/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788" + ), + "/v1/tenant//timeline/" + ); + assert_eq!( + path_without_ids( + "/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo" + ), + "/v1/tenant//timeline/" + ); + } +} diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs index fcd3eb57e2..6b0c16f0be 100644 --- a/storage_controller/src/id_lock_map.rs +++ b/storage_controller/src/id_lock_map.rs @@ -1,8 +1,7 @@ +use std::collections::HashMap; use std::fmt::Display; -use std::time::Instant; -use std::{collections::HashMap, sync::Arc}; - -use std::time::Duration; +use std::sync::Arc; +use std::time::{Duration, Instant}; use crate::service::RECONCILE_TIMEOUT; @@ -112,6 +111,14 @@ where } } + pub(crate) fn try_exclusive(&self, key: T, operation: I) -> Option> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default().clone(); + let mut guard = TracingExclusiveGuard::new(entry.try_write_owned().ok()?); + *guard.guard = Some(operation); + Some(guard) + } + /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do /// periodic housekeeping to avoid the map growing indefinitely pub(crate) fn housekeeping(&self) { diff --git a/storage_controller/src/leadership.rs b/storage_controller/src/leadership.rs index 5fae8991ec..5e1d6f3ec9 100644 --- a/storage_controller/src/leadership.rs +++ b/storage_controller/src/leadership.rs @@ -3,11 +3,9 @@ use std::sync::Arc; use hyper::Uri; use tokio_util::sync::CancellationToken; -use crate::{ - peer_client::{GlobalObservedState, PeerClient}, - persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence}, - service::Config, -}; +use crate::peer_client::{GlobalObservedState, PeerClient}; +use crate::persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence}; +use crate::service::Config; /// Helper for storage controller leadership acquisition pub(crate) struct Leadership { @@ -91,7 +89,9 @@ impl Leadership { // Special case: if this is a brand new storage controller, migrations will not // have run at this point yet, and, hence, the controllers table does not exist. // Detect this case via the error string (diesel doesn't type it) and allow it. - tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ..."); + tracing::info!( + "Detected first storage controller start-up. Allowing missing controllers table ..." + ); return Ok(None); } } diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs index f5823935e1..5f2c081927 100644 --- a/storage_controller/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -17,6 +17,8 @@ mod pageserver_client; mod peer_client; pub mod persistence; mod reconciler; +mod safekeeper; +mod safekeeper_client; mod scheduler; mod schema; pub mod service; diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 801409d612..04dd3bb3f6 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,31 +1,42 @@ -use anyhow::{anyhow, Context}; -use clap::Parser; -use hyper0::Uri; -use metrics::launch_timestamp::LaunchTimestamp; -use metrics::BuildInfo; use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; + +use anyhow::{Context, anyhow}; +use clap::Parser; +use hyper0::Uri; +use metrics::BuildInfo; +use metrics::launch_timestamp::LaunchTimestamp; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ - Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, - MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, + MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, + PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::logging::{self, LogFormat}; - use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21). +/// This adds roughly 3% overhead for allocations on average, which is acceptable considering +/// performance-sensitive code will avoid allocations as far as possible anyway. +#[allow(non_upper_case_globals)] +#[unsafe(export_name = "malloc_conf")] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; + #[derive(Parser)] #[command(author, version, about, long_about = None)] #[command(arg_required_else_help(true))] @@ -42,6 +53,10 @@ struct Cli { #[arg(long)] jwt_token: Option, + /// Token for authenticating this service with the safekeepers it controls + #[arg(long)] + safekeeper_jwt_token: Option, + /// Token for authenticating this service with the control plane, when calling /// the compute notification endpoint #[arg(long)] @@ -75,10 +90,14 @@ struct Cli { #[arg(long)] split_threshold: Option, - /// Maximum number of reconcilers that may run in parallel + /// Maximum number of normal-priority reconcilers that may run in parallel #[arg(long)] reconciler_concurrency: Option, + /// Maximum number of high-priority reconcilers that may run in parallel + #[arg(long)] + priority_reconciler_concurrency: Option, + /// How long to wait for the initial database connection to be available. #[arg(long, default_value = "5s")] db_connect_timeout: humantime::Duration, @@ -96,10 +115,14 @@ struct Cli { #[arg(long)] neon_local_repo_dir: Option, - /// Chaos testing + /// Chaos testing: exercise tenant migrations #[arg(long)] chaos_interval: Option, + /// Chaos testing: exercise an immediate exit + #[arg(long)] + chaos_exit_crontab: Option, + // Maximum acceptable lag for the secondary location while draining // a pageserver #[arg(long)] @@ -111,6 +134,10 @@ struct Cli { #[arg(long)] long_reconcile_threshold: Option, + + // Flag to use https for requests to pageserver API. + #[arg(long, default_value = "false")] + use_https_pageserver_api: bool, } enum StrictMode { @@ -134,7 +161,8 @@ impl Default for StrictMode { struct Secrets { database_url: String, public_key: Option, - jwt_token: Option, + pageserver_jwt_token: Option, + safekeeper_jwt_token: Option, control_plane_jwt_token: Option, peer_jwt_token: Option, } @@ -142,6 +170,7 @@ struct Secrets { impl Secrets { const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; + const SAFEKEEPER_JWT_TOKEN_ENV: &'static str = "SAFEKEEPER_JWT_TOKEN"; const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN"; const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN"; const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY"; @@ -165,7 +194,14 @@ impl Secrets { let this = Self { database_url, public_key, - jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV), + pageserver_jwt_token: Self::load_secret( + &args.jwt_token, + Self::PAGESERVER_JWT_TOKEN_ENV, + ), + safekeeper_jwt_token: Self::load_secret( + &args.safekeeper_jwt_token, + Self::SAFEKEEPER_JWT_TOKEN_ENV, + ), control_plane_jwt_token: Self::load_secret( &args.control_plane_jwt_token, Self::CONTROL_PLANE_JWT_TOKEN_ENV, @@ -245,18 +281,24 @@ async fn async_main() -> anyhow::Result<()> { let secrets = Secrets::load(&args).await?; + // TODO: once we've rolled out the safekeeper JWT token everywhere, put it into the validation code below + tracing::info!( + "safekeeper_jwt_token set: {:?}", + secrets.safekeeper_jwt_token.is_some() + ); + // Validate required secrets and arguments are provided in strict mode match strict_mode { StrictMode::Strict if (secrets.public_key.is_none() - || secrets.jwt_token.is_none() + || secrets.pageserver_jwt_token.is_none() || secrets.control_plane_jwt_token.is_none()) => { // Production systems should always have secrets configured: if public_key was not set // then we would implicitly disable auth. anyhow::bail!( - "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" - ); + "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" + ); } StrictMode::Strict if args.compute_hook_url.is_none() => { // Production systems should always have a compute hook set, to prevent falling @@ -274,7 +316,8 @@ async fn async_main() -> anyhow::Result<()> { } let config = Config { - jwt_token: secrets.jwt_token, + pageserver_jwt_token: secrets.pageserver_jwt_token, + safekeeper_jwt_token: secrets.safekeeper_jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, peer_jwt_token: secrets.peer_jwt_token, compute_hook_url: args.compute_hook_url, @@ -289,6 +332,9 @@ async fn async_main() -> anyhow::Result<()> { reconciler_concurrency: args .reconciler_concurrency .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), + priority_reconciler_concurrency: args + .priority_reconciler_concurrency + .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT), split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, max_secondary_lag_bytes: args.max_secondary_lag_bytes, @@ -303,12 +349,13 @@ async fn async_main() -> anyhow::Result<()> { address_for_peers: args.address_for_peers, start_as_candidate: args.start_as_candidate, http_service_port: args.listen.port() as i32, + use_https_pageserver_api: args.use_https_pageserver_api, }; // Validate that we can connect to the database Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?; - let persistence = Arc::new(Persistence::new(secrets.database_url)); + let persistence = Arc::new(Persistence::new(secrets.database_url).await); let service = Service::spawn(config, persistence.clone()).await?; @@ -320,7 +367,7 @@ async fn async_main() -> anyhow::Result<()> { let router = make_router(service.clone(), auth, build_info) .build() .map_err(|err| anyhow!(err))?; - let router_service = utils::http::RouterService::new(router).unwrap(); + let router_service = http_utils::RouterService::new(router).unwrap(); // Start HTTP server let server_shutdown = CancellationToken::new(); @@ -339,10 +386,12 @@ async fn async_main() -> anyhow::Result<()> { let service = service.clone(); let cancel = CancellationToken::new(); let cancel_bg = cancel.clone(); + let chaos_exit_crontab = args.chaos_exit_crontab; ( tokio::task::spawn( async move { - let mut chaos_injector = ChaosInjector::new(service, interval.into()); + let mut chaos_injector = + ChaosInjector::new(service, interval.into(), chaos_exit_crontab); chaos_injector.run(cancel_bg).await } .instrument(tracing::info_span!("chaos_injector")), diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 6d5885eba6..f490edb68f 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -7,17 +7,18 @@ //! //! The rest of the code defines label group types and deals with converting outer types to labels. //! +use std::sync::Mutex; + use bytes::Bytes; -use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; +use measured::label::LabelValue; +use measured::metric::histogram; +use measured::{FixedCardinalityLabel, MetricGroup}; use metrics::NeonMetrics; use once_cell::sync::Lazy; -use std::sync::Mutex; use strum::IntoEnumIterator; -use crate::{ - persistence::{DatabaseError, DatabaseOperation}, - service::LeadershipStatus, -}; +use crate::persistence::{DatabaseError, DatabaseOperation}; +use crate::service::LeadershipStatus; pub(crate) static METRICS_REGISTRY: Lazy = Lazy::new(StorageControllerMetrics::default); @@ -53,6 +54,16 @@ pub(crate) struct StorageControllerMetricGroup { /// How many shards are not scheduled into their preferred AZ pub(crate) storage_controller_schedule_az_violation: measured::Gauge, + /// How many shard locations (secondary or attached) on each node + pub(crate) storage_controller_node_shards: measured::GaugeVec, + + /// How many _attached_ shard locations on each node + pub(crate) storage_controller_node_attached_shards: measured::GaugeVec, + + /// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's + /// preferred AZ) + pub(crate) storage_controller_node_home_shards: measured::GaugeVec, + /// How many shards would like to reconcile but were blocked by concurrency limits pub(crate) storage_controller_pending_reconciles: measured::Gauge, @@ -70,6 +81,11 @@ pub(crate) struct StorageControllerMetricGroup { pub(crate) storage_controller_pageserver_request_error: measured::CounterVec, + /// Count of HTTP requests to the safekeeper that resulted in an error, + /// broken down by the safekeeper node id, request name and method + pub(crate) storage_controller_safekeeper_request_error: + measured::CounterVec, + /// Latency of HTTP requests to the pageserver, broken down by pageserver /// node id, request name and method. This include both successful and unsuccessful /// requests. @@ -77,6 +93,13 @@ pub(crate) struct StorageControllerMetricGroup { pub(crate) storage_controller_pageserver_request_latency: measured::HistogramVec, + /// Latency of HTTP requests to the safekeeper, broken down by safekeeper + /// node id, request name and method. This include both successful and unsuccessful + /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] + pub(crate) storage_controller_safekeeper_request_latency: + measured::HistogramVec, + /// Count of pass-through HTTP requests to the pageserver that resulted in an error, /// broken down by the pageserver node id, request name and method pub(crate) storage_controller_passthrough_request_error: @@ -132,6 +155,15 @@ impl Default for StorageControllerMetrics { } } +#[derive(measured::LabelGroup, Clone)] +#[label(set = NodeLabelGroupSet)] +pub(crate) struct NodeLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) az: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) node_id: &'a str, +} + #[derive(measured::LabelGroup)] #[label(set = ReconcileCompleteLabelGroupSet)] pub(crate) struct ReconcileCompleteLabelGroup { diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 4cc9b0070d..bc7fe8802a 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -1,21 +1,22 @@ -use std::{str::FromStr, time::Duration}; +use std::str::FromStr; +use std::time::Duration; -use pageserver_api::{ - controller_api::{ - AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, - NodeSchedulingPolicy, TenantLocateResponseShard, - }, - shard::TenantShardId, +use anyhow::anyhow; +use pageserver_api::controller_api::{ + AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, + NodeSchedulingPolicy, TenantLocateResponseShard, }; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use reqwest::StatusCode; use serde::Serialize; use tokio_util::sync::CancellationToken; -use utils::{backoff, id::NodeId}; +use utils::backoff; +use utils::id::NodeId; -use crate::{ - pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule, -}; +use crate::pageserver_client::PageserverClient; +use crate::persistence::NodePersistence; +use crate::scheduler::MaySchedule; /// Represents the in-memory description of a Node. /// @@ -32,12 +33,16 @@ pub(crate) struct Node { listen_http_addr: String, listen_http_port: u16, + listen_https_port: Option, listen_pg_addr: String, listen_pg_port: u16, availability_zone_id: AvailabilityZone, + // Flag from storcon's config to use https for pageserver admin API. + // Invariant: if |true|, listen_https_port should contain a value. + use_https: bool, // This cancellation token means "stop any RPCs in flight to this node, and don't start // any more". It is not related to process shutdown. #[serde(skip)] @@ -56,7 +61,16 @@ pub(crate) enum AvailabilityTransition { impl Node { pub(crate) fn base_url(&self) -> String { - format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + if self.use_https { + format!( + "https://{}:{}", + self.listen_http_addr, + self.listen_https_port + .expect("https port should be specified if use_https is on") + ) + } else { + format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + } } pub(crate) fn get_id(&self) -> NodeId { @@ -82,11 +96,20 @@ impl Node { self.id == register_req.node_id && self.listen_http_addr == register_req.listen_http_addr && self.listen_http_port == register_req.listen_http_port + // Note: listen_https_port may change. See [`Self::need_update`] for mode details. + // && self.listen_https_port == register_req.listen_https_port && self.listen_pg_addr == register_req.listen_pg_addr && self.listen_pg_port == register_req.listen_pg_port && self.availability_zone_id == register_req.availability_zone_id } + // Do we need to update an existing record in DB on this registration request? + pub(crate) fn need_update(&self, register_req: &NodeRegisterRequest) -> bool { + // listen_https_port is checked here because it may change during migration to https. + // After migration, this check may be moved to registration_match. + self.listen_https_port != register_req.listen_https_port + } + /// For a shard located on this node, populate a response object /// with this node's address information. pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard { @@ -95,6 +118,7 @@ impl Node { node_id: self.id, listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port, + listen_https_port: self.listen_https_port, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port, } @@ -175,25 +199,34 @@ impl Node { } } + #[allow(clippy::too_many_arguments)] pub(crate) fn new( id: NodeId, listen_http_addr: String, listen_http_port: u16, + listen_https_port: Option, listen_pg_addr: String, listen_pg_port: u16, availability_zone_id: AvailabilityZone, - ) -> Self { - Self { + use_https: bool, + ) -> anyhow::Result { + if use_https && listen_https_port.is_none() { + return Err(anyhow!("https is enabled, but node has no https port")); + } + + Ok(Self { id, listen_http_addr, listen_http_port, + listen_https_port, listen_pg_addr, listen_pg_port, scheduling: NodeSchedulingPolicy::Active, availability: NodeAvailability::Offline, availability_zone_id, + use_https, cancel: CancellationToken::new(), - } + }) } pub(crate) fn to_persistent(&self) -> NodePersistence { @@ -202,14 +235,19 @@ impl Node { scheduling_policy: self.scheduling.into(), listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port as i32, + listen_https_port: self.listen_https_port.map(|x| x as i32), listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port as i32, availability_zone_id: self.availability_zone_id.0.clone(), } } - pub(crate) fn from_persistent(np: NodePersistence) -> Self { - Self { + pub(crate) fn from_persistent(np: NodePersistence, use_https: bool) -> anyhow::Result { + if use_https && np.listen_https_port.is_none() { + return Err(anyhow!("https is enabled, but node has no https port")); + } + + Ok(Self { id: NodeId(np.node_id as u64), // At startup we consider a node offline until proven otherwise. availability: NodeAvailability::Offline, @@ -217,11 +255,13 @@ impl Node { .expect("Bad scheduling policy in DB"), listen_http_addr: np.listen_http_addr, listen_http_port: np.listen_http_port as u16, + listen_https_port: np.listen_https_port.map(|x| x as u16), listen_pg_addr: np.listen_pg_addr, listen_pg_port: np.listen_pg_port as u16, availability_zone_id: AvailabilityZone(np.availability_zone_id), + use_https, cancel: CancellationToken::new(), - } + }) } /// Wrapper for issuing requests to pageserver management API: takes care of generic @@ -285,8 +325,9 @@ impl Node { warn_threshold, max_retries, &format!( - "Call to node {} ({}:{}) management API", - self.id, self.listen_http_addr, self.listen_http_port + "Call to node {} ({}) management API", + self.id, + self.base_url(), ), cancel, ) @@ -299,8 +340,10 @@ impl Node { id: self.id, availability: self.availability.clone().into(), scheduling: self.scheduling, + availability_zone_id: self.availability_zone_id.0.clone(), listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port, + listen_https_port: self.listen_https_port, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port, } diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index b19cbc4fa3..e9c54414a3 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,16 +1,13 @@ -use pageserver_api::{ - models::{ - detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse, - PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, - TenantShardSplitRequest, TenantShardSplitResponse, TimelineArchivalConfigRequest, - TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, - }, - shard::TenantShardId, -}; -use pageserver_client::{ - mgmt_api::{Client, Result}, - BlockUnblock, +use pageserver_api::models::detach_ancestor::AncestorDetached; +use pageserver_api::models::{ + LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, + TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, + TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, + TopTenantShardsRequest, TopTenantShardsResponse, }; +use pageserver_api::shard::TenantShardId; +use pageserver_client::BlockUnblock; +use pageserver_client::mgmt_api::{Client, Result}; use reqwest::StatusCode; use utils::id::{NodeId, TenantId, TimelineId}; @@ -279,6 +276,22 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_download_heatmap_layers( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + concurrency: Option, + ) -> Result<()> { + measured_request!( + "download_heatmap_layers", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", @@ -299,4 +312,17 @@ impl PageserverClient { self.inner.top_tenant_shards(request).await ) } + + pub(crate) async fn wait_lsn( + &self, + tenant_shard_id: TenantShardId, + request: TenantWaitLsnRequest, + ) -> Result { + measured_request!( + "wait_lsn", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.wait_lsn(tenant_shard_id, request).await + ) + } } diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs index ee4eb55294..f3f275dee0 100644 --- a/storage_controller/src/peer_client.rs +++ b/storage_controller/src/peer_client.rs @@ -1,14 +1,16 @@ -use crate::tenant_shard::ObservedState; -use pageserver_api::shard::TenantShardId; -use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::error::Error as _; use std::time::Duration; -use tokio_util::sync::CancellationToken; +use http_utils::error::HttpErrorBody; use hyper::Uri; +use pageserver_api::shard::TenantShardId; use reqwest::{StatusCode, Url}; -use utils::{backoff, http::error::HttpErrorBody}; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use utils::backoff; + +use crate::tenant_shard::ObservedState; #[derive(Debug, Clone)] pub(crate) struct PeerClient { diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 7ca80c7dfe..d34da0fef0 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -1,33 +1,39 @@ pub(crate) mod split_state; use std::collections::HashMap; use std::str::FromStr; -use std::time::Duration; -use std::time::Instant; +use std::sync::Arc; +use std::time::{Duration, Instant}; -use self::split_state::SplitState; -use diesel::pg::PgConnection; use diesel::prelude::*; -use diesel::Connection; +use diesel_async::async_connection_wrapper::AsyncConnectionWrapper; +use diesel_async::pooled_connection::bb8::Pool; +use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfig}; +use diesel_async::{AsyncPgConnection, RunQueryDsl}; +use diesel_migrations::{EmbeddedMigrations, embed_migrations}; +use futures::FutureExt; +use futures::future::BoxFuture; use itertools::Itertools; -use pageserver_api::controller_api::AvailabilityZone; -use pageserver_api::controller_api::MetadataHealthRecord; -use pageserver_api::controller_api::ShardSchedulingPolicy; -use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; +use pageserver_api::controller_api::{ + AvailabilityZone, MetadataHealthRecord, NodeSchedulingPolicy, PlacementPolicy, + SafekeeperDescribeResponse, ShardSchedulingPolicy, SkSchedulingPolicy, +}; use pageserver_api::models::TenantConfig; -use pageserver_api::shard::ShardConfigError; -use pageserver_api::shard::ShardIdentity; -use pageserver_api::shard::ShardStripeSize; -use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; +use pageserver_api::shard::{ + ShardConfigError, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, +}; +use rustls::client::WebPkiServerVerifier; +use rustls::client::danger::{ServerCertVerified, ServerCertVerifier}; +use rustls::crypto::ring; +use scoped_futures::ScopedBoxFuture; use serde::{Deserialize, Serialize}; use utils::generation::Generation; use utils::id::{NodeId, TenantId}; +use self::split_state::SplitState; use crate::metrics::{ DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY, }; use crate::node::Node; - -use diesel_migrations::{embed_migrations, EmbeddedMigrations}; const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations"); /// ## What do we store? @@ -58,7 +64,7 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations"); /// updated, and reads of nodes are always from memory, not the database. We only require that /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline. pub struct Persistence { - connection_pool: diesel::r2d2::Pool>, + connection_pool: Pool, } /// Legacy format, for use in JSON compat objects in test environment @@ -74,7 +80,7 @@ pub(crate) enum DatabaseError { #[error(transparent)] Connection(#[from] diesel::result::ConnectionError), #[error(transparent)] - ConnectionPool(#[from] r2d2::Error), + ConnectionPool(#[from] diesel_async::pooled_connection::bb8::RunError), #[error("Logical error: {0}")] Logical(String), #[error("Migration error: {0}")] @@ -96,6 +102,7 @@ pub(crate) enum DatabaseOperation { TenantGenerations, ShardGenerations, ListTenantShards, + LoadTenant, InsertTenantShards, UpdateTenantShard, DeleteTenant, @@ -104,6 +111,7 @@ pub(crate) enum DatabaseOperation { ListMetadataHealth, ListMetadataHealthUnhealthy, ListMetadataHealthOutdated, + ListSafekeepers, GetLeader, UpdateLeader, SetPreferredAzs, @@ -120,6 +128,7 @@ pub(crate) enum AbortShardSplitStatus { pub(crate) type DatabaseResult = Result; /// Some methods can operate on either a whole tenant or a single shard +#[derive(Clone)] pub(crate) enum TenantFilter { Tenant(TenantId), Shard(TenantShardId), @@ -132,6 +141,11 @@ pub(crate) struct ShardGenerationState { pub(crate) generation_pageserver: Option, } +// A generous allowance for how many times we may retry serializable transactions +// before giving up. This is not expected to be hit: it is a defensive measure in case we +// somehow engineer a situation where duelling transactions might otherwise live-lock. +const MAX_RETRIES: usize = 128; + impl Persistence { // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. @@ -141,12 +155,18 @@ impl Persistence { const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10); const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60); - pub fn new(database_url: String) -> Self { - let manager = diesel::r2d2::ConnectionManager::::new(database_url); + pub async fn new(database_url: String) -> Self { + let mut mgr_config = ManagerConfig::default(); + mgr_config.custom_setup = Box::new(establish_connection_rustls); + + let manager = AsyncDieselConnectionManager::::new_with_config( + database_url, + mgr_config, + ); // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time // to execute queries (database queries are not generally on latency-sensitive paths). - let connection_pool = diesel::r2d2::Pool::builder() + let connection_pool = Pool::builder() .max_size(Self::MAX_CONNECTIONS) .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME)) .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT)) @@ -154,6 +174,7 @@ impl Persistence { .min_idle(Some(1)) .test_on_check_out(true) .build(manager) + .await .expect("Could not build connection pool"); Self { connection_pool } @@ -166,8 +187,10 @@ impl Persistence { timeout: Duration, ) -> Result<(), diesel::ConnectionError> { let started_at = Instant::now(); + log_postgres_connstr_info(database_url) + .map_err(|e| diesel::ConnectionError::InvalidConnectionUrl(e.to_string()))?; loop { - match PgConnection::establish(database_url) { + match establish_connection_rustls(database_url).await { Ok(_) => { tracing::info!("Connected to database."); return Ok(()); @@ -188,57 +211,22 @@ impl Persistence { pub(crate) async fn migration_run(&self) -> DatabaseResult<()> { use diesel_migrations::{HarnessWithOutput, MigrationHarness}; - self.with_conn(move |conn| -> DatabaseResult<()> { - HarnessWithOutput::write_to_stdout(conn) - .run_pending_migrations(MIGRATIONS) - .map(|_| ()) - .map_err(|e| DatabaseError::Migration(e.to_string())) - }) - .await - } - - /// Wraps `with_conn` in order to collect latency and error metrics - async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult - where - F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, - R: Send + 'static, - { - let latency = &METRICS_REGISTRY - .metrics_group - .storage_controller_database_query_latency; - let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); - - let res = self.with_conn(func).await; - - if let Err(err) = &res { - let error_counter = &METRICS_REGISTRY - .metrics_group - .storage_controller_database_query_error; - error_counter.inc(DatabaseQueryErrorLabelGroup { - error_type: err.error_label(), - operation: op, - }) - } - - res - } - - /// Call the provided function in a tokio blocking thread, with a Diesel database connection. - async fn with_conn(&self, func: F) -> DatabaseResult - where - F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, - R: Send + 'static, - { - // A generous allowance for how many times we may retry serializable transactions - // before giving up. This is not expected to be hit: it is a defensive measure in case we - // somehow engineer a situation where duelling transactions might otherwise live-lock. - const MAX_RETRIES: usize = 128; - - let mut conn = self.connection_pool.get()?; - tokio::task::spawn_blocking(move || -> DatabaseResult { + // Can't use self.with_conn here as we do spawn_blocking which requires static. + let conn = self + .connection_pool + .dedicated_connection() + .await + .map_err(|e| DatabaseError::Migration(e.to_string()))?; + let mut async_wrapper: AsyncConnectionWrapper = + AsyncConnectionWrapper::from(conn); + tokio::task::spawn_blocking(move || { let mut retry_count = 0; loop { - match conn.build_transaction().serializable().run(|c| func(c)) { + let result = HarnessWithOutput::write_to_stdout(&mut async_wrapper) + .run_pending_migrations(MIGRATIONS) + .map(|_| ()) + .map_err(|e| DatabaseError::Migration(e.to_string())); + match result { Ok(r) => break Ok(r), Err( err @ DatabaseError::Query(diesel::result::Error::DatabaseError( @@ -267,33 +255,112 @@ impl Persistence { } }) .await - .expect("Task panic") + .map_err(|e| DatabaseError::Migration(e.to_string()))??; + Ok(()) + } + + /// Wraps `with_conn` in order to collect latency and error metrics + async fn with_measured_conn<'a, 'b, F, R>( + &self, + op: DatabaseOperation, + func: F, + ) -> DatabaseResult + where + F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult> + + Send + + std::marker::Sync + + 'a, + R: Send + 'b, + { + let latency = &METRICS_REGISTRY + .metrics_group + .storage_controller_database_query_latency; + let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); + + let res = self.with_conn(func).await; + + if let Err(err) = &res { + let error_counter = &METRICS_REGISTRY + .metrics_group + .storage_controller_database_query_error; + error_counter.inc(DatabaseQueryErrorLabelGroup { + error_type: err.error_label(), + operation: op, + }) + } + + res + } + + /// Call the provided function with a Diesel database connection in a retry loop + async fn with_conn<'a, 'b, F, R>(&self, func: F) -> DatabaseResult + where + F: for<'r> Fn(&'r mut AsyncPgConnection) -> ScopedBoxFuture<'b, 'r, DatabaseResult> + + Send + + std::marker::Sync + + 'a, + R: Send + 'b, + { + let mut retry_count = 0; + loop { + let mut conn = self.connection_pool.get().await?; + match conn + .build_transaction() + .serializable() + .run(|c| func(c)) + .await + { + Ok(r) => break Ok(r), + Err( + err @ DatabaseError::Query(diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::SerializationFailure, + _, + )), + ) => { + retry_count += 1; + if retry_count > MAX_RETRIES { + tracing::error!( + "Exceeded max retries on SerializationFailure errors: {err:?}" + ); + break Err(err); + } else { + // Retry on serialization errors: these are expected, because even though our + // transactions don't fight for the same rows, they will occasionally collide + // on index pages (e.g. increment_generation for unrelated shards can collide) + tracing::debug!("Retrying transaction on serialization failure {err:?}"); + continue; + } + } + Err(e) => break Err(e), + } + } } /// When a node is first registered, persist it before using it for anything pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> { - let np = node.to_persistent(); - self.with_measured_conn( - DatabaseOperation::InsertNode, - move |conn| -> DatabaseResult<()> { + let np = &node.to_persistent(); + self.with_measured_conn(DatabaseOperation::InsertNode, move |conn| { + Box::pin(async move { diesel::insert_into(crate::schema::nodes::table) - .values(&np) - .execute(conn)?; + .values(np) + .execute(conn) + .await?; Ok(()) - }, - ) + }) + }) .await } /// At startup, populate the list of nodes which our shards may be placed on pub(crate) async fn list_nodes(&self) -> DatabaseResult> { let nodes: Vec = self - .with_measured_conn( - DatabaseOperation::ListNodes, - move |conn| -> DatabaseResult<_> { - Ok(crate::schema::nodes::table.load::(conn)?) - }, - ) + .with_measured_conn(DatabaseOperation::ListNodes, move |conn| { + Box::pin(async move { + Ok(crate::schema::nodes::table + .load::(conn) + .await?) + }) + }) .await?; tracing::info!("list_nodes: loaded {} nodes", nodes.len()); @@ -301,19 +368,27 @@ impl Persistence { Ok(nodes) } - pub(crate) async fn update_node( + pub(crate) async fn update_node( &self, input_node_id: NodeId, - input_scheduling: NodeSchedulingPolicy, - ) -> DatabaseResult<()> { + values: V, + ) -> DatabaseResult<()> + where + V: diesel::AsChangeset + Clone + Send + Sync, + V::Changeset: diesel::query_builder::QueryFragment + Send, // valid Postgres SQL + { use crate::schema::nodes::dsl::*; let updated = self .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| { - let updated = diesel::update(nodes) - .filter(node_id.eq(input_node_id.0 as i64)) - .set((scheduling_policy.eq(String::from(input_scheduling)),)) - .execute(conn)?; - Ok(updated) + let values = values.clone(); + Box::pin(async move { + let updated = diesel::update(nodes) + .filter(node_id.eq(input_node_id.0 as i64)) + .set(values) + .execute(conn) + .await?; + Ok(updated) + }) }) .await?; @@ -326,15 +401,68 @@ impl Persistence { } } + pub(crate) async fn update_node_scheduling_policy( + &self, + input_node_id: NodeId, + input_scheduling: NodeSchedulingPolicy, + ) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.update_node( + input_node_id, + scheduling_policy.eq(String::from(input_scheduling)), + ) + .await + } + + pub(crate) async fn update_node_on_registration( + &self, + input_node_id: NodeId, + input_https_port: Option, + ) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.update_node( + input_node_id, + listen_https_port.eq(input_https_port.map(|x| x as i32)), + ) + .await + } + /// At startup, load the high level state for shards, such as their config + policy. This will /// be enriched at runtime with state discovered on pageservers. - pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { - self.with_measured_conn( - DatabaseOperation::ListTenantShards, - move |conn| -> DatabaseResult<_> { - Ok(crate::schema::tenant_shards::table.load::(conn)?) - }, - ) + /// + /// We exclude shards configured to be detached. During startup, if we see any attached locations + /// for such shards, they will automatically be detached as 'orphans'. + pub(crate) async fn load_active_tenant_shards( + &self, + ) -> DatabaseResult> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn(DatabaseOperation::ListTenantShards, move |conn| { + Box::pin(async move { + let query = tenant_shards.filter( + placement_policy.ne(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), + ); + let result = query.load::(conn).await?; + + Ok(result) + }) + }) + .await + } + + /// When restoring a previously detached tenant into memory, load it from the database + pub(crate) async fn load_tenant( + &self, + filter_tenant_id: TenantId, + ) -> DatabaseResult> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn(DatabaseOperation::LoadTenant, move |conn| { + Box::pin(async move { + let query = tenant_shards.filter(tenant_id.eq(filter_tenant_id.to_string())); + let result = query.load::(conn).await?; + + Ok(result) + }) + }) .await } @@ -344,8 +472,7 @@ impl Persistence { &self, shards: Vec, ) -> DatabaseResult<()> { - use crate::schema::metadata_health; - use crate::schema::tenant_shards; + use crate::schema::{metadata_health, tenant_shards}; let now = chrono::Utc::now(); @@ -360,19 +487,22 @@ impl Persistence { }) .collect::>(); - self.with_measured_conn( - DatabaseOperation::InsertTenantShards, - move |conn| -> DatabaseResult<()> { + let shards = &shards; + let metadata_health_records = &metadata_health_records; + self.with_measured_conn(DatabaseOperation::InsertTenantShards, move |conn| { + Box::pin(async move { diesel::insert_into(tenant_shards::table) - .values(&shards) - .execute(conn)?; + .values(shards) + .execute(conn) + .await?; diesel::insert_into(metadata_health::table) - .values(&metadata_health_records) - .execute(conn)?; + .values(metadata_health_records) + .execute(conn) + .await?; Ok(()) - }, - ) + }) + }) .await } @@ -380,31 +510,31 @@ impl Persistence { /// the tenant from memory on this server. pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_measured_conn( - DatabaseOperation::DeleteTenant, - move |conn| -> DatabaseResult<()> { + self.with_measured_conn(DatabaseOperation::DeleteTenant, move |conn| { + Box::pin(async move { // `metadata_health` status (if exists) is also deleted based on the cascade behavior. diesel::delete(tenant_shards) .filter(tenant_id.eq(del_tenant_id.to_string())) - .execute(conn)?; + .execute(conn) + .await?; Ok(()) - }, - ) + }) + }) .await } pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> { use crate::schema::nodes::dsl::*; - self.with_measured_conn( - DatabaseOperation::DeleteNode, - move |conn| -> DatabaseResult<()> { + self.with_measured_conn(DatabaseOperation::DeleteNode, move |conn| { + Box::pin(async move { diesel::delete(nodes) .filter(node_id.eq(del_node_id.0 as i64)) - .execute(conn)?; + .execute(conn) + .await?; Ok(()) - }, - ) + }) + }) .await } @@ -416,39 +546,45 @@ impl Persistence { &self, input_node_id: NodeId, ) -> DatabaseResult> { - use crate::schema::nodes::dsl::scheduling_policy; - use crate::schema::nodes::dsl::*; + use crate::schema::nodes::dsl::{scheduling_policy, *}; use crate::schema::tenant_shards::dsl::*; let updated = self .with_measured_conn(DatabaseOperation::ReAttach, move |conn| { - let rows_updated = diesel::update(tenant_shards) - .filter(generation_pageserver.eq(input_node_id.0 as i64)) - .set(generation.eq(generation + 1)) - .execute(conn)?; + Box::pin(async move { + let rows_updated = diesel::update(tenant_shards) + .filter(generation_pageserver.eq(input_node_id.0 as i64)) + .set(generation.eq(generation + 1)) + .execute(conn) + .await?; - tracing::info!("Incremented {} tenants' generations", rows_updated); + tracing::info!("Incremented {} tenants' generations", rows_updated); - // TODO: UPDATE+SELECT in one query + // TODO: UPDATE+SELECT in one query - let updated = tenant_shards - .filter(generation_pageserver.eq(input_node_id.0 as i64)) - .select(TenantShardPersistence::as_select()) - .load(conn)?; + let updated = tenant_shards + .filter(generation_pageserver.eq(input_node_id.0 as i64)) + .select(TenantShardPersistence::as_select()) + .load(conn) + .await?; - // If the node went through a drain and restart phase before re-attaching, - // then reset it's node scheduling policy to active. - diesel::update(nodes) - .filter(node_id.eq(input_node_id.0 as i64)) - .filter( - scheduling_policy - .eq(String::from(NodeSchedulingPolicy::PauseForRestart)) - .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining))) - .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))), - ) - .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active))) - .execute(conn)?; + // If the node went through a drain and restart phase before re-attaching, + // then reset it's node scheduling policy to active. + diesel::update(nodes) + .filter(node_id.eq(input_node_id.0 as i64)) + .filter( + scheduling_policy + .eq(String::from(NodeSchedulingPolicy::PauseForRestart)) + .or(scheduling_policy + .eq(String::from(NodeSchedulingPolicy::Draining))) + .or(scheduling_policy + .eq(String::from(NodeSchedulingPolicy::Filling))), + ) + .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active))) + .execute(conn) + .await?; - Ok(updated) + Ok(updated) + }) }) .await?; @@ -485,19 +621,22 @@ impl Persistence { use crate::schema::tenant_shards::dsl::*; let updated = self .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| { - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) - .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) - .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) - .set(( - generation.eq(generation + 1), - generation_pageserver.eq(node_id.0 as i64), - )) - // TODO: only returning() the generation column - .returning(TenantShardPersistence::as_returning()) - .get_result(conn)?; + Box::pin(async move { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(( + generation.eq(generation + 1), + generation_pageserver.eq(node_id.0 as i64), + )) + // TODO: only returning() the generation column + .returning(TenantShardPersistence::as_returning()) + .get_result(conn) + .await?; - Ok(updated) + Ok(updated) + }) }) .await?; @@ -529,12 +668,15 @@ impl Persistence { use crate::schema::tenant_shards::dsl::*; let rows = self .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| { - let result = tenant_shards - .filter(tenant_id.eq(filter_tenant_id.to_string())) - .select(TenantShardPersistence::as_select()) - .order(shard_number) - .load(conn)?; - Ok(result) + Box::pin(async move { + let result = tenant_shards + .filter(tenant_id.eq(filter_tenant_id.to_string())) + .select(TenantShardPersistence::as_select()) + .order(shard_number) + .load(conn) + .await?; + Ok(result) + }) }) .await?; @@ -582,15 +724,18 @@ impl Persistence { break; } + let in_clause = &in_clause; let chunk_rows = self .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| { - // diesel doesn't support multi-column IN queries, so we compose raw SQL. No escaping is required because - // the inputs are strongly typed and cannot carry any user-supplied raw string content. - let result : Vec = diesel::sql_query( - format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str() - ).load(conn)?; + Box::pin(async move { + // diesel doesn't support multi-column IN queries, so we compose raw SQL. No escaping is required because + // the inputs are strongly typed and cannot carry any user-supplied raw string content. + let result : Vec = diesel::sql_query( + format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str() + ).load(conn).await?; - Ok(result) + Ok(result) + }) }) .await?; rows.extend(chunk_rows.into_iter()) @@ -624,80 +769,92 @@ impl Persistence { ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; + let tenant = &tenant; + let input_placement_policy = &input_placement_policy; + let input_config = &input_config; + let input_generation = &input_generation; + let input_scheduling_policy = &input_scheduling_policy; self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| { - let query = match tenant { - TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards) - .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) - .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) - .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) - .into_boxed(), - TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards) - .filter(tenant_id.eq(input_tenant_id.to_string())) - .into_boxed(), - }; + Box::pin(async move { + let query = match tenant { + TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .into_boxed(), + TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards) + .filter(tenant_id.eq(input_tenant_id.to_string())) + .into_boxed(), + }; - // Clear generation_pageserver if we are moving into a state where we won't have - // any attached pageservers. - let input_generation_pageserver = match input_placement_policy { - None | Some(PlacementPolicy::Attached(_)) => None, - Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None), - }; + // Clear generation_pageserver if we are moving into a state where we won't have + // any attached pageservers. + let input_generation_pageserver = match input_placement_policy { + None | Some(PlacementPolicy::Attached(_)) => None, + Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None), + }; - #[derive(AsChangeset)] - #[diesel(table_name = crate::schema::tenant_shards)] - struct ShardUpdate { - generation: Option, - placement_policy: Option, - config: Option, - scheduling_policy: Option, - generation_pageserver: Option>, - } + #[derive(AsChangeset)] + #[diesel(table_name = crate::schema::tenant_shards)] + struct ShardUpdate { + generation: Option, + placement_policy: Option, + config: Option, + scheduling_policy: Option, + generation_pageserver: Option>, + } - let update = ShardUpdate { - generation: input_generation.map(|g| g.into().unwrap() as i32), - placement_policy: input_placement_policy - .as_ref() - .map(|p| serde_json::to_string(&p).unwrap()), - config: input_config - .as_ref() - .map(|c| serde_json::to_string(&c).unwrap()), - scheduling_policy: input_scheduling_policy - .map(|p| serde_json::to_string(&p).unwrap()), - generation_pageserver: input_generation_pageserver, - }; + let update = ShardUpdate { + generation: input_generation.map(|g| g.into().unwrap() as i32), + placement_policy: input_placement_policy + .as_ref() + .map(|p| serde_json::to_string(&p).unwrap()), + config: input_config + .as_ref() + .map(|c| serde_json::to_string(&c).unwrap()), + scheduling_policy: input_scheduling_policy + .map(|p| serde_json::to_string(&p).unwrap()), + generation_pageserver: input_generation_pageserver, + }; - query.set(update).execute(conn)?; + query.set(update).execute(conn).await?; - Ok(()) + Ok(()) + }) }) .await?; Ok(()) } + /// Note that passing None for a shard clears the preferred AZ (rather than leaving it unmodified) pub(crate) async fn set_tenant_shard_preferred_azs( &self, - preferred_azs: Vec<(TenantShardId, AvailabilityZone)>, - ) -> DatabaseResult> { + preferred_azs: Vec<(TenantShardId, Option)>, + ) -> DatabaseResult)>> { use crate::schema::tenant_shards::dsl::*; + let preferred_azs = preferred_azs.as_slice(); self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| { - let mut shards_updated = Vec::default(); + Box::pin(async move { + let mut shards_updated = Vec::default(); - for (tenant_shard_id, preferred_az) in preferred_azs.iter() { - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) - .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) - .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) - .set(preferred_az_id.eq(preferred_az.0.clone())) - .execute(conn)?; + for (tenant_shard_id, preferred_az) in preferred_azs.iter() { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone()))) + .execute(conn) + .await?; - if updated == 1 { - shards_updated.push((*tenant_shard_id, preferred_az.clone())); + if updated == 1 { + shards_updated.push((*tenant_shard_id, preferred_az.clone())); + } } - } - Ok(shards_updated) + Ok(shards_updated) + }) }) .await } @@ -705,17 +862,21 @@ impl Persistence { pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::Detach, move |conn| { - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) - .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) - .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) - .set(( - generation_pageserver.eq(Option::::None), - placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), - )) - .execute(conn)?; + Box::pin(async move { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(( + generation_pageserver.eq(Option::::None), + placement_policy + .eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), + )) + .execute(conn) + .await?; - Ok(updated) + Ok(updated) + }) }) .await?; @@ -734,14 +895,16 @@ impl Persistence { parent_to_children: Vec<(TenantShardId, Vec)>, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> { + let parent_to_children = parent_to_children.as_slice(); + self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| { + Box::pin(async move { // Mark parent shards as splitting let updated = diesel::update(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) .filter(shard_count.eq(old_shard_count.literal() as i32)) .set((splitting.eq(1),)) - .execute(conn)?; + .execute(conn).await?; if u8::try_from(updated) .map_err(|_| DatabaseError::Logical( format!("Overflow existing shard count {} while splitting", updated)) @@ -754,7 +917,7 @@ impl Persistence { } // FIXME: spurious clone to sidestep closure move rules - let parent_to_children = parent_to_children.clone(); + let parent_to_children = parent_to_children.to_vec(); // Insert child shards for (parent_shard_id, children) in parent_to_children { @@ -762,7 +925,7 @@ impl Persistence { .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string())) .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32)) .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32)) - .load::(conn)?; + .load::(conn).await?; let parent = if parent.len() != 1 { return Err(DatabaseError::Logical(format!( "Parent shard {parent_shard_id} not found" @@ -777,12 +940,13 @@ impl Persistence { debug_assert!(shard.splitting == SplitState::Splitting); diesel::insert_into(tenant_shards) .values(shard) - .execute(conn)?; + .execute(conn).await?; } } Ok(()) }) + }) .await } @@ -794,25 +958,26 @@ impl Persistence { old_shard_count: ShardCount, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_measured_conn( - DatabaseOperation::CompleteShardSplit, - move |conn| -> DatabaseResult<()> { + self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| { + Box::pin(async move { // Drop parent shards diesel::delete(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) .filter(shard_count.eq(old_shard_count.literal() as i32)) - .execute(conn)?; + .execute(conn) + .await?; // Clear sharding flag let updated = diesel::update(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) .set((splitting.eq(0),)) - .execute(conn)?; + .execute(conn) + .await?; debug_assert!(updated > 0); Ok(()) - }, - ) + }) + }) .await } @@ -824,15 +989,15 @@ impl Persistence { new_shard_count: ShardCount, ) -> DatabaseResult { use crate::schema::tenant_shards::dsl::*; - self.with_measured_conn( - DatabaseOperation::AbortShardSplit, - move |conn| -> DatabaseResult { + self.with_measured_conn(DatabaseOperation::AbortShardSplit, move |conn| { + Box::pin(async move { // Clear the splitting state on parent shards let updated = diesel::update(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) .filter(shard_count.ne(new_shard_count.literal() as i32)) .set((splitting.eq(0),)) - .execute(conn)?; + .execute(conn) + .await?; // Parent shards are already gone: we cannot abort. if updated == 0 { @@ -852,11 +1017,12 @@ impl Persistence { diesel::delete(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) .filter(shard_count.eq(new_shard_count.literal() as i32)) - .execute(conn)?; + .execute(conn) + .await?; Ok(AbortShardSplitStatus::Aborted) - }, - ) + }) + }) .await } @@ -872,25 +1038,28 @@ impl Persistence { ) -> DatabaseResult<()> { use crate::schema::metadata_health::dsl::*; - self.with_measured_conn( - DatabaseOperation::UpdateMetadataHealth, - move |conn| -> DatabaseResult<_> { + let healthy_records = healthy_records.as_slice(); + let unhealthy_records = unhealthy_records.as_slice(); + self.with_measured_conn(DatabaseOperation::UpdateMetadataHealth, move |conn| { + Box::pin(async move { diesel::insert_into(metadata_health) - .values(&healthy_records) + .values(healthy_records) .on_conflict((tenant_id, shard_number, shard_count)) .do_update() .set((healthy.eq(true), last_scrubbed_at.eq(now))) - .execute(conn)?; + .execute(conn) + .await?; diesel::insert_into(metadata_health) - .values(&unhealthy_records) + .values(unhealthy_records) .on_conflict((tenant_id, shard_number, shard_count)) .do_update() .set((healthy.eq(false), last_scrubbed_at.eq(now))) - .execute(conn)?; + .execute(conn) + .await?; Ok(()) - }, - ) + }) + }) .await } @@ -899,15 +1068,13 @@ impl Persistence { pub(crate) async fn list_metadata_health_records( &self, ) -> DatabaseResult> { - self.with_measured_conn( - DatabaseOperation::ListMetadataHealth, - move |conn| -> DatabaseResult<_> { - Ok( - crate::schema::metadata_health::table - .load::(conn)?, - ) - }, - ) + self.with_measured_conn(DatabaseOperation::ListMetadataHealth, move |conn| { + Box::pin(async { + Ok(crate::schema::metadata_health::table + .load::(conn) + .await?) + }) + }) .await } @@ -919,10 +1086,15 @@ impl Persistence { use crate::schema::metadata_health::dsl::*; self.with_measured_conn( DatabaseOperation::ListMetadataHealthUnhealthy, - move |conn| -> DatabaseResult<_> { - Ok(crate::schema::metadata_health::table - .filter(healthy.eq(false)) - .load::(conn)?) + move |conn| { + Box::pin(async { + DatabaseResult::Ok( + crate::schema::metadata_health::table + .filter(healthy.eq(false)) + .load::(conn) + .await?, + ) + }) }, ) .await @@ -936,15 +1108,14 @@ impl Persistence { ) -> DatabaseResult> { use crate::schema::metadata_health::dsl::*; - self.with_measured_conn( - DatabaseOperation::ListMetadataHealthOutdated, - move |conn| -> DatabaseResult<_> { + self.with_measured_conn(DatabaseOperation::ListMetadataHealthOutdated, move |conn| { + Box::pin(async move { let query = metadata_health.filter(last_scrubbed_at.lt(earlier)); - let res = query.load::(conn)?; + let res = query.load::(conn).await?; Ok(res) - }, - ) + }) + }) .await } @@ -952,12 +1123,13 @@ impl Persistence { /// It is an error for the table to contain more than one entry. pub(crate) async fn get_leader(&self) -> DatabaseResult> { let mut leader: Vec = self - .with_measured_conn( - DatabaseOperation::GetLeader, - move |conn| -> DatabaseResult<_> { - Ok(crate::schema::controllers::table.load::(conn)?) - }, - ) + .with_measured_conn(DatabaseOperation::GetLeader, move |conn| { + Box::pin(async move { + Ok(crate::schema::controllers::table + .load::(conn) + .await?) + }) + }) .await?; if leader.len() > 1 { @@ -980,26 +1152,33 @@ impl Persistence { use crate::schema::controllers::dsl::*; let updated = self - .with_measured_conn( - DatabaseOperation::UpdateLeader, - move |conn| -> DatabaseResult { + .with_measured_conn(DatabaseOperation::UpdateLeader, move |conn| { + let prev = prev.clone(); + let new = new.clone(); + Box::pin(async move { let updated = match &prev { - Some(prev) => diesel::update(controllers) - .filter(address.eq(prev.address.clone())) - .filter(started_at.eq(prev.started_at)) - .set(( - address.eq(new.address.clone()), - started_at.eq(new.started_at), - )) - .execute(conn)?, - None => diesel::insert_into(controllers) - .values(new.clone()) - .execute(conn)?, + Some(prev) => { + diesel::update(controllers) + .filter(address.eq(prev.address.clone())) + .filter(started_at.eq(prev.started_at)) + .set(( + address.eq(new.address.clone()), + started_at.eq(new.started_at), + )) + .execute(conn) + .await? + } + None => { + diesel::insert_into(controllers) + .values(new.clone()) + .execute(conn) + .await? + } }; Ok(updated) - }, - ) + }) + }) .await?; if updated == 0 { @@ -1011,47 +1190,214 @@ impl Persistence { Ok(()) } - pub(crate) async fn safekeeper_get( - &self, - id: i64, - ) -> Result { - use crate::schema::safekeepers::dsl::{id as id_column, safekeepers}; - self.with_conn(move |conn| -> DatabaseResult { - Ok(safekeepers - .filter(id_column.eq(&id)) - .select(SafekeeperPersistence::as_select()) - .get_result(conn)?) - }) - .await + /// At startup, populate the list of nodes which our shards may be placed on + pub(crate) async fn list_safekeepers(&self) -> DatabaseResult> { + let safekeepers: Vec = self + .with_measured_conn(DatabaseOperation::ListNodes, move |conn| { + Box::pin(async move { + Ok(crate::schema::safekeepers::table + .load::(conn) + .await?) + }) + }) + .await?; + + tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len()); + + Ok(safekeepers) } pub(crate) async fn safekeeper_upsert( &self, - record: SafekeeperPersistence, + record: SafekeeperUpsert, ) -> Result<(), DatabaseError> { use crate::schema::safekeepers::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { - let bind = record.as_insert_or_update(); + self.with_conn(move |conn| { + let record = record.clone(); + Box::pin(async move { + let bind = record + .as_insert_or_update() + .map_err(|e| DatabaseError::Logical(format!("{e}")))?; - let inserted_updated = diesel::insert_into(safekeepers) - .values(&bind) - .on_conflict(id) - .do_update() - .set(&bind) - .execute(conn)?; + let inserted_updated = diesel::insert_into(safekeepers) + .values(&bind) + .on_conflict(id) + .do_update() + .set(&bind) + .execute(conn) + .await?; - if inserted_updated != 1 { - return Err(DatabaseError::Logical(format!( - "unexpected number of rows ({})", - inserted_updated - ))); - } + if inserted_updated != 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + inserted_updated + ))); + } - Ok(()) + Ok(()) + }) }) .await } + + pub(crate) async fn set_safekeeper_scheduling_policy( + &self, + id_: i64, + scheduling_policy_: SkSchedulingPolicy, + ) -> Result<(), DatabaseError> { + use crate::schema::safekeepers::dsl::*; + + self.with_conn(move |conn| { + Box::pin(async move { + #[derive(Insertable, AsChangeset)] + #[diesel(table_name = crate::schema::safekeepers)] + struct UpdateSkSchedulingPolicy<'a> { + id: i64, + scheduling_policy: &'a str, + } + let scheduling_policy_ = String::from(scheduling_policy_); + + let rows_affected = diesel::update(safekeepers.filter(id.eq(id_))) + .set(scheduling_policy.eq(scheduling_policy_)) + .execute(conn) + .await?; + + if rows_affected != 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({rows_affected})", + ))); + } + + Ok(()) + }) + }) + .await + } +} + +pub(crate) fn load_certs() -> anyhow::Result> { + let der_certs = rustls_native_certs::load_native_certs(); + + if !der_certs.errors.is_empty() { + anyhow::bail!("could not parse certificates: {:?}", der_certs.errors); + } + + let mut store = rustls::RootCertStore::empty(); + store.add_parsable_certificates(der_certs.certs); + Ok(Arc::new(store)) +} + +#[derive(Debug)] +/// A verifier that accepts all certificates (but logs an error still) +struct AcceptAll(Arc); +impl ServerCertVerifier for AcceptAll { + fn verify_server_cert( + &self, + end_entity: &rustls::pki_types::CertificateDer<'_>, + intermediates: &[rustls::pki_types::CertificateDer<'_>], + server_name: &rustls::pki_types::ServerName<'_>, + ocsp_response: &[u8], + now: rustls::pki_types::UnixTime, + ) -> Result { + let r = + self.0 + .verify_server_cert(end_entity, intermediates, server_name, ocsp_response, now); + if let Err(err) = r { + tracing::info!( + ?server_name, + "ignoring db connection TLS validation error: {err:?}" + ); + return Ok(ServerCertVerified::assertion()); + } + r + } + fn verify_tls12_signature( + &self, + message: &[u8], + cert: &rustls::pki_types::CertificateDer<'_>, + dss: &rustls::DigitallySignedStruct, + ) -> Result { + self.0.verify_tls12_signature(message, cert, dss) + } + fn verify_tls13_signature( + &self, + message: &[u8], + cert: &rustls::pki_types::CertificateDer<'_>, + dss: &rustls::DigitallySignedStruct, + ) -> Result { + self.0.verify_tls13_signature(message, cert, dss) + } + fn supported_verify_schemes(&self) -> Vec { + self.0.supported_verify_schemes() + } +} + +/// Loads the root certificates and constructs a client config suitable for connecting. +/// This function is blocking. +fn client_config_with_root_certs() -> anyhow::Result { + let client_config = + rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_safe_default_protocol_versions() + .expect("ring should support the default protocol versions"); + static DO_CERT_CHECKS: std::sync::OnceLock = std::sync::OnceLock::new(); + let do_cert_checks = + DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_DB_CERT_CHECKS").is_ok()); + Ok(if *do_cert_checks { + client_config + .with_root_certificates(load_certs()?) + .with_no_client_auth() + } else { + let verifier = AcceptAll( + WebPkiServerVerifier::builder_with_provider( + load_certs()?, + Arc::new(ring::default_provider()), + ) + .build()?, + ); + client_config + .dangerous() + .with_custom_certificate_verifier(Arc::new(verifier)) + .with_no_client_auth() + }) +} + +fn establish_connection_rustls(config: &str) -> BoxFuture> { + let fut = async { + // We first set up the way we want rustls to work. + let rustls_config = client_config_with_root_certs() + .map_err(|err| ConnectionError::BadConnection(format!("{err:?}")))?; + let tls = tokio_postgres_rustls::MakeRustlsConnect::new(rustls_config); + let (client, conn) = tokio_postgres::connect(config, tls) + .await + .map_err(|e| ConnectionError::BadConnection(e.to_string()))?; + + AsyncPgConnection::try_from_client_and_connection(client, conn).await + }; + fut.boxed() +} + +#[cfg_attr(test, test)] +fn test_config_debug_censors_password() { + let has_pw = + "host=/var/lib/postgresql,localhost port=1234 user=specialuser password='NOT ALLOWED TAG'"; + let has_pw_cfg = has_pw.parse::().unwrap(); + assert!(format!("{has_pw_cfg:?}").contains("specialuser")); + // Ensure that the password is not leaked by the debug impl + assert!(!format!("{has_pw_cfg:?}").contains("NOT ALLOWED TAG")); +} + +fn log_postgres_connstr_info(config_str: &str) -> anyhow::Result<()> { + let config = config_str + .parse::() + .map_err(|_e| anyhow::anyhow!("Couldn't parse config str"))?; + // We use debug formatting here, and use a unit test to ensure that we don't leak the password. + // To make extra sure the test gets ran, run it every time the function is called + // (this is rather cold code, we can afford it). + #[cfg(not(test))] + test_config_debug_censors_password(); + tracing::info!("database connection config: {config:?}"); + Ok(()) } /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably @@ -1128,6 +1474,7 @@ pub(crate) struct NodePersistence { pub(crate) listen_pg_addr: String, pub(crate) listen_pg_port: i32, pub(crate) availability_zone_id: String, + pub(crate) listen_https_port: Option, } /// Tenant metadata health status that are stored durably. @@ -1195,6 +1542,7 @@ pub(crate) struct ControllerPersistence { pub(crate) started_at: chrono::DateTime, } +// What we store in the database #[derive(Serialize, Deserialize, Queryable, Selectable, Eq, PartialEq, Debug, Clone)] #[diesel(table_name = crate::schema::safekeepers)] pub(crate) struct SafekeeperPersistence { @@ -1206,23 +1554,78 @@ pub(crate) struct SafekeeperPersistence { pub(crate) version: i64, pub(crate) host: String, pub(crate) port: i32, - pub(crate) active: bool, + pub(crate) http_port: i32, + pub(crate) availability_zone_id: String, + pub(crate) scheduling_policy: String, +} + +impl SafekeeperPersistence { + pub(crate) fn from_upsert( + upsert: SafekeeperUpsert, + scheduling_policy: SkSchedulingPolicy, + ) -> Self { + crate::persistence::SafekeeperPersistence { + id: upsert.id, + region_id: upsert.region_id, + version: upsert.version, + host: upsert.host, + port: upsert.port, + http_port: upsert.http_port, + availability_zone_id: upsert.availability_zone_id, + scheduling_policy: String::from(scheduling_policy), + } + } + pub(crate) fn as_describe_response(&self) -> Result { + let scheduling_policy = + SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| { + DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}")) + })?; + Ok(SafekeeperDescribeResponse { + id: NodeId(self.id as u64), + region_id: self.region_id.clone(), + version: self.version, + host: self.host.clone(), + port: self.port, + http_port: self.http_port, + availability_zone_id: self.availability_zone_id.clone(), + scheduling_policy, + }) + } +} + +/// What we expect from the upsert http api +#[derive(Serialize, Deserialize, Eq, PartialEq, Debug, Clone)] +pub(crate) struct SafekeeperUpsert { + pub(crate) id: i64, + pub(crate) region_id: String, + /// 1 is special, it means just created (not currently posted to storcon). + /// Zero or negative is not really expected. + /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag. + pub(crate) version: i64, + pub(crate) host: String, + pub(crate) port: i32, + /// The active flag will not be stored in the database and will be ignored. + pub(crate) active: Option, pub(crate) http_port: i32, pub(crate) availability_zone_id: String, } -impl SafekeeperPersistence { - fn as_insert_or_update(&self) -> InsertUpdateSafekeeper<'_> { - InsertUpdateSafekeeper { +impl SafekeeperUpsert { + fn as_insert_or_update(&self) -> anyhow::Result> { + if self.version < 0 { + anyhow::bail!("negative version: {}", self.version); + } + Ok(InsertUpdateSafekeeper { id: self.id, region_id: &self.region_id, version: self.version, host: &self.host, port: self.port, - active: self.active, http_port: self.http_port, availability_zone_id: &self.availability_zone_id, - } + // None means a wish to not update this column. We expose abilities to update it via other means. + scheduling_policy: None, + }) } } @@ -1234,7 +1637,7 @@ struct InsertUpdateSafekeeper<'a> { version: i64, host: &'a str, port: i32, - active: bool, http_port: i32, availability_zone_id: &'a str, + scheduling_policy: Option<&'a str>, } diff --git a/storage_controller/src/persistence/split_state.rs b/storage_controller/src/persistence/split_state.rs index bce1a75843..f83191038a 100644 --- a/storage_controller/src/persistence/split_state.rs +++ b/storage_controller/src/persistence/split_state.rs @@ -1,8 +1,8 @@ +use diesel::deserialize::{FromSql, FromSqlRow}; +use diesel::expression::AsExpression; use diesel::pg::{Pg, PgValue}; -use diesel::{ - deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql, - sql_types::Int2, -}; +use diesel::serialize::ToSql; +use diesel::sql_types::Int2; use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)] diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 3ad386a95b..a327f6f50f 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,19 +1,18 @@ -use crate::pageserver_client::PageserverClient; -use crate::persistence::Persistence; -use crate::service; -use pageserver_api::controller_api::PlacementPolicy; +use std::borrow::Cow; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use json_structural_diff::JsonDiff; +use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy}; use pageserver_api::models::{ - LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, + LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest, }; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_client::mgmt_api; use reqwest::StatusCode; -use std::collections::HashMap; -use std::sync::Arc; -use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; use utils::backoff::exponential_backoff; -use utils::failpoint_support; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use utils::lsn::Lsn; @@ -22,9 +21,12 @@ use utils::sync::gate::GateGuard; use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; +use crate::pageserver_client::PageserverClient; +use crate::persistence::Persistence; use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation}; +use crate::{compute_hook, service}; -const DEFAULT_HEATMAP_PERIOD: &str = "60s"; +const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60); /// Object with the lifetime of the background reconcile task that is created /// for tenants which have a difference between their intent and observed states. @@ -45,6 +47,7 @@ pub(super) struct Reconciler { pub(crate) reconciler_config: ReconcilerConfig, pub(crate) config: TenantConfig, + pub(crate) preferred_az: Option, /// Observed state from the point of view of the reconciler. /// This gets updated as the reconciliation makes progress. @@ -90,9 +93,10 @@ pub(crate) struct ReconcilerConfigBuilder { } impl ReconcilerConfigBuilder { - pub(crate) fn new() -> Self { + /// Priority is special: you must pick one thoughtfully, do not just use 'normal' as the default + pub(crate) fn new(priority: ReconcilerPriority) -> Self { Self { - config: ReconcilerConfig::default(), + config: ReconcilerConfig::new(priority), } } @@ -114,13 +118,32 @@ impl ReconcilerConfigBuilder { } } + pub(crate) fn tenant_creation_hint(self, hint: bool) -> Self { + Self { + config: ReconcilerConfig { + tenant_creation_hint: hint, + ..self.config + }, + } + } + pub(crate) fn build(self) -> ReconcilerConfig { self.config } } -#[derive(Default, Debug, Copy, Clone)] +// Higher priorities are used for user-facing tasks, so that a long backlog of housekeeping work (e.g. reconciling on startup, rescheduling +// things on node changes) does not starve user-facing tasks. +#[derive(Debug, Copy, Clone)] +pub(crate) enum ReconcilerPriority { + Normal, + High, +} + +#[derive(Debug, Copy, Clone)] pub(crate) struct ReconcilerConfig { + pub(crate) priority: ReconcilerPriority, + // During live migration give up on warming-up the secondary // after this timeout. secondary_warmup_timeout: Option, @@ -128,9 +151,25 @@ pub(crate) struct ReconcilerConfig { // During live migrations this is the amount of time that // the pagserver will hold our poll. secondary_download_request_timeout: Option, + + // A hint indicating whether this reconciliation is done on the + // creation of a new tenant. This only informs logging behaviour. + tenant_creation_hint: bool, } impl ReconcilerConfig { + /// Configs are always constructed with an explicit priority, to force callers to think about whether + /// the operation they're scheduling is high-priority or not. Normal priority is not a safe default, because + /// scheduling something user-facing at normal priority can result in it getting starved out by background work. + pub(crate) fn new(priority: ReconcilerPriority) -> Self { + Self { + priority, + secondary_warmup_timeout: None, + secondary_download_request_timeout: None, + tenant_creation_hint: false, + } + } + pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration { const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300); self.secondary_warmup_timeout @@ -142,6 +181,28 @@ impl ReconcilerConfig { self.secondary_download_request_timeout .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT) } + + pub(crate) fn tenant_creation_hint(&self) -> bool { + self.tenant_creation_hint + } +} + +impl From<&MigrationConfig> for ReconcilerConfig { + fn from(value: &MigrationConfig) -> Self { + // Run reconciler at high priority because MigrationConfig comes from human requests that should + // be presumed urgent. + let mut builder = ReconcilerConfigBuilder::new(ReconcilerPriority::High); + + if let Some(timeout) = value.secondary_warmup_timeout { + builder = builder.secondary_warmup_timeout(timeout) + } + + if let Some(timeout) = value.secondary_download_request_timeout { + builder = builder.secondary_download_request_timeout(timeout) + } + + builder.build() + } } /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O @@ -210,11 +271,12 @@ impl Reconciler { lazy: bool, ) -> Result<(), ReconcileError> { if !node.is_available() && config.mode == LocationConfigMode::Detached { - // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline - // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of - // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`] - tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation"); - self.observed.locations.remove(&node.get_id()); + // [`crate::service::Service::node_activate_reconcile`] will update the observed state + // when the node comes back online. At that point, the intent and observed states will + // be mismatched and a background reconciliation will detach. + tracing::info!( + "Node {node} is unavailable during detach: proceeding anyway, it will be detached via background reconciliation" + ); return Ok(()); } @@ -236,7 +298,7 @@ impl Reconciler { .location_config(tenant_shard_id, config.clone(), flush_ms, lazy) .await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 3, timeout, @@ -346,6 +408,32 @@ impl Reconciler { Ok(()) } + async fn wait_lsn( + &self, + node: &Node, + tenant_shard_id: TenantShardId, + timelines: HashMap, + ) -> Result { + const TIMEOUT: Duration = Duration::from_secs(10); + + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.service_config.pageserver_jwt_token.as_deref(), + ); + + client + .wait_lsn( + tenant_shard_id, + TenantWaitLsnRequest { + timelines, + timeout: TIMEOUT, + }, + ) + .await + .map_err(|e| e.into()) + } + async fn get_lsns( &self, tenant_shard_id: TenantShardId, @@ -354,7 +442,7 @@ impl Reconciler { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.service_config.jwt_token.as_deref(), + self.service_config.pageserver_jwt_token.as_deref(), ); let timelines = client.timeline_list(&tenant_shard_id).await?; @@ -392,7 +480,7 @@ impl Reconciler { ) .await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 3, request_download_timeout * 2, @@ -424,7 +512,8 @@ impl Reconciler { } else if status == StatusCode::ACCEPTED { let total_runtime = started_at.elapsed(); if total_runtime > total_download_timeout { - tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", + tracing::warn!( + "Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", total_runtime.as_millis(), progress.layers_downloaded, progress.layers_total, @@ -459,6 +548,39 @@ impl Reconciler { node: &Node, baseline: HashMap, ) -> anyhow::Result<()> { + // Signal to the pageserver that it should ingest up to the baseline LSNs. + loop { + match self.wait_lsn(node, tenant_shard_id, baseline.clone()).await { + Ok(StatusCode::OK) => { + // Everything is caught up + return Ok(()); + } + Ok(StatusCode::ACCEPTED) => { + // Some timelines are not caught up yet. + // They'll be polled below. + break; + } + Ok(StatusCode::NOT_FOUND) => { + // None of the timelines are present on the pageserver. + // This is correct if they've all been deleted, but + // let let the polling loop below cross check. + break; + } + Ok(status_code) => { + tracing::warn!( + "Unexpected status code ({status_code}) returned by wait_lsn endpoint" + ); + break; + } + Err(e) => { + tracing::info!("🕑 Can't trigger LSN wait on {node} yet, waiting ({e})",); + tokio::time::sleep(Duration::from_millis(500)).await; + continue; + } + } + } + + // Poll the LSNs until they catch up loop { let latest = match self.get_lsns(tenant_shard_id, node).await { Ok(l) => l, @@ -652,7 +774,7 @@ impl Reconciler { let observed_conf = match attached_node .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 1, Duration::from_secs(5), @@ -694,6 +816,11 @@ impl Reconciler { /// First we apply special case handling (e.g. for live migrations), and then a /// general case reconciliation where we walk through the intent by pageserver /// and call out to the pageserver to apply the desired state. + /// + /// An Ok(()) result indicates that we successfully attached the tenant, but _not_ that + /// all locations for the tenant are in the expected state. When nodes that are to be detached + /// or configured as secondary are unavailable, we may return Ok(()) but leave the shard in a + /// state where it still requires later reconciliation. pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> { // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it self.maybe_refresh_observed().await?; @@ -747,6 +874,8 @@ impl Reconciler { }; if increment_generation { + pausable_failpoint!("reconciler-pre-increment-generation"); + let generation = self .persistence .increment_generation(self.tenant_shard_id, node.get_id()) @@ -754,7 +883,27 @@ impl Reconciler { self.generation = Some(generation); wanted_conf.generation = generation.into(); } - tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + + let diff = match observed { + Some(ObservedStateLocation { + conf: Some(observed), + }) => { + let diff = JsonDiff::diff( + &serde_json::to_value(observed.clone()).unwrap(), + &serde_json::to_value(wanted_conf.clone()).unwrap(), + false, + ); + + if let Some(json_diff) = diff.diff { + serde_json::to_string(&json_diff).unwrap_or("diff err".to_string()) + } else { + "unknown".to_string() + } + } + _ => "full".to_string(), + }; + + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update: {diff}"); // Because `node` comes from a ref to &self, clone it before calling into a &mut self // function: this could be avoided by refactoring the state mutated by location_config into @@ -780,10 +929,18 @@ impl Reconciler { tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") } _ => { - // In all cases other than a matching observed configuration, we will - // reconcile this location. - tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); - changes.push((node.clone(), wanted_conf)) + // Only try and configure secondary locations on nodes that are available. This + // allows the reconciler to "succeed" while some secondaries are offline (e.g. after + // a node failure, where the failed node will have a secondary intent) + if node.is_available() { + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + changes.push((node.clone(), wanted_conf)) + } else { + tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable"); + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: None }); + } } } } @@ -809,7 +966,21 @@ impl Reconciler { if self.cancel.is_cancelled() { return Err(ReconcileError::Cancel); } - self.location_config(&node, conf, None, false).await?; + // We only try to configure secondary locations if the node is available. This does + // not stop us succeeding with the reconcile, because our core goal is to make the + // shard _available_ (the attached location), and configuring secondary locations + // can be done lazily when the node becomes available (via background reconciliation). + if node.is_available() { + self.location_config(&node, conf, None, false).await?; + } else { + // If the node is unavailable, we skip and consider the reconciliation successful: this + // is a common case where a pageserver is marked unavailable: we demote a location on + // that unavailable pageserver to secondary. + tracing::info!("Skipping configuring secondary location {node}, it is unavailable"); + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: None }); + } } // The condition below identifies a detach. We must have no attached intent and @@ -822,7 +993,7 @@ impl Reconciler { .handle_detach(self.tenant_shard_id, self.shard.stripe_size); } - failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue"); + pausable_failpoint!("reconciler-epilogue"); Ok(()) } @@ -834,23 +1005,45 @@ impl Reconciler { let result = self .compute_hook .notify( - self.tenant_shard_id, - node.get_id(), - self.shard.stripe_size, + compute_hook::ShardUpdate { + tenant_shard_id: self.tenant_shard_id, + node_id: node.get_id(), + stripe_size: self.shard.stripe_size, + preferred_az: self.preferred_az.as_ref().map(Cow::Borrowed), + }, &self.cancel, ) .await; if let Err(e) = &result { - // It is up to the caller whether they want to drop out on this error, but they don't have to: - // in general we should avoid letting unavailability of the cloud control plane stop us from - // making progress. - if !matches!(e, NotifyError::ShuttingDown) { - tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}"); - } - // Set this flag so that in our ReconcileResult we will set the flag on the shard that it // needs to retry at some point. self.compute_notify_failure = true; + + // It is up to the caller whether they want to drop out on this error, but they don't have to: + // in general we should avoid letting unavailability of the cloud control plane stop us from + // making progress. + match e { + // 404s from cplane during tenant creation are expected. + // Cplane only persists the shards to the database after + // creating the tenant and the timeline. If we notify before + // that, we'll get a 404. + // + // This is fine because tenant creations happen via /location_config + // and that returns the list of locations in the response. Hence, we + // silence the error and return Ok(()) here. Reconciliation will still + // be retried because we set [`Reconciler::compute_notify_failure`] above. + NotifyError::Unexpected(hyper::StatusCode::NOT_FOUND) + if self.reconciler_config.tenant_creation_hint() => + { + return Ok(()); + } + NotifyError::ShuttingDown => {} + _ => { + tracing::warn!( + "Failed to notify compute of attached pageserver {node}: {e}" + ); + } + } } result } else { @@ -929,7 +1122,7 @@ impl Reconciler { match origin .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 3, Duration::from_secs(5), @@ -1010,7 +1203,7 @@ fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig let mut config = config.clone(); if has_secondaries { if config.heatmap_period.is_none() { - config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string()); + config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD); } } else { config.heatmap_period = None; diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs new file mode 100644 index 0000000000..546fbf0726 --- /dev/null +++ b/storage_controller/src/safekeeper.rs @@ -0,0 +1,148 @@ +use std::str::FromStr; +use std::time::Duration; + +use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; +use reqwest::StatusCode; +use safekeeper_client::mgmt_api; +use tokio_util::sync::CancellationToken; +use utils::backoff; +use utils::id::NodeId; +use utils::logging::SecretString; + +use crate::heartbeater::SafekeeperState; +use crate::persistence::{DatabaseError, SafekeeperPersistence}; +use crate::safekeeper_client::SafekeeperClient; + +#[derive(Clone)] +pub struct Safekeeper { + pub(crate) skp: SafekeeperPersistence, + cancel: CancellationToken, + listen_http_addr: String, + listen_http_port: u16, + scheduling_policy: SkSchedulingPolicy, + id: NodeId, + availability: SafekeeperState, +} + +impl Safekeeper { + pub(crate) fn from_persistence(skp: SafekeeperPersistence, cancel: CancellationToken) -> Self { + let scheduling_policy = SkSchedulingPolicy::from_str(&skp.scheduling_policy).unwrap(); + Self { + cancel, + listen_http_addr: skp.host.clone(), + listen_http_port: skp.http_port as u16, + id: NodeId(skp.id as u64), + skp, + availability: SafekeeperState::Offline, + scheduling_policy, + } + } + pub(crate) fn base_url(&self) -> String { + format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + } + + pub(crate) fn get_id(&self) -> NodeId { + self.id + } + pub(crate) fn describe_response(&self) -> Result { + self.skp.as_describe_response() + } + pub(crate) fn set_availability(&mut self, availability: SafekeeperState) { + self.availability = availability; + } + pub(crate) fn scheduling_policy(&self) -> SkSchedulingPolicy { + self.scheduling_policy + } + pub(crate) fn set_scheduling_policy(&mut self, scheduling_policy: SkSchedulingPolicy) { + self.scheduling_policy = scheduling_policy; + self.skp.scheduling_policy = String::from(scheduling_policy); + } + /// Perform an operation (which is given a [`SafekeeperClient`]) with retries + pub(crate) async fn with_client_retries( + &self, + mut op: O, + jwt: &Option, + warn_threshold: u32, + max_retries: u32, + timeout: Duration, + cancel: &CancellationToken, + ) -> mgmt_api::Result + where + O: FnMut(SafekeeperClient) -> F, + F: std::future::Future>, + { + fn is_fatal(e: &mgmt_api::Error) -> bool { + use mgmt_api::Error::*; + match e { + ReceiveBody(_) | ReceiveErrorBody(_) => false, + ApiError(StatusCode::SERVICE_UNAVAILABLE, _) + | ApiError(StatusCode::GATEWAY_TIMEOUT, _) + | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, + ApiError(_, _) => true, + Cancelled => true, + } + } + + backoff::retry( + || { + let http_client = reqwest::ClientBuilder::new() + .timeout(timeout) + .build() + .expect("Failed to construct HTTP client"); + + let client = SafekeeperClient::from_client( + self.get_id(), + http_client, + self.base_url(), + jwt.clone(), + ); + + let node_cancel_fut = self.cancel.cancelled(); + + let op_fut = op(client); + + async { + tokio::select! { + r = op_fut=> {r}, + _ = node_cancel_fut => { + Err(mgmt_api::Error::Cancelled) + }} + } + }, + is_fatal, + warn_threshold, + max_retries, + &format!( + "Call to safekeeper {} ({}:{}) management API", + self.id, self.listen_http_addr, self.listen_http_port + ), + cancel, + ) + .await + .unwrap_or(Err(mgmt_api::Error::Cancelled)) + } + + pub(crate) fn update_from_record(&mut self, record: crate::persistence::SafekeeperUpsert) { + let crate::persistence::SafekeeperUpsert { + active: _, + availability_zone_id: _, + host, + http_port, + id, + port: _, + region_id: _, + version: _, + } = record.clone(); + if id != self.id.0 as i64 { + // The way the function is called ensures this. If we regress on that, it's a bug. + panic!( + "id can't be changed via update_from_record function: {id} != {}", + self.id.0 + ); + } + self.skp = + crate::persistence::SafekeeperPersistence::from_upsert(record, self.scheduling_policy); + self.listen_http_port = http_port as u16; + self.listen_http_addr = host; + } +} diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs new file mode 100644 index 0000000000..fb5be092a0 --- /dev/null +++ b/storage_controller/src/safekeeper_client.rs @@ -0,0 +1,120 @@ +use safekeeper_api::models::{ + PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + TimelineStatus, +}; +use safekeeper_client::mgmt_api::{Client, Result}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::logging::SecretString; + +use crate::metrics::PageserverRequestLabelGroup; + +/// Thin wrapper around [`safekeeper_client::mgmt_api::Client`]. It allows the storage +/// controller to collect metrics in a non-intrusive manner. +/// +/// Analogous to [`crate::pageserver_client::PageserverClient`]. +#[derive(Debug, Clone)] +pub(crate) struct SafekeeperClient { + inner: Client, + node_id_label: String, +} + +macro_rules! measured_request { + ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{ + let labels = PageserverRequestLabelGroup { + pageserver_id: $node_id, + path: $name, + method: $method, + }; + + let latency = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_safekeeper_request_latency; + let _timer_guard = latency.start_timer(labels.clone()); + + let res = $invoke; + + if res.is_err() { + let error_counters = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_request_error; + error_counters.inc(labels) + } + + res + }}; +} + +impl SafekeeperClient { + #[allow(dead_code)] + pub(crate) fn new( + node_id: NodeId, + mgmt_api_endpoint: String, + jwt: Option, + ) -> Self { + Self { + inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + pub(crate) fn from_client( + node_id: NodeId, + raw_client: reqwest::Client, + mgmt_api_endpoint: String, + jwt: Option, + ) -> Self { + Self { + inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + #[allow(dead_code)] + pub(crate) async fn create_timeline( + &self, + req: &TimelineCreateRequest, + ) -> Result { + measured_request!( + "create_timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.create_timeline(req).await + ) + } + + #[allow(dead_code)] + pub(crate) async fn delete_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + measured_request!( + "delete_timeline", + crate::metrics::Method::Delete, + &self.node_id_label, + self.inner.delete_timeline(tenant_id, timeline_id).await + ) + } + + #[allow(dead_code)] + pub(crate) async fn pull_timeline( + &self, + req: &PullTimelineRequest, + ) -> Result { + measured_request!( + "pull_timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.pull_timeline(req).await + ) + } + + pub(crate) async fn get_utilization(&self) -> Result { + measured_request!( + "utilization", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.utilization().await + ) + } +} diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index ecc6b11e47..817cf04fe1 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,9 +1,16 @@ -use crate::{node::Node, tenant_shard::TenantShard}; +use std::collections::HashMap; +use std::fmt::Debug; + +use http_utils::error::ApiError; use itertools::Itertools; -use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization}; +use pageserver_api::controller_api::AvailabilityZone; +use pageserver_api::models::PageserverUtilization; use serde::Serialize; -use std::{collections::HashMap, fmt::Debug}; -use utils::{http::error::ApiError, id::NodeId}; +use utils::id::NodeId; + +use crate::metrics::NodeLabelGroup; +use crate::node::Node; +use crate::tenant_shard::TenantShard; /// Scenarios in which we cannot find a suitable location for a tenant shard #[derive(thiserror::Error, Debug)] @@ -32,6 +39,9 @@ pub(crate) struct SchedulerNode { shard_count: usize, /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`]. attached_shard_count: usize, + /// How many shards have a location on this node (via [`crate::tenant_shard::IntentState`]) _and_ this node + /// is in their preferred AZ (i.e. this is their 'home' location) + home_shard_count: usize, /// Availability zone id in which the node resides az: AvailabilityZone, @@ -47,6 +57,12 @@ pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized { preferred_az: &Option, context: &ScheduleContext, ) -> Option; + + /// Return a score that drops any components based on node utilization: this is useful + /// for finding scores for scheduling optimisation, when we want to avoid rescheduling + /// shards due to e.g. disk usage, to avoid flapping. + fn for_optimization(&self) -> Self; + fn is_overloaded(&self) -> bool; fn node_id(&self) -> NodeId; } @@ -136,17 +152,13 @@ impl PartialOrd for SecondaryAzMatch { /// Ordering is given by member declaration order (top to bottom). #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] pub(crate) struct NodeAttachmentSchedulingScore { - /// The number of shards belonging to the tenant currently being - /// scheduled that are attached to this node. - affinity_score: AffinityScore, /// Flag indicating whether this node matches the preferred AZ /// of the shard. For equal affinity scores, nodes in the matching AZ /// are considered first. az_match: AttachmentAzMatch, - /// Size of [`ScheduleContext::attached_nodes`] for the current node. - /// This normally tracks the number of attached shards belonging to the - /// tenant being scheduled that are already on this node. - attached_shards_in_context: usize, + /// The number of shards belonging to the tenant currently being + /// scheduled that are attached to this node. + affinity_score: AffinityScore, /// Utilisation score that combines shard count and disk utilisation utilization_score: u64, /// Total number of shards attached to this node. When nodes have identical utilisation, this @@ -177,13 +189,25 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore { .copied() .unwrap_or(AffinityScore::FREE), az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())), - attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0), utilization_score: utilization.cached_score(), total_attached_shard_count: node.attached_shard_count, node_id: *node_id, }) } + /// For use in scheduling optimisation, where we only want to consider the aspects + /// of the score that can only be resolved by moving things (such as inter-shard affinity + /// and AZ affinity), and ignore aspects that reflect the total utilization of a node (which + /// can fluctuate for other reasons) + fn for_optimization(&self) -> Self { + Self { + utilization_score: 0, + total_attached_shard_count: 0, + node_id: NodeId(0), + ..*self + } + } + fn is_overloaded(&self) -> bool { PageserverUtilization::is_overloaded(self.utilization_score) } @@ -208,9 +232,9 @@ pub(crate) struct NodeSecondarySchedulingScore { affinity_score: AffinityScore, /// Utilisation score that combines shard count and disk utilisation utilization_score: u64, - /// Total number of shards attached to this node. When nodes have identical utilisation, this - /// acts as an anti-affinity between attached shards. - total_attached_shard_count: usize, + /// Anti-affinity with other non-home locations: this gives the behavior that secondaries + /// will spread out across the nodes in an AZ. + total_non_home_shard_count: usize, /// Convenience to make selection deterministic in tests and empty systems node_id: NodeId, } @@ -237,11 +261,20 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore { .copied() .unwrap_or(AffinityScore::FREE), utilization_score: utilization.cached_score(), - total_attached_shard_count: node.attached_shard_count, + total_non_home_shard_count: (node.shard_count - node.home_shard_count), node_id: *node_id, }) } + fn for_optimization(&self) -> Self { + Self { + utilization_score: 0, + total_non_home_shard_count: 0, + node_id: NodeId(0), + ..*self + } + } + fn is_overloaded(&self) -> bool { PageserverUtilization::is_overloaded(self.utilization_score) } @@ -293,6 +326,10 @@ impl AffinityScore { pub(crate) fn inc(&mut self) { self.0 += 1; } + + pub(crate) fn dec(&mut self) { + self.0 -= 1; + } } impl std::ops::Add for AffinityScore { @@ -324,9 +361,6 @@ pub(crate) struct ScheduleContext { /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`] pub(crate) nodes: HashMap, - /// Specifically how many _attached_ locations are on each node - pub(crate) attached_nodes: HashMap, - pub(crate) mode: ScheduleMode, } @@ -334,7 +368,6 @@ impl ScheduleContext { pub(crate) fn new(mode: ScheduleMode) -> Self { Self { nodes: HashMap::new(), - attached_nodes: HashMap::new(), mode, } } @@ -348,25 +381,31 @@ impl ScheduleContext { } } - pub(crate) fn push_attached(&mut self, node_id: NodeId) { - let entry = self.attached_nodes.entry(node_id).or_default(); - *entry += 1; - } - - pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore { - self.nodes - .get(&node_id) - .copied() - .unwrap_or(AffinityScore::FREE) - } - - pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize { - self.attached_nodes.get(&node_id).copied().unwrap_or(0) + /// Remove `shard`'s contributions to this context. This is useful when considering scheduling + /// this shard afresh, where we don't want it to e.g. experience anti-affinity to its current location. + pub(crate) fn project_detach(&self, shard: &TenantShard) -> Self { + let mut new_context = self.clone(); + + if let Some(attached) = shard.intent.get_attached() { + if let Some(score) = new_context.nodes.get_mut(attached) { + score.dec(); + } + } + + for secondary in shard.intent.get_secondary() { + if let Some(score) = new_context.nodes.get_mut(secondary) { + score.dec(); + } + } + + new_context } + /// For test, track the sum of AffinityScore values, which is effectively how many + /// attached or secondary locations have been registered with this context. #[cfg(test)] - pub(crate) fn attach_count(&self) -> usize { - self.attached_nodes.values().sum() + pub(crate) fn location_count(&self) -> usize { + self.nodes.values().map(|i| i.0).sum() } } @@ -388,6 +427,7 @@ impl Scheduler { SchedulerNode { shard_count: 0, attached_shard_count: 0, + home_shard_count: 0, may_schedule: node.may_schedule(), az: node.get_availability_zone_id().clone(), }, @@ -415,6 +455,7 @@ impl Scheduler { SchedulerNode { shard_count: 0, attached_shard_count: 0, + home_shard_count: 0, may_schedule: node.may_schedule(), az: node.get_availability_zone_id().clone(), }, @@ -427,6 +468,9 @@ impl Scheduler { Some(node) => { node.shard_count += 1; node.attached_shard_count += 1; + if Some(&node.az) == shard.preferred_az() { + node.home_shard_count += 1; + } } None => anyhow::bail!( "Tenant {} references nonexistent node {}", @@ -438,7 +482,12 @@ impl Scheduler { for node_id in shard.intent.get_secondary() { match expect_nodes.get_mut(node_id) { - Some(node) => node.shard_count += 1, + Some(node) => { + node.shard_count += 1; + if Some(&node.az) == shard.preferred_az() { + node.home_shard_count += 1; + } + } None => anyhow::bail!( "Tenant {} references nonexistent node {}", shard.tenant_shard_id, @@ -482,13 +531,20 @@ impl Scheduler { /// /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into /// [`Self::new`] or [`Self::node_upsert`]) - pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) { + pub(crate) fn update_node_ref_counts( + &mut self, + node_id: NodeId, + preferred_az: Option<&AvailabilityZone>, + update: RefCountUpdate, + ) { let Some(node) = self.nodes.get_mut(&node_id) else { debug_assert!(false); tracing::error!("Scheduler missing node {node_id}"); return; }; + let is_home_az = Some(&node.az) == preferred_az; + match update { RefCountUpdate::PromoteSecondary => { node.attached_shard_count += 1; @@ -496,19 +552,31 @@ impl Scheduler { RefCountUpdate::Attach => { node.shard_count += 1; node.attached_shard_count += 1; + if is_home_az { + node.home_shard_count += 1; + } } RefCountUpdate::Detach => { node.shard_count -= 1; node.attached_shard_count -= 1; + if is_home_az { + node.home_shard_count -= 1; + } } RefCountUpdate::DemoteAttached => { node.attached_shard_count -= 1; } RefCountUpdate::AddSecondary => { node.shard_count += 1; + if is_home_az { + node.home_shard_count += 1; + } } RefCountUpdate::RemoveSecondary => { node.shard_count -= 1; + if is_home_az { + node.home_shard_count -= 1; + } } } @@ -594,6 +662,7 @@ impl Scheduler { entry.insert(SchedulerNode { shard_count: 0, attached_shard_count: 0, + home_shard_count: 0, may_schedule: node.may_schedule(), az: node.get_availability_zone_id().clone(), }); @@ -607,33 +676,20 @@ impl Scheduler { } } - /// Where we have several nodes to choose from, for example when picking a secondary location - /// to promote to an attached location, this method may be used to pick the best choice based - /// on the scheduler's knowledge of utilization and availability. - /// - /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the - /// caller can pick a node some other way. - pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option { - if nodes.is_empty() { - return None; - } - - // TODO: When the utilization score returned by the pageserver becomes meaningful, - // schedule based on that instead of the shard count. - let node = nodes - .iter() - .map(|node_id| { - let may_schedule = self - .nodes - .get(node_id) - .map(|n| !matches!(n.may_schedule, MaySchedule::No)) - .unwrap_or(false); - (*node_id, may_schedule) - }) - .max_by_key(|(_n, may_schedule)| *may_schedule); - - // If even the preferred node has may_schedule==false, return None - node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None }) + /// Calculate a single node's score, used in optimizer logic to compare specific + /// nodes' scores. + pub(crate) fn compute_node_score( + &mut self, + node_id: NodeId, + preferred_az: &Option, + context: &ScheduleContext, + ) -> Option + where + Score: NodeSchedulingScore, + { + self.nodes + .get_mut(&node_id) + .and_then(|node| Score::generate(&node_id, node, preferred_az, context)) } /// Compute a schedulling score for each node that the scheduler knows of @@ -725,9 +781,10 @@ impl Scheduler { if !matches!(context.mode, ScheduleMode::Speculative) { tracing::info!( - "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})", - scores.iter().map(|i| i.node_id().0).collect::>() - ); + "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})", + scores.iter().map(|i| i.node_id().0).collect::>(), + preferred_az, + ); } // Note that we do not update shard count here to reflect the scheduling: that @@ -742,6 +799,77 @@ impl Scheduler { self.schedule_shard::(&[], &None, &ScheduleContext::default()) } + /// For choosing which AZ to schedule a new shard into, use this. It will return the + /// AZ with the the lowest number of shards currently scheduled in this AZ as their home + /// location. + /// + /// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded + /// node, because while tenants start out single sharded, when they grow and undergo + /// shard-split, they will occupy space on many nodes within an AZ. It is important + /// that we pick the AZ in a way that balances this _future_ load. + /// + /// Once we've picked an AZ, subsequent scheduling within that AZ will be driven by + /// nodes' utilization scores. + pub(crate) fn get_az_for_new_tenant(&self) -> Option { + if self.nodes.is_empty() { + return None; + } + + #[derive(Default)] + struct AzScore { + home_shard_count: usize, + scheduleable: bool, + } + + let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new(); + for node in self.nodes.values() { + let az = azs.entry(&node.az).or_default(); + az.home_shard_count += node.home_shard_count; + az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_)); + } + + // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where + // all nodes are overloaded or otherwise unschedulable). + if azs.values().any(|i| i.scheduleable) { + azs.retain(|_, i| i.scheduleable); + } + + // Find the AZ with the lowest number of shards currently allocated + Some( + azs.into_iter() + .min_by_key(|i| (i.1.home_shard_count, i.0)) + .unwrap() + .0 + .clone(), + ) + } + + pub(crate) fn get_node_az(&self, node_id: &NodeId) -> Option { + self.nodes.get(node_id).map(|n| n.az.clone()) + } + + /// For use when choosing a preferred secondary location: filter out nodes that are not + /// available, and gather their AZs. + pub(crate) fn filter_usable_nodes( + &self, + nodes: &[NodeId], + ) -> Vec<(NodeId, Option)> { + nodes + .iter() + .filter_map(|node_id| { + let node = self + .nodes + .get(node_id) + .expect("Referenced nodes always exist"); + if matches!(node.may_schedule, MaySchedule::Yes(_)) { + Some((*node_id, Some(node.az.clone()))) + } else { + None + } + }) + .collect() + } + /// Unit test access to internal state #[cfg(test)] pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize { @@ -752,19 +880,46 @@ impl Scheduler { pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize { self.nodes.get(&node_id).unwrap().attached_shard_count } + + /// Some metrics that we only calculate periodically: this is simpler than + /// rigorously updating them on every change. + pub(crate) fn update_metrics(&self) { + for (node_id, node) in &self.nodes { + let node_id_str = format!("{}", node_id); + let label_group = NodeLabelGroup { + az: &node.az.0, + node_id: &node_id_str, + }; + + crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_node_shards + .set(label_group.clone(), node.shard_count as i64); + + crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_node_attached_shards + .set(label_group.clone(), node.attached_shard_count as i64); + + crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_node_home_shards + .set(label_group.clone(), node.home_shard_count as i64); + } + } } #[cfg(test)] pub(crate) mod test_utils { - use crate::node::Node; - use pageserver_api::{ - controller_api::{AvailabilityZone, NodeAvailability}, - models::utilization::test_utilization, - }; use std::collections::HashMap; + + use pageserver_api::controller_api::{AvailabilityZone, NodeAvailability}; + use pageserver_api::models::utilization::test_utilization; use utils::id::NodeId; + use crate::node::Node; + /// Test helper: synthesize the requested number of nodes, all in active state. /// /// Node IDs start at one. @@ -781,13 +936,16 @@ pub(crate) mod test_utils { NodeId(i), format!("httphost-{i}"), 80 + i as u16, + None, format!("pghost-{i}"), 5432 + i as u16, az_iter .next() .cloned() .unwrap_or(AvailabilityZone("test-az".to_string())), - ); + false, + ) + .unwrap(); node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0))); assert!(node.is_available()); node @@ -799,18 +957,21 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { - use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization}; + use pageserver_api::controller_api::NodeAvailability; + use pageserver_api::models::utilization::test_utilization; + use pageserver_api::shard::ShardIdentity; + use utils::id::TenantId; + use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use super::*; - use crate::tenant_shard::IntentState; #[test] fn scheduler_basic() -> anyhow::Result<()> { let nodes = test_utils::make_test_nodes(2, &[]); let mut scheduler = Scheduler::new(nodes.values()); - let mut t1_intent = IntentState::new(); - let mut t2_intent = IntentState::new(); + let mut t1_intent = IntentState::new(None); + let mut t2_intent = IntentState::new(None); let context = ScheduleContext::default(); @@ -886,7 +1047,7 @@ mod tests { let scheduled = scheduler .schedule_shard::(&[], &None, context) .unwrap(); - let mut intent = IntentState::new(); + let mut intent = IntentState::new(None); intent.set_attached(scheduler, Some(scheduled)); scheduled_intents.push(intent); assert_eq!(scheduled, expect_node); @@ -1019,7 +1180,7 @@ mod tests { let scheduled = scheduler .schedule_shard::(&[], &preferred_az, context) .unwrap(); - let mut intent = IntentState::new(); + let mut intent = IntentState::new(preferred_az.clone()); intent.set_attached(scheduler, Some(scheduled)); scheduled_intents.push(intent); assert_eq!(scheduled, expect_node); @@ -1045,9 +1206,9 @@ mod tests { &mut context, ); - // Node 2 is not in "az-a", but it has the lowest affinity so we prefer that. + // Node 1 and 3 (az-a) have same affinity score, so prefer the lowest node id. assert_scheduler_chooses::( - NodeId(2), + NodeId(1), Some(az_a_tag.clone()), &mut scheduled_intents, &mut scheduler, @@ -1063,28 +1224,315 @@ mod tests { &mut context, ); - // Avoid nodes in "az-b" for the secondary location. - // Nodes 1 and 3 are identically loaded, so prefer the lowest node id. - assert_scheduler_chooses::( - NodeId(1), - Some(az_b_tag.clone()), - &mut scheduled_intents, - &mut scheduler, - &mut context, - ); - - // Avoid nodes in "az-b" for the secondary location. - // Node 3 has lower affinity score than 1, so prefer that. - assert_scheduler_chooses::( - NodeId(3), - Some(az_b_tag.clone()), - &mut scheduled_intents, - &mut scheduler, - &mut context, - ); - for mut intent in scheduled_intents { intent.clear(&mut scheduler); } } + + #[test] + fn az_scheduling_for_new_tenant() { + let az_a_tag = AvailabilityZone("az-a".to_string()); + let az_b_tag = AvailabilityZone("az-b".to_string()); + let nodes = test_utils::make_test_nodes( + 6, + &[ + az_a_tag.clone(), + az_a_tag.clone(), + az_a_tag.clone(), + az_b_tag.clone(), + az_b_tag.clone(), + az_b_tag.clone(), + ], + ); + + let mut scheduler = Scheduler::new(nodes.values()); + + /// Force the `home_shard_count` of a node directly: this is the metric used + /// by the scheduler when picking AZs. + fn set_shard_count(scheduler: &mut Scheduler, node_id: NodeId, shard_count: usize) { + let node = scheduler.nodes.get_mut(&node_id).unwrap(); + node.home_shard_count = shard_count; + } + + // Initial empty state. Scores are tied, scheduler prefers lower AZ ID. + assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone())); + + // Home shard count is higher in AZ A, so AZ B will be preferred + set_shard_count(&mut scheduler, NodeId(1), 10); + assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone())); + + // Total home shard count is higher in AZ B, so we revert to preferring AZ A + set_shard_count(&mut scheduler, NodeId(4), 6); + set_shard_count(&mut scheduler, NodeId(5), 6); + assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone())); + } + + /// Test that when selecting AZs for many new tenants, we get the expected balance across nodes + #[test] + fn az_selection_many() { + let az_a_tag = AvailabilityZone("az-a".to_string()); + let az_b_tag = AvailabilityZone("az-b".to_string()); + let az_c_tag = AvailabilityZone("az-c".to_string()); + let nodes = test_utils::make_test_nodes( + 6, + &[ + az_a_tag.clone(), + az_b_tag.clone(), + az_c_tag.clone(), + az_a_tag.clone(), + az_b_tag.clone(), + az_c_tag.clone(), + ], + ); + + let mut scheduler = Scheduler::new(nodes.values()); + + // We should get 1/6th of these on each node, give or take a few... + let total_tenants = 300; + + // ...where the 'few' is the number of AZs, because the scheduling will sometimes overshoot + // on one AZ before correcting itself. This is because we select the 'home' AZ based on + // an AZ-wide metric, but we select the location for secondaries on a purely node-based + // metric (while excluding the home AZ). + let grace = 3; + + let mut scheduled_shards = Vec::new(); + for _i in 0..total_tenants { + let preferred_az = scheduler.get_az_for_new_tenant().unwrap(); + + let mut node_home_counts = scheduler + .nodes + .iter() + .map(|(node_id, node)| (node_id, node.home_shard_count)) + .collect::>(); + node_home_counts.sort_by_key(|i| i.0); + eprintln!("Selected {}, vs nodes {:?}", preferred_az, node_home_counts); + + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::generate(), + shard_number: ShardNumber(0), + shard_count: ShardCount(1), + }; + + let shard_identity = ShardIdentity::new( + tenant_shard_id.shard_number, + tenant_shard_id.shard_count, + pageserver_api::shard::ShardStripeSize(1), + ) + .unwrap(); + let mut shard = TenantShard::new( + tenant_shard_id, + shard_identity, + pageserver_api::controller_api::PlacementPolicy::Attached(1), + Some(preferred_az), + ); + + let mut context = ScheduleContext::default(); + shard.schedule(&mut scheduler, &mut context).unwrap(); + eprintln!("Scheduled shard at {:?}", shard.intent); + + scheduled_shards.push(shard); + } + + for (node_id, node) in &scheduler.nodes { + eprintln!( + "Node {}: {} {} {}", + node_id, node.shard_count, node.attached_shard_count, node.home_shard_count + ); + } + + for node in scheduler.nodes.values() { + assert!((node.home_shard_count as i64 - total_tenants as i64 / 6).abs() < grace); + } + + for mut shard in scheduled_shards { + shard.intent.clear(&mut scheduler); + } + } + + #[test] + /// Make sure that when we have an odd number of nodes and an even number of shards, we still + /// get scheduling stability. + fn odd_nodes_stability() { + let az_a = AvailabilityZone("az-a".to_string()); + let az_b = AvailabilityZone("az-b".to_string()); + + let nodes = test_utils::make_test_nodes( + 10, + &[ + az_a.clone(), + az_a.clone(), + az_a.clone(), + az_a.clone(), + az_a.clone(), + az_b.clone(), + az_b.clone(), + az_b.clone(), + az_b.clone(), + az_b.clone(), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + // Need to keep these alive because they contribute to shard counts via RAII + let mut scheduled_shards = Vec::new(); + + let mut context = ScheduleContext::default(); + + fn schedule_shard( + tenant_shard_id: TenantShardId, + expect_attached: NodeId, + expect_secondary: NodeId, + scheduled_shards: &mut Vec, + scheduler: &mut Scheduler, + preferred_az: Option, + context: &mut ScheduleContext, + ) { + let shard_identity = ShardIdentity::new( + tenant_shard_id.shard_number, + tenant_shard_id.shard_count, + pageserver_api::shard::ShardStripeSize(1), + ) + .unwrap(); + let mut shard = TenantShard::new( + tenant_shard_id, + shard_identity, + pageserver_api::controller_api::PlacementPolicy::Attached(1), + preferred_az, + ); + + shard.schedule(scheduler, context).unwrap(); + + assert_eq!(shard.intent.get_attached().unwrap(), expect_attached); + assert_eq!( + shard.intent.get_secondary().first().unwrap(), + &expect_secondary + ); + + scheduled_shards.push(shard); + } + + let tenant_id = TenantId::generate(); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(0), + shard_count: ShardCount(8), + }, + NodeId(1), + NodeId(6), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(1), + shard_count: ShardCount(8), + }, + NodeId(2), + NodeId(7), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(2), + shard_count: ShardCount(8), + }, + NodeId(3), + NodeId(8), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(3), + shard_count: ShardCount(8), + }, + NodeId(4), + NodeId(9), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(4), + shard_count: ShardCount(8), + }, + NodeId(5), + NodeId(10), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(5), + shard_count: ShardCount(8), + }, + NodeId(1), + NodeId(6), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(6), + shard_count: ShardCount(8), + }, + NodeId(2), + NodeId(7), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(7), + shard_count: ShardCount(8), + }, + NodeId(3), + NodeId(8), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + // Assert that the optimizer suggests nochanges, i.e. our initial scheduling was stable. + for shard in &scheduled_shards { + assert_eq!(shard.optimize_attachment(&mut scheduler, &context), None); + } + + for mut shard in scheduled_shards { + shard.intent.clear(&mut scheduler); + } + } } diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 1717a9369d..361253bd19 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -26,6 +26,20 @@ diesel::table! { listen_pg_addr -> Varchar, listen_pg_port -> Int4, availability_zone_id -> Varchar, + listen_https_port -> Nullable, + } +} + +diesel::table! { + safekeepers (id) { + id -> Int8, + region_id -> Text, + version -> Int8, + host -> Text, + port -> Int4, + http_port -> Int4, + availability_zone_id -> Text, + scheduling_policy -> Varchar, } } @@ -45,18 +59,10 @@ diesel::table! { } } -diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,); - -diesel::table! { - safekeepers { - id -> Int8, - region_id -> Text, - version -> Int8, - instance_id -> Text, - host -> Text, - port -> Int4, - active -> Bool, - http_port -> Int4, - availability_zone_id -> Text, - } -} +diesel::allow_tables_to_appear_in_same_query!( + controllers, + metadata_health, + nodes, + safekeepers, + tenant_shards, +); diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 7e4ee53b4c..8671e340bd 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,104 +1,97 @@ pub mod chaos_injector; mod context_iterator; -use hyper::Uri; -use std::{ - borrow::Cow, - cmp::Ordering, - collections::{BTreeMap, HashMap, HashSet}, - error::Error, - ops::Deref, - path::PathBuf, - str::FromStr, - sync::Arc, - time::{Duration, Instant}, -}; +use std::borrow::Cow; +use std::cmp::Ordering; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::error::Error; +use std::ops::Deref; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::Arc; +use std::time::{Duration, Instant}; -use crate::{ - background_node_operations::{ - Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION, - }, - compute_hook::NotifyError, - drain_utils::{self, TenantShardDrain, TenantShardIterator}, - id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, - leadership::Leadership, - metrics, - peer_client::GlobalObservedState, - persistence::{ - AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence, - ShardGenerationState, TenantFilter, - }, - reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, - scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode}, - tenant_shard::{ - MigrateAttachment, ObservedStateDelta, ReconcileNeeded, ReconcilerStatus, - ScheduleOptimization, ScheduleOptimizationAction, - }, -}; use anyhow::Context; +use context_iterator::TenantShardContextIterator; use control_plane::storage_controller::{ AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, }; use diesel::result::DatabaseErrorKind; -use futures::{stream::FuturesUnordered, StreamExt}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; +use http_utils::error::ApiError; +use hyper::Uri; use itertools::Itertools; -use pageserver_api::{ - controller_api::{ - AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, - NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, - ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, - TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, - TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, - TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, - }, - models::{ - SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest, - TopTenantShardsRequest, - }, +use pageserver_api::controller_api::{ + AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, + NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, + SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, + ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, + TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, + TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, + TenantShardMigrateResponse, }; +use pageserver_api::models::{ + self, LocationConfig, LocationConfigListResponse, LocationConfigMode, PageserverUtilization, + SecondaryProgress, ShardParameters, TenantConfig, TenantConfigPatchRequest, + TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, + TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse, + TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, + TopTenantShardsRequest, +}; +use pageserver_api::shard::{ + ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, +}; +use pageserver_api::upcall_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse, + ValidateResponseTenant, +}; +use pageserver_client::{BlockUnblock, mgmt_api}; use reqwest::StatusCode; -use tracing::{instrument, Instrument}; - -use crate::pageserver_client::PageserverClient; -use pageserver_api::{ - models::{ - self, LocationConfig, LocationConfigListResponse, LocationConfigMode, - PageserverUtilization, ShardParameters, TenantConfig, TenantLocationConfigRequest, - TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, - TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo, - }, - shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId}, - upcall_api::{ - ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, - ValidateResponse, ValidateResponseTenant, - }, -}; -use pageserver_client::{mgmt_api, BlockUnblock}; +use safekeeper_api::models::SafekeeperUtilization; +use tokio::sync::TryAcquireError; use tokio::sync::mpsc::error::TrySendError; use tokio_util::sync::CancellationToken; -use utils::{ - completion::Barrier, - failpoint_support, - generation::Generation, - http::error::ApiError, - id::{NodeId, TenantId, TimelineId}, - sync::gate::Gate, +use tracing::{Instrument, instrument}; +use utils::completion::Barrier; +use utils::generation::Generation; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::sync::gate::Gate; +use utils::{failpoint_support, pausable_failpoint}; + +use crate::background_node_operations::{ + Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler, +}; +use crate::compute_hook::{self, ComputeHook, NotifyError}; +use crate::drain_utils::{self, TenantShardDrain, TenantShardIterator}; +use crate::heartbeater::{Heartbeater, PageserverState, SafekeeperState}; +use crate::id_lock_map::{ + IdLockMap, TracingExclusiveGuard, trace_exclusive_lock, trace_shared_lock, +}; +use crate::leadership::Leadership; +use crate::metrics; +use crate::node::{AvailabilityTransition, Node}; +use crate::pageserver_client::PageserverClient; +use crate::peer_client::GlobalObservedState; +use crate::persistence::split_state::SplitState; +use crate::persistence::{ + AbortShardSplitStatus, ControllerPersistence, DatabaseError, DatabaseResult, + MetadataHealthPersistence, Persistence, ShardGenerationState, TenantFilter, + TenantShardPersistence, +}; +use crate::reconciler::{ + ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, ReconcilerPriority, + attached_location_conf, +}; +use crate::safekeeper::Safekeeper; +use crate::scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode, Scheduler}; +use crate::tenant_shard::{ + IntentState, MigrateAttachment, ObservedState, ObservedStateDelta, ObservedStateLocation, + ReconcileNeeded, ReconcileResult, ReconcileWaitError, ReconcilerStatus, ReconcilerWaiter, + ScheduleOptimization, ScheduleOptimizationAction, TenantShard, }; -use crate::{ - compute_hook::ComputeHook, - heartbeater::{Heartbeater, PageserverState}, - node::{AvailabilityTransition, Node}, - persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, - reconciler::attached_location_conf, - scheduler::Scheduler, - tenant_shard::{ - IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, - ReconcilerWaiter, TenantShard, - }, -}; - -use context_iterator::TenantShardContextIterator; +const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500); // For operations that should be quick, like attaching a new tenant const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); @@ -139,6 +132,7 @@ enum TenantOperations { Create, LocationConfig, ConfigSet, + ConfigPatch, TimeTravelRemoteStorage, Delete, UpdatePolicy, @@ -150,6 +144,8 @@ enum TenantOperations { TimelineArchivalConfig, TimelineDetachAncestor, TimelineGcBlockUnblock, + DropDetached, + DownloadHeatmapLayers, } #[derive(Clone, strum_macros::Display)] @@ -186,6 +182,7 @@ pub(crate) enum LeadershipStatus { } pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; +pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256; // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly @@ -200,6 +197,8 @@ struct ServiceState { nodes: Arc>, + safekeepers: Arc>, + scheduler: Scheduler, /// Ongoing background operation on the cluster if any is running. @@ -266,6 +265,7 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { impl ServiceState { fn new( nodes: HashMap, + safekeepers: HashMap, tenants: BTreeMap, scheduler: Scheduler, delayed_reconcile_rx: tokio::sync::mpsc::Receiver, @@ -277,6 +277,7 @@ impl ServiceState { leadership_status: initial_leadership_status, tenants, nodes: Arc::new(nodes), + safekeepers: Arc::new(safekeepers), scheduler, ongoing_operation: None, delayed_reconcile_rx, @@ -293,6 +294,23 @@ impl ServiceState { (&mut self.nodes, &mut self.tenants, &mut self.scheduler) } + #[allow(clippy::type_complexity)] + fn parts_mut_sk( + &mut self, + ) -> ( + &mut Arc>, + &mut Arc>, + &mut BTreeMap, + &mut Scheduler, + ) { + ( + &mut self.nodes, + &mut self.safekeepers, + &mut self.tenants, + &mut self.scheduler, + ) + } + fn get_leadership_status(&self) -> LeadershipStatus { self.leadership_status } @@ -313,7 +331,12 @@ pub struct Config { // All pageservers managed by one instance of this service must have // the same public key. This JWT token will be used to authenticate // this service to the pageservers it manages. - pub jwt_token: Option, + pub pageserver_jwt_token: Option, + + // All safekeepers managed by one instance of this service must have + // the same public key. This JWT token will be used to authenticate + // this service to the safekeepers it manages. + pub safekeeper_jwt_token: Option, // This JWT token will be used to authenticate this service to the control plane. pub control_plane_jwt_token: Option, @@ -336,9 +359,12 @@ pub struct Config { /// and/or upon handling the re-attach request from a node. pub max_warming_up_interval: Duration, - /// How many Reconcilers may be spawned concurrently + /// How many normal-priority Reconcilers may be spawned concurrently pub reconciler_concurrency: usize, + /// How many high-priority Reconcilers may be spawned concurrently + pub priority_reconciler_concurrency: usize, + /// How large must a shard grow in bytes before we split it? /// None disables auto-splitting. pub split_threshold: Option, @@ -361,6 +387,8 @@ pub struct Config { pub http_service_port: i32, pub long_reconcile_threshold: Duration, + + pub use_https_pageserver_api: bool, } impl From for ApiError { @@ -391,7 +419,8 @@ pub struct Service { compute_hook: Arc, result_tx: tokio::sync::mpsc::UnboundedSender, - heartbeater: Heartbeater, + heartbeater_ps: Heartbeater, + heartbeater_sk: Heartbeater, // Channel for background cleanup from failed operations that require cleanup, such as shard split abort_tx: tokio::sync::mpsc::UnboundedSender, @@ -405,14 +434,19 @@ pub struct Service { // that transition it to/from Active. node_op_locks: IdLockMap, - // Limit how many Reconcilers we will spawn concurrently + // Limit how many Reconcilers we will spawn concurrently for normal-priority tasks such as background reconciliations + // and reconciliation on startup. reconciler_concurrency: Arc, + // Limit how many Reconcilers we will spawn concurrently for high-priority tasks such as tenant/timeline CRUD, which + // a human user might be waiting for. + priority_reconciler_concurrency: Arc, + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile /// Send into this queue to promptly attempt to reconcile this shard next time units are available. /// - /// Note that this state logically lives inside ServiceInner, but carrying Sender here makes the code simpler - /// by avoiding needing a &mut ref to something inside the ServiceInner. This could be optimized to + /// Note that this state logically lives inside ServiceState, but carrying Sender here makes the code simpler + /// by avoiding needing a &mut ref to something inside the ServiceState. This could be optimized to /// use a VecDeque instead of a channel to reduce synchronization overhead, at the cost of some code complexity. delayed_reconcile_tx: tokio::sync::mpsc::Sender, @@ -601,7 +635,8 @@ impl Service { let locked = self.inner.read().unwrap(); locked.nodes.clone() }; - let mut nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await; + let (mut nodes_online, mut sks_online) = + self.initial_heartbeat_round(all_nodes.keys()).await; // List of tenants for which we will attempt to notify compute of their location at startup let mut compute_notifications = Vec::new(); @@ -610,7 +645,7 @@ impl Service { tracing::info!("Populating tenant shards' states from initial pageserver scan..."); let shard_count = { let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, scheduler) = locked.parts_mut(); + let (nodes, safekeepers, tenants, scheduler) = locked.parts_mut_sk(); // Mark nodes online if they responded to us: nodes are offline by default after a restart. let mut new_nodes = (**nodes).clone(); @@ -622,6 +657,17 @@ impl Service { } *nodes = Arc::new(new_nodes); + let mut new_sks = (**safekeepers).clone(); + for (node_id, node) in new_sks.iter_mut() { + if let Some((utilization, last_seen_at)) = sks_online.remove(node_id) { + node.set_availability(SafekeeperState::Available { + utilization, + last_seen_at, + }); + } + } + *safekeepers = Arc::new(new_sks); + for (tenant_shard_id, observed_state) in observed.0 { let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { for node_id in observed_state.locations.keys() { @@ -653,11 +699,14 @@ impl Service { // emit a compute notification for this. In the case where our observed state does not // yet match our intent, we will eventually reconcile, and that will emit a compute notification. if let Some(attached_at) = tenant_shard.stably_attached() { - compute_notifications.push(( - *tenant_shard_id, - attached_at, - tenant_shard.shard.stripe_size, - )); + compute_notifications.push(compute_hook::ShardUpdate { + tenant_shard_id: *tenant_shard_id, + node_id: attached_at, + stripe_size: tenant_shard.shard.stripe_size, + preferred_az: tenant_shard + .preferred_az() + .map(|az| Cow::Owned(az.clone())), + }); } } } @@ -721,13 +770,18 @@ impl Service { }); } - tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"); + tracing::info!( + "Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)" + ); } async fn initial_heartbeat_round<'a>( &self, node_ids: impl Iterator, - ) -> HashMap { + ) -> ( + HashMap, + HashMap, + ) { assert!(!self.startup_complete.is_ready()); let all_nodes = { @@ -747,14 +801,21 @@ impl Service { } } + let all_sks = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + tracing::info!("Sending initial heartbeats..."); - let res = self - .heartbeater - .heartbeat(Arc::new(nodes_to_heartbeat)) - .await; + // Put a small, but reasonable timeout to get the initial heartbeats of the safekeepers to avoid a storage controller downtime + const SK_TIMEOUT: Duration = Duration::from_secs(5); + let (res_ps, res_sk) = tokio::join!( + self.heartbeater_ps.heartbeat(Arc::new(nodes_to_heartbeat)), + tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks)) + ); let mut online_nodes = HashMap::new(); - if let Ok(deltas) = res { + if let Ok(deltas) = res_ps { for (node_id, status) in deltas.0 { match status { PageserverState::Available { utilization, .. } => { @@ -768,7 +829,22 @@ impl Service { } } - online_nodes + let mut online_sks = HashMap::new(); + if let Ok(Ok(deltas)) = res_sk { + for (node_id, status) in deltas.0 { + match status { + SafekeeperState::Available { + utilization, + last_seen_at, + } => { + online_sks.insert(node_id, (utilization, last_seen_at)); + } + SafekeeperState::Offline => {} + } + } + } + + (online_nodes, online_sks) } /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline. @@ -796,7 +872,7 @@ impl Service { let response = node .with_client_retries( |client| async move { client.list_location_config().await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 5, timeout, @@ -897,7 +973,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); match client .location_config( @@ -948,12 +1024,11 @@ impl Service { let reconciles_spawned = self.reconcile_all(); if reconciles_spawned == 0 { // Run optimizer only when we didn't find any other work to do - let optimizations = self.optimize_all().await; - if optimizations == 0 { - // Run new splits only when no optimizations are pending - self.autosplit_tenants().await; - } + self.optimize_all().await; } + // Always attempt autosplits. Sharding is crucial for bulk ingest performance, so we + // must be responsive when new projects begin ingesting and reach the threshold. + self.autosplit_tenants().await; } _ = self.reconcilers_cancel.cancelled() => return } @@ -975,8 +1050,18 @@ impl Service { locked.nodes.clone() }; - let res = self.heartbeater.heartbeat(nodes).await; - if let Ok(deltas) = res { + let safekeepers = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + + const SK_TIMEOUT: Duration = Duration::from_secs(3); + let (res_ps, res_sk) = tokio::join!( + self.heartbeater_ps.heartbeat(nodes), + tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(safekeepers)) + ); + + if let Ok(deltas) = res_ps { let mut to_handle = Vec::default(); for (node_id, state) in deltas.0 { @@ -1017,6 +1102,8 @@ impl Service { ) .await; + pausable_failpoint!("heartbeat-pre-node-state-configure"); + // This is the code path for geniune availability transitions (i.e node // goes unavailable and/or comes back online). let res = self @@ -1036,6 +1123,9 @@ impl Service { // on a snapshot of the nodes. tracing::info!("Node {} was not found after heartbeat round", node_id); } + Err(ApiError::ShuttingDown) => { + // No-op: we're shutting down, no need to try and update any nodes' statuses + } Err(err) => { // Transition to active involves reconciling: if a node responds to a heartbeat then // becomes unavailable again, we may get an error here. @@ -1072,6 +1162,20 @@ impl Service { } } } + if let Ok(Ok(deltas)) = res_sk { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + for (id, state) in deltas.0 { + let Some(sk) = safekeepers.get_mut(&id) else { + tracing::info!( + "Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}" + ); + continue; + }; + sk.set_availability(state); + } + locked.safekeepers = Arc::new(safekeepers); + } } } @@ -1155,13 +1259,30 @@ impl Service { } } + // If we just finished detaching all shards for a tenant, it might be time to drop it from memory. + if tenant.policy == PlacementPolicy::Detached { + // We may only drop a tenant from memory while holding the exclusive lock on the tenant ID: this protects us + // from concurrent execution wrt a request handler that might expect the tenant to remain in memory for the + // duration of the request. + let guard = self.tenant_op_locks.try_exclusive( + tenant.tenant_shard_id.tenant_id, + TenantOperations::DropDetached, + ); + if let Some(guard) = guard { + self.maybe_drop_tenant(tenant.tenant_shard_id.tenant_id, &mut locked, &guard); + } + } + // Maybe some other work can proceed now that this job finished. + // + // Only bother with this if we have some semaphore units available in the normal-priority semaphore (these + // reconciles are scheduled at `[ReconcilerPriority::Normal]`). if self.reconciler_concurrency.available_permits() > 0 { while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() { let (nodes, tenants, _scheduler) = locked.parts_mut(); if let Some(shard) = tenants.get_mut(&tenant_shard_id) { shard.delayed_reconcile = false; - self.maybe_reconcile_shard(shard, nodes); + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal); } if self.reconciler_concurrency.available_permits() == 0 { @@ -1274,8 +1395,8 @@ impl Service { .list_nodes() .await? .into_iter() - .map(Node::from_persistent) - .collect::>(); + .map(|x| Node::from_persistent(x, config.use_https_pageserver_api)) + .collect::>>()?; let nodes: HashMap = nodes.into_iter().map(|n| (n.get_id(), n)).collect(); tracing::info!("Loaded {} nodes from database.", nodes.len()); metrics::METRICS_REGISTRY @@ -1283,8 +1404,19 @@ impl Service { .storage_controller_pageserver_nodes .set(nodes.len() as i64); + tracing::info!("Loading safekeepers from database..."); + let safekeepers = persistence + .list_safekeepers() + .await? + .into_iter() + .map(|skp| Safekeeper::from_persistence(skp, CancellationToken::new())) + .collect::>(); + let safekeepers: HashMap = + safekeepers.into_iter().map(|n| (n.get_id(), n)).collect(); + tracing::info!("Loaded {} safekeepers from database.", safekeepers.len()); + tracing::info!("Loading shards from database..."); - let mut tenant_shard_persistence = persistence.list_tenant_shards().await?; + let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?; tracing::info!( "Loaded {} shards from database.", tenant_shard_persistence.len() @@ -1363,10 +1495,13 @@ impl Service { NodeId(node_id as u64), "".to_string(), 123, + None, "".to_string(), 123, AvailabilityZone("test_az".to_string()), - ); + false, + ) + .unwrap(); scheduler.node_upsert(&node); } @@ -1376,7 +1511,11 @@ impl Service { // We will populate intent properly later in [`Self::startup_reconcile`], initially populate // it with what we can infer: the node for which a generation was most recently issued. - let mut intent = IntentState::new(); + let mut intent = IntentState::new( + tsp.preferred_az_id + .as_ref() + .map(|az| AvailabilityZone(az.clone())), + ); if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64)) { if nodes.contains_key(&generation_pageserver) { @@ -1385,7 +1524,9 @@ impl Service { // If a node was removed before being completely drained, it is legal for it to leave behind a `generation_pageserver` referring // to a non-existent node, because node deletion doesn't block on completing the reconciliations that will issue new generations // on different pageservers. - tracing::warn!("Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled"); + tracing::warn!( + "Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled" + ); } } let new_tenant = TenantShard::from_persistent(tsp, intent)?; @@ -1405,8 +1546,15 @@ impl Service { let cancel = CancellationToken::new(); let reconcilers_cancel = cancel.child_token(); - let heartbeater = Heartbeater::new( - config.jwt_token.clone(), + let heartbeater_ps = Heartbeater::new( + config.pageserver_jwt_token.clone(), + config.max_offline_interval, + config.max_warming_up_interval, + cancel.clone(), + ); + + let heartbeater_sk = Heartbeater::new( + config.safekeeper_jwt_token.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), @@ -1421,6 +1569,7 @@ impl Service { let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( nodes, + safekeepers, tenants, scheduler, delayed_reconcile_rx, @@ -1430,10 +1579,14 @@ impl Service { persistence, compute_hook: Arc::new(ComputeHook::new(config.clone())), result_tx, - heartbeater, + heartbeater_ps, + heartbeater_sk, reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new( config.reconciler_concurrency, )), + priority_reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new( + config.priority_reconciler_concurrency, + )), delayed_reconcile_tx, abort_tx, startup_complete: startup_complete.clone(), @@ -1536,8 +1689,14 @@ impl Service { // the pageserver API (not via this service), we will auto-create any missing tenant // shards with default state. let insert = { - let locked = self.inner.write().unwrap(); - !locked.tenants.contains_key(&attach_req.tenant_shard_id) + match self + .maybe_load_tenant(attach_req.tenant_shard_id.tenant_id, &_tenant_lock) + .await + { + Ok(_) => false, + Err(ApiError::NotFound(_)) => true, + Err(e) => return Err(e.into()), + } }; if insert { @@ -1579,6 +1738,7 @@ impl Service { attach_req.tenant_shard_id, ShardIdentity::unsharded(), PlacementPolicy::Attached(0), + None, ), ); tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id); @@ -1696,7 +1856,7 @@ impl Service { } Ok(AttachHookResponse { - gen: attach_req + r#gen: attach_req .node_id .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), }) @@ -1741,7 +1901,7 @@ impl Service { let configs = match node .with_client_retries( |client| async move { client.list_location_config().await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -1799,7 +1959,7 @@ impl Service { .location_config(tenant_shard_id, config, None, false) .await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -1868,7 +2028,7 @@ impl Service { let new_gen = *new_gen; response.tenants.push(ReAttachResponseTenant { id: *tenant_shard_id, - gen: Some(new_gen.into().unwrap()), + r#gen: Some(new_gen.into().unwrap()), // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`] // execution. If a pageserver is restarted during that process, then the reconcile pass will // fail, and start from scratch, so it doesn't make sense for us to try and preserve @@ -1905,7 +2065,7 @@ impl Service { response.tenants.push(ReAttachResponseTenant { id: *tenant_shard_id, - gen: None, + r#gen: None, mode: LocationConfigMode::Secondary, }); @@ -1967,15 +2127,19 @@ impl Service { let locked = self.inner.read().unwrap(); for req_tenant in validate_req.tenants { if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { - let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); + let valid = tenant_shard.generation == Some(Generation::new(req_tenant.r#gen)); tracing::info!( "handle_validate: {}(gen {}): valid={valid} (latest {:?})", req_tenant.id, - req_tenant.gen, + req_tenant.r#gen, tenant_shard.generation ); - in_memory_result.push((req_tenant.id, Generation::new(req_tenant.gen), valid)); + in_memory_result.push(( + req_tenant.id, + Generation::new(req_tenant.r#gen), + valid, + )); } else { // This is legal: for example during a shard split the pageserver may still // have deletions in its queue from the old pre-split shard, or after deletion @@ -1994,13 +2158,11 @@ impl Service { // in case of controller split-brain, where some other controller process might have incremented the generation. let db_generations = self .persistence - .shard_generations(in_memory_result.iter().filter_map(|i| { - if i.2 { - Some(&i.0) - } else { - None - } - })) + .shard_generations( + in_memory_result + .iter() + .filter_map(|i| if i.2 { Some(&i.0) } else { None }), + ) .await?; let db_generations = db_generations.into_iter().collect::>(); @@ -2106,6 +2268,16 @@ impl Service { ) }; + let preferred_az_id = { + let locked = self.inner.read().unwrap(); + // Idempotency: take the existing value if the tenant already exists + if let Some(shard) = locked.tenants.get(create_ids.first().unwrap()) { + shard.preferred_az().cloned() + } else { + locked.scheduler.get_az_for_new_tenant() + } + }; + // Ordering: we persist tenant shards before creating them on the pageserver. This enables a caller // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart // during the creation, rather than risking leaving orphan objects in S3. @@ -2125,7 +2297,7 @@ impl Service { splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), - preferred_az_id: None, + preferred_az_id: preferred_az_id.as_ref().map(|az| az.to_string()), }) .collect(); @@ -2142,7 +2314,9 @@ impl Service { // Unique key violation: this is probably a retry. Because the shard count is part of the unique key, // if we see a unique key violation it means that the creation request's shard count matches the previous // creation's shard count. - tracing::info!("Tenant shards already present in database, proceeding with idempotent creation..."); + tracing::info!( + "Tenant shards already present in database, proceeding with idempotent creation..." + ); } // Any other database error is unexpected and a bug. Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))), @@ -2161,6 +2335,7 @@ impl Service { &create_req.shard_parameters, create_req.config.clone(), placement_policy.clone(), + preferred_az_id.as_ref(), &mut schedule_context, ) .await; @@ -2174,44 +2349,6 @@ impl Service { } } - let preferred_azs = { - let locked = self.inner.read().unwrap(); - response_shards - .iter() - .filter_map(|resp| { - let az_id = locked - .nodes - .get(&resp.node_id) - .map(|n| n.get_availability_zone_id().clone())?; - - Some((resp.shard_id, az_id)) - }) - .collect::>() - }; - - // Note that we persist the preferred AZ for the new shards separately. - // In theory, we could "peek" the scheduler to determine where the shard will - // land, but the subsequent "real" call into the scheduler might select a different - // node. Hence, we do this awkward update to keep things consistent. - let updated = self - .persistence - .set_tenant_shard_preferred_azs(preferred_azs) - .await - .map_err(|err| { - ApiError::InternalServerError(anyhow::anyhow!( - "Failed to persist preferred az ids: {err}" - )) - })?; - - { - let mut locked = self.inner.write().unwrap(); - for (tid, az_id) in updated { - if let Some(shard) = locked.tenants.get_mut(&tid) { - shard.set_preferred_az(az_id); - } - } - } - // If we failed to schedule shards, then they are still created in the controller, // but we return an error to the requester to avoid a silent failure when someone // tries to e.g. create a tenant whose placement policy requires more nodes than @@ -2226,9 +2363,14 @@ impl Service { let waiters = { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, _scheduler) = locked.parts_mut(); + let config = ReconcilerConfigBuilder::new(ReconcilerPriority::High) + .tenant_creation_hint(true) + .build(); tenants .range_mut(TenantShardId::tenant_range(tenant_id)) - .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes)) + .filter_map(|(_shard_id, shard)| { + self.maybe_configured_reconcile_shard(shard, nodes, config) + }) .collect::>() }; @@ -2242,6 +2384,7 @@ impl Service { /// Helper for tenant creation that does the scheduling for an individual shard. Covers both the /// case of a new tenant and a pre-existing one. + #[allow(clippy::too_many_arguments)] async fn do_initial_shard_scheduling( &self, tenant_shard_id: TenantShardId, @@ -2249,6 +2392,7 @@ impl Service { shard_params: &ShardParameters, config: TenantConfig, placement_policy: PlacementPolicy, + preferred_az_id: Option<&AvailabilityZone>, schedule_context: &mut ScheduleContext, ) -> InitialShardScheduleOutcome { let mut locked = self.inner.write().unwrap(); @@ -2259,10 +2403,6 @@ impl Service { Entry::Occupied(mut entry) => { tracing::info!("Tenant shard {tenant_shard_id} already exists while creating"); - // TODO: schedule() should take an anti-affinity expression that pushes - // attached and secondary locations (independently) away frorm those - // pageservers also holding a shard for this tenant. - if let Err(err) = entry.get_mut().schedule(scheduler, schedule_context) { return InitialShardScheduleOutcome::ShardScheduleError(err); } @@ -2286,6 +2426,7 @@ impl Service { tenant_shard_id, ShardIdentity::from_params(tenant_shard_id.shard_number, shard_params), placement_policy, + preferred_az_id.cloned(), )); state.generation = initial_generation; @@ -2456,6 +2597,122 @@ impl Service { } } + /// For APIs that might act on tenants with [`PlacementPolicy::Detached`], first check if + /// the tenant is present in memory. If not, load it from the database. If it is found + /// in neither location, return a NotFound error. + /// + /// Caller must demonstrate they hold a lock guard, as otherwise two callers might try and load + /// it at the same time, or we might race with [`Self::maybe_drop_tenant`] + async fn maybe_load_tenant( + &self, + tenant_id: TenantId, + _guard: &TracingExclusiveGuard, + ) -> Result<(), ApiError> { + // Check if the tenant is present in memory, and select an AZ to use when loading + // if we will load it. + let load_in_az = { + let locked = self.inner.read().unwrap(); + let existing = locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .next(); + + // If the tenant is not present in memory, we expect to load it from database, + // so let's figure out what AZ to load it into while we have self.inner locked. + if existing.is_none() { + locked + .scheduler + .get_az_for_new_tenant() + .ok_or(ApiError::BadRequest(anyhow::anyhow!( + "No AZ with nodes found to load tenant" + )))? + } else { + // We already have this tenant in memory + return Ok(()); + } + }; + + let tenant_shards = self.persistence.load_tenant(tenant_id).await?; + if tenant_shards.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {} not found", tenant_id).into(), + )); + } + + // Update the persistent shards with the AZ that we are about to apply to in-memory state + self.persistence + .set_tenant_shard_preferred_azs( + tenant_shards + .iter() + .map(|t| { + ( + t.get_tenant_shard_id().expect("Corrupt shard in database"), + Some(load_in_az.clone()), + ) + }) + .collect(), + ) + .await?; + + let mut locked = self.inner.write().unwrap(); + tracing::info!( + "Loaded {} shards for tenant {}", + tenant_shards.len(), + tenant_id + ); + + locked.tenants.extend(tenant_shards.into_iter().map(|p| { + let intent = IntentState::new(Some(load_in_az.clone())); + let shard = + TenantShard::from_persistent(p, intent).expect("Corrupt shard row in database"); + + // Sanity check: when loading on-demand, we should always be loaded something Detached + debug_assert!(shard.policy == PlacementPolicy::Detached); + if shard.policy != PlacementPolicy::Detached { + tracing::error!( + "Tenant shard {} loaded on-demand, but has non-Detached policy {:?}", + shard.tenant_shard_id, + shard.policy + ); + } + + (shard.tenant_shard_id, shard) + })); + + Ok(()) + } + + /// If all shards for a tenant are detached, and in a fully quiescent state (no observed locations on pageservers), + /// and have no reconciler running, then we can drop the tenant from memory. It will be reloaded on-demand + /// if we are asked to attach it again (see [`Self::maybe_load_tenant`]). + /// + /// Caller must demonstrate they hold a lock guard, as otherwise it is unsafe to drop a tenant from + /// memory while some other function might assume it continues to exist while not holding the lock on Self::inner. + fn maybe_drop_tenant( + &self, + tenant_id: TenantId, + locked: &mut std::sync::RwLockWriteGuard, + _guard: &TracingExclusiveGuard, + ) { + let mut tenant_shards = locked.tenants.range(TenantShardId::tenant_range(tenant_id)); + if tenant_shards.all(|(_id, shard)| { + shard.policy == PlacementPolicy::Detached + && shard.reconciler.is_none() + && shard.observed.is_empty() + }) { + let keys = locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .map(|(id, _)| id) + .copied() + .collect::>(); + for key in keys { + tracing::info!("Dropping detached tenant shard {} from memory", key); + locked.tenants.remove(&key); + } + } + } + /// This API is used by the cloud control plane to migrate unsharded tenants that it created /// directly with pageservers into this service. /// @@ -2482,14 +2739,26 @@ impl Service { ) .await; - if !tenant_shard_id.is_unsharded() { + let tenant_id = if !tenant_shard_id.is_unsharded() { return Err(ApiError::BadRequest(anyhow::anyhow!( "This API is for importing single-sharded or unsharded tenants" ))); - } + } else { + tenant_shard_id.tenant_id + }; + + // In case we are waking up a Detached tenant + match self.maybe_load_tenant(tenant_id, &_tenant_lock).await { + Ok(()) | Err(ApiError::NotFound(_)) => { + // This is a creation or an update + } + Err(e) => { + return Err(e); + } + }; // First check if this is a creation or an update - let create_or_update = self.tenant_location_config_prepare(tenant_shard_id.tenant_id, req); + let create_or_update = self.tenant_location_config_prepare(tenant_id, req); let mut result = TenantLocationConfigResponse { shards: Vec::new(), @@ -2512,6 +2781,7 @@ impl Service { // Persist updates // Ordering: write to the database before applying changes in-memory, so that // we will not appear time-travel backwards on a restart. + let mut schedule_context = ScheduleContext::default(); for ShardUpdate { tenant_shard_id, @@ -2568,7 +2838,8 @@ impl Service { shard.schedule(scheduler, &mut schedule_context)?; - let maybe_waiter = self.maybe_reconcile_shard(shard, nodes); + let maybe_waiter = + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High); if let Some(waiter) = maybe_waiter { waiters.push(waiter); } @@ -2602,6 +2873,59 @@ impl Service { Ok(result) } + pub(crate) async fn tenant_config_patch( + &self, + req: TenantConfigPatchRequest, + ) -> Result<(), ApiError> { + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + req.tenant_id, + TenantOperations::ConfigPatch, + ) + .await; + + let tenant_id = req.tenant_id; + let patch = req.config; + + self.maybe_load_tenant(tenant_id, &_tenant_lock).await?; + + let base = { + let locked = self.inner.read().unwrap(); + let shards = locked + .tenants + .range(TenantShardId::tenant_range(req.tenant_id)); + + let mut configs = shards.map(|(_sid, shard)| &shard.config).peekable(); + + let first = match configs.peek() { + Some(first) => (*first).clone(), + None => { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(), + )); + } + }; + + if !configs.all_equal() { + tracing::error!("Tenant configs for {} are mismatched. ", req.tenant_id); + // This can't happen because we atomically update the database records + // of all shards to the new value in [`Self::set_tenant_config_and_reconcile`]. + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Tenant configs for {} are mismatched", + req.tenant_id + ))); + } + + first + }; + + let updated_config = base + .apply_patch(patch) + .map_err(|err| ApiError::BadRequest(anyhow::anyhow!(err)))?; + self.set_tenant_config_and_reconcile(tenant_id, updated_config) + .await + } + pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> { // We require an exclusive lock, because we are updating persistent and in-memory state let _tenant_lock = trace_exclusive_lock( @@ -2611,12 +2935,20 @@ impl Service { ) .await; - let tenant_id = req.tenant_id; - let config = req.config; + self.maybe_load_tenant(req.tenant_id, &_tenant_lock).await?; + self.set_tenant_config_and_reconcile(req.tenant_id, req.config) + .await + } + + async fn set_tenant_config_and_reconcile( + &self, + tenant_id: TenantId, + config: TenantConfig, + ) -> Result<(), ApiError> { self.persistence .update_tenant_shard( - TenantFilter::Tenant(req.tenant_id), + TenantFilter::Tenant(tenant_id), None, Some(config.clone()), None, @@ -2630,7 +2962,9 @@ impl Service { let (nodes, tenants, _scheduler) = locked.parts_mut(); for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { shard.config = config.clone(); - if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + if let Some(waiter) = + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High) + { waiters.push(waiter); } } @@ -2663,7 +2997,7 @@ impl Service { None => { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant not found").into(), - )) + )); } } }; @@ -2730,7 +3064,9 @@ impl Service { }) .find(|(_, _, mode)| *mode != LocationConfigMode::Detached); if let Some((node_id, _observed_location, mode)) = maybe_attached { - return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}"))); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}" + ))); } } let scheduler = &mut locked.scheduler; @@ -2766,7 +3102,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); @@ -2827,7 +3163,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); futs.push(async move { let result = client @@ -2896,6 +3232,8 @@ impl Service { let _tenant_lock = trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await; + self.maybe_load_tenant(tenant_id, &_tenant_lock).await?; + // Detach all shards. This also deletes local pageserver shard data. let (detach_waiters, node) = { let mut detach_waiters = Vec::new(); @@ -2910,7 +3248,9 @@ impl Service { debug_assert!(shard.intent.get_attached().is_none()); debug_assert!(shard.intent.get_secondary().is_empty()); - if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + if let Some(waiter) = + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High) + { detach_waiters.push(waiter); } } @@ -2946,7 +3286,7 @@ impl Service { .tenant_delete(TenantShardId::unsharded(tenant_id)) .await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 3, RECONCILE_TIMEOUT, @@ -3015,6 +3355,8 @@ impl Service { ) .await; + self.maybe_load_tenant(tenant_id, &_tenant_lock).await?; + failpoint_support::sleep_millis_async!("tenant-update-policy-exclusive-lock"); let TenantPolicyRequest { @@ -3060,7 +3402,7 @@ impl Service { // In case scheduling is being switched back on, try it now. shard.schedule(scheduler, &mut schedule_context).ok(); - self.maybe_reconcile_shard(shard, nodes); + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High); } Ok(()) @@ -3163,7 +3505,7 @@ impl Service { let timeline_info = create_one( shard_zero_tid, shard_zero_locations, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), create_req.clone(), ) .await?; @@ -3179,7 +3521,7 @@ impl Service { // Create timeline on remaining shards with number >0 if !targets.0.is_empty() { // If we had multiple shards, issue requests for the remainder now. - let jwt = &self.config.jwt_token; + let jwt = &self.config.pageserver_jwt_token; self.tenant_for_shards( targets .0 @@ -3262,7 +3604,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), req.clone(), )) }) @@ -3343,7 +3685,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), )) }) .await?; @@ -3417,7 +3759,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), dir, )) }) @@ -3427,6 +3769,61 @@ impl Service { Ok(()) } + pub(crate) async fn tenant_timeline_download_heatmap_layers( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + concurrency: Option, + ) -> Result<(), ApiError> { + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_shard_id.tenant_id, + TenantOperations::DownloadHeatmapLayers, + ) + .await; + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + // If the request got an unsharded tenant id, then apply + // the operation to all shards. Otherwise, apply it to a specific shard. + let shards_range = if tenant_shard_id.is_unsharded() { + TenantShardId::tenant_range(tenant_shard_id.tenant_id) + } else { + tenant_shard_id.range() + }; + + for (tenant_shard_id, shard) in locked.tenants.range(shards_range) { + if let Some(node_id) = shard.intent.get_attached() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + } + targets + }; + + self.tenant_for_shards_api( + targets, + |tenant_shard_id, client| async move { + client + .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + Ok(()) + } + /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. /// /// On success, the returned vector contains exactly the same number of elements as the input `locations`. @@ -3477,7 +3874,7 @@ impl Service { futs.push(async move { node.with_client_retries( |client| op(tenant_shard_id, client), - &self.config.jwt_token, + &self.config.pageserver_jwt_token, warn_threshold, max_retries, timeout, @@ -3523,6 +3920,11 @@ impl Service { .iter() .any(|i| i.generation.is_none() || i.generation_pageserver.is_none()) { + let shard_generations = generations + .into_iter() + .map(|i| (i.tenant_shard_id, (i.generation, i.generation_pageserver))) + .collect::>(); + // One or more shards has not been attached to a pageserver. Check if this is because it's configured // to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry) let locked = self.inner.read().unwrap(); @@ -3533,6 +3935,34 @@ impl Service { PlacementPolicy::Attached(_) => { // This shard is meant to be attached: the caller is not wrong to try and // use this function, but we can't service the request right now. + let Some(generation) = shard_generations.get(shard_id) else { + // This can only happen if there is a split brain controller modifying the database. This should + // never happen when testing, and if it happens in production we can only log the issue. + debug_assert!(false); + tracing::error!( + "Shard {shard_id} not found in generation state! Is another rogue controller running?" + ); + continue; + }; + let (generation, generation_pageserver) = generation; + if let Some(generation) = generation { + if generation_pageserver.is_none() { + // This is legitimate only in a very narrow window where the shard was only just configured into + // Attached mode after being created in Secondary or Detached mode, and it has had its generation + // set but not yet had a Reconciler run (reconciler is the only thing that sets generation_pageserver). + tracing::warn!( + "Shard {shard_id} generation is set ({generation:?}) but generation_pageserver is None, reconciler not run yet?" + ); + } + } else { + // This should never happen: a shard with no generation is only permitted when it was created in some state + // other than PlacementPolicy::Attached (and generation is always written to DB before setting Attached in memory) + debug_assert!(false); + tracing::error!( + "Shard {shard_id} generation is None, but it is in PlacementPolicy::Attached mode!" + ); + continue; + } } PlacementPolicy::Secondary | PlacementPolicy::Detached => { return Err(ApiError::Conflict(format!( @@ -3699,7 +4129,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), )) }) .await?; @@ -3721,7 +4151,7 @@ impl Service { shard_zero_tid, timeline_id, shard_zero_locations.latest.node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), ) .await?; Ok(shard_zero_status) @@ -3928,17 +4358,42 @@ impl Service { .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into())) } - pub(crate) fn tenant_list(&self) -> Vec { + /// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not + /// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory + /// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses + /// in our external API. + pub(crate) fn tenant_list( + &self, + limit: Option, + start_after: Option, + ) -> Vec { let locked = self.inner.read().unwrap(); + // Apply start_from parameter + let shard_range = match start_after { + None => locked.tenants.range(..), + Some(tenant_id) => locked.tenants.range( + TenantShardId { + tenant_id, + shard_number: ShardNumber(u8::MAX), + shard_count: ShardCount(u8::MAX), + }.., + ), + }; + let mut result = Vec::new(); - for (_tenant_id, tenant_shards) in - &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id) - { + for (_tenant_id, tenant_shards) in &shard_range.group_by(|(id, _shard)| id.tenant_id) { result.push( self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v)) .expect("Groups are always non-empty"), ); + + // Enforce `limit` parameter + if let Some(limit) = limit { + if result.len() >= limit { + break; + } + } } result @@ -4033,6 +4488,26 @@ impl Service { } tracing::info!("Restoring parent shard {tenant_shard_id}"); + + // Drop any intents that refer to unavailable nodes, to enable this abort to proceed even + // if the original attachment location is offline. + if let Some(node_id) = shard.intent.get_attached() { + if !nodes.get(node_id).unwrap().is_available() { + tracing::info!( + "Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}" + ); + shard.intent.demote_attached(scheduler, *node_id); + } + } + for node_id in shard.intent.get_secondary().clone() { + if !nodes.get(&node_id).unwrap().is_available() { + tracing::info!( + "Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}" + ); + shard.intent.remove_secondary(scheduler, node_id); + } + } + shard.splitting = SplitState::Idle; if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) { // If this shard can't be scheduled now (perhaps due to offline nodes or @@ -4041,7 +4516,7 @@ impl Service { tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}") } - self.maybe_reconcile_shard(shard, nodes); + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High); } // We don't expect any new_shard_count shards to exist here, but drop them just in case @@ -4056,7 +4531,9 @@ impl Service { // rely on the reconciliation that happens when a node transitions to Active to clean up. Since we have // removed child shards from our in-memory state and database, the reconciliation will implicitly remove // them from the node. - tracing::warn!("Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated."); + tracing::warn!( + "Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated." + ); continue; } @@ -4079,7 +4556,7 @@ impl Service { client.location_config(child_id, config, None, false).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 10, Duration::from_secs(5), @@ -4184,16 +4661,15 @@ impl Service { }, ); - let mut child_state = TenantShard::new(child, child_shard, policy.clone()); - child_state.intent = IntentState::single(scheduler, Some(pageserver)); + let mut child_state = + TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone()); + child_state.intent = + IntentState::single(scheduler, Some(pageserver), preferred_az.clone()); child_state.observed = ObservedState { locations: child_observed, }; child_state.generation = Some(generation); child_state.config = config.clone(); - if let Some(preferred_az) = &preferred_az { - child_state.set_preferred_az(preferred_az.clone()); - } // The child's TenantShard::splitting is intentionally left at the default value of Idle, // as at this point in the split process we have succeeded and this part is infallible: @@ -4208,7 +4684,11 @@ impl Service { tracing::warn!("Failed to schedule child shard {child}: {e}"); } // In the background, attach secondary locations for the new shards - if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) { + if let Some(waiter) = self.maybe_reconcile_shard( + &mut child_state, + nodes, + ReconcilerPriority::High, + ) { waiters.push(waiter); } @@ -4498,7 +4978,10 @@ impl Service { // applies the new stripe size to the children. let mut shard_ident = shard_ident.unwrap(); if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size { - return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size))); + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", + shard_ident.stripe_size + ))); } shard_ident.stripe_size = new_stripe_size; @@ -4573,7 +5056,9 @@ impl Service { shard.intent.clear_secondary(scheduler); // Run Reconciler to execute detach fo secondary locations. - if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + if let Some(waiter) = + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High) + { waiters.push(waiter); } } @@ -4674,7 +5159,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); let response = client .tenant_shard_split( @@ -4740,11 +5225,22 @@ impl Service { for (child_id, child_ps, stripe_size) in child_locations { if let Err(e) = self .compute_hook - .notify(child_id, child_ps, stripe_size, &self.cancel) + .notify( + compute_hook::ShardUpdate { + tenant_shard_id: child_id, + node_id: child_ps, + stripe_size, + preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed), + }, + &self.cancel, + ) .await { - tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", - child_id, child_ps); + tracing::warn!( + "Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", + child_id, + child_ps + ); failed_notifications.push(child_id); } } @@ -4800,7 +5296,13 @@ impl Service { match shard.policy { PlacementPolicy::Attached(n) => { // If our new attached node was a secondary, it no longer should be. - shard.intent.remove_secondary(scheduler, migrate_req.node_id); + shard + .intent + .remove_secondary(scheduler, migrate_req.node_id); + + shard + .intent + .set_attached(scheduler, Some(migrate_req.node_id)); // If we were already attached to something, demote that to a secondary if let Some(old_attached) = old_attached { @@ -4813,8 +5315,6 @@ impl Service { shard.intent.push_secondary(scheduler, old_attached); } } - - shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); } PlacementPolicy::Secondary => { shard.intent.clear(scheduler); @@ -4823,7 +5323,7 @@ impl Service { PlacementPolicy::Detached => { return Err(ApiError::BadRequest(anyhow::anyhow!( "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first" - ))) + ))); } } @@ -4831,7 +5331,77 @@ impl Service { shard.sequence = shard.sequence.next(); } - self.maybe_reconcile_shard(shard, nodes) + let reconciler_config = match migrate_req.migration_config { + Some(cfg) => (&cfg).into(), + None => ReconcilerConfig::new(ReconcilerPriority::High), + }; + + self.maybe_configured_reconcile_shard(shard, nodes, reconciler_config) + }; + + if let Some(waiter) = waiter { + waiter.wait_timeout(RECONCILE_TIMEOUT).await?; + } else { + tracing::info!("Migration is a no-op"); + } + + Ok(TenantShardMigrateResponse {}) + } + + pub(crate) async fn tenant_shard_migrate_secondary( + &self, + tenant_shard_id: TenantShardId, + migrate_req: TenantShardMigrateRequest, + ) -> Result { + let waiter = { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let Some(node) = nodes.get(&migrate_req.node_id) else { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Node {} not found", + migrate_req.node_id + ))); + }; + + if !node.is_available() { + // Warn but proceed: the caller may intend to manually adjust the placement of + // a shard even if the node is down, e.g. if intervening during an incident. + tracing::warn!("Migrating to unavailable node {node}"); + } + + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard not found").into(), + )); + }; + + if shard.intent.get_secondary().len() == 1 + && shard.intent.get_secondary()[0] == migrate_req.node_id + { + tracing::info!( + "Migrating secondary to {node}: intent is unchanged {:?}", + shard.intent + ); + } else if shard.intent.get_attached() == &Some(migrate_req.node_id) { + tracing::info!( + "Migrating secondary to {node}: already attached where we were asked to create a secondary" + ); + } else { + let old_secondaries = shard.intent.get_secondary().clone(); + for secondary in old_secondaries { + shard.intent.remove_secondary(scheduler, secondary); + } + + shard.intent.push_secondary(scheduler, migrate_req.node_id); + shard.sequence = shard.sequence.next(); + tracing::info!( + "Migrating secondary to {node}: new intent {:?}", + shard.intent + ); + } + + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High) }; if let Some(waiter) = waiter { @@ -4924,7 +5494,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); let scan_result = client @@ -5044,7 +5614,8 @@ impl Service { expect_nodes.sort_by_key(|n| n.node_id); nodes.sort_by_key(|n| n.node_id); - if nodes != expect_nodes { + // Errors relating to nodes are deferred so that we don't skip the shard checks below if we have a node error + let node_result = if nodes != expect_nodes { tracing::error!("Consistency check failed on nodes."); tracing::error!( "Nodes in memory: {}", @@ -5056,17 +5627,31 @@ impl Service { serde_json::to_string(&nodes) .map_err(|e| ApiError::InternalServerError(e.into()))? ); - return Err(ApiError::InternalServerError(anyhow::anyhow!( + Err(ApiError::InternalServerError(anyhow::anyhow!( "Node consistency failure" - ))); - } + ))) + } else { + Ok(()) + }; + + let mut persistent_shards = self.persistence.load_active_tenant_shards().await?; + persistent_shards + .sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); - let mut shards = self.persistence.list_tenant_shards().await?; - shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); - if shards != expect_shards { + // Because JSON contents of persistent tenants might disagree with the fields in current `TenantConfig` + // definition, we will do an encode/decode cycle to ensure any legacy fields are dropped and any new + // fields are added, before doing a comparison. + for tsp in &mut persistent_shards { + let config: TenantConfig = serde_json::from_str(&tsp.config) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + tsp.config = serde_json::to_string(&config).expect("Encoding config is infallible"); + } + + if persistent_shards != expect_shards { tracing::error!("Consistency check failed on shards."); + tracing::error!( "Shards in memory: {}", serde_json::to_string(&expect_shards) @@ -5074,15 +5659,60 @@ impl Service { ); tracing::error!( "Shards in database: {}", - serde_json::to_string(&shards) + serde_json::to_string(&persistent_shards) .map_err(|e| ApiError::InternalServerError(e.into()))? ); + + // The total dump log lines above are useful in testing but in the field grafana will + // usually just drop them because they're so large. So we also do some explicit logging + // of just the diffs. + let persistent_shards = persistent_shards + .into_iter() + .map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp)) + .collect::>(); + let expect_shards = expect_shards + .into_iter() + .map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp)) + .collect::>(); + for (tenant_shard_id, persistent_tsp) in &persistent_shards { + match expect_shards.get(tenant_shard_id) { + None => { + tracing::error!( + "Shard {} found in database but not in memory", + tenant_shard_id + ); + } + Some(expect_tsp) => { + if expect_tsp != persistent_tsp { + tracing::error!( + "Shard {} is inconsistent. In memory: {}, database has: {}", + tenant_shard_id, + serde_json::to_string(expect_tsp).unwrap(), + serde_json::to_string(&persistent_tsp).unwrap() + ); + } + } + } + } + + // Having already logged any differences, log any shards that simply aren't present in the database + for (tenant_shard_id, memory_tsp) in &expect_shards { + if !persistent_shards.contains_key(tenant_shard_id) { + tracing::error!( + "Shard {} found in memory but not in database: {}", + tenant_shard_id, + serde_json::to_string(memory_tsp) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + } + } + return Err(ApiError::InternalServerError(anyhow::anyhow!( "Shard consistency failure" ))); } - Ok(()) + node_result } /// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that @@ -5183,7 +5813,7 @@ impl Service { ) } - self.maybe_reconcile_shard(shard, nodes); + self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal); } // Here we remove an existing observed location for the node we're removing, and it will @@ -5269,7 +5899,7 @@ impl Service { return Err(ApiError::InternalServerError(anyhow::anyhow!( "{} attached as primary+secondary on the same node", tid - ))) + ))); } (true, false) => Some(false), (false, true) => Some(true), @@ -5308,8 +5938,10 @@ impl Service { ) .await; + #[derive(PartialEq)] enum RegistrationStatus { - Matched, + UpToDate, + NeedUpdate, Mismatched, New, } @@ -5318,7 +5950,11 @@ impl Service { let locked = self.inner.read().unwrap(); if let Some(node) = locked.nodes.get(®ister_req.node_id) { if node.registration_match(®ister_req) { - RegistrationStatus::Matched + if node.need_update(®ister_req) { + RegistrationStatus::NeedUpdate + } else { + RegistrationStatus::UpToDate + } } else { RegistrationStatus::Mismatched } @@ -5328,9 +5964,9 @@ impl Service { }; match registration_status { - RegistrationStatus::Matched => { + RegistrationStatus::UpToDate => { tracing::info!( - "Node {} re-registered with matching address", + "Node {} re-registered with matching address and is up to date", register_req.node_id ); @@ -5348,7 +5984,7 @@ impl Service { "Node is already registered with different address".to_string(), )); } - RegistrationStatus::New => { + RegistrationStatus::New | RegistrationStatus::NeedUpdate => { // fallthrough } } @@ -5377,6 +6013,16 @@ impl Service { )); } + if self.config.use_https_pageserver_api && register_req.listen_https_port.is_none() { + return Err(ApiError::PreconditionFailed( + format!( + "Node {} has no https port, but use_https is enabled", + register_req.node_id + ) + .into(), + )); + } + // Ordering: we must persist the new node _before_ adding it to in-memory state. // This ensures that before we use it for anything or expose it via any external // API, it is guaranteed to be available after a restart. @@ -5384,13 +6030,29 @@ impl Service { register_req.node_id, register_req.listen_http_addr, register_req.listen_http_port, + register_req.listen_https_port, register_req.listen_pg_addr, register_req.listen_pg_port, - register_req.availability_zone_id, + register_req.availability_zone_id.clone(), + self.config.use_https_pageserver_api, ); + let new_node = match new_node { + Ok(new_node) => new_node, + Err(error) => return Err(ApiError::InternalServerError(error)), + }; - // TODO: idempotency if the node already exists in the database - self.persistence.insert_node(&new_node).await?; + match registration_status { + RegistrationStatus::New => self.persistence.insert_node(&new_node).await?, + RegistrationStatus::NeedUpdate => { + self.persistence + .update_node_on_registration( + register_req.node_id, + register_req.listen_https_port, + ) + .await? + } + _ => unreachable!("Other statuses have been processed earlier"), + } let mut locked = self.inner.write().unwrap(); let mut new_nodes = (*locked.nodes).clone(); @@ -5405,11 +6067,24 @@ impl Service { .storage_controller_pageserver_nodes .set(locked.nodes.len() as i64); - tracing::info!( - "Registered pageserver {}, now have {} pageservers", - register_req.node_id, - locked.nodes.len() - ); + match registration_status { + RegistrationStatus::New => { + tracing::info!( + "Registered pageserver {} ({}), now have {} pageservers", + register_req.node_id, + register_req.availability_zone_id, + locked.nodes.len() + ); + } + RegistrationStatus::NeedUpdate => { + tracing::info!( + "Re-registered and updated node {} ({})", + register_req.node_id, + register_req.availability_zone_id, + ); + } + _ => unreachable!("Other statuses have been processed earlier"), + } Ok(()) } @@ -5427,7 +6102,9 @@ impl Service { if let Some(scheduling) = scheduling { // Scheduling is a persistent part of Node: we must write updates to the database before // applying them in memory - self.persistence.update_node(node_id, scheduling).await?; + self.persistence + .update_node_scheduling_policy(node_id, scheduling) + .await?; } // If we're activating a node, then before setting it active we must reconcile any shard locations @@ -5551,7 +6228,14 @@ impl Service { tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); } Ok(()) => { - if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() { + if self + .maybe_reconcile_shard( + tenant_shard, + nodes, + ReconcilerPriority::Normal, + ) + .is_some() + { tenants_affected += 1; }; } @@ -5582,7 +6266,11 @@ impl Service { if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { if observed_loc.conf.is_none() { - self.maybe_reconcile_shard(tenant_shard, nodes); + self.maybe_reconcile_shard( + tenant_shard, + nodes, + ReconcilerPriority::Normal, + ); } } } @@ -5946,8 +6634,36 @@ impl Service { &self, shard: &mut TenantShard, nodes: &Arc>, + priority: ReconcilerPriority, ) -> Option { - self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default()) + self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::new(priority)) + } + + /// Before constructing a Reconciler, acquire semaphore units from the appropriate concurrency limit (depends on priority) + fn get_reconciler_units( + &self, + priority: ReconcilerPriority, + ) -> Result { + let units = match priority { + ReconcilerPriority::Normal => self.reconciler_concurrency.clone().try_acquire_owned(), + ReconcilerPriority::High => { + match self + .priority_reconciler_concurrency + .clone() + .try_acquire_owned() + { + Ok(u) => Ok(u), + Err(TryAcquireError::NoPermits) => { + // If the high priority semaphore is exhausted, then high priority tasks may steal units from + // the normal priority semaphore. + self.reconciler_concurrency.clone().try_acquire_owned() + } + Err(e) => Err(e), + } + } + }; + + units.map(ReconcileUnits::new) } /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], @@ -5959,16 +6675,17 @@ impl Service { ) -> Option { let reconcile_needed = shard.get_reconcile_needed(nodes); - match reconcile_needed { + let reconcile_reason = match reconcile_needed { ReconcileNeeded::No => return None, ReconcileNeeded::WaitExisting(waiter) => return Some(waiter), - ReconcileNeeded::Yes => { + ReconcileNeeded::Yes(reason) => { // Fall through to try and acquire units for spawning reconciler + reason } }; - let units = match self.reconciler_concurrency.clone().try_acquire_owned() { - Ok(u) => ReconcileUnits::new(u), + let units = match self.get_reconciler_units(reconciler_config.priority) { + Ok(u) => u, Err(_) => { tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(), "Concurrency limited: enqueued for reconcile later"); @@ -6002,6 +6719,7 @@ impl Service { }; shard.spawn_reconciler( + reconcile_reason, &self.result_tx, nodes, &self.compute_hook, @@ -6022,7 +6740,7 @@ impl Service { /// available. A return value of 0 indicates that everything is fully reconciled already. fn reconcile_all(&self) -> usize { let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, _scheduler) = locked.parts_mut(); + let (nodes, tenants, scheduler) = locked.parts_mut(); let pageservers = nodes.clone(); // This function is an efficient place to update lazy statistics, since we are walking @@ -6030,6 +6748,10 @@ impl Service { let mut pending_reconciles = 0; let mut az_violations = 0; + // If we find any tenants to drop from memory, stash them to offload after + // we're done traversing the map of tenants. + let mut drop_detached_tenants = Vec::new(); + let mut reconciles_spawned = 0; for shard in tenants.values_mut() { // Accumulate scheduling statistics @@ -6057,12 +6779,37 @@ impl Service { // Eventual consistency: if an earlier reconcile job failed, and the shard is still // dirty, spawn another rone - if self.maybe_reconcile_shard(shard, &pageservers).is_some() { + if self + .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal) + .is_some() + { reconciles_spawned += 1; } else if shard.delayed_reconcile { // Shard wanted to reconcile but for some reason couldn't. pending_reconciles += 1; } + + // If this tenant is detached, try dropping it from memory. This is usually done + // proactively in [`Self::process_results`], but we do it here to handle the edge + // case where a reconcile completes while someone else is holding an op lock for the tenant. + if shard.tenant_shard_id.shard_number == ShardNumber(0) + && shard.policy == PlacementPolicy::Detached + { + if let Some(guard) = self.tenant_op_locks.try_exclusive( + shard.tenant_shard_id.tenant_id, + TenantOperations::DropDetached, + ) { + drop_detached_tenants.push((shard.tenant_shard_id.tenant_id, guard)); + } + } + } + + // Some metrics are calculated from SchedulerNode state, update these periodically + scheduler.update_metrics(); + + // Process any deferred tenant drops + for (tenant_id, guard) in drop_detached_tenants { + self.maybe_drop_tenant(tenant_id, &mut locked, &guard); } metrics::METRICS_REGISTRY @@ -6097,7 +6844,7 @@ impl Service { // with the frequency of background calls, this acts as an implicit rate limit that runs a small // trickle of optimizations in the background, rather than executing a large number in parallel // when a change occurs. - const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 2; + const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 16; // Synchronous prepare: scan shards for possible scheduling optimizations let candidate_work = self.optimize_all_plan(); @@ -6118,9 +6865,13 @@ impl Service { // Shard was dropped between planning and execution; continue; }; + tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}"); if shard.apply_optimization(scheduler, optimization) { optimizations_applied += 1; - if self.maybe_reconcile_shard(shard, nodes).is_some() { + if self + .maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal) + .is_some() + { reconciles_spawned += 1; } } @@ -6144,11 +6895,17 @@ impl Service { // How many candidate optimizations we will generate, before evaluating them for readniess: setting // this higher than the execution limit gives us a chance to execute some work even if the first // few optimizations we find are not ready. - const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8; + const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 64; let mut work = Vec::new(); let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, scheduler) = locked.parts_mut(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + + // We are going to plan a bunch of optimisations before applying any of them, so the + // utilisation stats on nodes will be effectively stale for the >1st optimisation we + // generate. To avoid this causing unstable migrations/flapping, it's important that the + // code in TenantShard for finding optimisations uses [`NodeAttachmentSchedulingScore::disregard_utilization`] + // to ignore the utilisation component of the score. for (_tenant_id, schedule_context, shards) in TenantShardContextIterator::new(tenants, ScheduleMode::Speculative) @@ -6179,13 +6936,32 @@ impl Service { continue; } - // TODO: optimization calculations are relatively expensive: create some fast-path for - // the common idle case (avoiding the search on tenants that we have recently checked) + // Fast path: we may quickly identify shards that don't have any possible optimisations + if !shard.maybe_optimizable(scheduler, &schedule_context) { + if cfg!(feature = "testing") { + // Check that maybe_optimizable doesn't disagree with the actual optimization functions. + // Only do this in testing builds because it is not a correctness-critical check, so we shouldn't + // panic in prod if we hit this, or spend cycles on it in prod. + assert!( + shard + .optimize_attachment(scheduler, &schedule_context) + .is_none() + ); + assert!( + shard + .optimize_secondary(scheduler, &schedule_context) + .is_none() + ); + } + continue; + } + if let Some(optimization) = - // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to + // If idle, maybe optimize attachments: if a shard has a secondary location that is preferable to // its primary location based on soft constraints, cut it over. - shard.optimize_attachment(nodes, &schedule_context) + shard.optimize_attachment(scheduler, &schedule_context) { + tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for attachment: {optimization:?}"); work.push((shard.tenant_shard_id, optimization)); break; } else if let Some(optimization) = @@ -6195,6 +6971,7 @@ impl Service { // in the same tenant with secondary locations on the node where they originally split. shard.optimize_secondary(scheduler, &schedule_context) { + tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for secondary: {optimization:?}"); work.push((shard.tenant_shard_id, optimization)); break; } @@ -6230,7 +7007,9 @@ impl Service { } Some(node) => { if !node.is_available() { - tracing::info!("Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable"); + tracing::info!( + "Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable" + ); } else { // Accumulate optimizations that require fetching secondary status, so that we can execute these // remote API requests concurrently. @@ -6243,8 +7022,10 @@ impl Service { } } } - ScheduleOptimizationAction::ReplaceSecondary(_) => { - // No extra checks needed to replace a secondary: this does not interrupt client access + ScheduleOptimizationAction::ReplaceSecondary(_) + | ScheduleOptimizationAction::CreateSecondary(_) + | ScheduleOptimizationAction::RemoveSecondary(_) => { + // No extra checks needed to manage secondaries: this does not interrupt client access validated_work.push((tenant_shard_id, optimization)) } }; @@ -6274,7 +7055,9 @@ impl Service { { match secondary_status { Err(e) => { - tracing::info!("Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}"); + tracing::info!( + "Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}" + ); } Ok(progress) => { // We require secondary locations to have less than 10GiB of downloads pending before we will use @@ -6287,7 +7070,9 @@ impl Service { || progress.bytes_total - progress.bytes_downloaded > DOWNLOAD_FRESHNESS_THRESHOLD { - tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"); + tracing::info!( + "Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}" + ); #[cfg(feature = "testing")] if progress.heatmap_mtime.is_none() { @@ -6316,50 +7101,98 @@ impl Service { /// we have this helper to move things along faster. #[cfg(feature = "testing")] async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) { - let (attached_node, secondary_node) = { + let (attached_node, secondaries) = { let locked = self.inner.read().unwrap(); let Some(shard) = locked.tenants.get(&tenant_shard_id) else { + tracing::warn!( + "Skipping kick of secondary download for {tenant_shard_id}: not found" + ); return; }; - let (Some(attached), Some(secondary)) = ( - shard.intent.get_attached(), - shard.intent.get_secondary().first(), - ) else { + + let Some(attached) = shard.intent.get_attached() else { + tracing::warn!( + "Skipping kick of secondary download for {tenant_shard_id}: no attached" + ); return; }; - ( - locked.nodes.get(attached).unwrap().clone(), - locked.nodes.get(secondary).unwrap().clone(), - ) + + let secondaries = shard + .intent + .get_secondary() + .iter() + .map(|n| locked.nodes.get(n).unwrap().clone()) + .collect::>(); + + (locked.nodes.get(attached).unwrap().clone(), secondaries) }; // Make remote API calls to upload + download heatmaps: we ignore errors because this is just // a 'kick' to let scheduling optimisation run more promptly. - attached_node + match attached_node .with_client_retries( |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 3, 10, SHORT_RECONCILE_TIMEOUT, &self.cancel, ) - .await; + .await + { + Some(Err(e)) => { + tracing::info!( + "Failed to upload heatmap from {attached_node} for {tenant_shard_id}: {e}" + ); + } + None => { + tracing::info!( + "Cancelled while uploading heatmap from {attached_node} for {tenant_shard_id}" + ); + } + Some(Ok(_)) => { + tracing::info!( + "Successfully uploaded heatmap from {attached_node} for {tenant_shard_id}" + ); + } + } - secondary_node - .with_client_retries( - |client| async move { - client - .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1))) - .await - }, - &self.config.jwt_token, - 3, - 10, - SHORT_RECONCILE_TIMEOUT, - &self.cancel, - ) - .await; + for secondary_node in secondaries { + match secondary_node + .with_client_retries( + |client| async move { + client + .tenant_secondary_download( + tenant_shard_id, + Some(Duration::from_secs(1)), + ) + .await + }, + &self.config.pageserver_jwt_token, + 3, + 10, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + Some(Err(e)) => { + tracing::info!( + "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}" + ); + } + None => { + tracing::info!( + "Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}" + ); + } + Some(Ok(progress)) => { + tracing::info!( + "Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}" + ); + } + } + } } /// Look for shards which are oversized and in need of splitting @@ -6394,7 +7227,7 @@ impl Service { let request = request_ref.clone(); client.top_tenant_shards(request.clone()).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 3, 3, Duration::from_secs(5), @@ -6441,7 +7274,9 @@ impl Service { // We spawn a task to run this, so it's exactly like some external API client requesting it. We don't // want to block the background reconcile loop on this. - tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}"); + tracing::info!( + "Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}" + ); let this = self.clone(); tokio::spawn( @@ -6567,7 +7402,7 @@ impl Service { match node .with_client_retries( |client| async move { client.tenant_secondary_status(tenant_shard_id).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 3, Duration::from_millis(250), @@ -6602,7 +7437,7 @@ impl Service { // to not stall the operation when a cold secondary is encountered. const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); - let reconciler_config = ReconcilerConfigBuilder::new() + let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) .build(); @@ -6728,7 +7563,7 @@ impl Service { } waiters = self - .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT) .await; failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel); @@ -6782,61 +7617,104 @@ impl Service { Ok(()) } - /// Create a node fill plan (pick secondaries to promote) that meets the following requirements: - /// 1. The node should be filled until it reaches the expected cluster average of - /// attached shards. If there are not enough secondaries on the node, the plan stops early. - /// 2. Select tenant shards to promote such that the number of attached shards is balanced - /// throughout the cluster. We achieve this by picking tenant shards from each node, - /// starting from the ones with the largest number of attached shards, until the node - /// reaches the expected cluster average. - /// 3. Avoid promoting more shards of the same tenant than required. The upper bound - /// for the number of tenants from the same shard promoted to the node being filled is: - /// shard count for the tenant divided by the number of nodes in the cluster. + /// Create a node fill plan (pick secondaries to promote), based on: + /// 1. Shards which have a secondary on this node, and this node is in their home AZ, and are currently attached to a node + /// outside their home AZ, should be migrated back here. + /// 2. If after step 1 we have not migrated enough shards for this node to have its fair share of + /// attached shards, we will promote more shards from the nodes with the most attached shards, unless + /// those shards have a home AZ that doesn't match the node we're filling. fn fill_node_plan(&self, node_id: NodeId) -> Vec { let mut locked = self.inner.write().unwrap(); - let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); + let (nodes, tenants, _scheduler) = locked.parts_mut(); - let mut tids_by_node = locked - .tenants - .iter_mut() - .filter_map(|(tid, tenant_shard)| { - if !matches!( - tenant_shard.get_scheduling_policy(), - ShardSchedulingPolicy::Active - ) { - // Only include tenants in fills if they have a normal (Active) scheduling policy. We - // even exclude Essential, because moving to fill a node is not essential to keeping this - // tenant available. - return None; - } + let node_az = nodes + .get(&node_id) + .expect("Node must exist") + .get_availability_zone_id() + .clone(); - if tenant_shard.intent.get_secondary().contains(&node_id) { + // The tenant shard IDs that we plan to promote from secondary to attached on this node + let mut plan = Vec::new(); + + // Collect shards which do not have a preferred AZ & are elegible for moving in stage 2 + let mut free_tids_by_node: HashMap> = HashMap::new(); + + // Don't respect AZ preferences if there is only one AZ. This comes up in tests, but it could + // conceivably come up in real life if deploying a single-AZ region intentionally. + let respect_azs = nodes + .values() + .map(|n| n.get_availability_zone_id()) + .unique() + .count() + > 1; + + // Step 1: collect all shards that we are required to migrate back to this node because their AZ preference + // requires it. + for (tsid, tenant_shard) in tenants { + if !tenant_shard.intent.get_secondary().contains(&node_id) { + // Shard doesn't have a secondary on this node, ignore it. + continue; + } + + // AZ check: when filling nodes after a restart, our intent is to move _back_ the + // shards which belong on this node, not to promote shards whose scheduling preference + // would be on their currently attached node. So will avoid promoting shards whose + // home AZ doesn't match the AZ of the node we're filling. + match tenant_shard.preferred_az() { + _ if !respect_azs => { if let Some(primary) = tenant_shard.intent.get_attached() { - return Some((*primary, *tid)); + free_tids_by_node.entry(*primary).or_default().push(*tsid); } } + None => { + // Shard doesn't have an AZ preference: it is elegible to be moved, but we + // will only do so if our target shard count requires it. + if let Some(primary) = tenant_shard.intent.get_attached() { + free_tids_by_node.entry(*primary).or_default().push(*tsid); + } + } + Some(az) if az == &node_az => { + // This shard's home AZ is equal to the node we're filling: it should + // be moved back to this node as part of filling, unless its currently + // attached location is also in its home AZ. + if let Some(primary) = tenant_shard.intent.get_attached() { + if nodes + .get(primary) + .expect("referenced node must exist") + .get_availability_zone_id() + != tenant_shard + .preferred_az() + .expect("tenant must have an AZ preference") + { + plan.push(*tsid) + } + } else { + plan.push(*tsid) + } + } + Some(_) => { + // This shard's home AZ is somewhere other than the node we're filling, + // it may not be moved back to this node as part of filling. Ignore it + } + } + } - None - }) - .into_group_map(); + // Step 2: also promote any AZ-agnostic shards as required to achieve the target number of attachments + let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); let expected_attached = locked.scheduler.expected_attached_shard_count(); let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count(); let mut promoted_per_tenant: HashMap = HashMap::new(); - let mut plan = Vec::new(); for (node_id, attached) in nodes_by_load { - let available = locked - .nodes - .get(&node_id) - .map_or(false, |n| n.is_available()); + let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available()); if !available { continue; } if plan.len() >= fill_requirement - || tids_by_node.is_empty() + || free_tids_by_node.is_empty() || attached <= expected_attached { break; @@ -6848,7 +7726,7 @@ impl Service { let mut remove_node = false; while take > 0 { - match tids_by_node.get_mut(&node_id) { + match free_tids_by_node.get_mut(&node_id) { Some(tids) => match tids.pop() { Some(tid) => { let max_promote_for_tenant = std::cmp::max( @@ -6874,7 +7752,7 @@ impl Service { } if remove_node { - tids_by_node.remove(&node_id); + free_tids_by_node.remove(&node_id); } } @@ -6892,7 +7770,7 @@ impl Service { ) -> Result<(), OperationError> { const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); - let reconciler_config = ReconcilerConfigBuilder::new() + let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) .build(); @@ -6981,7 +7859,7 @@ impl Service { } waiters = self - .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT) .await; } @@ -7113,18 +7991,80 @@ impl Service { global_observed } + pub(crate) async fn safekeepers_list( + &self, + ) -> Result, DatabaseError> { + let locked = self.inner.read().unwrap(); + let mut list = locked + .safekeepers + .iter() + .map(|sk| sk.1.describe_response()) + .collect::, _>>()?; + list.sort_by_key(|v| v.id); + Ok(list) + } + pub(crate) async fn get_safekeeper( &self, id: i64, - ) -> Result { - self.persistence.safekeeper_get(id).await + ) -> Result { + let locked = self.inner.read().unwrap(); + let sk = locked + .safekeepers + .get(&NodeId(id as u64)) + .ok_or(diesel::result::Error::NotFound)?; + sk.describe_response() } pub(crate) async fn upsert_safekeeper( &self, - record: crate::persistence::SafekeeperPersistence, + record: crate::persistence::SafekeeperUpsert, ) -> Result<(), DatabaseError> { - self.persistence.safekeeper_upsert(record).await + let node_id = NodeId(record.id as u64); + self.persistence.safekeeper_upsert(record.clone()).await?; + { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + match safekeepers.entry(node_id) { + std::collections::hash_map::Entry::Occupied(mut entry) => { + entry.get_mut().update_from_record(record); + } + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert(Safekeeper::from_persistence( + crate::persistence::SafekeeperPersistence::from_upsert( + record, + SkSchedulingPolicy::Pause, + ), + CancellationToken::new(), + )); + } + } + locked.safekeepers = Arc::new(safekeepers); + } + Ok(()) + } + + pub(crate) async fn set_safekeeper_scheduling_policy( + &self, + id: i64, + scheduling_policy: SkSchedulingPolicy, + ) -> Result<(), DatabaseError> { + self.persistence + .set_safekeeper_scheduling_policy(id, scheduling_policy) + .await?; + let node_id = NodeId(id as u64); + // After the change has been persisted successfully, update the in-memory state + { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + let sk = safekeepers + .get_mut(&node_id) + .ok_or(DatabaseError::Logical("Not found".to_string()))?; + sk.set_scheduling_policy(scheduling_policy); + + locked.safekeepers = Arc::new(safekeepers); + } + Ok(()) } pub(crate) async fn update_shards_preferred_azs( diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs index 0e551beaa7..2ff68d7037 100644 --- a/storage_controller/src/service/chaos_injector.rs +++ b/storage_controller/src/service/chaos_injector.rs @@ -1,38 +1,152 @@ -use std::{sync::Arc, time::Duration}; +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; +use std::time::Duration; use pageserver_api::controller_api::ShardSchedulingPolicy; use rand::seq::SliceRandom; use rand::thread_rng; use tokio_util::sync::CancellationToken; +use utils::id::NodeId; +use utils::shard::TenantShardId; -use super::Service; +use super::{Node, Scheduler, Service, TenantShard}; pub struct ChaosInjector { service: Arc, interval: Duration, + chaos_exit_crontab: Option, +} + +fn cron_to_next_duration(cron: &cron::Schedule) -> anyhow::Result { + use chrono::Utc; + let next = cron.upcoming(Utc).next().unwrap(); + let duration = (next - Utc::now()).to_std()?; + Ok(tokio::time::sleep(duration)) +} + +async fn maybe_sleep(sleep: Option) -> Option<()> { + if let Some(sleep) = sleep { + sleep.await; + Some(()) + } else { + None + } } impl ChaosInjector { - pub fn new(service: Arc, interval: Duration) -> Self { - Self { service, interval } + pub fn new( + service: Arc, + interval: Duration, + chaos_exit_crontab: Option, + ) -> Self { + Self { + service, + interval, + chaos_exit_crontab, + } } pub async fn run(&mut self, cancel: CancellationToken) { let mut interval = tokio::time::interval(self.interval); - - loop { - tokio::select! { - _ = interval.tick() => {} - _ = cancel.cancelled() => { - tracing::info!("Shutting down"); - return; + let cron_interval = { + if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab { + match cron_to_next_duration(chaos_exit_crontab) { + Ok(interval_exit) => Some(interval_exit), + Err(e) => { + tracing::error!("Error processing the cron schedule: {e}"); + None + } } + } else { + None } - - self.inject_chaos().await; - - tracing::info!("Chaos iteration..."); + }; + enum ChaosEvent { + ShuffleTenant, + ForceKill, } + let chaos_type = tokio::select! { + _ = interval.tick() => { + ChaosEvent::ShuffleTenant + } + Some(_) = maybe_sleep(cron_interval) => { + ChaosEvent::ForceKill + } + _ = cancel.cancelled() => { + tracing::info!("Shutting down"); + return; + } + }; + + match chaos_type { + ChaosEvent::ShuffleTenant => { + self.inject_chaos().await; + } + ChaosEvent::ForceKill => { + self.force_kill().await; + } + } + + tracing::info!("Chaos iteration..."); + } + + /// If a shard has a secondary and attached location, then re-assign the secondary to be + /// attached and the attached to be secondary. + /// + /// Only modifies tenants if they're in Active scheduling policy. + fn maybe_migrate_to_secondary( + &self, + tenant_shard_id: TenantShardId, + nodes: &Arc>, + tenants: &mut BTreeMap, + scheduler: &mut Scheduler, + ) { + let shard = tenants + .get_mut(&tenant_shard_id) + .expect("Held lock between choosing ID and this get"); + + if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) { + // Skip non-active scheduling policies, so that a shard with a policy like Pause can + // be pinned without being disrupted by us. + tracing::info!( + "Skipping shard {tenant_shard_id}: scheduling policy is {:?}", + shard.get_scheduling_policy() + ); + return; + } + + // Pick a secondary to promote + let Some(new_location) = shard + .intent + .get_secondary() + .choose(&mut thread_rng()) + .cloned() + else { + tracing::info!( + "Skipping shard {tenant_shard_id}: no secondary location, can't migrate" + ); + return; + }; + + let Some(old_location) = *shard.intent.get_attached() else { + tracing::info!("Skipping shard {tenant_shard_id}: currently has no attached location"); + return; + }; + + tracing::info!("Injecting chaos: migrate {tenant_shard_id} {old_location}->{new_location}"); + + shard.intent.demote_attached(scheduler, old_location); + shard.intent.promote_attached(scheduler, new_location); + self.service.maybe_reconcile_shard( + shard, + nodes, + crate::reconciler::ReconcilerPriority::Normal, + ); + } + + async fn force_kill(&mut self) { + tracing::warn!("Injecting chaos: force kill"); + std::process::exit(1); } async fn inject_chaos(&mut self) { @@ -40,45 +154,47 @@ impl ChaosInjector { let batch_size = 128; let mut inner = self.service.inner.write().unwrap(); let (nodes, tenants, scheduler) = inner.parts_mut(); - let tenant_ids = tenants.keys().cloned().collect::>(); - let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size); + + // Prefer to migrate tenants that are currently outside their home AZ. This avoids the chaos injector + // continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some + // random tenants to move, and then on next chaos iteration moving them back, then picking some new + // random tenants on the next iteration. + let (out_of_home_az, in_home_az): (Vec<_>, Vec<_>) = tenants + .values() + .map(|shard| { + ( + shard.tenant_shard_id, + shard.is_attached_outside_preferred_az(nodes), + ) + }) + .partition(|(_id, is_outside)| *is_outside); + + let mut out_of_home_az: Vec<_> = out_of_home_az.into_iter().map(|(id, _)| id).collect(); + let mut in_home_az: Vec<_> = in_home_az.into_iter().map(|(id, _)| id).collect(); + + let mut victims = Vec::with_capacity(batch_size); + if out_of_home_az.len() >= batch_size { + tracing::info!( + "Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", + out_of_home_az.len() + ); + + out_of_home_az.shuffle(&mut thread_rng()); + victims.extend(out_of_home_az.into_iter().take(batch_size)); + } else { + tracing::info!( + "Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", + out_of_home_az.len(), + std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len()) + ); + + victims.extend(out_of_home_az); + in_home_az.shuffle(&mut thread_rng()); + victims.extend(in_home_az.into_iter().take(batch_size - victims.len())); + } for victim in victims { - let shard = tenants - .get_mut(victim) - .expect("Held lock between choosing ID and this get"); - - if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) { - // Skip non-active scheduling policies, so that a shard with a policy like Pause can - // be pinned without being disrupted by us. - tracing::info!( - "Skipping shard {victim}: scheduling policy is {:?}", - shard.get_scheduling_policy() - ); - continue; - } - - // Pick a secondary to promote - let Some(new_location) = shard - .intent - .get_secondary() - .choose(&mut thread_rng()) - .cloned() - else { - tracing::info!("Skipping shard {victim}: no secondary location, can't migrate"); - continue; - }; - - let Some(old_location) = *shard.intent.get_attached() else { - tracing::info!("Skipping shard {victim}: currently has no attached location"); - continue; - }; - - tracing::info!("Injecting chaos: migrate {victim} {old_location}->{new_location}"); - - shard.intent.demote_attached(scheduler, old_location); - shard.intent.promote_attached(scheduler, new_location); - self.service.maybe_reconcile_shard(shard, nodes); + self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler); } } } diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs index d38010a27e..c4784e5e36 100644 --- a/storage_controller/src/service/context_iterator.rs +++ b/storage_controller/src/service/context_iterator.rs @@ -43,9 +43,6 @@ impl<'a> Iterator for TenantShardContextIterator<'a> { // Accumulate the schedule context for all the shards in a tenant schedule_context.avoid(&shard.intent.all_pageservers()); - if let Some(attached) = shard.intent.get_attached() { - schedule_context.push_attached(*attached); - } tenant_shards.push(shard); if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 { @@ -57,17 +54,16 @@ impl<'a> Iterator for TenantShardContextIterator<'a> { #[cfg(test)] mod tests { - use std::{collections::BTreeMap, str::FromStr}; + use std::collections::BTreeMap; + use std::str::FromStr; use pageserver_api::controller_api::PlacementPolicy; use utils::shard::{ShardCount, ShardNumber}; - use crate::{ - scheduler::test_utils::make_test_nodes, service::Scheduler, - tenant_shard::tests::make_test_tenant_with_id, - }; - use super::*; + use crate::scheduler::test_utils::make_test_nodes; + use crate::service::Scheduler; + use crate::tenant_shard::tests::make_test_tenant_with_id; #[test] fn test_context_iterator() { @@ -115,7 +111,7 @@ mod tests { assert_eq!(tenant_id, t1_id); assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); assert_eq!(shards.len(), 1); - assert_eq!(context.attach_count(), 1); + assert_eq!(context.location_count(), 2); let (tenant_id, context, shards) = iter.next().unwrap(); assert_eq!(tenant_id, t2_id); @@ -124,13 +120,13 @@ mod tests { assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2)); assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3)); assert_eq!(shards.len(), 4); - assert_eq!(context.attach_count(), 4); + assert_eq!(context.location_count(), 8); let (tenant_id, context, shards) = iter.next().unwrap(); assert_eq!(tenant_id, t3_id); assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); assert_eq!(shards.len(), 1); - assert_eq!(context.attach_count(), 1); + assert_eq!(context.location_count(), 2); for shard in tenants.values_mut() { shard.intent.clear(&mut scheduler); diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 2eb98ee825..34fd244023 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1,51 +1,39 @@ -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, - time::Duration, -}; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; -use crate::{ - metrics::{ - self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome, - }, - persistence::TenantShardPersistence, - reconciler::{ReconcileUnits, ReconcilerConfig}, - scheduler::{ - AffinityScore, AttachedShardTag, MaySchedule, RefCountUpdate, ScheduleContext, - SecondaryShardTag, - }, - service::ReconcileResultRequest, -}; use futures::future::{self, Either}; use itertools::Itertools; -use pageserver_api::controller_api::{ - AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, -}; -use pageserver_api::{ - models::{LocationConfig, LocationConfigMode, TenantConfig}, - shard::{ShardIdentity, TenantShardId}, -}; +use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy, ShardSchedulingPolicy}; +use pageserver_api::models::{LocationConfig, LocationConfigMode, TenantConfig}; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use serde::{Deserialize, Serialize}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; -use tracing::{instrument, Instrument}; -use utils::{ - generation::Generation, - id::NodeId, - seqwait::{SeqWait, SeqWaitError}, - sync::gate::GateGuard, -}; +use tracing::{Instrument, instrument}; +use utils::generation::Generation; +use utils::id::NodeId; +use utils::seqwait::{SeqWait, SeqWaitError}; +use utils::shard::ShardCount; +use utils::sync::gate::GateGuard; -use crate::{ - compute_hook::ComputeHook, - node::Node, - persistence::{split_state::SplitState, Persistence}, - reconciler::{ - attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState, - }, - scheduler::{ScheduleError, Scheduler}, - service, Sequence, +use crate::compute_hook::ComputeHook; +use crate::metrics::{ + self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome, }; +use crate::node::Node; +use crate::persistence::split_state::SplitState; +use crate::persistence::{Persistence, TenantShardPersistence}; +use crate::reconciler::{ + ReconcileError, ReconcileUnits, Reconciler, ReconcilerConfig, TargetState, + attached_location_conf, secondary_location_conf, +}; +use crate::scheduler::{ + AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore, + RefCountUpdate, ScheduleContext, ScheduleError, Scheduler, SecondaryShardTag, ShardTag, +}; +use crate::service::ReconcileResultRequest; +use crate::{Sequence, service}; /// Serialization helper fn read_last_error(v: &std::sync::Mutex>, serializer: S) -> Result @@ -147,45 +135,67 @@ pub(crate) struct TenantShard { // Support/debug tool: if something is going wrong or flapping with scheduling, this may // be set to a non-active state to avoid making changes while the issue is fixed. scheduling_policy: ShardSchedulingPolicy, +} + +#[derive(Clone, Debug, Serialize)] +pub(crate) struct IntentState { + attached: Option, + secondary: Vec, // We should attempt to schedule this shard in the provided AZ to // decrease chances of cross-AZ compute. preferred_az_id: Option, } -#[derive(Default, Clone, Debug, Serialize)] -pub(crate) struct IntentState { - attached: Option, - secondary: Vec, -} - impl IntentState { - pub(crate) fn new() -> Self { + pub(crate) fn new(preferred_az_id: Option) -> Self { Self { attached: None, secondary: vec![], + preferred_az_id, } } - pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option) -> Self { + pub(crate) fn single( + scheduler: &mut Scheduler, + node_id: Option, + preferred_az_id: Option, + ) -> Self { if let Some(node_id) = node_id { - scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach); + scheduler.update_node_ref_counts( + node_id, + preferred_az_id.as_ref(), + RefCountUpdate::Attach, + ); } Self { attached: node_id, secondary: vec![], + preferred_az_id, } } pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option) { if self.attached != new_attached { if let Some(old_attached) = self.attached.take() { - scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); + scheduler.update_node_ref_counts( + old_attached, + self.preferred_az_id.as_ref(), + RefCountUpdate::Detach, + ); } if let Some(new_attached) = &new_attached { - scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach); + scheduler.update_node_ref_counts( + *new_attached, + self.preferred_az_id.as_ref(), + RefCountUpdate::Attach, + ); } self.attached = new_attached; } + + if let Some(new_attached) = &new_attached { + assert!(!self.secondary.contains(new_attached)); + } } /// Like set_attached, but the node is from [`Self::secondary`]. This swaps the node from @@ -204,15 +214,28 @@ impl IntentState { let demoted = self.attached; self.attached = Some(promote_secondary); - scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary); + scheduler.update_node_ref_counts( + promote_secondary, + self.preferred_az_id.as_ref(), + RefCountUpdate::PromoteSecondary, + ); if let Some(demoted) = demoted { - scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached); + scheduler.update_node_ref_counts( + demoted, + self.preferred_az_id.as_ref(), + RefCountUpdate::DemoteAttached, + ); } } pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) { - debug_assert!(!self.secondary.contains(&new_secondary)); - scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary); + assert!(!self.secondary.contains(&new_secondary)); + assert!(self.attached != Some(new_secondary)); + scheduler.update_node_ref_counts( + new_secondary, + self.preferred_az_id.as_ref(), + RefCountUpdate::AddSecondary, + ); self.secondary.push(new_secondary); } @@ -220,27 +243,43 @@ impl IntentState { pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) { let index = self.secondary.iter().position(|n| *n == node_id); if let Some(index) = index { - scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); + scheduler.update_node_ref_counts( + node_id, + self.preferred_az_id.as_ref(), + RefCountUpdate::RemoveSecondary, + ); self.secondary.remove(index); } } pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) { for secondary in self.secondary.drain(..) { - scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary); + scheduler.update_node_ref_counts( + secondary, + self.preferred_az_id.as_ref(), + RefCountUpdate::RemoveSecondary, + ); } } /// Remove the last secondary node from the list of secondaries pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) { if let Some(node_id) = self.secondary.pop() { - scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); + scheduler.update_node_ref_counts( + node_id, + self.preferred_az_id.as_ref(), + RefCountUpdate::RemoveSecondary, + ); } } pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) { if let Some(old_attached) = self.attached.take() { - scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); + scheduler.update_node_ref_counts( + old_attached, + self.preferred_az_id.as_ref(), + RefCountUpdate::Detach, + ); } self.clear_secondary(scheduler); @@ -275,7 +314,11 @@ impl IntentState { if self.attached == Some(node_id) { self.attached = None; self.secondary.push(node_id); - scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached); + scheduler.update_node_ref_counts( + node_id, + self.preferred_az_id.as_ref(), + RefCountUpdate::DemoteAttached, + ); true } else { false @@ -315,6 +358,7 @@ pub(crate) struct ObservedStateLocation { /// we know that we might have some state on this node. pub(crate) conf: Option, } + pub(crate) struct ReconcilerWaiter { // For observability purposes, remember the ID of the shard we're // waiting for. @@ -360,6 +404,10 @@ pub(crate) enum ScheduleOptimizationAction { ReplaceSecondary(ReplaceSecondary), // Migrate attachment to an existing secondary location MigrateAttachment(MigrateAttachment), + // Create a secondary location, with the intent of later migrating to it + CreateSecondary(NodeId), + // Remove a secondary location that we previously created to facilitate a migration + RemoveSecondary(NodeId), } #[derive(Eq, PartialEq, Debug, Clone)] @@ -422,7 +470,14 @@ pub(crate) enum ReconcileNeeded { /// spawned: wait for the existing reconciler rather than spawning a new one. WaitExisting(ReconcilerWaiter), /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`] - Yes, + Yes(ReconcileReason), +} + +#[derive(Debug)] +pub(crate) enum ReconcileReason { + ActiveNodesDirty, + UnknownLocation, + PendingComputeNotification, } /// Pending modification to the observed state of a tenant shard. @@ -465,6 +520,10 @@ impl ObservedState { locations: HashMap::new(), } } + + pub(crate) fn is_empty(&self) -> bool { + self.locations.is_empty() + } } impl TenantShard { @@ -472,6 +531,7 @@ impl TenantShard { tenant_shard_id: TenantShardId, shard: ShardIdentity, policy: PlacementPolicy, + preferred_az_id: Option, ) -> Self { metrics::METRICS_REGISTRY .metrics_group @@ -481,7 +541,7 @@ impl TenantShard { Self { tenant_shard_id, policy, - intent: IntentState::default(), + intent: IntentState::new(preferred_az_id), generation: Some(Generation::new(0)), shard, observed: ObservedState::default(), @@ -495,7 +555,6 @@ impl TenantShard { last_error: Arc::default(), pending_compute_notification: false, scheduling_policy: ShardSchedulingPolicy::default(), - preferred_az_id: None, } } @@ -558,7 +617,7 @@ impl TenantShard { return Ok((false, node_id)); } - if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) { + if let Some(promote_secondary) = self.preferred_secondary(scheduler) { // Promote a secondary tracing::debug!("Promoted secondary {} to attached", promote_secondary); self.intent.promote_attached(scheduler, promote_secondary); @@ -567,7 +626,7 @@ impl TenantShard { // Pick a fresh node: either we had no secondaries or none were schedulable let node_id = scheduler.schedule_shard::( &self.intent.secondary, - &self.preferred_az_id, + &self.intent.preferred_az_id, context, )?; tracing::debug!("Selected {} as attached", node_id); @@ -589,9 +648,6 @@ impl TenantShard { let r = self.do_schedule(scheduler, context); context.avoid(&self.intent.all_pageservers()); - if let Some(attached) = self.intent.get_attached() { - context.push_attached(*attached); - } r } @@ -626,24 +682,7 @@ impl TenantShard { use PlacementPolicy::*; match self.policy { Attached(secondary_count) => { - let retain_secondaries = if self.intent.attached.is_none() - && scheduler.node_preferred(&self.intent.secondary).is_some() - { - // If we have no attached, and one of the secondaries is elegible to be promoted, retain - // one more secondary than we usually would, as one of them will become attached futher down this function. - secondary_count + 1 - } else { - secondary_count - }; - - while self.intent.secondary.len() > retain_secondaries { - // We have no particular preference for one secondary location over another: just - // arbitrarily drop from the end - self.intent.pop_secondary(scheduler); - modified = true; - } - - // Should have exactly one attached, and N secondaries + // Should have exactly one attached, and at least N secondaries let (modified_attached, attached_node_id) = self.schedule_attached(scheduler, context)?; modified |= modified_attached; @@ -652,7 +691,7 @@ impl TenantShard { while self.intent.secondary.len() < secondary_count { let node_id = scheduler.schedule_shard::( &used_pageservers, - &self.preferred_az_id, + &self.intent.preferred_az_id, context, )?; self.intent.push_secondary(scheduler, node_id); @@ -664,21 +703,34 @@ impl TenantShard { if let Some(node_id) = self.intent.get_attached() { // Populate secondary by demoting the attached node self.intent.demote_attached(scheduler, *node_id); + modified = true; } else if self.intent.secondary.is_empty() { // Populate secondary by scheduling a fresh node - let node_id = scheduler.schedule_shard::( + // + // We use [`AttachedShardTag`] because when a secondary location is the only one + // a shard has, we expect that its next use will be as an attached location: we want + // the tenant to be ready to warm up and run fast in their preferred AZ. + let node_id = scheduler.schedule_shard::( &[], - &self.preferred_az_id, + &self.intent.preferred_az_id, context, )?; self.intent.push_secondary(scheduler, node_id); modified = true; } while self.intent.secondary.len() > 1 { - // We have no particular preference for one secondary location over another: just - // arbitrarily drop from the end - self.intent.pop_secondary(scheduler); + // If we have multiple secondaries (e.g. when transitioning from Attached to Secondary and + // having just demoted our attached location), then we should prefer to keep the location + // in our preferred AZ. Tenants in Secondary mode want to be in the preferred AZ so that + // they have a warm location to become attached when transitioning back into Attached. + + let mut candidates = self.intent.get_secondary().clone(); + // Sort to get secondaries outside preferred AZ last + candidates + .sort_by_key(|n| scheduler.get_node_az(n).as_ref() != self.preferred_az()); + let secondary_to_remove = candidates.pop().unwrap(); + self.intent.remove_secondary(scheduler, secondary_to_remove); modified = true; } } @@ -713,7 +765,7 @@ impl TenantShard { ) -> Result<(), ScheduleError> { let promote_to = match promote_to { Some(node) => node, - None => match scheduler.node_preferred(self.intent.get_secondary()) { + None => match self.preferred_secondary(scheduler) { Some(node) => node, None => { return Err(ScheduleError::ImpossibleConstraint); @@ -740,90 +792,306 @@ impl TenantShard { Ok(()) } + /// Returns None if the current location's score is unavailable, i.e. cannot draw a conclusion + fn is_better_location( + &self, + scheduler: &mut Scheduler, + schedule_context: &ScheduleContext, + current: NodeId, + candidate: NodeId, + ) -> Option { + let Some(candidate_score) = scheduler.compute_node_score::( + candidate, + &self.intent.preferred_az_id, + schedule_context, + ) else { + // The candidate node is unavailable for scheduling or otherwise couldn't get a score + return None; + }; + + match scheduler.compute_node_score::( + current, + &self.intent.preferred_az_id, + schedule_context, + ) { + Some(current_score) => { + // Ignore utilization components when comparing scores: we don't want to migrate + // because of transient load variations, it risks making the system thrash, and + // migrating for utilization requires a separate high level view of the system to + // e.g. prioritize moving larger or smaller tenants, rather than arbitrarily + // moving things around in the order that we hit this function. + let candidate_score = candidate_score.for_optimization(); + let current_score = current_score.for_optimization(); + + if candidate_score < current_score { + tracing::info!( + "Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})" + ); + Some(true) + } else { + // The candidate node is no better than our current location, so don't migrate + tracing::debug!( + "Candidate node {candidate} is no better than our current location {current} (candidate {candidate_score:?} vs current {current_score:?})", + ); + Some(false) + } + } + None => { + // The current node is unavailable for scheduling, so we can't make any sensible + // decisions about optimisation. This should be a transient state -- if the node + // is offline then it will get evacuated, if is blocked by a scheduling mode + // then we will respect that mode by doing nothing. + tracing::debug!("Current node {current} is unavailable for scheduling"); + None + } + } + } + + fn find_better_location( + &self, + scheduler: &mut Scheduler, + schedule_context: &ScheduleContext, + current: NodeId, + hard_exclude: &[NodeId], + ) -> Option { + // Look for a lower-scoring location to attach to + let Ok(candidate_node) = scheduler.schedule_shard::( + hard_exclude, + &self.intent.preferred_az_id, + schedule_context, + ) else { + // A scheduling error means we have no possible candidate replacements + tracing::debug!("No candidate node found"); + return None; + }; + + if candidate_node == current { + // We're already at the best possible location, so don't migrate + tracing::debug!("Candidate node {candidate_node} is already in use"); + return None; + } + + self.is_better_location::(scheduler, schedule_context, current, candidate_node) + .and_then(|better| if better { Some(candidate_node) } else { None }) + } + + /// This function is an optimization, used to avoid doing large numbers of scheduling operations + /// when looking for optimizations. This function uses knowledge of how scores work to do some + /// fast checks for whether it may to be possible to improve a score. + /// + /// If we return true, it only means that optimization _might_ be possible, not that it necessarily is. If we + /// return no, it definitely means that calling [`Self::optimize_attachment`] or [`Self::optimize_secondary`] would do no + /// work. + pub(crate) fn maybe_optimizable( + &self, + scheduler: &mut Scheduler, + schedule_context: &ScheduleContext, + ) -> bool { + // Sharded tenant: check if any locations have a nonzero affinity score + if self.shard.count >= ShardCount(1) { + let schedule_context = schedule_context.project_detach(self); + for node in self.intent.all_pageservers() { + if let Some(af) = schedule_context.nodes.get(&node) { + if *af > AffinityScore(0) { + return true; + } + } + } + } + + // Attached tenant: check if the attachment is outside the preferred AZ + if let PlacementPolicy::Attached(_) = self.policy { + if let Some(attached) = self.intent.get_attached() { + if scheduler.get_node_az(attached) != self.intent.preferred_az_id { + return true; + } + } + } + + // Tenant with secondary locations: check if any are within the preferred AZ + for secondary in self.intent.get_secondary() { + if scheduler.get_node_az(secondary) == self.intent.preferred_az_id { + return true; + } + } + + // Does the tenant have excess secondaries? + if self.intent.get_secondary().len() > self.policy.want_secondaries() { + return true; + } + + // Fall through: no optimizations possible + false + } + /// Optimize attachments: if a shard has a secondary location that is preferable to /// its primary location based on soft constraints, switch that secondary location /// to be attached. #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn optimize_attachment( &self, - nodes: &HashMap, + scheduler: &mut Scheduler, schedule_context: &ScheduleContext, ) -> Option { let attached = (*self.intent.get_attached())?; - if self.intent.secondary.is_empty() { - // We can only do useful work if we have both attached and secondary locations: this - // function doesn't schedule new locations, only swaps between attached and secondaries. - return None; - } - let current_affinity_score = schedule_context.get_node_affinity(attached); - let current_attachment_count = schedule_context.get_node_attachments(attached); + let schedule_context = schedule_context.project_detach(self); - // Generate score for each node, dropping any un-schedulable nodes. - let all_pageservers = self.intent.all_pageservers(); - let mut scores = all_pageservers - .iter() - .flat_map(|node_id| { - let node = nodes.get(node_id); - if node.is_none() { - None - } else if matches!( - node.unwrap().get_scheduling(), - NodeSchedulingPolicy::Filling - ) { - // If the node is currently filling, don't count it as a candidate to avoid, - // racing with the background fill. - None - } else if matches!(node.unwrap().may_schedule(), MaySchedule::No) { - None - } else { - let affinity_score = schedule_context.get_node_affinity(*node_id); - let attachment_count = schedule_context.get_node_attachments(*node_id); - Some((*node_id, affinity_score, attachment_count)) - } - }) - .collect::>(); - - // Sort precedence: - // 1st - prefer nodes with the lowest total affinity score - // 2nd - prefer nodes with the lowest number of attachments in this context - // 3rd - if all else is equal, sort by node ID for determinism in tests. - scores.sort_by_key(|i| (i.1, i.2, i.0)); - - if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) = - scores.first() - { - if attached != *preferred_node { - // The best alternative must be more than 1 better than us, otherwise we could end - // up flapping back next time we're called (e.g. there's no point migrating from - // a location with score 1 to a score zero, because on next location the situation - // would be the same, but in reverse). - if current_affinity_score > *preferred_affinity_score + AffinityScore(1) - || current_attachment_count > *preferred_attachment_count + 1 - { - tracing::info!( - "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})", - self.intent.get_secondary() - ); - return Some(ScheduleOptimization { - sequence: self.sequence, - action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { - old_attached_node_id: attached, - new_attached_node_id: *preferred_node, - }), - }); - } - } else { - tracing::debug!( - "Node {} is already preferred (score {:?})", - preferred_node, - preferred_affinity_score - ); + // If we already have a secondary that is higher-scoring than out current location, + // then simply migrate to it. + for secondary in self.intent.get_secondary() { + if let Some(true) = self.is_better_location::( + scheduler, + &schedule_context, + attached, + *secondary, + ) { + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: attached, + new_attached_node_id: *secondary, + }), + }); } } - // Fall-through: we didn't find an optimization - None + // Given that none of our current secondaries is a better location than our current + // attached location (checked above), we may trim any secondaries that are not needed + // for the placement policy. + if self.intent.get_secondary().len() > self.policy.want_secondaries() { + // This code path cleans up extra secondaries after migrating, and/or + // trims extra secondaries after a PlacementPolicy::Attached(N) was + // modified to decrease N. + + let secondary_scores = self + .intent + .get_secondary() + .iter() + .map(|node_id| { + ( + *node_id, + scheduler.compute_node_score::( + *node_id, + &self.intent.preferred_az_id, + &schedule_context, + ), + ) + }) + .collect::>(); + + if secondary_scores.iter().any(|score| score.1.is_none()) { + // Trivial case: if we only have one secondary, drop that one + if self.intent.get_secondary().len() == 1 { + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::RemoveSecondary( + *self.intent.get_secondary().first().unwrap(), + ), + }); + } + + // Try to find a "good" secondary to keep, without relying on scores (one or more nodes is in a state + // where its score can't be calculated), and drop the others. This enables us to make progress in + // most cases, even if some nodes are offline or have scheduling=pause set. + + debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this + // logic presumes we are in a mode where we want secondaries to be in non-home AZ + if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| { + let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id; + let is_available = secondary_scores + .get(n) + .expect("Built from same list of nodes") + .is_some(); + is_available && !in_home_az + }) { + // Great, we found one to retain. Pick some other to drop. + if let Some(victim) = self + .intent + .get_secondary() + .iter() + .find(|n| n != &retain_secondary) + { + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(*victim), + }); + } + } + + // Fall through: we didn't identify one to remove. This ought to be rare. + tracing::warn!( + "Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)", + self.intent.get_secondary() + ); + } else { + let victim = secondary_scores + .iter() + .max_by_key(|score| score.1.unwrap()) + .unwrap() + .0; + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(*victim), + }); + } + } + + let replacement = self.find_better_location::( + scheduler, + &schedule_context, + attached, + &[], // Don't exclude secondaries: our preferred attachment location may be a secondary + ); + + // We have found a candidate and confirmed that its score is preferable + // to our current location. See if we have a secondary location in the preferred location already: if not, + // then create one. + if let Some(replacement) = replacement { + // If we are currently in non-preferred AZ, then the scheduler might suggest a location that is better, but still + // not in our preferred AZ. Migration has a cost in resources an impact to the workload, so we want to avoid doing + // multiple hops where we might go to some other AZ before eventually finding a suitable location in our preferred + // AZ: skip this optimization if it is not in our final, preferred AZ. + // + // This should be a transient state, there should always be capacity eventually in our preferred AZ (even if nodes + // there are too overloaded for scheduler to suggest them, more should be provisioned eventually). + if self.intent.preferred_az_id.is_some() + && scheduler.get_node_az(&replacement) != self.intent.preferred_az_id + { + tracing::debug!( + "Candidate node {replacement} is not in preferred AZ {:?}", + self.intent.preferred_az_id + ); + + // This should only happen if our current location is not in the preferred AZ, otherwise + // [`Self::find_better_location`]` should have rejected any other location outside the preferred Az, because + // AZ is the highest priority part of NodeAttachmentSchedulingScore. + debug_assert!(scheduler.get_node_az(&attached) != self.intent.preferred_az_id); + + return None; + } + + if !self.intent.get_secondary().contains(&replacement) { + Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::CreateSecondary(replacement), + }) + } else { + // We already have a secondary in the preferred location, let's try migrating to it. Our caller + // will check the warmth of the destination before deciding whether to really execute this. + Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: attached, + new_attached_node_id: replacement, + }), + }) + } + } else { + // We didn't find somewhere we'd rather be, and we don't have any excess secondaries + // to clean up: no action required. + None + } } #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] @@ -832,50 +1100,59 @@ impl TenantShard { scheduler: &mut Scheduler, schedule_context: &ScheduleContext, ) -> Option { - if self.intent.secondary.is_empty() { - // We can only do useful work if we have both attached and secondary locations: this - // function doesn't schedule new locations, only swaps between attached and secondaries. + if self.intent.get_secondary().len() > self.policy.want_secondaries() { + // We have extra secondaries, perhaps to facilitate a migration of the attached location: + // do nothing, it is up to [`Self::optimize_attachment`] to clean them up. When that's done, + // and we are called again, we will proceed. + tracing::debug!("Too many secondaries: skipping"); return None; } + let schedule_context = schedule_context.project_detach(self); + for secondary in self.intent.get_secondary() { - let Some(affinity_score) = schedule_context.nodes.get(secondary) else { - // We're already on a node unaffected any affinity constraints, - // so we won't change it. - continue; + // Make sure we don't try to migrate a secondary to our attached location: this case happens + // easily in environments without multiple AZs. + let exclude = match self.intent.attached { + Some(attached) => vec![attached], + None => vec![], }; - // Let the scheduler suggest a node, where it would put us if we were scheduling afresh - // This implicitly limits the choice to nodes that are available, and prefers nodes - // with lower utilization. - let Ok(candidate_node) = scheduler.schedule_shard::( - &self.intent.all_pageservers(), - &self.preferred_az_id, - schedule_context, - ) else { - // A scheduling error means we have no possible candidate replacements - continue; + let replacement = match &self.policy { + PlacementPolicy::Attached(_) => { + // Secondaries for an attached shard should be scheduled using `SecondaryShardTag` + // to avoid placing them in the preferred AZ. + self.find_better_location::( + scheduler, + &schedule_context, + *secondary, + &exclude, + ) + } + PlacementPolicy::Secondary => { + // In secondary-only mode, we want our secondary locations in the preferred AZ, + // so that they're ready to take over as an attached location when we transition + // into PlacementPolicy::Attached. + self.find_better_location::( + scheduler, + &schedule_context, + *secondary, + &exclude, + ) + } + PlacementPolicy::Detached => None, }; - let candidate_affinity_score = schedule_context - .nodes - .get(&candidate_node) - .unwrap_or(&AffinityScore::FREE); - - // The best alternative must be more than 1 better than us, otherwise we could end - // up flapping back next time we're called. - if *candidate_affinity_score + AffinityScore(1) < *affinity_score { - // If some other node is available and has a lower score than this node, then - // that other node is a good place to migrate to. - tracing::info!( - "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})", - self.intent.get_secondary() - ); + assert!(replacement != Some(*secondary)); + if let Some(replacement) = replacement { + // We have found a candidate and confirmed that its score is preferable + // to our current location. See if we have a secondary location in the preferred location already: if not, + // then create one. return Some(ScheduleOptimization { sequence: self.sequence, action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { old_node_id: *secondary, - new_node_id: candidate_node, + new_node_id: replacement, }), }); } @@ -916,11 +1193,54 @@ impl TenantShard { self.intent.remove_secondary(scheduler, old_node_id); self.intent.push_secondary(scheduler, new_node_id); } + ScheduleOptimizationAction::CreateSecondary(new_node_id) => { + self.intent.push_secondary(scheduler, new_node_id); + } + ScheduleOptimizationAction::RemoveSecondary(old_secondary) => { + self.intent.remove_secondary(scheduler, old_secondary); + } } true } + /// When a shard has several secondary locations, we need to pick one in situations where + /// we promote one of them to an attached location: + /// - When draining a node for restart + /// - When responding to a node failure + /// + /// In this context, 'preferred' does not mean the node with the best scheduling score: instead + /// we want to pick the node which is best for use _temporarily_ while the previous attached location + /// is unavailable (e.g. because it's down or deploying). That means we prefer to use secondary + /// locations in a non-preferred AZ, as they're more likely to have awarm cache than a temporary + /// secondary in the preferred AZ (which are usually only created for migrations, and if they exist + /// they're probably not warmed up yet). The latter behavior is based oni + /// + /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the + /// caller needs to a pick a node some other way. + pub(crate) fn preferred_secondary(&self, scheduler: &Scheduler) -> Option { + let candidates = scheduler.filter_usable_nodes(&self.intent.secondary); + + // We will sort candidates to prefer nodes which are _not_ in our preferred AZ, i.e. we prefer + // to migrate to a long-lived secondary location (which would have been scheduled in a non-preferred AZ), + // rather than a short-lived secondary location being used for optimization/migration (which would have + // been scheduled in our preferred AZ). + let mut candidates = candidates + .iter() + .map(|(node_id, node_az)| { + if node_az == &self.intent.preferred_az_id { + (1, *node_id) + } else { + (0, *node_id) + } + }) + .collect::>(); + + candidates.sort(); + + candidates.first().map(|i| i.1) + } + /// Query whether the tenant's observed state for attached node matches its intent state, and if so, /// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there. @@ -1020,12 +1340,18 @@ impl TenantShard { let active_nodes_dirty = self.dirty(pageservers); - // Even if there is no pageserver work to be done, if we have a pending notification to computes, - // wake up a reconciler to send it. - let do_reconcile = - active_nodes_dirty || dirty_observed || self.pending_compute_notification; + let reconcile_needed = match ( + active_nodes_dirty, + dirty_observed, + self.pending_compute_notification, + ) { + (true, _, _) => ReconcileNeeded::Yes(ReconcileReason::ActiveNodesDirty), + (_, true, _) => ReconcileNeeded::Yes(ReconcileReason::UnknownLocation), + (_, _, true) => ReconcileNeeded::Yes(ReconcileReason::PendingComputeNotification), + _ => ReconcileNeeded::No, + }; - if !do_reconcile { + if matches!(reconcile_needed, ReconcileNeeded::No) { tracing::debug!("Not dirty, no reconciliation needed."); return ReconcileNeeded::No; } @@ -1068,7 +1394,7 @@ impl TenantShard { } } - ReconcileNeeded::Yes + reconcile_needed } /// Ensure the sequence number is set to a value where waiting for this value will make us wait @@ -1117,10 +1443,15 @@ impl TenantShard { let result = reconciler.reconcile().await; // If we know we had a pending compute notification from some previous action, send a notification irrespective - // of whether the above reconcile() did any work + // of whether the above reconcile() did any work. It has to be Ok() though, because otherwise we might be + // sending a notification of a location that isn't really attached. if result.is_ok() && must_notify { // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] reconciler.compute_notify().await.ok(); + } else if must_notify { + // Carry this flag so that the reconciler's result will indicate that it still needs to retry + // the compute hook notification eventually. + reconciler.compute_notify_failure = true; } // Update result counter @@ -1153,6 +1484,7 @@ impl TenantShard { #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn spawn_reconciler( &mut self, + reason: ReconcileReason, result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, compute_hook: &Arc, @@ -1197,6 +1529,7 @@ impl TenantShard { detach, reconciler_config, config: self.config.clone(), + preferred_az: self.intent.preferred_az_id.clone(), observed: self.observed.clone(), original_observed: self.observed.clone(), compute_hook: compute_hook.clone(), @@ -1211,7 +1544,7 @@ impl TenantShard { let reconcile_seq = self.sequence; let long_reconcile_threshold = service_config.long_reconcile_threshold; - tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence); + tracing::info!(seq=%reconcile_seq, "Spawning Reconciler ({reason:?})"); let must_notify = self.pending_compute_notification; let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq, tenant_id=%reconciler.tenant_shard_id.tenant_id, @@ -1417,7 +1750,6 @@ impl TenantShard { pending_compute_notification: false, delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), - preferred_az_id: tsp.preferred_az_id.map(AvailabilityZone), }) } @@ -1433,16 +1765,16 @@ impl TenantShard { config: serde_json::to_string(&self.config).unwrap(), splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(), - preferred_az_id: self.preferred_az_id.as_ref().map(|az| az.0.clone()), + preferred_az_id: self.intent.preferred_az_id.as_ref().map(|az| az.0.clone()), } } pub(crate) fn preferred_az(&self) -> Option<&AvailabilityZone> { - self.preferred_az_id.as_ref() + self.intent.preferred_az_id.as_ref() } - pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) { - self.preferred_az_id = Some(preferred_az_id); + pub(crate) fn set_preferred_az(&mut self, preferred_az_id: Option) { + self.intent.preferred_az_id = preferred_az_id; } /// Returns all the nodes to which this tenant shard is attached according to the @@ -1458,8 +1790,8 @@ impl TenantShard { let conf = observed.conf.as_ref()?; match (conf.generation, conf.mode) { - (Some(gen), AttachedMulti | AttachedSingle | AttachedStale) => { - Some((*node_id, gen)) + (Some(gen_), AttachedMulti | AttachedSingle | AttachedStale) => { + Some((*node_id, gen_)) } _ => None, } @@ -1467,7 +1799,7 @@ impl TenantShard { .sorted_by(|(_lhs_node_id, lhs_gen), (_rhs_node_id, rhs_gen)| { lhs_gen.cmp(rhs_gen).reverse() }) - .map(|(node_id, gen)| (node_id, Generation::new(gen))) + .map(|(node_id, gen_)| (node_id, Generation::new(gen_))) .collect() } @@ -1499,7 +1831,10 @@ impl TenantShard { (Some(crnt), Some(new)) if crnt_gen > new_gen => { tracing::warn!( "Skipping observed state update {}: {:?} and using None due to stale generation ({} > {})", - node_id, loc, crnt, new + node_id, + loc, + crnt, + new ); self.observed @@ -1526,6 +1861,23 @@ impl TenantShard { } } } + + /// Returns true if the tenant shard is attached to a node that is outside the preferred AZ. + /// + /// If the shard does not have a preferred AZ, returns false. + pub(crate) fn is_attached_outside_preferred_az(&self, nodes: &HashMap) -> bool { + self.intent + .get_attached() + .map(|node_id| { + Some( + nodes + .get(&node_id) + .expect("referenced node exists") + .get_availability_zone_id(), + ) != self.intent.preferred_az_id.as_ref() + }) + .unwrap_or(false) + } } impl Drop for TenantShard { @@ -1539,18 +1891,17 @@ impl Drop for TenantShard { #[cfg(test)] pub(crate) mod tests { - use std::{cell::RefCell, rc::Rc}; + use std::cell::RefCell; + use std::rc::Rc; - use pageserver_api::{ - controller_api::NodeAvailability, - shard::{ShardCount, ShardNumber}, - }; - use rand::{rngs::StdRng, SeedableRng}; + use pageserver_api::controller_api::NodeAvailability; + use pageserver_api::shard::{ShardCount, ShardNumber}; + use rand::SeedableRng; + use rand::rngs::StdRng; use utils::id::TenantId; - use crate::scheduler::test_utils::make_test_nodes; - use super::*; + use crate::scheduler::test_utils::make_test_nodes; fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard { let tenant_id = TenantId::generate(); @@ -1571,6 +1922,7 @@ pub(crate) mod tests { ) .unwrap(), policy, + None, ) } @@ -1597,7 +1949,7 @@ pub(crate) mod tests { shard_number, shard_count, }; - let mut ts = TenantShard::new( + TenantShard::new( tenant_shard_id, ShardIdentity::new( shard_number, @@ -1606,13 +1958,8 @@ pub(crate) mod tests { ) .unwrap(), policy.clone(), - ); - - if let Some(az) = &preferred_az { - ts.set_preferred_az(az.clone()); - } - - ts + preferred_az.clone(), + ) }) .collect() } @@ -1732,16 +2079,20 @@ pub(crate) mod tests { // In pause mode, schedule() shouldn't do anything tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause; - assert!(tenant_shard - .schedule(&mut scheduler, &mut ScheduleContext::default()) - .is_ok()); + assert!( + tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok() + ); assert!(tenant_shard.intent.all_pageservers().is_empty()); // In active mode, schedule() works tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active; - assert!(tenant_shard - .schedule(&mut scheduler, &mut ScheduleContext::default()) - .is_ok()); + assert!( + tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok() + ); assert!(!tenant_shard.intent.all_pageservers().is_empty()); tenant_shard.intent.clear(&mut scheduler); @@ -1749,65 +2100,90 @@ pub(crate) mod tests { } #[test] - fn optimize_attachment() -> anyhow::Result<()> { - let nodes = make_test_nodes(3, &[]); + /// Simple case: moving attachment to somewhere better where we already have a secondary + fn optimize_attachment_simple() -> anyhow::Result<()> { + let nodes = make_test_nodes( + 3, + &[ + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + ], + ); let mut scheduler = Scheduler::new(nodes.values()); let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_b.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); // Initially: both nodes attached on shard 1, and both have secondary locations // on different nodes. - shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); - shard_a.intent.push_secondary(&mut scheduler, NodeId(2)); + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(2))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(1)); shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1))); - shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); + shard_b.intent.push_secondary(&mut scheduler, NodeId(2)); - let mut schedule_context = ScheduleContext::default(); - schedule_context.avoid(&shard_a.intent.all_pageservers()); - schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); - schedule_context.avoid(&shard_b.intent.all_pageservers()); - schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + fn make_schedule_context(shard_a: &TenantShard, shard_b: &TenantShard) -> ScheduleContext { + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context + } - let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context); - - // Either shard should recognize that it has the option to switch to a secondary location where there - // would be no other shards from the same tenant, and request to do so. + let schedule_context = make_schedule_context(&shard_a, &shard_b); + let optimization_a = shard_a.optimize_attachment(&mut scheduler, &schedule_context); assert_eq!( optimization_a, Some(ScheduleOptimization { sequence: shard_a.sequence, action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { - old_attached_node_id: NodeId(1), - new_attached_node_id: NodeId(2) + old_attached_node_id: NodeId(2), + new_attached_node_id: NodeId(1) }) }) ); - - // Note that these optimizing two shards in the same tenant with the same ScheduleContext is - // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility - // of [`Service::optimize_all`] to avoid trying - // to do optimizations for multiple shards in the same tenant at the same time. Generating - // both optimizations is just done for test purposes - let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context); - assert_eq!( - optimization_b, - Some(ScheduleOptimization { - sequence: shard_b.sequence, - action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { - old_attached_node_id: NodeId(1), - new_attached_node_id: NodeId(3) - }) - }) - ); - - // Applying these optimizations should result in the end state proposed shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); - assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2))); - assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]); - shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap()); - assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3))); - assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]); + + // // Either shard should recognize that it has the option to switch to a secondary location where there + // // would be no other shards from the same tenant, and request to do so. + // assert_eq!( + // optimization_a_prepare, + // Some(ScheduleOptimization { + // sequence: shard_a.sequence, + // action: ScheduleOptimizationAction::CreateSecondary(NodeId(2)) + // }) + // ); + // shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap()); + + // let schedule_context = make_schedule_context(&shard_a, &shard_b); + // let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + // assert_eq!( + // optimization_a_migrate, + // Some(ScheduleOptimization { + // sequence: shard_a.sequence, + // action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + // old_attached_node_id: NodeId(1), + // new_attached_node_id: NodeId(2) + // }) + // }) + // ); + // shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap()); + + // let schedule_context = make_schedule_context(&shard_a, &shard_b); + // let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + // assert_eq!( + // optimization_a_cleanup, + // Some(ScheduleOptimization { + // sequence: shard_a.sequence, + // action: ScheduleOptimizationAction::RemoveSecondary(NodeId(1)) + // }) + // ); + // shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap()); + + // // Shard B should not be moved anywhere, since the pressure on node 1 was relieved by moving shard A + // let schedule_context = make_schedule_context(&shard_a, &shard_b); + // assert_eq!(shard_b.optimize_attachment(&mut scheduler, &schedule_context), None); shard_a.intent.clear(&mut scheduler); shard_b.intent.clear(&mut scheduler); @@ -1815,6 +2191,190 @@ pub(crate) mod tests { Ok(()) } + #[test] + /// Complicated case: moving attachment to somewhere better where we do not have a secondary + /// already, creating one as needed. + fn optimize_attachment_multistep() -> anyhow::Result<()> { + let nodes = make_test_nodes( + 3, + &[ + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + // Two shards of a tenant that wants to be in AZ A + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); + let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_b.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); + + // Both shards are initially attached in non-home AZ _and_ have secondaries in non-home AZs + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(2))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); + shard_b.intent.set_attached(&mut scheduler, Some(NodeId(3))); + shard_b.intent.push_secondary(&mut scheduler, NodeId(2)); + + fn make_schedule_context(shard_a: &TenantShard, shard_b: &TenantShard) -> ScheduleContext { + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context + } + + let schedule_context = make_schedule_context(&shard_a, &shard_b); + let optimization_a_prepare = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_prepare, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::CreateSecondary(NodeId(1)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap()); + + let schedule_context = make_schedule_context(&shard_a, &shard_b); + let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_migrate, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(2), + new_attached_node_id: NodeId(1) + }) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap()); + + let schedule_context = make_schedule_context(&shard_a, &shard_b); + let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_cleanup, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap()); + + // // Shard B should not be moved anywhere, since the pressure on node 1 was relieved by moving shard A + // let schedule_context = make_schedule_context(&shard_a, &shard_b); + // assert_eq!(shard_b.optimize_attachment(&mut scheduler, &schedule_context), None); + + shard_a.intent.clear(&mut scheduler); + shard_b.intent.clear(&mut scheduler); + + Ok(()) + } + + #[test] + /// Check that multi-step migration works when moving to somewhere that is only better by + /// 1 AffinityScore -- this ensures that we don't have a bug like the intermediate secondary + /// counting toward the affinity score such that it prevents the rest of the migration from happening. + fn optimize_attachment_marginal() -> anyhow::Result<()> { + let nodes = make_test_nodes(2, &[]); + let mut scheduler = Scheduler::new(nodes.values()); + + // Multi-sharded tenant, we will craft a situation where affinity + // scores differ only slightly + let mut shards = make_test_tenant(PlacementPolicy::Attached(0), ShardCount::new(4), None); + + // 1 attached on node 1 + shards[0] + .intent + .set_attached(&mut scheduler, Some(NodeId(1))); + // 3 attached on node 2 + shards[1] + .intent + .set_attached(&mut scheduler, Some(NodeId(2))); + shards[2] + .intent + .set_attached(&mut scheduler, Some(NodeId(2))); + shards[3] + .intent + .set_attached(&mut scheduler, Some(NodeId(2))); + + // The scheduler should figure out that we need to: + // - Create a secondary for shard 3 on node 1 + // - Migrate shard 3 to node 1 + // - Remove shard 3's location on node 2 + + fn make_schedule_context(shards: &Vec) -> ScheduleContext { + let mut schedule_context = ScheduleContext::default(); + for shard in shards { + schedule_context.avoid(&shard.intent.all_pageservers()); + } + schedule_context + } + + let schedule_context = make_schedule_context(&shards); + let optimization_a_prepare = + shards[1].optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_prepare, + Some(ScheduleOptimization { + sequence: shards[1].sequence, + action: ScheduleOptimizationAction::CreateSecondary(NodeId(1)) + }) + ); + shards[1].apply_optimization(&mut scheduler, optimization_a_prepare.unwrap()); + + let schedule_context = make_schedule_context(&shards); + let optimization_a_migrate = + shards[1].optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_migrate, + Some(ScheduleOptimization { + sequence: shards[1].sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(2), + new_attached_node_id: NodeId(1) + }) + }) + ); + shards[1].apply_optimization(&mut scheduler, optimization_a_migrate.unwrap()); + + let schedule_context = make_schedule_context(&shards); + let optimization_a_cleanup = + shards[1].optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_cleanup, + Some(ScheduleOptimization { + sequence: shards[1].sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2)) + }) + ); + shards[1].apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap()); + + // Everything should be stable now + let schedule_context = make_schedule_context(&shards); + assert_eq!( + shards[0].optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shards[1].optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shards[2].optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shards[3].optimize_attachment(&mut scheduler, &schedule_context), + None + ); + + for mut shard in shards { + shard.intent.clear(&mut scheduler); + } + + Ok(()) + } + #[test] fn optimize_secondary() -> anyhow::Result<()> { let nodes = make_test_nodes(4, &[]); @@ -1832,9 +2392,7 @@ pub(crate) mod tests { let mut schedule_context = ScheduleContext::default(); schedule_context.avoid(&shard_a.intent.all_pageservers()); - schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); schedule_context.avoid(&shard_b.intent.all_pageservers()); - schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context); @@ -1861,11 +2419,114 @@ pub(crate) mod tests { Ok(()) } + /// Test how the optimisation code behaves with an extra secondary + #[test] + fn optimize_removes_secondary() -> anyhow::Result<()> { + let az_a_tag = AvailabilityZone("az-a".to_string()); + let az_b_tag = AvailabilityZone("az-b".to_string()); + let mut nodes = make_test_nodes( + 4, + &[ + az_a_tag.clone(), + az_b_tag.clone(), + az_a_tag.clone(), + az_b_tag.clone(), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut schedule_context = ScheduleContext::default(); + + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_a.intent.preferred_az_id = Some(az_a_tag.clone()); + shard_a + .schedule(&mut scheduler, &mut schedule_context) + .unwrap(); + + // Attached on node 1, secondary on node 2 + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1))); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(2)]); + + // Initially optimiser is idle + assert_eq!( + shard_a.optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shard_a.optimize_secondary(&mut scheduler, &schedule_context), + None + ); + + // A spare secondary in the home AZ: it should be removed -- this is the situation when we're midway through a graceful migration, after cutting over + // to our new location + shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); + let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization.unwrap()); + + // A spare secondary in the non-home AZ, and one of them is offline + shard_a.intent.push_secondary(&mut scheduler, NodeId(4)); + nodes + .get_mut(&NodeId(4)) + .unwrap() + .set_availability(NodeAvailability::Offline); + scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap()); + let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(4)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization.unwrap()); + + // A spare secondary when should have none + shard_a.policy = PlacementPolicy::Attached(0); + let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization.unwrap()); + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1))); + assert_eq!(shard_a.intent.get_secondary(), &vec![]); + + // Check that in secondary mode, we preserve the secondary in the preferred AZ + let mut schedule_context = ScheduleContext::default(); // Fresh context, we're about to call schedule() + shard_a.policy = PlacementPolicy::Secondary; + shard_a + .schedule(&mut scheduler, &mut schedule_context) + .unwrap(); + assert_eq!(shard_a.intent.get_attached(), &None); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]); + assert_eq!( + shard_a.optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shard_a.optimize_secondary(&mut scheduler, &schedule_context), + None + ); + + shard_a.intent.clear(&mut scheduler); + + Ok(()) + } + // Optimize til quiescent: this emulates what Service::optimize_all does, when // called repeatedly in the background. // Returns the applied optimizations fn optimize_til_idle( - nodes: &HashMap, scheduler: &mut Scheduler, shards: &mut [TenantShard], ) -> Vec { @@ -1877,14 +2538,18 @@ pub(crate) mod tests { for shard in shards.iter() { schedule_context.avoid(&shard.intent.all_pageservers()); - if let Some(attached) = shard.intent.get_attached() { - schedule_context.push_attached(*attached); - } } for shard in shards.iter_mut() { - let optimization = shard.optimize_attachment(nodes, &schedule_context); + let optimization = shard.optimize_attachment(scheduler, &schedule_context); + tracing::info!( + "optimize_attachment({})={:?}", + shard.tenant_shard_id, + optimization + ); if let Some(optimization) = optimization { + // Check that maybe_optimizable wouldn't have wrongly claimed this optimization didn't exist + assert!(shard.maybe_optimizable(scheduler, &schedule_context)); optimizations.push(optimization.clone()); shard.apply_optimization(scheduler, optimization); any_changed = true; @@ -1892,7 +2557,15 @@ pub(crate) mod tests { } let optimization = shard.optimize_secondary(scheduler, &schedule_context); + tracing::info!( + "optimize_secondary({})={:?}", + shard.tenant_shard_id, + optimization + ); if let Some(optimization) = optimization { + // Check that maybe_optimizable wouldn't have wrongly claimed this optimization didn't exist + assert!(shard.maybe_optimizable(scheduler, &schedule_context)); + optimizations.push(optimization.clone()); shard.apply_optimization(scheduler, optimization); any_changed = true; @@ -1916,45 +2589,87 @@ pub(crate) mod tests { /// that it converges. #[test] fn optimize_add_nodes() -> anyhow::Result<()> { - let nodes = make_test_nodes(4, &[]); + let nodes = make_test_nodes( + 9, + &[ + // Initial 6 nodes + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + AvailabilityZone("az-c".to_string()), + // Three we will add later + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + ], + ); - // Only show the scheduler a couple of nodes + // Only show the scheduler two nodes in each AZ to start with let mut scheduler = Scheduler::new([].iter()); - scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); - scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap()); - - let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None); - let mut schedule_context = ScheduleContext::default(); - for shard in &mut shards { - assert!(shard - .schedule(&mut scheduler, &mut schedule_context) - .is_ok()); + for i in 1..=6 { + scheduler.node_upsert(nodes.get(&NodeId(i)).unwrap()); } - // We should see equal number of locations on the two nodes. - assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4); - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2); + let mut shards = make_test_tenant( + PlacementPolicy::Attached(1), + ShardCount::new(4), + Some(AvailabilityZone("az-a".to_string())), + ); + let mut schedule_context = ScheduleContext::default(); + for shard in &mut shards { + assert!( + shard + .schedule(&mut scheduler, &mut schedule_context) + .is_ok() + ); + } - assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4); + // Initial: attached locations land in the tenant's home AZ. + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2); + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2); - // Add another two nodes: we should see the shards spread out when their optimize - // methods are called - scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap()); - scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap()); - optimize_til_idle(&nodes, &mut scheduler, &mut shards); + // Initial: secondary locations in a remote AZ + assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(5)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(5)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(6)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(6)), 0); - assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2); + // Add another three nodes: we should see the shards spread out when their optimize + // methods are called + scheduler.node_upsert(nodes.get(&NodeId(7)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(8)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(9)).unwrap()); + optimize_til_idle(&mut scheduler, &mut shards); + + // We expect one attached location was moved to the new node in the tenant's home AZ + assert_eq!(scheduler.get_node_shard_count(NodeId(7)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(7)), 1); + // The original node has one less attached shard + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); + // One of the original nodes still has two attachments, since there are an odd number of nodes assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2); - assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2); - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1); - - assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2); - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1); + // None of our secondaries moved, since we already had enough nodes for those to be + // scheduled perfectly + assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(5)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(5)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(6)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(6)), 0); for shard in shards.iter_mut() { shard.intent.clear(&mut scheduler); @@ -1994,10 +2709,10 @@ pub(crate) mod tests { shard.schedule(&mut scheduler, context).unwrap(); } - let applied_to_a = optimize_til_idle(&nodes, &mut scheduler, &mut a); + let applied_to_a = optimize_til_idle(&mut scheduler, &mut a); assert_eq!(applied_to_a, vec![]); - let applied_to_b = optimize_til_idle(&nodes, &mut scheduler, &mut b); + let applied_to_b = optimize_til_idle(&mut scheduler, &mut b); assert_eq!(applied_to_b, vec![]); for shard in a.iter_mut().chain(b.iter_mut()) { @@ -2149,4 +2864,108 @@ pub(crate) mod tests { } Ok(()) } + + /// Check how the shard's scheduling behaves when in PlacementPolicy::Secondary mode. + #[test] + fn tenant_secondary_scheduling() -> anyhow::Result<()> { + let az_a = AvailabilityZone("az-a".to_string()); + let nodes = make_test_nodes( + 3, + &[ + az_a.clone(), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + ], + ); + + let mut scheduler = Scheduler::new(nodes.values()); + let mut context = ScheduleContext::default(); + + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Secondary); + tenant_shard.intent.preferred_az_id = Some(az_a.clone()); + tenant_shard + .schedule(&mut scheduler, &mut context) + .expect("we have enough nodes, scheduling should work"); + assert_eq!(tenant_shard.intent.secondary.len(), 1); + assert!(tenant_shard.intent.attached.is_none()); + + // Should have scheduled into the preferred AZ + assert_eq!( + scheduler + .get_node_az(&tenant_shard.intent.secondary[0]) + .as_ref(), + tenant_shard.preferred_az() + ); + + // Optimizer should agree + assert_eq!( + tenant_shard.optimize_attachment(&mut scheduler, &context), + None + ); + assert_eq!( + tenant_shard.optimize_secondary(&mut scheduler, &context), + None + ); + + // Switch to PlacementPolicy::Attached + tenant_shard.policy = PlacementPolicy::Attached(1); + tenant_shard + .schedule(&mut scheduler, &mut context) + .expect("we have enough nodes, scheduling should work"); + assert_eq!(tenant_shard.intent.secondary.len(), 1); + assert!(tenant_shard.intent.attached.is_some()); + // Secondary should now be in non-preferred AZ + assert_ne!( + scheduler + .get_node_az(&tenant_shard.intent.secondary[0]) + .as_ref(), + tenant_shard.preferred_az() + ); + // Attached should be in preferred AZ + assert_eq!( + scheduler + .get_node_az(&tenant_shard.intent.attached.unwrap()) + .as_ref(), + tenant_shard.preferred_az() + ); + + // Optimizer should agree + assert_eq!( + tenant_shard.optimize_attachment(&mut scheduler, &context), + None + ); + assert_eq!( + tenant_shard.optimize_secondary(&mut scheduler, &context), + None + ); + + // Switch back to PlacementPolicy::Secondary + tenant_shard.policy = PlacementPolicy::Secondary; + tenant_shard + .schedule(&mut scheduler, &mut context) + .expect("we have enough nodes, scheduling should work"); + assert_eq!(tenant_shard.intent.secondary.len(), 1); + assert!(tenant_shard.intent.attached.is_none()); + // When we picked a location to keep, we should have kept the one in the preferred AZ + assert_eq!( + scheduler + .get_node_az(&tenant_shard.intent.secondary[0]) + .as_ref(), + tenant_shard.preferred_az() + ); + + // Optimizer should agree + assert_eq!( + tenant_shard.optimize_attachment(&mut scheduler, &context), + None + ); + assert_eq!( + tenant_shard.optimize_secondary(&mut scheduler, &context), + None + ); + + tenant_shard.intent.clear(&mut scheduler); + + Ok(()) + } } diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index 609f3bf009..7f6544b894 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "storage_scrubber" version = "0.1.0" -edition.workspace = true +edition = "2024" license.workspace = true [dependencies] diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 1b4ff01a17..f0ba632fd4 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -1,11 +1,19 @@ use std::collections::{HashMap, HashSet}; +use std::time::SystemTime; +use futures_util::StreamExt; use itertools::Itertools; +use pageserver::tenant::IndexPart; use pageserver::tenant::checks::check_valid_layermap; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::remote_timeline_client::manifest::TenantManifest; +use pageserver::tenant::remote_timeline_client::{ + parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, +}; +use pageserver::tenant::storage_layer::LayerName; use pageserver_api::shard::ShardIndex; +use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::generation::Generation; @@ -14,14 +22,7 @@ use utils::shard::TenantShardId; use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; -use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; -use futures_util::StreamExt; -use pageserver::tenant::remote_timeline_client::{ - parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, -}; -use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::IndexPart; -use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; +use crate::{RootTarget, TenantShardTimelineId, download_object_with_retries}; pub(crate) struct TimelineAnalysis { /// Anomalies detected @@ -88,9 +89,14 @@ pub(crate) async fn branch_cleanup_and_check_errors( match s3_data.blob_data { BlobDataParseResult::Parsed { index_part, - index_part_generation: _index_part_generation, - s3_layers: _s3_layers, + index_part_generation: _, + s3_layers: _, + index_part_last_modified_time, + index_part_snapshot_time, } => { + // Ignore missing file error if index_part downloaded is different from the one when listing the layer files. + let ignore_error = index_part_snapshot_time < index_part_last_modified_time + && !cfg!(debug_assertions); if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) { result .errors @@ -171,7 +177,7 @@ pub(crate) async fn branch_cleanup_and_check_errors( is_l0, ); - if is_l0 { + if is_l0 || ignore_error { result.warnings.push(msg); } else { result.errors.push(msg); @@ -308,9 +314,11 @@ pub(crate) enum BlobDataParseResult { Parsed { index_part: Box, index_part_generation: Generation, + index_part_last_modified_time: SystemTime, + index_part_snapshot_time: SystemTime, s3_layers: HashSet<(LayerName, Generation)>, }, - /// The remains of a deleted Timeline (i.e. an initdb archive only) + /// The remains of an uncleanly deleted Timeline or aborted timeline creation(e.g. an initdb archive only, or some layer without an index) Relic, Incorrect { errors: Vec, @@ -321,11 +329,11 @@ pub(crate) enum BlobDataParseResult { pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { match name.rsplit_once('-') { // FIXME: this is gross, just use a regex? - Some((layer_filename, gen)) if gen.len() == 8 => { + Some((layer_filename, gen_)) if gen_.len() == 8 => { let layer = layer_filename.parse::()?; - let gen = - Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?; - Ok((layer, gen)) + let gen_ = + Generation::parse_suffix(gen_).ok_or("Malformed generation suffix".to_string())?; + Ok((layer, gen_)) } _ => Ok((name.parse::()?, Generation::none())), } @@ -346,7 +354,7 @@ pub(crate) async fn list_timeline_blobs( match res { ListTimelineBlobsResult::Ready(data) => Ok(data), ListTimelineBlobsResult::MissingIndexPart(_) => { - // Retry if index is missing. + // Retry if listing raced with removal of an index let data = list_timeline_blobs_impl(remote_client, id, root_target) .await? .into_data(); @@ -358,7 +366,7 @@ pub(crate) async fn list_timeline_blobs( enum ListTimelineBlobsResult { /// Blob data is ready to be intepreted. Ready(RemoteTimelineBlobData), - /// List timeline blobs has layer files but is missing [`IndexPart`]. + /// The listing contained an index but when we tried to fetch it, we couldn't MissingIndexPart(RemoteTimelineBlobData), } @@ -415,9 +423,9 @@ async fn list_timeline_blobs_impl( tracing::info!("initdb archive preserved {key}"); } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { - Ok((new_layer, gen)) => { - tracing::debug!("Parsed layer key: {new_layer} {gen:?}"); - s3_layers.insert((new_layer, gen)); + Ok((new_layer, gen_)) => { + tracing::debug!("Parsed layer key: {new_layer} {gen_:?}"); + s3_layers.insert((new_layer, gen_)); } Err(e) => { tracing::info!("Error parsing {maybe_layer_name} as layer name: {e}"); @@ -457,7 +465,7 @@ async fn list_timeline_blobs_impl( .max_by_key(|i| i.1) .map(|(k, g)| (k.clone(), g)) { - Some((key, gen)) => (Some::(key.to_owned()), gen), + Some((key, gen_)) => (Some::(key.to_owned()), gen_), None => { // Legacy/missing case: one or zero index parts, which did not have a generation (index_part_keys.pop(), Generation::none()) @@ -467,26 +475,26 @@ async fn list_timeline_blobs_impl( match index_part_object.as_ref() { Some(selected) => index_part_keys.retain(|k| k != selected), None => { - // It is possible that the branch gets deleted after we got some layer files listed - // and we no longer have the index file in the listing. - errors.push( + // This case does not indicate corruption, but it should be very unusual. It can + // happen if: + // - timeline creation is in progress (first layer is written before index is written) + // - timeline deletion happened while a stale pageserver was still attached, it might upload + // a layer after the deletion is done. + tracing::info!( "S3 list response got no index_part.json file but still has layer files" - .to_string(), ); - return Ok(ListTimelineBlobsResult::MissingIndexPart( - RemoteTimelineBlobData { - blob_data: BlobDataParseResult::Incorrect { errors, s3_layers }, - unused_index_keys: index_part_keys, - unknown_keys, - }, - )); + return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData { + blob_data: BlobDataParseResult::Relic, + unused_index_keys: index_part_keys, + unknown_keys, + })); } } if let Some(index_part_object_key) = index_part_object.as_ref() { - let index_part_bytes = + let (index_part_bytes, index_part_last_modified_time) = match download_object_with_retries(remote_client, &index_part_object_key.key).await { - Ok(index_part_bytes) => index_part_bytes, + Ok(data) => data, Err(e) => { // It is possible that the branch gets deleted in-between we list the objects // and we download the index part file. @@ -500,7 +508,7 @@ async fn list_timeline_blobs_impl( )); } }; - + let index_part_snapshot_time = index_part_object_key.last_modified; match serde_json::from_slice(&index_part_bytes) { Ok(index_part) => { return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData { @@ -508,10 +516,12 @@ async fn list_timeline_blobs_impl( index_part: Box::new(index_part), index_part_generation, s3_layers, + index_part_last_modified_time, + index_part_snapshot_time, }, unused_index_keys: index_part_keys, unknown_keys, - })) + })); } Err(index_parse_error) => errors.push(format!( "index_part.json body parsing error: {index_parse_error}" @@ -533,8 +543,9 @@ async fn list_timeline_blobs_impl( } pub(crate) struct RemoteTenantManifestInfo { - pub(crate) latest_generation: Option, - pub(crate) manifests: Vec<(Generation, ListingObject)>, + pub(crate) generation: Generation, + pub(crate) manifest: TenantManifest, + pub(crate) listing_object: ListingObject, } pub(crate) enum ListTenantManifestResult { @@ -543,7 +554,10 @@ pub(crate) enum ListTenantManifestResult { #[allow(dead_code)] unknown_keys: Vec, }, - NoErrors(RemoteTenantManifestInfo), + NoErrors { + latest_generation: Option, + manifests: Vec<(Generation, ListingObject)>, + }, } /// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object. @@ -592,14 +606,6 @@ pub(crate) async fn list_tenant_manifests( unknown_keys.push(obj); } - if manifests.is_empty() { - tracing::debug!("No manifest for timeline."); - - return Ok(ListTenantManifestResult::WithErrors { - errors, - unknown_keys, - }); - } if !unknown_keys.is_empty() { errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string())); @@ -609,6 +615,15 @@ pub(crate) async fn list_tenant_manifests( }); } + if manifests.is_empty() { + tracing::debug!("No manifest for timeline."); + + return Ok(ListTenantManifestResult::NoErrors { + latest_generation: None, + manifests, + }); + } + // Find the manifest with the highest generation let (latest_generation, latest_listing_object) = manifests .iter() @@ -616,9 +631,11 @@ pub(crate) async fn list_tenant_manifests( .map(|(g, obj)| (*g, obj.clone())) .unwrap(); + manifests.retain(|(gen_, _obj)| gen_ != &latest_generation); + let manifest_bytes = match download_object_with_retries(remote_client, &latest_listing_object.key).await { - Ok(bytes) => bytes, + Ok((bytes, _)) => bytes, Err(e) => { // It is possible that the tenant gets deleted in-between we list the objects // and we download the manifest file. @@ -634,13 +651,15 @@ pub(crate) async fn list_tenant_manifests( }; match TenantManifest::from_json_bytes(&manifest_bytes) { - Ok(_manifest) => { - return Ok(ListTenantManifestResult::NoErrors( - RemoteTenantManifestInfo { - latest_generation: Some(latest_generation), - manifests, - }, - )); + Ok(manifest) => { + return Ok(ListTenantManifestResult::NoErrors { + latest_generation: Some(RemoteTenantManifestInfo { + generation: latest_generation, + manifest, + listing_object: latest_listing_object, + }), + manifests, + }); } Err(parse_error) => errors.push(( latest_listing_object.key.get_path().as_str().to_owned(), diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs index b1dfe3a53f..5cf286c662 100644 --- a/storage_scrubber/src/cloud_admin_api.rs +++ b/storage_scrubber/src/cloud_admin_api.rs @@ -3,11 +3,9 @@ use std::error::Error as _; use chrono::{DateTime, Utc}; use futures::Future; use hex::FromHex; - -use reqwest::{header, Client, StatusCode, Url}; +use reqwest::{Client, StatusCode, Url, header}; use serde::Deserialize; use tokio::sync::Semaphore; - use tokio_util::sync::CancellationToken; use utils::backoff; use utils::id::{TenantId, TimelineId}; diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs index 95d3af1453..efb05fb55e 100644 --- a/storage_scrubber/src/find_large_objects.rs +++ b/storage_scrubber/src/find_large_objects.rs @@ -5,10 +5,9 @@ use pageserver::tenant::storage_layer::LayerName; use remote_storage::ListingMode; use serde::{Deserialize, Serialize}; -use crate::{ - checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants, - stream_objects_with_retries, BucketConfig, NodeKind, -}; +use crate::checks::parse_layer_object_name; +use crate::metadata_stream::stream_tenants; +use crate::{BucketConfig, NodeKind, init_remote, stream_objects_with_retries}; #[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] enum LargeObjectKind { diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index a4e5107e3d..e4f69a1669 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -3,11 +3,9 @@ //! Garbage means S3 objects which are either not referenced by any metadata, //! or are referenced by a control plane tenant/timeline in a deleted state. -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, - time::Duration, -}; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; use anyhow::Context; use futures_util::TryStreamExt; @@ -16,13 +14,14 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; -use utils::{backoff, id::TenantId}; +use utils::backoff; +use utils::id::TenantId; +use crate::cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix}; use crate::{ - cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData}, + BucketConfig, ConsoleConfig, MAX_RETRIES, NodeKind, TenantShardTimelineId, TraversingDepth, init_remote, list_objects_with_retries, - metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix}, - BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES, }; #[derive(Serialize, Deserialize, Debug)] @@ -259,14 +258,21 @@ async fn find_garbage_inner( .await?; if let Some(object) = tenant_objects.keys.first() { if object.key.get_path().as_str().ends_with("heatmap-v1.json") { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)"); + tracing::info!( + "Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)" + ); garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); continue; } else { - tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key); + tracing::info!( + "Tenant {tenant_shard_id} is missing in console and contains one object: {}", + object.key + ); } } else { - tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran"); + tracing::info!( + "Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran" + ); } } else { // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial @@ -295,9 +301,13 @@ async fn find_garbage_inner( } if any_non_initdb { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb"); + tracing::info!( + "Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb" + ); } else { - tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb"); + tracing::info!( + "Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb" + ); garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id)); continue; } @@ -546,7 +556,9 @@ pub async fn purge_garbage( .any(|g| matches!(g.entity, GarbageEntity::Timeline(_))) && garbage_list.active_timeline_count == 0 { - anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines"); + anyhow::bail!( + "Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines" + ); } let filtered_items = garbage_list diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index be526daaf0..34e43fcc0b 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -13,19 +13,18 @@ pub mod tenant_snapshot; use std::env; use std::fmt::Display; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, SystemTime}; use anyhow::Context; use aws_config::retry::{RetryConfigBuilder, RetryMode}; +use aws_sdk_s3::Client; use aws_sdk_s3::config::Region; use aws_sdk_s3::error::DisplayErrorContext; -use aws_sdk_s3::Client; - use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; use futures::{Stream, StreamExt}; -use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; use pageserver::tenant::TENANTS_SEGMENT_NAME; +use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path}; use pageserver_api::shard::TenantShardId; use remote_storage::{ DownloadOpts, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, @@ -38,7 +37,8 @@ use tokio::io::AsyncReadExt; use tokio_util::sync::CancellationToken; use tracing::{error, warn}; use tracing_appender::non_blocking::WorkerGuard; -use tracing_subscriber::{fmt, prelude::*, EnvFilter}; +use tracing_subscriber::prelude::*; +use tracing_subscriber::{EnvFilter, fmt}; use utils::fs_ext; use utils::id::{TenantId, TenantTimelineId, TimelineId}; @@ -411,10 +411,10 @@ async fn init_remote( let default_prefix = default_prefix_in_bucket(node_kind).to_string(); match &mut storage_config.0.storage { - RemoteStorageKind::AwsS3(ref mut config) => { + RemoteStorageKind::AwsS3(config) => { config.prefix_in_bucket.get_or_insert(default_prefix); } - RemoteStorageKind::AzureContainer(ref mut config) => { + RemoteStorageKind::AzureContainer(config) => { config.prefix_in_container.get_or_insert(default_prefix); } RemoteStorageKind::LocalFs { .. } => (), @@ -509,10 +509,11 @@ async fn list_objects_with_retries( panic!("MAX_RETRIES is not allowed to be 0"); } +/// Returns content, last modified time async fn download_object_with_retries( remote_client: &GenericRemoteStorage, key: &RemotePath, -) -> anyhow::Result> { +) -> anyhow::Result<(Vec, SystemTime)> { let cancel = CancellationToken::new(); for trial in 0..MAX_RETRIES { let mut buf = Vec::new(); @@ -535,7 +536,7 @@ async fn download_object_with_retries( { Ok(bytes_read) => { tracing::debug!("Downloaded {bytes_read} bytes for object {key}"); - return Ok(buf); + return Ok((buf, download.last_modified)); } Err(e) => { error!("Failed to stream object body for key {key}: {e}"); diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index fa6ee90b66..fb2ab02565 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,24 +1,20 @@ -use anyhow::{anyhow, bail, Context}; +use anyhow::{Context, anyhow, bail}; use camino::Utf8PathBuf; +use clap::{Parser, Subcommand}; use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse}; use pageserver_api::shard::TenantShardId; use reqwest::{Method, Url}; use storage_controller_client::control_api; -use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; -use storage_scrubber::pageserver_physical_gc::GcMode; +use storage_scrubber::garbage::{PurgeMode, find_garbage, purge_garbage}; +use storage_scrubber::pageserver_physical_gc::{GcMode, pageserver_physical_gc}; use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata; -use storage_scrubber::scan_safekeeper_metadata::DatabaseOrList; +use storage_scrubber::scan_safekeeper_metadata::{DatabaseOrList, scan_safekeeper_metadata}; use storage_scrubber::tenant_snapshot::SnapshotDownloader; -use storage_scrubber::{find_large_objects, ControllerClientConfig}; use storage_scrubber::{ - init_logging, pageserver_physical_gc::pageserver_physical_gc, - scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind, - TraversingDepth, + BucketConfig, ConsoleConfig, ControllerClientConfig, NodeKind, TraversingDepth, + find_large_objects, init_logging, }; - -use clap::{Parser, Subcommand}; use utils::id::TenantId; - use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); @@ -173,15 +169,23 @@ async fn main() -> anyhow::Result<()> { if let NodeKind::Safekeeper = node_kind { let db_or_list = match (timeline_lsns, dump_db_connstr) { (Some(timeline_lsns), _) => { - let timeline_lsns = serde_json::from_str(&timeline_lsns).context("parsing timeline_lsns")?; + let timeline_lsns = serde_json::from_str(&timeline_lsns) + .context("parsing timeline_lsns")?; DatabaseOrList::List(timeline_lsns) } (None, Some(dump_db_connstr)) => { - let dump_db_table = dump_db_table.ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?; + let dump_db_table = dump_db_table + .ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?; let tenant_ids = tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(); - DatabaseOrList::Database { tenant_ids, connstr: dump_db_connstr, table: dump_db_table } + DatabaseOrList::Database { + tenant_ids, + connstr: dump_db_connstr, + table: dump_db_table, + } } - (None, None) => anyhow::bail!("neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`"), + (None, None) => anyhow::bail!( + "neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`" + ), }; let summary = scan_safekeeper_metadata(bucket_config.clone(), db_or_list).await?; if json { @@ -371,7 +375,9 @@ pub async fn scan_pageserver_metadata_cmd( exit_code: bool, ) -> anyhow::Result<()> { if controller_client.is_none() && post_to_storcon { - return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run")); + return Err(anyhow!( + "Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run" + )); } match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await { Err(e) => { diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs index 47447d681c..af2407856d 100644 --- a/storage_scrubber/src/metadata_stream.rs +++ b/storage_scrubber/src/metadata_stream.rs @@ -1,17 +1,17 @@ use std::str::FromStr; -use anyhow::{anyhow, Context}; +use anyhow::{Context, anyhow}; use async_stream::{stream, try_stream}; use futures::StreamExt; +use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath}; use tokio_stream::Stream; +use utils::id::{TenantId, TimelineId}; use crate::{ - list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target, - TenantShardTimelineId, + RootTarget, S3Target, TenantShardTimelineId, list_objects_with_retries, + stream_objects_with_retries, }; -use pageserver_api::shard::TenantShardId; -use utils::id::{TenantId, TimelineId}; /// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes pub fn stream_tenants<'a>( diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index 20cb9c3633..c956b1abbc 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -2,18 +2,16 @@ use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use std::time::Duration; -use crate::checks::{ - list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult, -}; -use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES}; +use async_stream::try_stream; +use futures::future::Either; use futures_util::{StreamExt, TryStreamExt}; +use pageserver::tenant::IndexPart; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest; use pageserver::tenant::remote_timeline_client::{ parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, }; use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::IndexPart; use pageserver_api::controller_api::TenantDescribeResponse; use pageserver_api::shard::{ShardIndex, TenantShardId}; use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; @@ -21,11 +19,18 @@ use reqwest::Method; use serde::Serialize; use storage_controller_client::control_api; use tokio_util::sync::CancellationToken; -use tracing::{info_span, Instrument}; +use tracing::{Instrument, info_span}; use utils::backoff; use utils::generation::Generation; use utils::id::{TenantId, TenantTimelineId}; +use crate::checks::{ + BlobDataParseResult, ListTenantManifestResult, RemoteTenantManifestInfo, list_tenant_manifests, + list_timeline_blobs, +}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{BucketConfig, MAX_RETRIES, NodeKind, RootTarget, TenantShardTimelineId, init_remote}; + #[derive(Serialize, Default)] pub struct GcSummary { indices_deleted: usize, @@ -448,6 +453,8 @@ async fn gc_ancestor( index_part: _, index_part_generation: _, s3_layers, + index_part_last_modified_time: _, + index_part_snapshot_time: _, } => s3_layers, BlobDataParseResult::Relic => { // Post-deletion tenant location: don't try and GC it. @@ -527,7 +534,7 @@ async fn gc_tenant_manifests( target: &RootTarget, mode: GcMode, tenant_shard_id: TenantShardId, -) -> anyhow::Result { +) -> anyhow::Result<(GcSummary, Option)> { let mut gc_summary = GcSummary::default(); match list_tenant_manifests(remote_client, tenant_shard_id, target).await? { ListTenantManifestResult::WithErrors { @@ -537,33 +544,35 @@ async fn gc_tenant_manifests( for (_key, error) in errors { tracing::warn!(%tenant_shard_id, "list_tenant_manifests: {error}"); } + Ok((gc_summary, None)) } - ListTenantManifestResult::NoErrors(mut manifest_info) => { - let Some(latest_gen) = manifest_info.latest_generation else { - return Ok(gc_summary); + ListTenantManifestResult::NoErrors { + latest_generation, + mut manifests, + } => { + let Some(latest_generation) = latest_generation else { + return Ok((gc_summary, None)); }; - manifest_info - .manifests - .sort_by_key(|(generation, _obj)| *generation); + manifests.sort_by_key(|(generation, _obj)| *generation); // skip the two latest generations (they don't neccessarily have to be 1 apart from each other) - let candidates = manifest_info.manifests.iter().rev().skip(2); + let candidates = manifests.iter().rev().skip(2); for (_generation, key) in candidates { maybe_delete_tenant_manifest( remote_client, &min_age, - latest_gen, + latest_generation.generation, key, mode, &mut gc_summary, ) .instrument( - info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_gen, %key.key), + info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_generation.generation, %key.key), ) .await; } + Ok((gc_summary, Some(latest_generation))) } } - Ok(gc_summary) } async fn gc_timeline( @@ -572,7 +581,8 @@ async fn gc_timeline( target: &RootTarget, mode: GcMode, ttid: TenantShardTimelineId, - accumulator: &Arc>, + accumulator: &std::sync::Mutex, + tenant_manifest_info: Arc>, ) -> anyhow::Result { let mut summary = GcSummary::default(); let data = list_timeline_blobs(remote_client, ttid, target).await?; @@ -581,7 +591,9 @@ async fn gc_timeline( BlobDataParseResult::Parsed { index_part, index_part_generation, - s3_layers: _s3_layers, + s3_layers: _, + index_part_last_modified_time: _, + index_part_snapshot_time: _, } => (index_part, *index_part_generation, data.unused_index_keys), BlobDataParseResult::Relic => { // Post-deletion tenant location: don't try and GC it. @@ -597,6 +609,60 @@ async fn gc_timeline( } }; + if let Some(tenant_manifest_info) = &*tenant_manifest_info { + // TODO: this is O(n^2) in the number of offloaded timelines. Do a hashmap lookup instead. + let maybe_offloaded = tenant_manifest_info + .manifest + .offloaded_timelines + .iter() + .find(|offloaded_timeline| offloaded_timeline.timeline_id == ttid.timeline_id); + if let Some(offloaded) = maybe_offloaded { + let warnings = validate_index_part_with_offloaded(index_part, offloaded); + let warn = if warnings.is_empty() { + false + } else { + // Verify that the manifest hasn't changed. If it has, a potential racing change could have been cause for our troubles. + match list_tenant_manifests(remote_client, ttid.tenant_shard_id, target).await? { + ListTenantManifestResult::WithErrors { + errors, + unknown_keys: _, + } => { + for (_key, error) in errors { + tracing::warn!(%ttid, "list_tenant_manifests in gc_timeline: {error}"); + } + true + } + ListTenantManifestResult::NoErrors { + latest_generation, + manifests: _, + } => { + if let Some(new_latest_gen) = latest_generation { + let manifest_changed = ( + new_latest_gen.generation, + new_latest_gen.listing_object.last_modified, + ) == ( + tenant_manifest_info.generation, + tenant_manifest_info.listing_object.last_modified, + ); + if manifest_changed { + tracing::debug!(%ttid, "tenant manifest changed since it was loaded, suppressing {} warnings", warnings.len()); + } + manifest_changed + } else { + // The latest generation is gone. This timeline is in the progress of being deleted? + false + } + } + } + }; + if warn { + for warning in warnings { + tracing::warn!(%ttid, "{}", warning); + } + } + } + } + accumulator.lock().unwrap().update(ttid, index_part); for key in candidates { @@ -608,6 +674,35 @@ async fn gc_timeline( Ok(summary) } +fn validate_index_part_with_offloaded( + index_part: &IndexPart, + offloaded: &OffloadedTimelineManifest, +) -> Vec { + let mut warnings = Vec::new(); + if let Some(archived_at_index_part) = index_part.archived_at { + if archived_at_index_part + .signed_duration_since(offloaded.archived_at) + .num_seconds() + != 0 + { + warnings.push(format!( + "index-part archived_at={} differs from manifest archived_at={}", + archived_at_index_part, offloaded.archived_at + )); + } + } else { + warnings.push("Timeline offloaded in manifest but not archived in index-part".to_string()); + } + if index_part.metadata.ancestor_timeline() != offloaded.ancestor_timeline_id { + warnings.push(format!( + "index-part anestor={:?} differs from manifest ancestor={:?}", + index_part.metadata.ancestor_timeline(), + offloaded.ancestor_timeline_id + )); + } + warnings +} + /// Physical garbage collection: removing unused S3 objects. /// /// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level @@ -629,9 +724,9 @@ pub async fn pageserver_physical_gc( let remote_client = Arc::new(remote_client); let tenants = if tenant_shard_ids.is_empty() { - futures::future::Either::Left(stream_tenants(&remote_client, &target)) + Either::Left(stream_tenants(&remote_client, &target)) } else { - futures::future::Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok))) + Either::Right(futures::stream::iter(tenant_shard_ids.into_iter().map(Ok))) }; // How many tenants to process in parallel. We need to be mindful of pageservers @@ -639,69 +734,74 @@ pub async fn pageserver_physical_gc( const CONCURRENCY: usize = 32; // Accumulate information about each tenant for cross-shard GC step we'll do at the end - let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default())); + let accumulator = std::sync::Mutex::new(TenantRefAccumulator::default()); + + // Accumulate information about how many manifests we have GCd + let manifest_gc_summary = std::sync::Mutex::new(GcSummary::default()); // Generate a stream of TenantTimelineId - enum GcSummaryOrContent { - Content(T), - GcSummary(GcSummary), - } let timelines = tenants.map_ok(|tenant_shard_id| { let target_ref = ⌖ let remote_client_ref = &remote_client; + let manifest_gc_summary_ref = &manifest_gc_summary; async move { - let summaries_from_manifests = match gc_tenant_manifests( + let gc_manifest_result = gc_tenant_manifests( remote_client_ref, min_age, target_ref, mode, tenant_shard_id, ) - .await - { - Ok(gc_summary) => vec![Ok(GcSummaryOrContent::::GcSummary( - gc_summary, - ))], + .await; + let (summary_from_manifest, tenant_manifest_opt) = match gc_manifest_result { + Ok((gc_summary, tenant_manifest)) => (gc_summary, tenant_manifest), Err(e) => { tracing::warn!(%tenant_shard_id, "Error in gc_tenant_manifests: {e}"); - Vec::new() + (GcSummary::default(), None) } }; - stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id) - .await - .map(|stream| { - stream - .map_ok(GcSummaryOrContent::Content) - .chain(futures::stream::iter(summaries_from_manifests.into_iter())) - }) + manifest_gc_summary_ref + .lock() + .unwrap() + .merge(summary_from_manifest); + let tenant_manifest_arc = Arc::new(tenant_manifest_opt); + let mut timelines = Box::pin( + stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id).await?, + ); + Ok(try_stream! { + while let Some(ttid_res) = timelines.next().await { + let ttid = ttid_res?; + yield (ttid, tenant_manifest_arc.clone()); + } + }) } }); - let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); - let timelines = timelines.try_flatten(); let mut summary = GcSummary::default(); - - // Drain futures for per-shard GC, populating accumulator as a side effect { - let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid { - GcSummaryOrContent::Content(ttid) => futures::future::Either::Left(gc_timeline( + let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); + let timelines = timelines.try_flatten(); + + let timelines = timelines.map_ok(|(ttid, tenant_manifest_arc)| { + gc_timeline( &remote_client, &min_age, &target, mode, ttid, &accumulator, - )), - GcSummaryOrContent::GcSummary(gc_summary) => { - futures::future::Either::Right(futures::future::ok(gc_summary)) - } + tenant_manifest_arc, + ) }); let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); + // Drain futures for per-shard GC, populating accumulator as a side effect while let Some(i) = timelines.next().await { summary.merge(i?); } } + // Streams are lazily evaluated, so only now do we have access to the inner object + summary.merge(manifest_gc_summary.into_inner().unwrap()); // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC let Some(client) = controller_client else { @@ -709,8 +809,7 @@ pub async fn pageserver_physical_gc( return Ok(summary); }; - let (ancestor_shards, ancestor_refs) = Arc::into_inner(accumulator) - .unwrap() + let (ancestor_shards, ancestor_refs) = accumulator .into_inner() .unwrap() .into_gc_ancestors(client, &mut summary) diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index c8de6e46b3..ba75f25984 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -1,21 +1,22 @@ use std::collections::{HashMap, HashSet}; -use crate::checks::{ - branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, - RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis, -}; -use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use futures_util::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::remote_layer_path; use pageserver_api::controller_api::MetadataHealthUpdateRequest; use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; use serde::Serialize; -use tracing::{info_span, Instrument}; +use tracing::{Instrument, info_span}; use utils::id::TenantId; use utils::shard::ShardCount; +use crate::checks::{ + BlobDataParseResult, RemoteTimelineBlobData, TenantObjectListing, TimelineAnalysis, + branch_cleanup_and_check_errors, list_timeline_blobs, +}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, init_remote}; + #[derive(Serialize, Default)] pub struct MetadataSummary { tenant_count: usize, @@ -47,6 +48,8 @@ impl MetadataSummary { index_part, index_part_generation: _, s3_layers: _, + index_part_last_modified_time: _, + index_part_snapshot_time: _, } = &data.blob_data { *self @@ -195,7 +198,9 @@ pub async fn scan_pageserver_metadata( if let BlobDataParseResult::Parsed { index_part, index_part_generation, - s3_layers: _s3_layers, + s3_layers: _, + index_part_last_modified_time: _, + index_part_snapshot_time: _, } = &data.blob_data { if index_part.deleted_at.is_some() { @@ -318,9 +323,11 @@ pub async fn scan_pageserver_metadata( match &data.blob_data { BlobDataParseResult::Parsed { - index_part: _index_part, + index_part: _, index_part_generation: _index_part_generation, s3_layers, + index_part_last_modified_time: _, + index_part_snapshot_time: _, } => { tenant_objects.push(ttid, s3_layers.clone()); } diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 0a4d4266a0..f10d758097 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -1,23 +1,24 @@ -use std::{collections::HashSet, str::FromStr, sync::Arc}; +use std::collections::HashSet; +use std::str::FromStr; +use std::sync::Arc; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use futures::stream::{StreamExt, TryStreamExt}; use once_cell::sync::OnceCell; use pageserver_api::shard::TenantShardId; -use postgres_ffi::{XLogFileName, PG_TLI}; +use postgres_ffi::{PG_TLI, XLogFileName}; use remote_storage::GenericRemoteStorage; use rustls::crypto::ring; use serde::Serialize; use tokio_postgres::types::PgLsn; use tracing::{debug, error, info}; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, -}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; +use crate::cloud_admin_api::CloudAdminApiClient; +use crate::metadata_stream::stream_listing; use crate::{ - cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing, - BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, + BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, init_remote, }; /// Generally we should ask safekeepers, but so far we use everywhere default 16MB. diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 39e0b5c9b4..e17409c20e 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -1,25 +1,26 @@ use std::collections::HashMap; use std::sync::Arc; -use crate::checks::{list_timeline_blobs, BlobDataParseResult, RemoteTimelineBlobData}; -use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; -use crate::{ - download_object_to_file_s3, init_remote, init_remote_s3, BucketConfig, NodeKind, RootTarget, - TenantShardTimelineId, -}; use anyhow::Context; use async_stream::stream; use aws_sdk_s3::Client; use camino::Utf8PathBuf; use futures::{StreamExt, TryStreamExt}; +use pageserver::tenant::IndexPart; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::IndexPart; use pageserver_api::shard::TenantShardId; use remote_storage::{GenericRemoteStorage, S3Config}; use utils::generation::Generation; use utils::id::TenantId; +use crate::checks::{BlobDataParseResult, RemoteTimelineBlobData, list_timeline_blobs}; +use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; +use crate::{ + BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, download_object_to_file_s3, + init_remote, init_remote_s3, +}; + pub struct SnapshotDownloader { s3_client: Arc, s3_root: RootTarget, @@ -268,6 +269,8 @@ impl SnapshotDownloader { index_part, index_part_generation, s3_layers: _, + index_part_last_modified_time: _, + index_part_snapshot_time: _, } => { self.download_timeline( ttid, diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 887bfef478..4b591d3316 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -8,10 +8,12 @@ pytest_plugins = ( "fixtures.compute_reconfigure", "fixtures.storage_controller_proxy", "fixtures.paths", + "fixtures.compute_migrations", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", "fixtures.compare_fixtures", "fixtures.slow", "fixtures.reruns", + "fixtures.fast_import", ) diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 6c22b31e00..c82c7578d1 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -208,6 +208,10 @@ class ShardIndex: shard_count=int(input[2:4], 16), ) + @property + def is_sharded(self) -> bool: + return self.shard_count != 0 + class TenantShardId: def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int): diff --git a/test_runner/fixtures/compute_migrations.py b/test_runner/fixtures/compute_migrations.py new file mode 100644 index 0000000000..ea99785af0 --- /dev/null +++ b/test_runner/fixtures/compute_migrations.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +import os +from typing import TYPE_CHECKING + +import pytest + +from fixtures.paths import BASE_DIR + +if TYPE_CHECKING: + from collections.abc import Iterator + from pathlib import Path + +COMPUTE_MIGRATIONS_DIR = BASE_DIR / "compute_tools" / "src" / "migrations" +COMPUTE_MIGRATIONS_TEST_DIR = COMPUTE_MIGRATIONS_DIR / "tests" + +COMPUTE_MIGRATIONS = sorted(next(os.walk(COMPUTE_MIGRATIONS_DIR))[2]) +NUM_COMPUTE_MIGRATIONS = len(COMPUTE_MIGRATIONS) + + +@pytest.fixture(scope="session") +def compute_migrations_dir() -> Iterator[Path]: + """ + Retrieve the path to the compute migrations directory. + """ + yield COMPUTE_MIGRATIONS_DIR + + +@pytest.fixture(scope="session") +def compute_migrations_test_dir() -> Iterator[Path]: + """ + Retrieve the path to the compute migrations test directory. + """ + yield COMPUTE_MIGRATIONS_TEST_DIR diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 33f01f80fb..425abef935 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -69,7 +69,10 @@ def compute_reconfigure_listener(make_httpserver: HTTPServer): # This causes the endpoint to query storage controller for its location, which # is redundant since we already have it here, but this avoids extending the # neon_local CLI to take full lists of locations - reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] + fut = reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[misc] + + # To satisfy semantics of notify-attach API, we must wait for the change to be applied before returning 200 + fut.result() return Response(status=200) diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 1cd9158c68..cdc162fca2 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -9,49 +9,60 @@ from requests.adapters import HTTPAdapter class EndpointHttpClient(requests.Session): def __init__( self, - port: int, + external_port: int, + internal_port: int, ): super().__init__() - self.port = port + self.external_port: int = external_port + self.internal_port: int = internal_port self.mount("http://", HTTPAdapter()) def dbs_and_roles(self): - res = self.get(f"http://localhost:{self.port}/dbs_and_roles") + res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles") res.raise_for_status() return res.json() def database_schema(self, database: str): res = self.get( - f"http://localhost:{self.port}/database_schema?database={urllib.parse.quote(database, safe='')}" + f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}" ) res.raise_for_status() return res.text - def installed_extensions(self): - res = self.get(f"http://localhost:{self.port}/installed_extensions") - res.raise_for_status() - return res.json() - def extensions(self, extension: str, version: str, database: str): body = { "extension": extension, "version": version, "database": database, } - res = self.post(f"http://localhost:{self.port}/extensions", json=body) + res = self.post(f"http://localhost:{self.internal_port}/extensions", json=body) res.raise_for_status() return res.json() def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]): res = self.post( - f"http://localhost:{self.port}/grants", + f"http://localhost:{self.internal_port}/grants", json={"database": database, "schema": schema, "role": role, "privileges": privileges}, ) res.raise_for_status() return res.json() def metrics(self) -> str: - res = self.get(f"http://localhost:{self.port}/metrics") + res = self.get(f"http://localhost:{self.external_port}/metrics") res.raise_for_status() return res.text + + def configure_failpoints(self, *args: tuple[str, str]) -> None: + body: list[dict[str, str]] = [] + + for fp in args: + body.append( + { + "name": fp[0], + "action": fp[1], + } + ) + + res = self.post(f"http://localhost:{self.internal_port}/failpoints", json=body) + res.raise_for_status() diff --git a/test_runner/fixtures/fast_import.py b/test_runner/fixtures/fast_import.py new file mode 100644 index 0000000000..d674be99de --- /dev/null +++ b/test_runner/fixtures/fast_import.py @@ -0,0 +1,150 @@ +import os +import shutil +import subprocess +import tempfile +from collections.abc import Iterator +from pathlib import Path +from typing import cast + +import pytest +from _pytest.config import Config + +from fixtures.log_helper import log +from fixtures.neon_cli import AbstractNeonCli +from fixtures.pg_version import PgVersion + + +class FastImport(AbstractNeonCli): + COMMAND = "fast_import" + cmd: subprocess.CompletedProcess[str] | None = None + + def __init__( + self, + extra_env: dict[str, str] | None, + binpath: Path, + pg_distrib_dir: Path, + pg_version: PgVersion, + workdir: Path, + cleanup: bool = True, + ): + if extra_env is None: + env_vars = {} + else: + env_vars = extra_env.copy() + + if not (binpath / self.COMMAND).exists(): + raise Exception(f"{self.COMMAND} binary not found at '{binpath}'") + super().__init__(env_vars, binpath) + + pg_dir = pg_distrib_dir / pg_version.v_prefixed + self.pg_distrib_dir = pg_distrib_dir + self.pg_version = pg_version + self.pg_bin = pg_dir / "bin" + if not (self.pg_bin / "postgres").exists(): + raise Exception(f"postgres binary was not found at '{self.pg_bin}'") + self.pg_lib = pg_dir / "lib" + if env_vars.get("LD_LIBRARY_PATH") is not None: + self.pg_lib = Path(env_vars["LD_LIBRARY_PATH"]) + elif os.getenv("LD_LIBRARY_PATH") is not None: + self.pg_lib = Path(str(os.getenv("LD_LIBRARY_PATH"))) + if not workdir.exists(): + raise Exception(f"Working directory '{workdir}' does not exist") + self.workdir = workdir + self.cleanup = cleanup + + def run_pgdata( + self, + s3prefix: str | None = None, + pg_port: int | None = None, + source_connection_string: str | None = None, + interactive: bool = False, + ): + return self.run( + "pgdata", + s3prefix=s3prefix, + pg_port=pg_port, + source_connection_string=source_connection_string, + interactive=interactive, + ) + + def run_dump_restore( + self, + s3prefix: str | None = None, + source_connection_string: str | None = None, + destination_connection_string: str | None = None, + ): + return self.run( + "dump-restore", + s3prefix=s3prefix, + source_connection_string=source_connection_string, + destination_connection_string=destination_connection_string, + ) + + def run( + self, + command: str, + s3prefix: str | None = None, + pg_port: int | None = None, + source_connection_string: str | None = None, + destination_connection_string: str | None = None, + interactive: bool = False, + ) -> subprocess.CompletedProcess[str]: + if self.cmd is not None: + raise Exception("Command already executed") + args = [ + f"--pg-bin-dir={self.pg_bin}", + f"--pg-lib-dir={self.pg_lib}", + f"--working-directory={self.workdir}", + ] + if s3prefix is not None: + args.append(f"--s3-prefix={s3prefix}") + args.append(command) + if pg_port is not None: + args.append(f"--pg-port={pg_port}") + if source_connection_string is not None: + args.append(f"--source-connection-string={source_connection_string}") + if destination_connection_string is not None: + args.append(f"--destination-connection-string={destination_connection_string}") + if interactive: + args.append("--interactive") + + self.cmd = self.raw_cli(args) + return self.cmd + + def __enter__(self): + return self + + def __exit__(self, *args): + if self.workdir.exists() and self.cleanup: + shutil.rmtree(self.workdir) + + +@pytest.fixture(scope="function") +def fast_import( + pg_version: PgVersion, + test_output_dir: Path, + neon_binpath: Path, + pg_distrib_dir: Path, + pytestconfig: Config, +) -> Iterator[FastImport]: + workdir = Path(tempfile.mkdtemp(dir=test_output_dir, prefix="fast_import_")) + with FastImport( + None, + neon_binpath, + pg_distrib_dir, + pg_version, + workdir, + cleanup=not cast(bool, pytestconfig.getoption("--preserve-database-files")), + ) as fi: + yield fi + + if fi.cmd is None: + return + + # dump stdout & stderr into test log dir + with open(test_output_dir / "fast_import.stdout", "w") as f: + f.write(fi.cmd.stdout) + with open(test_output_dir / "fast_import.stderr", "w") as f: + f.write(fi.cmd.stderr) + + log.info("Written logs to %s", test_output_dir) diff --git a/test_runner/fixtures/httpserver.py b/test_runner/fixtures/httpserver.py index f653fd804c..1f46bb22b2 100644 --- a/test_runner/fixtures/httpserver.py +++ b/test_runner/fixtures/httpserver.py @@ -7,24 +7,25 @@ from pytest_httpserver import HTTPServer if TYPE_CHECKING: from collections.abc import Iterator + from ssl import SSLContext from fixtures.port_distributor import PortDistributor -# TODO: mypy fails with: -# Module "fixtures.neon_fixtures" does not explicitly export attribute "PortDistributor" [attr-defined] -# from fixtures.neon_fixtures import PortDistributor + ListenAddress = tuple[str, int] # compared to the fixtures from pytest_httpserver with same names, these are # always function scoped, so you can check and stop the server in tests. @pytest.fixture(scope="function") -def httpserver_ssl_context(): - return None +def httpserver_ssl_context() -> Iterator[SSLContext | None]: + yield None @pytest.fixture(scope="function") -def make_httpserver(httpserver_listen_address, httpserver_ssl_context) -> Iterator[HTTPServer]: +def make_httpserver( + httpserver_listen_address: ListenAddress, httpserver_ssl_context: SSLContext | None +) -> Iterator[HTTPServer]: host, port = httpserver_listen_address if not host: host = HTTPServer.DEFAULT_LISTEN_HOST @@ -47,6 +48,6 @@ def httpserver(make_httpserver: HTTPServer) -> Iterator[HTTPServer]: @pytest.fixture(scope="function") -def httpserver_listen_address(port_distributor: PortDistributor) -> tuple[str, int]: +def httpserver_listen_address(port_distributor: PortDistributor) -> ListenAddress: port = port_distributor.get_port() return ("localhost", port) diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index a591e088ef..83a1a87611 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -126,13 +126,8 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = ( "pageserver_page_cache_read_accesses_total", "pageserver_page_cache_size_current_bytes", "pageserver_page_cache_size_max_bytes", - "pageserver_getpage_reconstruct_seconds_bucket", - "pageserver_getpage_reconstruct_seconds_count", - "pageserver_getpage_reconstruct_seconds_sum", *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), - *histogram("pageserver_layers_visited_per_read_global"), - *histogram("pageserver_getpage_get_reconstruct_data_seconds"), *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"), *histogram("pageserver_io_operations_seconds"), @@ -163,6 +158,9 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( "pageserver_pitr_history_size", "pageserver_layer_bytes", "pageserver_layer_count", + "pageserver_layers_per_read_bucket", + "pageserver_layers_per_read_count", + "pageserver_layers_per_read_sum", "pageserver_visible_physical_size", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", @@ -178,6 +176,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( counter("pageserver_timeline_wal_records_received"), counter("pageserver_page_service_pagestream_flush_in_progress_micros"), *histogram("pageserver_page_service_batch_size"), + *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"), *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, # "pageserver_directory_entries_count", -- only used if above a certain threshold # "pageserver_broken_tenants_count" -- used only for broken diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index a85a191455..97a5a36814 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -478,7 +478,8 @@ class NeonLocalCli(AbstractNeonCli): self, branch_name: str, pg_port: int, - http_port: int, + external_http_port: int, + internal_http_port: int, tenant_id: TenantId, pg_version: PgVersion, endpoint_id: str | None = None, @@ -486,6 +487,7 @@ class NeonLocalCli(AbstractNeonCli): lsn: Lsn | None = None, pageserver_id: int | None = None, allow_multiple=False, + update_catalog: bool = False, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", @@ -501,8 +503,10 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--lsn", str(lsn)]) if pg_port is not None: args.extend(["--pg-port", str(pg_port)]) - if http_port is not None: - args.extend(["--http-port", str(http_port)]) + if external_http_port is not None: + args.extend(["--external-http-port", str(external_http_port)]) + if internal_http_port is not None: + args.extend(["--internal-http-port", str(internal_http_port)]) if endpoint_id is not None: args.append(endpoint_id) if hot_standby: @@ -511,6 +515,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--pageserver-id", str(pageserver_id)]) if allow_multiple: args.extend(["--allow-multiple"]) + if update_catalog: + args.extend(["--update-catalog"]) res = self.raw_cli(args) res.check_returncode() @@ -522,14 +528,16 @@ class NeonLocalCli(AbstractNeonCli): safekeepers: list[int] | None = None, remote_ext_config: str | None = None, pageserver_id: int | None = None, - allow_multiple=False, + allow_multiple: bool = False, + create_test_user: bool = False, basebackup_request_tries: int | None = None, + env: dict[str, str] | None = None, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", "start", ] - extra_env_vars = {} + extra_env_vars = env or {} if basebackup_request_tries is not None: extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries) if remote_ext_config is not None: @@ -543,6 +551,8 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--pageserver-id", str(pageserver_id)]) if allow_multiple: args.extend(["--allow-multiple"]) + if create_test_user: + args.extend(["--create-test-user"]) res = self.raw_cli(args, extra_env_vars) res.check_returncode() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8354432c0c..1d282971b1 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3,6 +3,7 @@ from __future__ import annotations import abc import asyncio import concurrent.futures +import dataclasses import filecmp import json import os @@ -26,6 +27,7 @@ from urllib.parse import quote, urlparse import asyncpg import backoff +import boto3 import httpx import psycopg2 import psycopg2.sql @@ -36,6 +38,8 @@ from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest from jwcrypto import jwk +from mypy_boto3_kms import KMSClient +from mypy_boto3_s3 import S3Client # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -54,6 +58,7 @@ from fixtures.common_types import ( TimelineArchivalState, TimelineId, ) +from fixtures.compute_migrations import NUM_COMPUTE_MIGRATIONS from fixtures.endpoint.http import EndpointHttpClient from fixtures.h2server import H2Server from fixtures.log_helper import log @@ -91,7 +96,7 @@ from fixtures.utils import ( ATTACHMENT_NAME_REGEX, COMPONENT_BINARIES, USE_LFC, - allure_add_grafana_links, + allure_add_grafana_link, assert_no_errors, get_dir_size, print_gc_result, @@ -134,6 +139,9 @@ DEFAULT_BRANCH_NAME: str = "main" BASE_PORT: int = 15000 +# By default we create pageservers with this phony AZ +DEFAULT_AZ_ID: str = "us-east-2a" + @pytest.fixture(scope="session") def neon_api_key() -> str: @@ -194,6 +202,30 @@ def mock_s3_server(port_distributor: PortDistributor) -> Iterator[MockS3Server]: mock_s3_server.kill() +@pytest.fixture(scope="session") +def mock_kms(mock_s3_server: MockS3Server) -> Iterator[KMSClient]: + yield boto3.client( + "kms", + endpoint_url=mock_s3_server.endpoint(), + region_name=mock_s3_server.region(), + aws_access_key_id=mock_s3_server.access_key(), + aws_secret_access_key=mock_s3_server.secret_key(), + aws_session_token=mock_s3_server.session_token(), + ) + + +@pytest.fixture(scope="session") +def mock_s3_client(mock_s3_server: MockS3Server) -> Iterator[S3Client]: + yield boto3.client( + "s3", + endpoint_url=mock_s3_server.endpoint(), + region_name=mock_s3_server.region(), + aws_access_key_id=mock_s3_server.access_key(), + aws_secret_access_key=mock_s3_server.secret_key(), + aws_session_token=mock_s3_server.session_token(), + ) + + class PgProtocol: """Reusable connection logic""" @@ -309,6 +341,10 @@ class PgProtocol: """ return self.safe_psql(query, log_query=log_query)[0][0] + def show_timeline_id(self) -> TimelineId: + """SHOW neon.timeline_id""" + return TimelineId(cast("str", self.safe_psql("show neon.timeline_id")[0][0])) + class PageserverWalReceiverProtocol(StrEnum): VANILLA = "vanilla" @@ -366,6 +402,7 @@ class NeonEnvBuilder: pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None, num_safekeepers: int = 1, num_pageservers: int = 1, + num_azs: int = 1, # Use non-standard SK ids to check for various parsing bugs safekeepers_id_start: int = 0, # fsync is disabled by default to make the tests go faster @@ -382,6 +419,7 @@ class NeonEnvBuilder: storage_controller_port_override: int | None = None, pageserver_virtual_file_io_mode: str | None = None, pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None, + pageserver_get_vectored_concurrent_io: str | None = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -397,6 +435,7 @@ class NeonEnvBuilder: self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers self.num_pageservers = num_pageservers + self.num_azs = num_azs self.safekeepers_id_start = safekeepers_id_start self.safekeepers_enable_fsync = safekeepers_enable_fsync self.auth_enabled = auth_enabled @@ -420,6 +459,9 @@ class NeonEnvBuilder: self.storage_controller_config: dict[Any, Any] | None = None self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine + self.pageserver_get_vectored_concurrent_io: str | None = ( + pageserver_get_vectored_concurrent_io + ) self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = ( pageserver_default_tenant_config_compaction_algorithm @@ -435,7 +477,10 @@ class NeonEnvBuilder: self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode - self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol + if pageserver_wal_receiver_protocol is not None: + self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol + else: + self.pageserver_wal_receiver_protocol = PageserverWalReceiverProtocol.INTERPRETED assert test_name.startswith( "test_" @@ -443,8 +488,10 @@ class NeonEnvBuilder: self.test_name = test_name self.compatibility_neon_binpath = compatibility_neon_binpath self.compatibility_pg_distrib_dir = compatibility_pg_distrib_dir + self.test_may_use_compatibility_snapshot_binaries = False self.version_combination = combination self.mixdir = self.test_output_dir / "mixdir_neon" + if self.version_combination is not None: assert ( self.compatibility_neon_binpath is not None @@ -454,6 +501,7 @@ class NeonEnvBuilder: ), "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required when using mixed versions" self.mixdir.mkdir(mode=0o755, exist_ok=True) self._mix_versions() + self.test_may_use_compatibility_snapshot_binaries = True def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv: # Cannot create more than one environment from one builder @@ -655,6 +703,11 @@ class NeonEnvBuilder: def _mix_versions(self): assert self.version_combination is not None, "version combination must be set" + + # Always use a newer version of `neon_local` + (self.mixdir / "neon_local").hardlink_to(self.neon_binpath / "neon_local") + self.neon_local_binpath = self.mixdir + for component, paths in COMPONENT_BINARIES.items(): directory = ( self.neon_binpath @@ -663,10 +716,11 @@ class NeonEnvBuilder: ) for filename in paths: destination = self.mixdir / filename - destination.symlink_to(directory / filename) + destination.hardlink_to(directory / filename) + self.neon_binpath = self.mixdir + if self.version_combination["compute"] == "old": self.pg_distrib_dir = self.compatibility_pg_distrib_dir - self.neon_binpath = self.mixdir def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path): """ @@ -983,6 +1037,7 @@ class NeonEnv: self.endpoints = EndpointFactory(self) self.safekeepers: list[Safekeeper] = [] self.pageservers: list[NeonPageserver] = [] + self.num_azs = config.num_azs self.broker = NeonBroker(self) self.pageserver_remote_storage = config.pageserver_remote_storage self.safekeepers_remote_storage = config.safekeepers_remote_storage @@ -1052,6 +1107,7 @@ class NeonEnv: self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol + self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io # Create the neon_local's `NeonLocalInitConf` cfg: dict[str, Any] = { @@ -1083,14 +1139,21 @@ class NeonEnv: http=self.port_distributor.get_port(), ) + # Availabilty zones may also be configured manually with `NeonEnvBuilder.pageserver_config_override` + if self.num_azs > 1: + # Round-robin assignment of AZ names like us-east-2a, us-east-2b, etc. + az_prefix = DEFAULT_AZ_ID[:-1] + availability_zone = f"{az_prefix}{chr(ord('a') + (ps_id - 1) % self.num_azs)}" + else: + availability_zone = DEFAULT_AZ_ID + ps_cfg: dict[str, Any] = { "id": ps_id, "listen_pg_addr": f"localhost:{pageserver_port.pg}", "listen_http_addr": f"localhost:{pageserver_port.http}", "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, - # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override` - "availability_zone": "us-east-2a", + "availability_zone": availability_zone, # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids # the pageserver taking a long time to start up due to syncfs flushing other tests' data "no_sync": True, @@ -1098,12 +1161,24 @@ class NeonEnv: # Batching (https://github.com/neondatabase/neon/issues/9377): # enable batching by default in tests and benchmarks. - # Compat tests are exempt because old versions fail to parse the new config. - if not config.compatibility_neon_binpath: - ps_cfg["page_service_pipelining"] = { - "mode": "pipelined", - "execution": "concurrent-futures", - "max_batch_size": 32, + ps_cfg["page_service_pipelining"] = { + "mode": "pipelined", + "execution": "concurrent-futures", + "max_batch_size": 32, + } + + if config.test_may_use_compatibility_snapshot_binaries: + log.info( + "Skipping WAL contiguity validation to avoid forward-compatibility related test failures" + ) + else: + # Look for gaps in WAL received from safekeepeers + ps_cfg["validate_wal_contiguity"] = True + + get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io + if get_vectored_concurrent_io is not None: + ps_cfg["get_vectored_concurrent_io"] = { + "mode": self.pageserver_get_vectored_concurrent_io, } if self.pageserver_virtual_file_io_engine is not None: @@ -1440,6 +1515,7 @@ def neon_simple_env( pageserver_virtual_file_io_engine: str, pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None, pageserver_virtual_file_io_mode: str | None, + pageserver_get_vectored_concurrent_io: str | None, ) -> Iterator[NeonEnv]: """ Simple Neon environment, with 1 safekeeper and 1 pageserver. No authentication, no fsync. @@ -1472,6 +1548,7 @@ def neon_simple_env( pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, + pageserver_get_vectored_concurrent_io=pageserver_get_vectored_concurrent_io, combination=combination, ) as builder: env = builder.init_start() @@ -1498,6 +1575,7 @@ def neon_env_builder( pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None, record_property: Callable[[str, object], None], pageserver_virtual_file_io_mode: str | None, + pageserver_get_vectored_concurrent_io: str | None, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1540,6 +1618,7 @@ def neon_env_builder( test_overlay_dir=test_overlay_dir, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, + pageserver_get_vectored_concurrent_io=pageserver_get_vectored_concurrent_io, ) as builder: yield builder # Propogate `preserve_database_files` to make it possible to use in other fixtures, @@ -1551,6 +1630,7 @@ def neon_env_builder( class PageserverPort: pg: int http: int + https: int | None = None class LogUtils: @@ -1631,6 +1711,12 @@ class StorageControllerLeadershipStatus(StrEnum): CANDIDATE = "candidate" +@dataclass +class StorageControllerMigrationConfig: + secondary_warmup_timeout: str | None + secondary_download_request_timeout: str | None + + class NeonStorageController(MetricsGetter, LogUtils): def __init__(self, env: NeonEnv, port: int, auth_enabled: bool): self.env = env @@ -1801,6 +1887,7 @@ class NeonStorageController(MetricsGetter, LogUtils): "node_id": int(node.id), "listen_http_addr": "localhost", "listen_http_port": node.service_port.http, + "listen_https_port": node.service_port.https, "listen_pg_addr": "localhost", "listen_pg_port": node.service_port.pg, "availability_zone_id": node.az_id, @@ -1877,7 +1964,10 @@ class NeonStorageController(MetricsGetter, LogUtils): ) return response.json() - def tenant_list(self): + def tenant_shard_dump(self): + """ + Debug listing API: dumps the internal map of tenant shards + """ response = self.request( "GET", f"{self.api}/debug/v1/tenant", @@ -1885,6 +1975,18 @@ class NeonStorageController(MetricsGetter, LogUtils): ) return response.json() + def tenant_list(self, **kwargs): + """ + Control API tenant listing: a vector of the same content returned by tenant_describe + """ + response = self.request( + "GET", + f"{self.api}/control/v1/tenant", + headers=self.headers(TokenScope.ADMIN), + params=kwargs, + ) + return response.json() + def node_configure(self, node_id, body: dict[str, Any]): log.info(f"node_configure({node_id}, {body})") body["node_id"] = node_id @@ -2009,11 +2111,20 @@ class NeonStorageController(MetricsGetter, LogUtils): shards: list[TenantShardId] = body["new_shards"] return shards - def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int): + def tenant_shard_migrate( + self, + tenant_shard_id: TenantShardId, + dest_ps_id: int, + config: StorageControllerMigrationConfig | None = None, + ): + payload = {"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id} + if config is not None: + payload["migration_config"] = dataclasses.asdict(config) + self.request( "PUT", f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate", - json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id}, + json=payload, headers=self.headers(TokenScope.ADMIN), ) log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}") @@ -2231,7 +2342,7 @@ class NeonStorageController(MetricsGetter, LogUtils): """ Get the intent and observed placements of all tenants known to the storage controller. """ - tenants = self.tenant_list() + tenants = self.tenant_shard_dump() tenant_placement: defaultdict[str, dict[str, Any]] = defaultdict( lambda: { @@ -2314,6 +2425,14 @@ class NeonStorageController(MetricsGetter, LogUtils): json=body, ) + def safekeeper_scheduling_policy(self, id: int, scheduling_policy: str): + self.request( + "POST", + f"{self.api}/control/v1/safekeeper/{id}/scheduling_policy", + headers=self.headers(TokenScope.ADMIN), + json={"id": id, "scheduling_policy": scheduling_policy}, + ) + def get_safekeeper(self, id: int) -> dict[str, Any] | None: try: response = self.request( @@ -2329,6 +2448,16 @@ class NeonStorageController(MetricsGetter, LogUtils): return None raise e + def get_safekeepers(self) -> list[dict[str, Any]]: + response = self.request( + "GET", + f"{self.api}/control/v1/safekeeper", + headers=self.headers(TokenScope.ADMIN), + ) + json = response.json() + assert isinstance(json, list) + return json + def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]: response = self.request( "PUT", @@ -2340,6 +2469,14 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() return [TenantShardId.parse(tid) for tid in response.json()["updated"]] + def download_heatmap_layers(self, tenant_shard_id: TenantShardId, timeline_id: TimelineId): + response = self.request( + "POST", + f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", + headers=self.headers(TokenScope.ADMIN), + ) + response.raise_for_status() + def __enter__(self) -> Self: return self @@ -2504,6 +2641,7 @@ class NeonPageserver(PgProtocol, LogUtils): self, extra_env_vars: dict[str, str] | None = None, timeout_in_seconds: int | None = None, + await_active: bool = True, ) -> Self: """ Start the page server. @@ -2530,8 +2668,10 @@ class NeonPageserver(PgProtocol, LogUtils): ) self.running = True - if self.env.storage_controller.running and self.env.storage_controller.node_registered( - self.id + if ( + await_active + and self.env.storage_controller.running + and self.env.storage_controller.node_registered(self.id) ): self.env.storage_controller.poll_node_status( self.id, PageserverAvailability.ACTIVE, None, max_attempts=200, backoff=0.1 @@ -2686,6 +2826,11 @@ class NeonPageserver(PgProtocol, LogUtils): log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}") raise + def heatmap_content(self, tenant_shard_id: TenantId | TenantShardId) -> Any: + path = self.tenant_dir(tenant_shard_id) / "heatmap-v1.json" + with open(path) as f: + return json.load(f) + def tenant_create( self, tenant_id: TenantId, @@ -3112,7 +3257,7 @@ def remote_pg( end_ms = int(datetime.utcnow().timestamp() * 1000) if is_neon: # Add 10s margin to the start and end times - allure_add_grafana_links( + allure_add_grafana_link( host, timeline_id, start_ms - 10_000, @@ -3206,10 +3351,9 @@ class NeonProxy(PgProtocol): # Link auth backend params *["--auth-backend", "link"], *["--uri", NeonProxy.link_auth_uri], - *["--allow-self-signed-compute", "true"], ] - class Console(AuthBackend): + class ProxyV1(AuthBackend): def __init__(self, endpoint: str, fixed_rate_limit: int | None = None): self.endpoint = endpoint self.fixed_rate_limit = fixed_rate_limit @@ -3217,7 +3361,7 @@ class NeonProxy(PgProtocol): def extra_args(self) -> list[str]: args = [ # Console auth backend params - *["--auth-backend", "console"], + *["--auth-backend", "cplane-v1"], *["--auth-endpoint", self.endpoint], *["--sql-over-http-pool-opt-in", "false"], ] @@ -3261,7 +3405,7 @@ class NeonProxy(PgProtocol): metric_collection_interval: str | None = None, ): host = "127.0.0.1" - domain = "proxy.localtest.me" # resolves to 127.0.0.1 + domain = "proxy.local.neon.build" # resolves to 127.0.0.1 super().__init__(dsn=auth_backend.default_conn_url, host=domain, port=proxy_port) self.domain = domain @@ -3284,7 +3428,7 @@ class NeonProxy(PgProtocol): # generate key of it doesn't exist crt_path = self.test_output_dir / "proxy.crt" key_path = self.test_output_dir / "proxy.key" - generate_proxy_tls_certs("*.localtest.me", key_path, crt_path) + generate_proxy_tls_certs("*.local.neon.build", key_path, crt_path) args = [ str(self.neon_binpath / "proxy"), @@ -3465,13 +3609,13 @@ class NeonProxy(PgProtocol): class NeonAuthBroker: - class ControlPlane: + class ProxyV1: def __init__(self, endpoint: str): self.endpoint = endpoint def extra_args(self) -> list[str]: args = [ - *["--auth-backend", "console"], + *["--auth-backend", "cplane-v1"], *["--auth-endpoint", self.endpoint], ] return args @@ -3483,9 +3627,9 @@ class NeonAuthBroker: http_port: int, mgmt_port: int, external_http_port: int, - auth_backend: NeonAuthBroker.ControlPlane, + auth_backend: NeonAuthBroker.ProxyV1, ): - self.domain = "apiauth.localtest.me" # resolves to 127.0.0.1 + self.domain = "apiauth.local.neon.build" # resolves to 127.0.0.1 self.host = "127.0.0.1" self.http_port = http_port self.external_http_port = external_http_port @@ -3502,7 +3646,7 @@ class NeonAuthBroker: # generate key of it doesn't exist crt_path = self.test_output_dir / "proxy.crt" key_path = self.test_output_dir / "proxy.key" - generate_proxy_tls_certs("apiauth.localtest.me", key_path, crt_path) + generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path) args = [ str(self.neon_binpath / "proxy"), @@ -3669,7 +3813,7 @@ def static_auth_broker( local_proxy_addr = f"{http2_echoserver.host}:{http2_echoserver.port}" # return local_proxy addr on ProxyWakeCompute. - httpserver.expect_request("/cplane/proxy_wake_compute").respond_with_json( + httpserver.expect_request("/cplane/wake_compute").respond_with_json( { "address": local_proxy_addr, "aux": { @@ -3709,7 +3853,7 @@ def static_auth_broker( http_port=http_port, mgmt_port=mgmt_port, external_http_port=external_http_port, - auth_backend=NeonAuthBroker.ControlPlane(httpserver.url_for("/cplane")), + auth_backend=NeonAuthBroker.ProxyV1(httpserver.url_for("/cplane")), ) as proxy: proxy.start() yield proxy @@ -3723,7 +3867,8 @@ class Endpoint(PgProtocol, LogUtils): env: NeonEnv, tenant_id: TenantId, pg_port: int, - http_port: int, + external_http_port: int, + internal_http_port: int, check_stop_result: bool = True, ): super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres") @@ -3733,7 +3878,8 @@ class Endpoint(PgProtocol, LogUtils): self.pgdata_dir: Path | None = None # Path to computenode PGDATA self.tenant_id = tenant_id self.pg_port = pg_port - self.http_port = http_port + self.external_http_port = external_http_port + self.internal_http_port = internal_http_port self.check_stop_result = check_stop_result # passed to endpoint create and endpoint reconfigure self.active_safekeepers: list[int] = list(map(lambda sk: sk.id, env.safekeepers)) @@ -3750,7 +3896,8 @@ class Endpoint(PgProtocol, LogUtils): self, auth_token: str | None = None, retries: Retry | None = None ) -> EndpointHttpClient: return EndpointHttpClient( - port=self.http_port, + external_port=self.external_http_port, + internal_port=self.internal_http_port, ) def create( @@ -3762,6 +3909,7 @@ class Endpoint(PgProtocol, LogUtils): config_lines: list[str] | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, + update_catalog: bool = False, ) -> Self: """ Create a new Postgres endpoint. @@ -3782,10 +3930,12 @@ class Endpoint(PgProtocol, LogUtils): lsn=lsn, hot_standby=hot_standby, pg_port=self.pg_port, - http_port=self.http_port, + external_http_port=self.external_http_port, + internal_http_port=self.internal_http_port, pg_version=self.env.pg_version, pageserver_id=pageserver_id, allow_multiple=allow_multiple, + update_catalog=update_catalog, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = self.env.repo_dir / path @@ -3839,7 +3989,9 @@ class Endpoint(PgProtocol, LogUtils): pageserver_id: int | None = None, safekeepers: list[int] | None = None, allow_multiple: bool = False, + create_test_user: bool = False, basebackup_request_tries: int | None = None, + env: dict[str, str] | None = None, ) -> Self: """ Start the Postgres instance. @@ -3859,7 +4011,9 @@ class Endpoint(PgProtocol, LogUtils): remote_ext_config=remote_ext_config, pageserver_id=pageserver_id, allow_multiple=allow_multiple, + create_test_user=create_test_user, basebackup_request_tries=basebackup_request_tries, + env=env, ) self._running.release(1) self.log_config_value("shared_buffers") @@ -3973,14 +4127,17 @@ class Endpoint(PgProtocol, LogUtils): log.info("Updating compute spec to: %s", json.dumps(data_dict, indent=4)) json.dump(data_dict, file, indent=4) - # Please note: Migrations only run if pg_skip_catalog_updates is false - def wait_for_migrations(self, num_migrations: int = 11): + def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None: + """ + Wait for all compute migrations to be ran. Remember that migrations only + run if "pg_skip_catalog_updates" is set in the compute spec to false. + """ with self.cursor() as cur: def check_migrations_done(): cur.execute("SELECT id FROM neon_migration.migration_id") migration_id: int = cur.fetchall()[0][0] - assert migration_id >= num_migrations + assert migration_id >= wait_for wait_until(check_migrations_done) @@ -4096,7 +4253,7 @@ class Endpoint(PgProtocol, LogUtils): # Checkpoints running endpoint and returns pg_wal size in MB. def get_pg_wal_size(self): - log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}') + log.info(f"checkpointing at LSN {self.safe_psql('select pg_current_wal_lsn()')[0][0]}") self.safe_psql("checkpoint") assert self.pgdata_dir is not None # please mypy return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024 @@ -4167,7 +4324,8 @@ class EndpointFactory: self.env, tenant_id=tenant_id or self.env.initial_tenant, pg_port=self.env.port_distributor.get_port(), - http_port=self.env.port_distributor.get_port(), + external_http_port=self.env.port_distributor.get_port(), + internal_http_port=self.env.port_distributor.get_port(), ) self.num_instances += 1 self.endpoints.append(ep) @@ -4192,12 +4350,14 @@ class EndpointFactory: hot_standby: bool = False, config_lines: list[str] | None = None, pageserver_id: int | None = None, + update_catalog: bool = False, ) -> Endpoint: ep = Endpoint( self.env, tenant_id=tenant_id or self.env.initial_tenant, pg_port=self.env.port_distributor.get_port(), - http_port=self.env.port_distributor.get_port(), + external_http_port=self.env.port_distributor.get_port(), + internal_http_port=self.env.port_distributor.get_port(), ) endpoint_id = endpoint_id or self.env.generate_endpoint_id() @@ -4212,6 +4372,7 @@ class EndpointFactory: hot_standby=hot_standby, config_lines=config_lines, pageserver_id=pageserver_id, + update_catalog=update_catalog, ) def stop_all(self, fail_on_error=True) -> Self: @@ -4304,6 +4465,7 @@ class Safekeeper(LogUtils): "1s", "--eviction-min-resident", "10s", + "--wal-reader-fanout", ] self.extra_opts = extra_opts @@ -4585,7 +4747,8 @@ class StorageScrubber: ] args = base_args + args - log.info(f"Invoking scrubber command {args} with env: {env}") + log.info(f"Invoking scrubber command {args}") + (output_path, stdout, status_code) = subprocess_capture( self.log_dir, args, @@ -4869,8 +5032,13 @@ def check_restored_datadir_content( restored_files = list_files_to_compare(restored_dir_path) + # pg_notify files are always ignored + pgdata_files = [f for f in pgdata_files if not f.startswith("pg_notify")] + restored_files = [f for f in restored_files if not f.startswith("pg_notify")] + + # pg_xact and pg_multixact files are optional in basebackup: depending on our configuration they + # may be omitted and loaded on demand. if pgdata_files != restored_files: - # filter pg_xact and multixact files which are downloaded on demand pgdata_files = [ f for f in pgdata_files @@ -4908,20 +5076,59 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) -def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> Lsn: +# wait for subscriber to catch up with publisher +def logical_replication_sync( + subscriber: PgProtocol, + publisher: PgProtocol, + # pass subname explicitly to avoid confusion + # when multiple subscriptions are present + subname: str, + sub_dbname: str | None = None, + pub_dbname: str | None = None, +): """Wait logical replication subscriber to sync with publisher.""" - publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - while True: - res = subscriber.safe_psql("select latest_end_lsn from pg_catalog.pg_stat_subscription")[0][ - 0 - ] - if res: - log.info(f"subscriber_lsn={res}") - subscriber_lsn = Lsn(res) - log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}") - if subscriber_lsn >= publisher_lsn: - return subscriber_lsn - time.sleep(0.5) + + def initial_sync(): + # first check if the subscription is active `s`=`synchronized`, `r` = `ready` + query = f"""SELECT 1 FROM pg_subscription_rel join pg_catalog.pg_subscription + on pg_subscription_rel.srsubid = pg_subscription.oid + WHERE srsubstate NOT IN ('r', 's') and subname='{subname}'""" + + if sub_dbname is not None: + res = subscriber.safe_psql(query, dbname=sub_dbname) + else: + res = subscriber.safe_psql(query) + + assert (res is None) or (len(res) == 0) + + wait_until(initial_sync) + + # wait for the subscription to catch up with current state of publisher + # caller is responsible to call checkpoint before calling this function + if pub_dbname is not None: + publisher_lsn = Lsn( + publisher.safe_psql("SELECT pg_current_wal_flush_lsn()", dbname=pub_dbname)[0][0] + ) + else: + publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + + def subscriber_catch_up(): + query = f"select latest_end_lsn from pg_catalog.pg_stat_subscription where latest_end_lsn is NOT NULL and subname='{subname}'" + + if sub_dbname is not None: + res = subscriber.safe_psql(query, dbname=sub_dbname) + else: + res = subscriber.safe_psql(query) + + assert res is not None + + res_lsn = res[0][0] + log.info(f"subscriber_lsn={res_lsn}") + subscriber_lsn = Lsn(res_lsn) + log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}") + assert subscriber_lsn >= publisher_lsn + + wait_until(subscriber_catch_up) def tenant_get_shards( @@ -4990,12 +5197,14 @@ def wait_for_last_flush_lsn( timeline: TimelineId, pageserver_id: int | None = None, auth_token: str | None = None, + last_flush_lsn: Lsn | None = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" shards = tenant_get_shards(env, tenant, pageserver_id) - last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + if last_flush_lsn is None: + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) results = [] for tenant_shard_id, pageserver in shards: diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 5059039678..748ac0d569 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -99,8 +99,11 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*WARN.*path=/v1/utilization .*request was dropped before completing", # Can happen during shutdown ".*scheduling deletion on drop failed: queue is in state Stopped.*", - # Too many frozen layers error is normal during intensive benchmarks - ".*too many frozen layers.*", + # L0 flush backpressure delays are expected under heavy ingest load. We want to exercise + # this backpressure in tests. + ".*delaying layer flush by \\S+ for compaction backpressure.*", + ".*stalling layer flushes for compaction backpressure.*", + ".*layer roll waiting for flush due to compaction backpressure.*", ) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 0832eac22f..364aff325d 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -15,7 +15,6 @@ from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from fixtures.common_types import ( - Id, Lsn, TenantId, TenantShardId, @@ -25,7 +24,7 @@ from fixtures.common_types import ( from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion -from fixtures.utils import Fn +from fixtures.utils import EnhancedJSONEncoder, Fn class PageserverApiException(Exception): @@ -83,14 +82,6 @@ class TimelineCreateRequest: mode: TimelineCreateRequestMode def to_json(self) -> str: - class EnhancedJSONEncoder(json.JSONEncoder): - def default(self, o): - if dataclasses.is_dataclass(o) and not isinstance(o, type): - return dataclasses.asdict(o) - elif isinstance(o, Id): - return o.id.hex() - return super().default(o) - # mode is flattened this = dataclasses.asdict(self) mode = this.pop("mode") @@ -488,7 +479,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter): ) self.verbose_error(res) - def patch_tenant_config_client_side( + def patch_tenant_config(self, tenant_id: TenantId | TenantShardId, updates: dict[str, Any]): + """ + Only use this via storage_controller.pageserver_api(). + + See `set_tenant_config` for more information. + """ + assert "tenant_id" not in updates.keys() + res = self.patch( + f"http://localhost:{self.port}/v1/tenant/config", + json={**updates, "tenant_id": str(tenant_id)}, + ) + self.verbose_error(res) + + def update_tenant_config( self, tenant_id: TenantId, inserts: dict[str, Any] | None = None, @@ -499,13 +503,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter): See `set_tenant_config` for more information. """ - current = self.tenant_config(tenant_id).tenant_specific_overrides - if inserts is not None: - current.update(inserts) - if removes is not None: - for key in removes: - del current[key] - self.set_tenant_config(tenant_id, current) + if inserts is None: + inserts = {} + if removes is None: + removes = [] + + patch = inserts | {remove: None for remove in removes} + self.patch_tenant_config(tenant_id, patch) def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int: return self.tenant_size_and_modelinputs(tenant_id)[0] @@ -725,6 +729,18 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res_json = res.json() assert res_json is None + def timeline_compact_info( + self, + tenant_id: TenantId | TenantShardId, + timeline_id: TimelineId, + ) -> Any: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact", + ) + self.verbose_error(res) + res_json = res.json() + return res_json + def timeline_compact( self, tenant_id: TenantId | TenantShardId, @@ -736,7 +752,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter): enhanced_gc_bottom_most_compaction=False, body: dict[str, Any] | None = None, ): - self.is_testing_enabled_or_skip() query = {} if force_repartition: query["force_repartition"] = "true" diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index f57c0f801f..c33342c89e 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -44,6 +44,11 @@ def pageserver_virtual_file_io_mode() -> str | None: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE") +@pytest.fixture(scope="function", autouse=True) +def pageserver_get_vectored_concurrent_io() -> str | None: + return os.getenv("PAGESERVER_GET_VECTORED_CONCURRENT_IO") + + def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") if toml_table is None: @@ -116,6 +121,11 @@ def pytest_runtest_makereport(*args, **kwargs): }.get(os.uname().machine, "UNKNOWN") arch = os.getenv("RUNNER_ARCH", uname_m) allure.dynamic.parameter("__arch", arch) - allure.dynamic.parameter("__lfc", os.getenv("USE_LFC") != "false") + allure.dynamic.parameter( + "__lfc", "with-lfc" if os.getenv("USE_LFC") != "false" else "without-lfc" + ) + allure.dynamic.parameter( + "__sanitizers", "enabled" if os.getenv("SANITIZERS") == "enabled" else "disabled" + ) yield diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py index 80777d65e9..fc4fb3629b 100644 --- a/test_runner/fixtures/paths.py +++ b/test_runner/fixtures/paths.py @@ -21,8 +21,8 @@ if TYPE_CHECKING: BASE_DIR = Path(__file__).parents[2] -COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc" DEFAULT_OUTPUT_DIR: str = "test_output" +COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc" def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | None = None) -> Path: diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 4e1e8a884f..4df2b2df2b 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -70,6 +70,9 @@ class MockS3Server: def secret_key(self) -> str: return "test" + def session_token(self) -> str: + return "test" + def kill(self): self.server.stop() @@ -161,6 +164,7 @@ class S3Storage: bucket_region: str access_key: str | None secret_key: str | None + session_token: str | None aws_profile: str | None prefix_in_bucket: str client: S3Client @@ -181,13 +185,18 @@ class S3Storage: if home is not None: env["HOME"] = home return env - if self.access_key is not None and self.secret_key is not None: + if ( + self.access_key is not None + and self.secret_key is not None + and self.session_token is not None + ): return { "AWS_ACCESS_KEY_ID": self.access_key, "AWS_SECRET_ACCESS_KEY": self.secret_key, + "AWS_SESSION_TOKEN": self.session_token, } raise RuntimeError( - "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY) have to be set for S3Storage" + "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN) have to be set for S3Storage" ) def to_string(self) -> str: @@ -273,18 +282,35 @@ class S3Storage: def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str: return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}" + def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str: + """ + Gets the latest generation key from a list of keys. + + @param index_keys: A list of keys of different generations, which start with `prefix` + """ + + def parse_gen(key: str) -> int: + shortname = key.split("/")[-1] + generation_str = shortname.removeprefix(prefix).removesuffix(suffix) + try: + return int(generation_str, base=16) + except ValueError: + log.info(f"Ignoring non-matching key: {key}") + return -1 + + if len(keys) == 0: + raise IndexError("No keys found") + + return max(keys, key=parse_gen) + def get_latest_index_key(self, index_keys: list[str]) -> str: """ Gets the latest index file key. @param index_keys: A list of index keys of different generations. """ - - def parse_gen(index_key: str) -> int: - parts = index_key.split("index_part.json-") - return int(parts[-1], base=16) if len(parts) == 2 else -1 - - return max(index_keys, key=parse_gen) + key = self.get_latest_generation_key(prefix="index_part.json-", suffix="", keys=index_keys) + return key def download_index_part(self, index_key: str) -> IndexPartDump: """ @@ -297,6 +323,29 @@ class S3Storage: log.info(f"index_part.json: {body}") return IndexPartDump.from_json(json.loads(body)) + def download_tenant_manifest(self, tenant_id: TenantId) -> dict[str, Any] | None: + tenant_prefix = self.tenant_path(tenant_id) + + objects = self.client.list_objects_v2(Bucket=self.bucket_name, Prefix=f"{tenant_prefix}/")[ + "Contents" + ] + keys = [obj["Key"] for obj in objects if obj["Key"].find("tenant-manifest") != -1] + try: + manifest_key = self.get_latest_generation_key("tenant-manifest-", ".json", keys) + except IndexError: + log.info( + f"No manifest found for tenant {tenant_id}, this is normal if it didn't offload anything yet" + ) + return None + + response = self.client.get_object(Bucket=self.bucket_name, Key=manifest_key) + body = response["Body"].read().decode("utf-8") + log.info(f"Downloaded manifest {manifest_key}: {body}") + + manifest = json.loads(body) + assert isinstance(manifest, dict) + return manifest + def heatmap_key(self, tenant_id: TenantId) -> str: return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" @@ -352,6 +401,7 @@ class RemoteStorageKind(StrEnum): mock_region = mock_s3_server.region() access_key, secret_key = mock_s3_server.access_key(), mock_s3_server.secret_key() + session_token = mock_s3_server.session_token() client = boto3.client( "s3", @@ -359,6 +409,7 @@ class RemoteStorageKind(StrEnum): region_name=mock_region, aws_access_key_id=access_key, aws_secret_access_key=secret_key, + aws_session_token=session_token, ) bucket_name = to_bucket_name(user, test_name) @@ -372,6 +423,7 @@ class RemoteStorageKind(StrEnum): bucket_region=mock_region, access_key=access_key, secret_key=secret_key, + session_token=session_token, aws_profile=None, prefix_in_bucket="", client=client, @@ -383,9 +435,10 @@ class RemoteStorageKind(StrEnum): env_access_key = os.getenv("AWS_ACCESS_KEY_ID") env_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") + env_access_token = os.getenv("AWS_SESSION_TOKEN") env_profile = os.getenv("AWS_PROFILE") assert ( - env_access_key and env_secret_key + env_access_key and env_secret_key and env_access_token ) or env_profile, "need to specify either access key and secret access key or profile" bucket_name = bucket_name or os.getenv("REMOTE_STORAGE_S3_BUCKET") @@ -398,6 +451,9 @@ class RemoteStorageKind(StrEnum): client = boto3.client( "s3", region_name=bucket_region, + aws_access_key_id=env_access_key, + aws_secret_access_key=env_secret_key, + aws_session_token=env_access_token, ) return S3Storage( @@ -405,6 +461,7 @@ class RemoteStorageKind(StrEnum): bucket_region=bucket_region, access_key=env_access_key, secret_key=env_secret_key, + session_token=env_access_token, aws_profile=env_profile, prefix_in_bucket=prefix_in_bucket, client=client, diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 286f80ba69..493ce7334e 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -10,7 +10,7 @@ import requests from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics -from fixtures.utils import wait_until +from fixtures.utils import EnhancedJSONEncoder, wait_until if TYPE_CHECKING: from typing import Any @@ -25,6 +25,7 @@ class Walreceiver: @dataclass class SafekeeperTimelineStatus: + mconf: Configuration | None term: int last_log_term: int pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 @@ -69,6 +70,56 @@ class TermBumpResponse: ) +@dataclass +class SafekeeperId: + id: int + host: str + pg_port: int + + +@dataclass +class Configuration: + generation: int + members: list[SafekeeperId] + new_members: list[SafekeeperId] | None + + @classmethod + def from_json(cls, d: dict[str, Any]) -> Configuration: + generation = d["generation"] + members = d["members"] + new_members = d.get("new_members") + return Configuration(generation, members, new_members) + + def to_json(self) -> str: + return json.dumps(self, cls=EnhancedJSONEncoder) + + +@dataclass +class TimelineCreateRequest: + tenant_id: TenantId + timeline_id: TimelineId + mconf: Configuration + # not exactly PgVersion, for example 150002 for 15.2 + pg_version: int + start_lsn: Lsn + commit_lsn: Lsn | None + + def to_json(self) -> str: + return json.dumps(self, cls=EnhancedJSONEncoder) + + +@dataclass +class TimelineMembershipSwitchResponse: + previous_conf: Configuration + current_conf: Configuration + + @classmethod + def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse: + previous_conf = Configuration.from_json(d["previous_conf"]) + current_conf = Configuration.from_json(d["current_conf"]) + return TimelineMembershipSwitchResponse(previous_conf, current_conf) + + class SafekeeperHttpClient(requests.Session, MetricsGetter): HTTPError = requests.HTTPError @@ -131,20 +182,8 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): resj = res.json() return [TenantTimelineId.from_json(ttidj) for ttidj in resj] - def timeline_create( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 - commit_lsn: Lsn, - ): - body = { - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - "pg_version": pg_version, - "commit_lsn": str(commit_lsn), - } - res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body) + def timeline_create(self, r: TimelineCreateRequest): + res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", data=r.to_json()) res.raise_for_status() def timeline_status( @@ -154,7 +193,10 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): res.raise_for_status() resj = res.json() walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] + # It is always normally not None, it is allowed only to make forward compat tests happy. + mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None return SafekeeperTimelineStatus( + mconf=mconf, term=resj["acceptor_state"]["term"], last_log_term=resj["acceptor_state"]["epoch"], pg_version=resj["pg_info"]["pg_version"], @@ -180,6 +222,11 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: return self.timeline_status(tenant_id, timeline_id).commit_lsn + # Get timeline membership configuration. + def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration: + # make mypy happy + return self.timeline_status(tenant_id, timeline_id).mconf # type: ignore + # only_local doesn't remove segments in the remote storage. def timeline_delete( self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False @@ -226,6 +273,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json + def membership_switch( + self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + ) -> TimelineMembershipSwitchResponse: + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership", + data=to.to_json(), + ) + res.raise_for_status() + return TimelineMembershipSwitchResponse.from_json(res.json()) + def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: dict[str, Any]): res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy", diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index c34ac298d1..84d62fb877 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextlib +import dataclasses import json import os import re @@ -21,6 +22,7 @@ import zstandard from psycopg2.extensions import cursor from typing_extensions import override +from fixtures.common_types import Id, Lsn from fixtures.log_helper import log from fixtures.pageserver.common_types import ( parse_delta_layer, @@ -50,11 +52,11 @@ COMPONENT_BINARIES = { # Disable auto-formatting for better readability # fmt: off VERSIONS_COMBINATIONS = ( - {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"}, - {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"}, - {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"}, - {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"}, - {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"}, + {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"}, # combination: nnnnn + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"}, # combination: ooonn + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"}, # combination: ononn + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"}, # combination: onnnn + {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"}, # combination: nnnoo ) # fmt: on @@ -62,6 +64,8 @@ VERSIONS_COMBINATIONS = ( # If it is not set or set to a value not equal to "false", LFC is enabled by default. USE_LFC = os.environ.get("USE_LFC") != "false" +WITH_SANITIZERS = os.environ.get("SANITIZERS") == "enabled" + def subprocess_capture( capture_dir: Path, @@ -308,62 +312,46 @@ def allure_attach_from_dir(dir: Path, preserve_database_files: bool = False): GRAFANA_URL = "https://neonprod.grafana.net" -GRAFANA_EXPLORE_URL = f"{GRAFANA_URL}/explore" -GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL = f"{GRAFANA_URL}/d/8G011dlnk/timeline-inspector" -LOGS_STAGING_DATASOURCE_ID = "xHHYY0dVz" +GRAFANA_DASHBOARD_URL = f"{GRAFANA_URL}/d/cdya0okb81zwga/cross-service-endpoint-debugging" -def allure_add_grafana_links(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int): - """Add links to server logs in Grafana to Allure report""" - links: dict[str, str] = {} - # We expect host to be in format like ep-divine-night-159320.us-east-2.aws.neon.build +def allure_add_grafana_link(host: str, timeline_id: TimelineId, start_ms: int, end_ms: int): + """ + Add a link to the cross-service endpoint debugging dashboard in Grafana to Allure report. + + Args: + host (str): The host string in the format 'ep-..'. + timeline_id (TimelineId): The timeline identifier for the Grafana dashboard. + (currently ignored but may be needed in future verions of the dashboard) + start_ms (int): The start time in milliseconds for the Grafana dashboard. + end_ms (int): The end time in milliseconds for the Grafana dashboard. + + Example: + Given + host = '' + timeline_id = '996926d1f5ddbe7381b8840083f8fc9a' + + The generated link would be something like: + https://neonprod.grafana.net/d/cdya0okb81zwga/cross-service-endpoint-debugging?orgId=1&from=2025-02-17T21:10:00.000Z&to=2025-02-17T21:20:00.000Z&timezone=utc&var-env=dev%7Cstaging&var-input_endpoint_id=ep-holy-mouse-w2u462gi + + """ + # We expect host to be in format like ep-holy-mouse-w2u462gi.us-east-2.aws.neon.build endpoint_id, region_id, _ = host.split(".", 2) - expressions = { - "compute logs": f'{{app="compute-node-{endpoint_id}", neon_region="{region_id}"}}', - "k8s events": f'{{job="integrations/kubernetes/eventhandler"}} |~ "name=compute-node-{endpoint_id}-"', - "console logs": f'{{neon_service="console", neon_region="{region_id}"}} | json | endpoint_id = "{endpoint_id}"', - "proxy logs": f'{{neon_service="proxy-scram", neon_region="{region_id}"}}', + params = { + "orgId": 1, + "from": start_ms, + "to": end_ms, + "timezone": "utc", + "var-env": "dev|staging", + "var-input_endpoint_id": endpoint_id, } - params: dict[str, Any] = { - "datasource": LOGS_STAGING_DATASOURCE_ID, - "queries": [ - { - "expr": "", - "refId": "A", - "datasource": {"type": "loki", "uid": LOGS_STAGING_DATASOURCE_ID}, - "editorMode": "code", - "queryType": "range", - } - ], - "range": { - "from": str(start_ms), - "to": str(end_ms), - }, - } - for name, expr in expressions.items(): - params["queries"][0]["expr"] = expr - query_string = urlencode({"orgId": 1, "left": json.dumps(params)}) - links[name] = f"{GRAFANA_EXPLORE_URL}?{query_string}" + query_string = urlencode(params) + link = f"{GRAFANA_DASHBOARD_URL}?{query_string}" - timeline_qs = urlencode( - { - "orgId": 1, - "var-environment": "victoria-metrics-aws-dev", - "var-timeline_id": timeline_id, - "var-endpoint_id": endpoint_id, - "var-log_datasource": "grafanacloud-neonstaging-logs", - "from": start_ms, - "to": end_ms, - } - ) - link = f"{GRAFANA_TIMELINE_INSPECTOR_DASHBOARD_URL}?{timeline_qs}" - links["Timeline Inspector"] = link - - for name, link in links.items(): - allure.dynamic.link(link, name=name) - log.info(f"{name}: {link}") + allure.dynamic.link(link, name="Cross-Service Endpoint Debugging") + log.info(f"Cross-Service Endpoint Debugging: {link}") def start_in_background( @@ -605,6 +593,22 @@ class PropagatingThread(threading.Thread): return self.ret +class EnhancedJSONEncoder(json.JSONEncoder): + """ + Default json.JSONEncoder works only on primitive builtins. Extend it to any + dataclass plus our custom types. + """ + + def default(self, o): + if dataclasses.is_dataclass(o) and not isinstance(o, type): + return dataclasses.asdict(o) + elif isinstance(o, Id): + return o.id.hex() + elif isinstance(o, Lsn): + return str(o) # standard hex notation + return super().default(o) + + def human_bytes(amt: float) -> str: """ Render a bytes amount into nice IEC bytes string. diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index 1b8c9fef44..1947a9c3fb 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -53,6 +53,24 @@ class Workload: self._endpoint: Endpoint | None = None self._endpoint_opts = endpoint_opts or {} + self._configured_pageserver: int | None = None + + def branch( + self, + timeline_id: TimelineId, + branch_name: str | None = None, + endpoint_opts: dict[str, Any] | None = None, + ) -> Workload: + """ + Checkpoint the current status of the workload in case of branching + """ + branch_workload = Workload( + self.env, self.tenant_id, timeline_id, branch_name, endpoint_opts + ) + branch_workload.expect_rows = self.expect_rows + branch_workload.churn_cursor = self.churn_cursor + return branch_workload + def reconfigure(self) -> None: """ Request the endpoint to reconfigure based on location reported by storage controller @@ -76,8 +94,12 @@ class Workload: **self._endpoint_opts, ) self._endpoint.start(pageserver_id=pageserver_id) + self._configured_pageserver = pageserver_id else: - self._endpoint.reconfigure(pageserver_id=pageserver_id) + if self._configured_pageserver != pageserver_id: + self._configured_pageserver = pageserver_id + self._endpoint.reconfigure(pageserver_id=pageserver_id) + self._endpoint_config = pageserver_id connstring = self._endpoint.safe_psql( "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'" @@ -106,6 +128,7 @@ class Workload: def write_rows(self, n: int, pageserver_id: int | None = None, upload: bool = True): endpoint = self.endpoint(pageserver_id) + start = self.expect_rows end = start + n - 1 self.expect_rows += n diff --git a/test_runner/logical_repl/README.md b/test_runner/logical_repl/README.md index 8eca056dda..449e56e21d 100644 --- a/test_runner/logical_repl/README.md +++ b/test_runner/logical_repl/README.md @@ -1,13 +1,18 @@ # Logical replication tests +> [!NOTE] +> Neon project should have logical replication enabled: +> +> https://neon.tech/docs/guides/logical-replication-postgres#enable-logical-replication-in-the-source-neon-project + ## Clickhouse ```bash export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb -docker compose -f clickhouse/docker-compose.yml up -d -pytest -m remote_cluster -k test_clickhouse -docker compose -f clickhouse/docker-compose.yml down +docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml up -d +./scripts/pytest -m remote_cluster -k test_clickhouse +docker compose -f test_runner/logical_repl/clickhouse/docker-compose.yml down ``` ## Debezium @@ -15,8 +20,7 @@ docker compose -f clickhouse/docker-compose.yml down ```bash export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb -docker compose -f debezium/docker-compose.yml up -d -pytest -m remote_cluster -k test_debezium -docker compose -f debezium/docker-compose.yml down - -``` \ No newline at end of file +docker compose -f test_runner/logical_repl/debezium/docker-compose.yml up -d +./scripts/pytest -m remote_cluster -k test_debezium +docker compose -f test_runner/logical_repl/debezium/docker-compose.yml down +``` diff --git a/test_runner/performance/many_relations/create_many_relations.sql b/test_runner/performance/many_relations/create_many_relations.sql new file mode 100644 index 0000000000..1b3673c9e1 --- /dev/null +++ b/test_runner/performance/many_relations/create_many_relations.sql @@ -0,0 +1,199 @@ +-- create a schema that simulates Neon control plane operations table +-- however use partitioned operations tables with many (e.g. 500) child partition tables per table +-- in summary we create multiple of these partitioned operations tables (with 500 childs each) - until we reach the requested number of tables + + +-- first we need some other tables that can be referenced by the operations table + +-- Table for branches +CREATE TABLE public.branches ( + id text PRIMARY KEY +); + +-- Table for endpoints +CREATE TABLE public.endpoints ( + id text PRIMARY KEY +); + +-- Table for projects +CREATE TABLE public.projects ( + id text PRIMARY KEY +); + +INSERT INTO public.branches (id) +VALUES ('branch_1'); + +-- Insert one row into endpoints +INSERT INTO public.endpoints (id) +VALUES ('endpoint_1'); + +-- Insert one row into projects +INSERT INTO public.projects (id) +VALUES ('project_1'); + +-- now we create a procedure that can create n operations tables +-- we do that in a procedure to save roundtrip latency when scaling the test to many tables +-- prefix is the base table name, e.g. 'operations_scale_1000' if we create 1000 tables +CREATE OR REPLACE PROCEDURE create_partitioned_tables(prefix text, n INT) +LANGUAGE plpgsql AS $$ +DECLARE + table_name TEXT; -- Variable to hold table names dynamically + i INT; -- Counter for the loop +BEGIN + -- Loop to create n partitioned tables + FOR i IN 1..n LOOP + table_name := format('%s_%s', prefix, i); + + -- Create the partitioned table + EXECUTE format( + 'CREATE TABLE public.%s ( + project_id character varying NOT NULL, + id uuid NOT NULL, + status integer, + action character varying NOT NULL, + error character varying, + created_at timestamp with time zone NOT NULL DEFAULT now(), + updated_at timestamp with time zone NOT NULL DEFAULT now(), + spec jsonb, + retry_at timestamp with time zone, + failures_count integer DEFAULT 0, + metadata jsonb NOT NULL DEFAULT ''{}''::jsonb, + executor_id text NOT NULL, + attempt_duration_ms integer, + metrics jsonb DEFAULT ''{}''::jsonb, + branch_id text, + endpoint_id text, + next_operation_id uuid, + compute_id text, + connection_attempt_at timestamp with time zone, + concurrency_key text, + queue_id text, + CONSTRAINT %s_pkey PRIMARY KEY (id, created_at), + CONSTRAINT %s_branch_id_fk FOREIGN KEY (branch_id) REFERENCES branches(id) ON DELETE CASCADE, + CONSTRAINT %s_endpoint_id_fk FOREIGN KEY (endpoint_id) REFERENCES endpoints(id) ON DELETE CASCADE, + CONSTRAINT %s_next_operation_id_fk FOREIGN KEY (next_operation_id, created_at) REFERENCES %s(id, created_at), + CONSTRAINT %s_project_id_fk FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE + ) PARTITION BY RANGE (created_at)', + table_name, table_name, table_name, table_name, table_name, table_name, table_name + ); + + -- Add indexes for the partitioned table + EXECUTE format('CREATE INDEX index_%s_on_next_operation_id ON public.%s (next_operation_id)', table_name, table_name); + EXECUTE format('CREATE INDEX index_%s_on_project_id ON public.%s (project_id)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_branch_id ON public.%s (branch_id)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_branch_id_created_idx ON public.%s (branch_id, created_at)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_created_at_idx ON public.%s (created_at)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_created_at_project_id_id_cond_idx ON public.%s (created_at, project_id, id)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_endpoint_id ON public.%s (endpoint_id)', table_name, table_name); + EXECUTE format( + 'CREATE INDEX %s_for_redo_worker_idx ON public.%s (executor_id) WHERE status <> 1', + table_name, table_name + ); + EXECUTE format( + 'CREATE INDEX %s_project_id_status_index ON public.%s ((project_id::text), status)', + table_name, table_name + ); + EXECUTE format( + 'CREATE INDEX %s_status_not_finished ON public.%s (status) WHERE status <> 1', + table_name, table_name + ); + EXECUTE format('CREATE INDEX %s_updated_at_desc_idx ON public.%s (updated_at DESC)', table_name, table_name); + EXECUTE format( + 'CREATE INDEX %s_with_failures ON public.%s (failures_count) WHERE failures_count > 0', + table_name, table_name + ); + END LOOP; +END; +$$; + +-- next we create a procedure that can add the child partitions (one per day) to each of the operations tables +CREATE OR REPLACE PROCEDURE create_operations_partitions( + table_name TEXT, + start_date DATE, + end_date DATE +) +LANGUAGE plpgsql AS $$ +DECLARE + partition_date DATE; + partition_name TEXT; + counter INT := 0; -- Counter to track the number of tables created in the current transaction +BEGIN + partition_date := start_date; + + -- Create partitions in batches + WHILE partition_date < end_date LOOP + partition_name := format('%s_%s', table_name, to_char(partition_date,'YYYY_MM_DD')); + + EXECUTE format( + 'CREATE TABLE IF NOT EXISTS public.%s PARTITION OF public.%s + FOR VALUES FROM (''%s'') TO (''%s'')', + partition_name, + table_name, + partition_date, + partition_date + INTERVAL '1 day' + ); + + counter := counter + 1; + + -- Commit and reset counter after every 100 partitions + IF counter >= 100 THEN + COMMIT; + counter := 0; -- Reset the counter + END IF; + + -- Advance to the next day + partition_date := partition_date + INTERVAL '1 day'; + END LOOP; + + -- Final commit for remaining partitions + IF counter > 0 THEN + COMMIT; + END IF; + + -- Insert synthetic rows into each partition + EXECUTE format( + 'INSERT INTO %I ( + project_id, + branch_id, + endpoint_id, + id, + status, + action, + created_at, + updated_at, + spec, + metadata, + executor_id, + failures_count + ) + SELECT + ''project_1'', -- project_id + ''branch_1'', -- branch_id + ''endpoint_1'', -- endpoint_id + ''e8bba687-0df9-4291-bfcd-7d5f6aa7c158'', -- unique id + 1, -- status + ''SYNTHETIC_ACTION'', -- action + gs::timestamp + interval ''0 ms'', -- created_at + gs::timestamp + interval ''1 minute'', -- updated_at + ''{"key": "value"}'', -- spec (JSONB) + ''{"metadata_key": "metadata_value"}'', -- metadata (JSONB) + ''executor_1'', -- executor_id + 0 -- failures_count + FROM generate_series(%L, %L::DATE - INTERVAL ''1 day'', INTERVAL ''1 day'') AS gs', + table_name, start_date, end_date + ); + + -- Commit the inserted rows + COMMIT; +END; +$$; + +-- we can now create partitioned tables using something like +-- CALL create_partitioned_tables('operations_scale_1000' ,10); + +-- and we can create the child partitions for a table using something like +-- CALL create_operations_partitions( +-- 'operations_scale_1000_1', +-- '2000-01-01', -- Start date +-- ('2000-01-01'::DATE + INTERVAL '1 day' * 500)::DATE -- End date (start date + number of days) +-- ); diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py index 0cd1080fa7..eaa89ae754 100644 --- a/test_runner/performance/test_compaction.py +++ b/test_runner/performance/test_compaction.py @@ -75,6 +75,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare): # Initially disable compaction so that we will build up a stack of L0s "compaction_period": "0s", "gc_period": "0s", + "compaction_upper_limit": 12, } ) neon_compare.tenant = tenant_id @@ -91,6 +92,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare): tenant_conf = pageserver_http.tenant_config(tenant_id) assert tenant_conf.effective_config["checkpoint_distance"] == 256 * 1024 * 1024 assert tenant_conf.effective_config["compaction_threshold"] == 10 + assert tenant_conf.effective_config["compaction_upper_limit"] == 12 # Aim to write about 20 L0s, so that we will hit the limit on how many # to compact at once diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index 07f244da0c..acb7b56fd0 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -22,7 +22,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma "checkpoint_distance": f"{1024 ** 2}", "compaction_target_size": f"{1024 ** 2}", # set PITR interval to be small, so we can do GC - "pitr_interval": "60 s", + "pitr_interval": "10 s", # "compaction_threshold": "3", # "image_creation_threshold": "2", } @@ -32,6 +32,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma n_steps = 10 n_update_iters = 100 step_size = 10000 + branch_created = 0 with endpoint.cursor() as cur: cur.execute("SET statement_timeout='1000s'") cur.execute( @@ -66,6 +67,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma if mode == "with_snapshots": if step == n_steps / 2: env.create_branch("child") + branch_created += 1 max_num_of_deltas_above_image = 0 max_total_num_of_deltas = 0 @@ -142,6 +144,15 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma with layer_map_path.open("w") as f: f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id))) + # We should have collected all garbage + if mode == "normal": + # in theory we should get physical size ~= logical size, but given that gc interval is 10s, + # and the layer has indexes that might contribute to the fluctuation, we allow a small margin + # of 1 here, and the end ratio we are asserting is 1 (margin) + 1 (expected) = 2. + assert physical_size / logical_size < 2 + elif mode == "with_snapshots": + assert physical_size / logical_size < (2 + branch_created) + @pytest.mark.timeout(10000) def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): diff --git a/test_runner/performance/test_ingest_logical_message.py b/test_runner/performance/test_ingest_logical_message.py index d3118eb15a..b55cb68b64 100644 --- a/test_runner/performance/test_ingest_logical_message.py +++ b/test_runner/performance/test_ingest_logical_message.py @@ -76,6 +76,9 @@ def test_ingest_logical_message( log.info("Waiting for Pageserver to catch up") wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn) + recover_to_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + endpoint.stop() + # Now that all data is ingested, delete and recreate the tenant in the pageserver. This will # reingest all the WAL from the safekeeper without any other constraints. This gives us a # baseline of how fast the pageserver can ingest this WAL in isolation. @@ -88,7 +91,13 @@ def test_ingest_logical_message( with zenbenchmark.record_duration("pageserver_recover_ingest"): log.info("Recovering WAL into pageserver") client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline) - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + wait_for_last_flush_lsn( + env, endpoint, env.initial_tenant, env.initial_timeline, last_flush_lsn=recover_to_lsn + ) + + # Check endpoint can start, i.e. we really recovered + endpoint.start() + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) # Emit metrics. wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024)) diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index 8a4ad2d399..6c00944005 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -23,24 +23,31 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): "checkpoint_distance": "16384", "compaction_period": "1 s", "compaction_threshold": "1", + "l0_flush_delay_threshold": "0", + "l0_flush_stall_threshold": "0", "compaction_target_size": "16384", } ) endpoint = env.endpoints.create_start("main", tenant_id=tenant) cur = endpoint.connect().cursor() + cur.execute("set log_statement = 'all'") cur.execute("create table t(x integer)") for _ in range(n_iters): - cur.execute(f"insert into t values (generate_series(1,{n_records}))") + with zenbenchmark.record_duration(f"insert into t values (generate_series(1,{n_records}))"): + cur.execute(f"insert into t values (generate_series(1,{n_records}))") time.sleep(1) - cur.execute("vacuum t") + with zenbenchmark.record_duration("vacuum t"): + cur.execute("vacuum t") - with zenbenchmark.record_duration("test_query"): + with zenbenchmark.record_duration("SELECT count(*) from t"): cur.execute("SELECT count(*) from t") assert cur.fetchone() == (n_iters * n_records,) - flush_ep_to_pageserver(env, endpoint, tenant, timeline) - env.pageserver.http_client().timeline_checkpoint( - tenant, timeline, compact=False, wait_until_uploaded=True - ) + with zenbenchmark.record_duration("flush_ep_to_pageserver"): + flush_ep_to_pageserver(env, endpoint, tenant, timeline) + with zenbenchmark.record_duration("timeline_checkpoint"): + env.pageserver.http_client().timeline_checkpoint( + tenant, timeline, compact=False, wait_until_uploaded=True + ) diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py index 704073fe3b..3bf3ef890f 100644 --- a/test_runner/performance/test_lazy_startup.py +++ b/test_runner/performance/test_lazy_startup.py @@ -79,7 +79,9 @@ def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: assert sum == 1000000 # Get metrics - metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json() + metrics = requests.get( + f"http://localhost:{endpoint.external_http_port}/metrics.json" + ).json() durations = { "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec", "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers", diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 9d653d1a1e..fdc56cc496 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -44,13 +44,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") # Wait logical replication channel to be established - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") pg_bin.run_capture(["pgbench", "-c10", "-T100", "-Mprepared", endpoint.connstr()]) # Wait logical replication to sync start = time.time() - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") log.info(f"Sync with master took {time.time() - start} seconds") sum_master = cast("int", endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]) diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/performance/test_parallel_copy.py similarity index 100% rename from test_runner/regress/test_parallel_copy.py rename to test_runner/performance/test_parallel_copy.py diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py index f0a0c1f5a2..da62422fca 100644 --- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py +++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py @@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path): "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}", "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")), "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")), - "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7", + "PGOPTIONS": "-c idle_in_transaction_session_timeout=0 -c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7", } # Combine the current environment with custom variables env = os.environ.copy() diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py new file mode 100644 index 0000000000..2570c55f6c --- /dev/null +++ b/test_runner/performance/test_perf_many_relations.py @@ -0,0 +1,117 @@ +import os +from pathlib import Path + +import pytest +from fixtures.benchmark_fixture import NeonBenchmarker +from fixtures.compare_fixtures import RemoteCompare +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +def get_num_relations(default: int = 1000) -> list[int]: + # We parametrize each run with scale specifying the number of wanted child partitions. + # Databases are pre-created and passed through BENCHMARK_CONNSTR env variable. + scales = os.getenv("TEST_NUM_RELATIONS", default=str(default)) + rv = [] + for s in scales.split(","): + scale = int(s) + rv.append(scale) + return rv + + +@pytest.mark.parametrize("num_relations", get_num_relations()) +@pytest.mark.remote_cluster +def test_perf_many_relations(remote_compare: RemoteCompare, num_relations: int): + """ + Test creating many relations in a single database. + We use partitioned tables with child tables, indexes and constraints to have a realistic schema. + Also we include some common data types like text, uuid, timestamp, JSONB, etc. + + see many_relations/create_many_relations.sql + """ + env = remote_compare + + # prepare some base tables and the plpgsql procedures that we use to create the tables + sql_file = Path(__file__).parent / "many_relations" / "create_many_relations.sql" + env.pg_bin.run_capture(["psql", env.pg.connstr(), "-f", str(sql_file)]) + + num_parent_tables = num_relations // 500 + 1 + log.info(f"Creating {num_relations} relations in {num_parent_tables} parent tables") + + log.info(f"Creating {num_parent_tables} parent tables") + sql = f"CALL create_partitioned_tables('operations_scale_{num_relations}', {num_parent_tables})" + log.info(sql) + env.pg_bin.run_capture(["psql", env.pg.connstr(), "-c", sql]) + + current_table = 0 + num_relations_remaining = num_relations + + # now run and measure the actual relation creation + while num_relations_remaining > 0: + current_table += 1 + parent_table_name = f"operations_scale_{num_relations}_{current_table}" + if num_relations_remaining > 500: + num_relations_to_create = 500 + else: + num_relations_to_create = num_relations_remaining + num_relations_remaining -= num_relations_to_create + log.info( + f"Creating {num_relations_to_create} child tables in partitioned parent table '{parent_table_name}'" + ) + sql = f"CALL create_operations_partitions( '{parent_table_name}', '2000-01-01', ('2000-01-01'::DATE + INTERVAL '1 day' * {num_relations_to_create})::DATE)" + log.info(sql) + with env.zenbenchmark.record_duration( + f"CREATE_TABLE/{current_table}/{num_relations_to_create}" + ): + env.pg_bin.run_capture( + ["psql", env.pg.connstr(options="-cstatement_timeout=1000s "), "-c", sql] + ) + + +def test_perf_simple_many_relations_reldir_v2( + neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker +): + """ + Test creating many relations in a single database. + """ + env = neon_env_builder.init_start(initial_tenant_conf={"rel_size_v2_enabled": "true"}) + ep = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers=1000MB", + "max_locks_per_transaction=16384", + ], + ) + + n = 100000 + step = 5000 + # Create many relations + log.info(f"Creating {n} relations...") + begin = 0 + with zenbenchmark.record_duration("create_first_relation"): + ep.safe_psql("CREATE TABLE IF NOT EXISTS table_begin (id SERIAL PRIMARY KEY, data TEXT)") + with zenbenchmark.record_duration("create_many_relations"): + while True: + end = begin + step + ep.safe_psql_many( + [ + "BEGIN", + f"""DO $$ + DECLARE + i INT; + table_name TEXT; + BEGIN + FOR i IN {begin}..{end} LOOP + table_name := 'table_' || i; + EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)'; + END LOOP; + END $$; + """, + "COMMIT", + ] + ) + begin = end + if begin >= n: + break + with zenbenchmark.record_duration("create_last_relation"): + ep.safe_psql(f"CREATE TABLE IF NOT EXISTS table_{begin} (id SERIAL PRIMARY KEY, data TEXT)") diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py index caa89955e3..e5a9f17da8 100644 --- a/test_runner/performance/test_sharding_autosplit.py +++ b/test_runner/performance/test_sharding_autosplit.py @@ -2,6 +2,7 @@ from __future__ import annotations import concurrent.futures import re +import threading from pathlib import Path import pytest @@ -188,7 +189,20 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): check_pgbench_output(out_path) - with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + stop_pump = threading.Event() + + def pump_controller(): + # Run a background loop to force the storage controller to run its + # background work faster than it otherwise would: this helps + # us: + # A) to create a test that runs in a shorter time + # B) to create a test that is more intensive by doing the shard migrations + # after splits happen more rapidly. + while not stop_pump.is_set(): + env.storage_controller.reconcile_all() + stop_pump.wait(0.1) + + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count + 1) as pgbench_threads: pgbench_futs = [] for tenant_state in tenants.values(): fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint) @@ -198,6 +212,8 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): for fut in pgbench_futs: fut.result() + pump_fut = pgbench_threads.submit(pump_controller) + pgbench_futs = [] for tenant_state in tenants.values(): fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint) @@ -207,6 +223,9 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): for fut in pgbench_futs: fut.result() + stop_pump.set() + pump_fut.result() + def assert_all_split(): for tenant_id in tenants.keys(): shards = tenant_get_shards(env, tenant_id) @@ -228,7 +247,7 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): log.info(f"{shard_zero_id} timeline: {timeline_info}") # Run compaction for all tenants, restart endpoint so that on subsequent reads we will - # definitely hit pageserver for reads. This compaction passis expected to drop unwanted + # definitely hit pageserver for reads. This compaction pass is expected to drop unwanted # layers but not do any rewrites (we're still in the same generation) for tenant_id, tenant_state in tenants.items(): tenant_state.endpoint.stop() @@ -277,6 +296,16 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): for fut in pgbench_futs: fut.result() + # Run a full forced compaction, to detect any data corruption. + for tenant_id, tenant_state in tenants.items(): + for shard_id, shard_ps in tenant_get_shards(env, tenant_id): + shard_ps.http_client().timeline_compact( + shard_id, + tenant_state.timeline_id, + force_image_layer_creation=True, + force_l0_compaction=True, + ) + # Assert that some rewrites happened # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers) diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index d051717e92..60d8b5be30 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -56,7 +56,9 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc endpoint.safe_psql("select 1;") # Get metrics - metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json() + metrics = requests.get( + f"http://localhost:{endpoint.external_http_port}/metrics.json" + ).json() durations = { "wait_for_spec_ms": f"{i}_wait_for_spec", "sync_safekeepers_ms": f"{i}_sync_safekeepers", diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 49f41483ec..d45db28c78 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -13,11 +13,13 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + NeonPageserver, PageserverAvailability, PageserverSchedulingPolicy, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pg_version import PgVersion +from fixtures.utils import wait_until def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]: @@ -85,8 +87,12 @@ def test_storage_controller_many_tenants( ) AZS = ["alpha", "bravo", "charlie"] + + def az_selector(node_id): + return f"az-{AZS[(node_id - 1) % len(AZS)]}" + neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update( - {"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"} + {"availability_zone": az_selector(ps_cfg["id"])} ) # A small sleep on each call into the notify hook, to simulate the latency of doing a database write @@ -168,6 +174,31 @@ def test_storage_controller_many_tenants( log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)") assert rss < expect_memory_per_shard * total_shards + def assert_all_tenants_scheduled_in_home_az(): + for tenant_id in tenant_ids: + desc = env.storage_controller.tenant_describe(tenant_id) + preferred_az = None + for shard in desc["shards"]: + # All shards in a tenant should have the same preferred AZ + if preferred_az is None: + preferred_az = shard["preferred_az_id"] + else: + assert preferred_az == shard["preferred_az_id"] + + # Attachment should be in the preferred AZ + assert shard["preferred_az_id"] == az_selector( + shard["node_attached"] + ), f"Shard {shard['tenant_shard_id']} not in {shard['preferred_az_id']}" + + # Secondary locations should not be in the preferred AZ + for node_secondary in shard["node_secondary"]: + assert ( + shard["preferred_az_id"] != az_selector(node_secondary) + ), f"Shard {shard['tenant_shard_id']} secondary should be in {shard['preferred_az_id']}" + + # There should only be one secondary location (i.e. no migrations in flight) + assert len(shard["node_secondary"]) == 1 + # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore # permits, to ensure that we are exercising stressing that. api_concurrency = 135 @@ -242,6 +273,22 @@ def test_storage_controller_many_tenants( f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s" ) + # Check initial scheduling + assert_all_tenants_scheduled_in_home_az() + az_attached_counts: defaultdict[str, int] = defaultdict(int) + az_secondary_counts: defaultdict[str, int] = defaultdict(int) + node_attached_counts: defaultdict[str, int] = defaultdict(int) + for tenant_id in tenants.keys(): + desc = env.storage_controller.tenant_describe(tenant_id) + for shard in desc["shards"]: + az_attached_counts[az_selector(shard["node_attached"])] += 1 + node_attached_counts[shard["node_attached"]] += 1 + for node_secondary in shard["node_secondary"]: + az_secondary_counts[az_selector(node_secondary)] += 1 + + log.info(f"Initial node attached counts: {node_attached_counts}") + log.info(f"Initial AZ shard counts: {az_attached_counts}, {az_secondary_counts}") + # Plan operations: ensure each tenant with a timeline gets at least # one of each operation type. Then add other tenants to make up the # numbers. @@ -450,11 +497,77 @@ def test_storage_controller_many_tenants( env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) env.storage_controller.consistency_check() + # Since we did `reconcile_until_idle` during the above loop, the system should be left in + # an optimally scheduled state. Validate that this includes all the tenants being scheduled + # in their home AZ. + assert_all_tenants_scheduled_in_home_az() + # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, # as they were not offline long enough to trigger any scheduling changes. env.storage_controller.consistency_check() check_memory() + # Simulate loss of an AZ + victim_az = "az-alpha" + killed_pageservers = [] + for ps in env.pageservers: + if az_selector(ps.id) == victim_az: + ps.stop(immediate=True) + killed_pageservers.append(ps) + log.info(f"Killed pageserver {ps.id}") + + assert killed_pageservers + + # Wait for the controller to notice the pageservers are dead + def assert_pageservers_availability( + pageservers: list[NeonPageserver], expected_availability: PageserverAvailability + ): + nodes = env.storage_controller.nodes() + checked_any = False + node_ids = [ps.id for ps in pageservers] + for node in nodes: + if node["id"] in node_ids: + checked_any = True + assert ( + node["availability"] == expected_availability + ), f"Node {node['id']} is not {expected_availability} yet: {node['availability']}" + + assert checked_any + + wait_until( + lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.OFFLINE), + timeout=60, + ) + + # Let the controller finish all its rescheduling + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + + # Check that all the tenants are rescheduled to the remaining pageservers + for tenant_id in tenant_ids: + desc = env.storage_controller.tenant_describe(tenant_id) + for shard in desc["shards"]: + # Attachment should be outside the AZ where we killed the pageservers + assert ( + az_selector(shard["node_attached"]) != victim_az + ), f"Shard {shard['tenant_shard_id']} still in {victim_az} (node {shard['node_attached']})" + + # Bring back the pageservers + for ps in killed_pageservers: + ps.start() + + wait_until( + lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.ACTIVE), + timeout=60, + ) + + # A very long timeout is required: we will be migrating all the tenants on all the pageservers + # in the region that we just restored. Assume it'll take up to twice as long as it took to fill + # a single node + env.storage_controller.reconcile_until_idle( + max_interval=0.1, timeout_secs=DRAIN_FILL_TIMEOUT * 4 + ) + assert_all_tenants_scheduled_in_home_az() + # Stop the storage controller before tearing down fixtures, because it otherwise might log # errors trying to call our `ComputeReconfigure`. env.storage_controller.stop() diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 354fc15745..0b138bf167 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -421,9 +421,9 @@ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" -version = "0.10.66" +version = "0.10.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" +checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6" dependencies = [ "bitflags 2.6.0", "cfg-if", @@ -453,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.103" +version = "0.9.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" +checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc" dependencies = [ "cc", "libc", diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 45112fd67e..07600dd911 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -139,6 +139,12 @@ def test_fully_custom_config(positive_env: NeonEnv): fully_custom_config = { "compaction_period": "1h", "compaction_threshold": 13, + "compaction_upper_limit": 100, + "compaction_l0_first": False, + "compaction_l0_semaphore": False, + "l0_flush_delay_threshold": 25, + "l0_flush_stall_threshold": 42, + "l0_flush_wait_upload": False, "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m", @@ -171,11 +177,16 @@ def test_fully_custom_config(positive_env: NeonEnv): "image_layer_creation_check_threshold": 1, "lsn_lease_length": "1m", "lsn_lease_length_for_ts": "5s", - "timeline_offloading": True, + "timeline_offloading": False, "wal_receiver_protocol_override": { "type": "interpreted", "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, }, + "rel_size_v2_enabled": True, + "gc_compaction_enabled": True, + "gc_compaction_initial_threshold_kb": 1024000, + "gc_compaction_ratio_percent": 200, + "image_creation_preempt_threshold": 5, } vps_http = env.storage_controller.pageserver_api() diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index c0c9537421..bfc5cb174e 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -7,6 +7,7 @@ import psycopg2.errors import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import USE_LFC @pytest.mark.timeout(600) @@ -80,3 +81,193 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): # do a graceful shutdown which would had caught the allowed_errors before # https://github.com/neondatabase/neon/pull/8632 env.pageserver.stop() + + +def test_compute_pageserver_hung_connections(neon_env_builder: NeonEnvBuilder): + """ + Test timeouts in waiting for response to pageserver request + """ + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(".*slow GetPage.*") + pageserver_http = env.pageserver.http_client() + endpoint = env.endpoints.create_start( + "main", + tenant_id=env.initial_tenant, + config_lines=["autovacuum = off"], + ) + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.debug(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + # Print the backend PID so that it can be compared with the logs easily + cur.execute("SELECT pg_backend_pid()") + row = cur.fetchone() + assert row is not None + log.info(f"running test workload in backend PID {row[0]}") + + def run_workload(duration: float): + end_time = time.time() + duration + times_executed = 0 + while time.time() < end_time: + if random.random() < 0.5: + cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')") + else: + cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") + cur.fetchall() + times_executed += 1 + log.info(f"Workload executed {times_executed} times") + assert times_executed > 0 + + ## Test short connection hiccups + ## + ## This is to exercise the logging timeout. + log.info("running workload with log timeout") + cur.execute("SET neon.pageserver_response_log_timeout = '500ms'") + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%3*return(3000)")) + run_workload(20) + + # check that the message was logged + assert endpoint.log_contains("no response received from pageserver for .* s, still waiting") + assert endpoint.log_contains("received response from pageserver after .* s") + + ## Test connections that are hung for longer + ## + ## This exercises the disconnect timeout. We'll disconnect and + ## reconnect after 500 ms. + log.info("running workload with disconnect timeout") + cur.execute("SET neon.pageserver_response_log_timeout = '250ms'") + cur.execute("SET neon.pageserver_response_disconnect_timeout = '500ms'") + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%3*return(3000)")) + run_workload(15) + + assert endpoint.log_contains("no response from pageserver for .* s, disconnecting") + + # do a graceful shutdown which would had caught the allowed_errors before + # https://github.com/neondatabase/neon/pull/8632 + env.pageserver.stop() + + +def test_compute_pageserver_statement_timeout(neon_env_builder: NeonEnvBuilder): + """ + Test statement_timeout while waiting for response to pageserver request + """ + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(".*slow GetPage.*") + pageserver_http = env.pageserver.http_client() + + # Make sure the shared_buffers and LFC are tiny, to ensure the queries + # hit the storage. Disable autovacuum to make the test more deterministic. + config_lines = [ + "shared_buffers='512kB'", + "autovacuum = off", + ] + if USE_LFC: + config_lines = ["neon.max_file_cache_size = 1MB", "neon.file_cache_size_limit = 1MB"] + endpoint = env.endpoints.create_start( + "main", + tenant_id=env.initial_tenant, + config_lines=config_lines, + ) + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Disable parallel query. Parallel workers open their own pageserver connections, + # which messes up the test logic. + cur.execute("SET max_parallel_workers_per_gather=0") + cur.execute("SET effective_io_concurrency=0") + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.debug(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + ## Run a query until the compute->pageserver connection hits the failpoint and + ## get stuck. This tests that the statement_timeout is obeyed while waiting on a + ## GetPage request. + log.info("running workload with statement_timeout") + cur.execute("SET neon.pageserver_response_log_timeout = '2000ms'") + cur.execute("SET neon.pageserver_response_disconnect_timeout = '30000ms'") + cur.execute("SET statement_timeout='10s'") + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "10%return(60000)")) + + start_time = time.time() + with pytest.raises(psycopg2.errors.QueryCanceled): + cur.execute("SELECT count(*) FROM foo") + cur.fetchall() + log.info("Statement timeout reached") + end_time = time.time() + # Verify that the statement_timeout canceled the query before + # neon.pageserver_response_disconnect_timeout expired + assert end_time - start_time < 40 + times_canceled = 1 + + # Should not have disconnected yet + assert not endpoint.log_contains("no response from pageserver for .* s, disconnecting") + + # Clear the failpoint. This doesn't affect the connection that already hit it. It + # will keep waiting. But subsequent connections will work normally. + pageserver_http.configure_failpoints(("before-pagestream-msg-flush", "off")) + + # If we keep retrying, we should eventually succeed. (This tests that the + # neon.pageserver_response_disconnect_timeout is not reset on query + # cancellation.) + while times_canceled < 10: + try: + cur.execute("SELECT count(*) FROM foo") + cur.fetchall() + log.info("Statement succeeded") + break + except psycopg2.errors.QueryCanceled: + log.info("Statement timed out, retrying") + times_canceled += 1 + assert times_canceled > 1 and times_canceled < 10 + + assert endpoint.log_contains("no response from pageserver for .* s, disconnecting") + + # do a graceful shutdown which would had caught the allowed_errors before + # https://github.com/neondatabase/neon/pull/8632 + env.pageserver.stop() diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index fccfbc7f09..0e28231a86 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -64,6 +64,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # tweak the default settings to allow quickly create image layers and L1 layers "compaction_period": "1 s", "compaction_threshold": "2", + "l0_flush_delay_threshold": "20", + "l0_flush_stall_threshold": "40", "image_creation_threshold": "1", # Disable PITR, this test will set an explicit space-based GC limit "pitr_interval": "0 s", diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 124e62999a..d49686b57c 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -29,6 +29,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): ".*failed to load metadata.*", ".*load failed.*load local timeline.*", ".*: layer load failed, assuming permanent failure:.*", + ".*failed to get checkpoint bytes.*", + ".*failed to get control bytes.*", ] ) @@ -75,7 +77,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err: + with pytest.raises(Exception, match="failed to get checkpoint bytes") as err: pg1.start() log.info( f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}" diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py index 10027ce689..2ae38e6d88 100644 --- a/test_runner/regress/test_clog_truncate.py +++ b/test_runner/regress/test_clog_truncate.py @@ -1,18 +1,19 @@ from __future__ import annotations import os -import time from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv -from fixtures.utils import query_scalar +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import query_scalar, wait_until # # Test compute node start after clog truncation # -def test_clog_truncate(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_clog_truncate(neon_env_builder: NeonEnvBuilder): + # Use a multi-sharded tenant because WAL ingest logic is shard-dependent, and + # this test is one of the very few that exercises a CLogTruncate WAL record. + env = neon_env_builder.init_start(initial_tenant_shard_count=2) # set aggressive autovacuum to make sure that truncation will happen config = [ @@ -31,6 +32,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv): endpoint.safe_psql("CREATE EXTENSION neon_test_utils") # Consume many xids to advance clog + log.info("Consuming xids...") with endpoint.cursor() as cur: cur.execute("select test_consume_xids(1000*1000*10);") log.info("xids consumed") @@ -47,11 +49,17 @@ def test_clog_truncate(neon_simple_env: NeonEnv): pg_xact_0000_path = os.path.join(endpoint.pg_xact_dir_path(), "0000") log.info(f"pg_xact_0000_path = {pg_xact_0000_path}") - while os.path.isfile(pg_xact_0000_path): - log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}") - time.sleep(5) + def assert_file_removed(): + exists = os.path.isfile(pg_xact_0000_path) + if exists: + log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}") + assert not exists + + log.info("Waiting for truncation...") + wait_until(assert_file_removed) # checkpoint to advance latest lsn + log.info("Checkpointing...") with endpoint.cursor() as cur: cur.execute("CHECKPOINT;") lsn_after_truncation = query_scalar(cur, "select pg_current_wal_insert_lsn()") diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 810a9723e0..ce8ed3c7c5 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -1,6 +1,8 @@ from __future__ import annotations import json +import math +import random import time from enum import StrEnum @@ -27,6 +29,21 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = { # "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later } +PREEMPT_COMPACTION_TENANT_CONF = { + "gc_period": "5s", + "compaction_period": "5s", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024**2, + # Compact small layers + "compaction_target_size": 1024**2, + "image_creation_threshold": 1, + "image_creation_preempt_threshold": 1, + # compact more frequently + "compaction_threshold": 3, + "compaction_upper_limit": 6, + "lsn_lease_length": "0s", +} + @skip_in_debug_build("only run with release build") @pytest.mark.parametrize( @@ -34,7 +51,8 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = { [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED], ) def test_pageserver_compaction_smoke( - neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol + neon_env_builder: NeonEnvBuilder, + wal_receiver_protocol: PageserverWalReceiverProtocol, ): """ This is a smoke test that compaction kicks in. The workload repeatedly churns @@ -52,7 +70,8 @@ def test_pageserver_compaction_smoke( page_cache_size=10 """ - env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF) + conf = AGGRESSIVE_COMPACTION_TENANT_CONF.copy() + env = neon_env_builder.init_start(initial_tenant_conf=conf) tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -84,12 +103,9 @@ page_cache_size=10 log.info("Checking layer access metrics ...") layer_access_metric_names = [ - "pageserver_layers_visited_per_read_global_sum", - "pageserver_layers_visited_per_read_global_count", - "pageserver_layers_visited_per_read_global_bucket", - "pageserver_layers_visited_per_vectored_read_global_sum", - "pageserver_layers_visited_per_vectored_read_global_count", - "pageserver_layers_visited_per_vectored_read_global_bucket", + "pageserver_layers_per_read_global_sum", + "pageserver_layers_per_read_global_count", + "pageserver_layers_per_read_global_bucket", ] metrics = env.pageserver.http_client().get_metrics() @@ -97,14 +113,8 @@ page_cache_size=10 layer_access_metrics = metrics.query_all(name) log.info(f"Got metrics: {layer_access_metrics}") - non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum") - non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count") - if non_vectored_count.value != 0: - non_vectored_average = non_vectored_sum.value / non_vectored_count.value - else: - non_vectored_average = 0 - vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum") - vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count") + vectored_sum = metrics.query_one("pageserver_layers_per_read_global_sum") + vectored_count = metrics.query_one("pageserver_layers_per_read_global_count") if vectored_count.value > 0: assert vectored_sum.value > 0 vectored_average = vectored_sum.value / vectored_count.value @@ -113,16 +123,54 @@ page_cache_size=10 assert vectored_sum.value == 0 vectored_average = 0 - log.info(f"{non_vectored_average=} {vectored_average=}") + log.info(f"{vectored_average=}") # The upper bound for average number of layer visits below (8) # was chosen empirically for this workload. - assert non_vectored_average < 8 assert vectored_average < 8 @skip_in_debug_build("only run with release build") -def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): +def test_pageserver_compaction_preempt( + neon_env_builder: NeonEnvBuilder, +): + # Ideally we should be able to do unit tests for this, but we need real Postgres + # WALs in order to do unit testing... + + conf = PREEMPT_COMPACTION_TENANT_CONF.copy() + env = neon_env_builder.init_start(initial_tenant_conf=conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 200000 + churn_rounds = 10 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + log.info(f"Running churn round {i}/{churn_rounds} ...") + workload.churn_rows(row_count, env.pageserver.id, upload=False) + workload.validate(env.pageserver.id) + ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True) + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + # ensure image layer creation gets preempted and then resumed + env.pageserver.assert_log_contains("resuming image layer creation") + + +@skip_in_debug_build("only run with release build") +@pytest.mark.parametrize( + "with_branches", + ["with_branches", "no_branches"], +) +def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_branches: str): SMOKE_CONF = { # Run both gc and gc-compaction. "gc_period": "5s", @@ -134,7 +182,6 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): } env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) - tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -149,11 +196,20 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): log.info("Writing initial data ...") workload.write_rows(row_count, env.pageserver.id) - for i in range(1, churn_rounds + 1): - if i % 10 == 0: - log.info(f"Running churn round {i}/{churn_rounds} ...") + ps_http.timeline_gc( + tenant_id, timeline_id, None + ) # Force refresh gc info to have gc_cutoff generated - # Run gc-compaction every 10 rounds to ensure the test doesn't take too long time. + child_workloads: list[Workload] = [] + + for i in range(1, churn_rounds + 1): + log.info(f"Running churn round {i}/{churn_rounds} ...") + if i % 10 == 5 and with_branches == "with_branches": + branch_name = f"child-{i}" + branch_timeline_id = env.create_branch(branch_name) + child_workloads.append(workload.branch(branch_timeline_id, branch_name)) + if (i - 1) % 10 == 0 or (i - 1) % 10 == 1: + # Run gc-compaction twice every 10 rounds to ensure the test doesn't take too long time. ps_http.timeline_compact( tenant_id, timeline_id, @@ -161,19 +217,246 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): body={ "scheduled": True, "sub_compaction": True, - "compact_range": { + "compact_key_range": { "start": "000000000000000000000000000000000000", "end": "030000000000000000000000000000000000", }, + "sub_compaction_max_job_size_mb": 16, }, ) + # do not wait for upload so that we can see if gc_compaction works well with data being ingested + workload.churn_rows(row_count, env.pageserver.id, upload=False) + time.sleep(1) + workload.validate(env.pageserver.id) - workload.churn_rows(row_count, env.pageserver.id) + def compaction_finished(): + queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id)) + assert queue_depth == 0 + + wait_until(compaction_finished, timeout=60) # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) - env.pageserver.assert_log_contains( - "scheduled_compact_timeline.*picked .* layers for compaction" + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + for child_workload in child_workloads: + log.info(f"Validating at branch {child_workload.branch_name}") + child_workload.validate(env.pageserver.id) + + # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction. + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_gc(tenant_id, timeline_id, None) + + +@pytest.mark.parametrize( + "compaction_mode", + ["before_restart", "after_restart"], +) +def test_pageserver_gc_compaction_idempotent( + neon_env_builder: NeonEnvBuilder, compaction_mode: str +): + """ + Do gc-compaction twice without writing any new data and see if anything breaks. + We run this test in two modes: + - before_restart: run two gc-compactions before pageserver restart + - after_restart: run one gc-compaction before and one after pageserver restart + """ + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": 1024, + "lsn_lease_length": "0s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Only in testing mode: the warning is expected because we rewrite a layer file of different generations. + # We could potentially patch the sanity-check code to not emit the warning in the future. + env.pageserver.allowed_errors.append(".*was unlinked but was not dangling.*") + + row_count = 10000 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + workload.write_rows(row_count, env.pageserver.id) + + child_workloads: list[Workload] = [] + + def compaction_finished(): + queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id)) + assert queue_depth == 0 + + workload.churn_rows(row_count, env.pageserver.id) + env.create_branch("child_branch") # so that we have a retain_lsn + workload.churn_rows(row_count, env.pageserver.id) + env.create_branch("child_branch_2") # so that we have another retain_lsn + workload.churn_rows(row_count, env.pageserver.id) + # compact 3 times if mode is before_restart + n_compactions = 3 if compaction_mode == "before_restart" else 1 + ps_http.timeline_compact( + tenant_id, timeline_id, force_l0_compaction=True, wait_until_uploaded=True ) + for _ in range(n_compactions): + # Force refresh gc info to have gc_cutoff generated + ps_http.timeline_gc(tenant_id, timeline_id, None) + ps_http.timeline_compact( + tenant_id, + timeline_id, + enhanced_gc_bottom_most_compaction=True, + body={ + "scheduled": True, + "sub_compaction": True, + "sub_compaction_max_job_size_mb": 16, + }, + ) + wait_until(compaction_finished, timeout=60) + if compaction_mode == "after_restart": + env.pageserver.restart(True) + ps_http.timeline_gc( + tenant_id, timeline_id, None + ) # Force refresh gc info to have gc_cutoff generated + for _ in range(3): + ps_http.timeline_compact( + tenant_id, + timeline_id, + enhanced_gc_bottom_most_compaction=True, + body={ + "scheduled": True, + "sub_compaction": True, + "sub_compaction_max_job_size_mb": 16, + }, + ) + wait_until(compaction_finished, timeout=60) + + # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") + + # ensure we hit the duplicated layer key warning at least once: we did two compactions consecutively, + # and the second one should have hit the duplicated layer key warning. + if compaction_mode == "before_restart": + env.pageserver.assert_log_contains("duplicated layer key in the same generation") + else: + env.pageserver.assert_log_contains("same layer key at different generation") + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + for child_workload in child_workloads: + log.info(f"Validating at branch {child_workload.branch_name}") + child_workload.validate(env.pageserver.id) + + # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction. + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_gc(tenant_id, timeline_id, None) + + +@skip_in_debug_build("only run with release build") +def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder): + """ + Force interrupt a gc-compaction and see if anything breaks. + """ + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": "1024", + "lsn_lease_length": "0s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Only in testing mode: the warning is expected because we rewrite a layer file of different generations. + # We could potentially patch the sanity-check code to not emit the warning in the future. + env.pageserver.allowed_errors.append(".*was unlinked but was not dangling.*") + + row_count = 10000 + churn_rounds = 20 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + def compaction_finished(): + queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id)) + assert queue_depth == 0 + + expected_compaction_time_seconds = 5.0 + ps_http.timeline_gc( + tenant_id, timeline_id, None + ) # Force refresh gc info to have gc_cutoff generated + for i in range(1, churn_rounds + 1): + log.info(f"Running churn round {i}/{churn_rounds} ...") + workload.churn_rows(row_count, env.pageserver.id) + ps_http.timeline_compact( + tenant_id, + timeline_id, + enhanced_gc_bottom_most_compaction=True, + body={ + "scheduled": True, + "sub_compaction": True, + "compact_key_range": { + "start": "000000000000000000000000000000000000", + "end": "030000000000000000000000000000000000", + }, + "sub_compaction_max_job_size_mb": 16, + }, + ) + # sleep random seconds between 0 and max(compaction_time); if the result is 0, wait until the compaction is complete + # This would hopefully trigger the restart at different periods of the compaction: + # - while we are doing the compaction + # - while we finished the compaction but not yet uploaded the metadata + # - after we uploaded the metadata + time_to_sleep = random.randint(0, max(5, math.ceil(expected_compaction_time_seconds))) + if time_to_sleep == 0 or i == 1: + start = time.time() + wait_until(compaction_finished, timeout=60) + end = time.time() + expected_compaction_time_seconds = end - start + log.info( + f"expected_compaction_time_seconds updated to {expected_compaction_time_seconds} seconds" + ) + else: + time.sleep(time_to_sleep) + env.pageserver.restart(True) + ps_http.timeline_gc( + tenant_id, timeline_id, None + ) # Force refresh gc info to have gc_cutoff generated + ps_http.timeline_compact( + tenant_id, + timeline_id, + enhanced_gc_bottom_most_compaction=True, + body={ + "scheduled": True, + "sub_compaction": True, + "compact_key_range": { + "start": "000000000000000000000000000000000000", + "end": "030000000000000000000000000000000000", + }, + "sub_compaction_max_job_size_mb": 16, + }, + ) + workload.validate(env.pageserver.id) + + wait_until(compaction_finished, timeout=60) + + # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") log.info("Validating at workload end ...") workload.validate(env.pageserver.id) @@ -183,6 +466,59 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): ps_http.timeline_gc(tenant_id, timeline_id, None) +@skip_in_debug_build("only run with release build") +def test_pageserver_gc_compaction_trigger(neon_env_builder: NeonEnvBuilder): + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": f"{1024 * 16}", + "lsn_lease_length": "0s", + "gc_compaction_enabled": "true", + "gc_compaction_initial_threshold_kb": "16", + "gc_compaction_ratio_percent": "50", + # Do not generate image layers with create_image_layers + "image_layer_creation_check_threshold": "100", + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 10000 + churn_rounds = 20 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + ps_http.timeline_gc( + tenant_id, timeline_id, None + ) # Force refresh gc info to have gc_cutoff generated + + def compaction_finished(): + queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id)) + assert queue_depth == 0 + + for i in range(1, churn_rounds + 1): + log.info(f"Running churn round {i}/{churn_rounds} ...") + workload.churn_rows(row_count, env.pageserver.id, upload=True) + wait_until(compaction_finished, timeout=60) + workload.validate(env.pageserver.id) + + # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) + env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction") + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + # Stripe sizes in number of pages. TINY_STRIPES = 16 LARGE_STRIPES = 32768 @@ -223,7 +559,9 @@ def test_sharding_compaction( "pitr_interval": "0s", # disable background compaction and GC. We invoke it manually when we want it to happen. "gc_period": "0s", + "gc_horizon": f"{128 * 1024}", "compaction_period": "0s", + "lsn_lease_length": "0s", # create image layers eagerly: we want to exercise image layer creation in this test. "image_creation_threshold": "1", "image_layer_creation_check_threshold": 0, @@ -298,6 +636,8 @@ def test_sharding_compaction( for shard in env.storage_controller.locate(tenant_id): pageserver = env.get_pageserver(shard["node_id"]) tenant_shard_id = shard["shard_id"] + # Force refresh gc info to have gc_cutoff generated + pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None) pageserver.http_client().timeline_compact( tenant_shard_id, timeline_id, @@ -390,9 +730,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) env.pageserver.http_client().configure_failpoints((FAILPOINT, "return")) # Write some data to trigger compaction - workload.write_rows(1024, upload=False) - workload.write_rows(1024, upload=False) - workload.write_rows(1024, upload=False) + workload.write_rows(32768, upload=False) def assert_broken(): env.pageserver.assert_log_contains(BROKEN_LOG) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index ba7305148f..823f2185e4 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -141,11 +141,18 @@ def test_create_snapshot( neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + # Miniature layers to enable generating non-trivial layer map without writing lots of data. + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + } + ) endpoint = env.endpoints.create_start("main") - pg_bin.run_capture(["pgbench", "--initialize", "--scale=10", endpoint.connstr()]) - pg_bin.run_capture(["pgbench", "--time=60", "--progress=2", endpoint.connstr()]) + pg_bin.run_capture(["pgbench", "--initialize", "--scale=1", endpoint.connstr()]) + pg_bin.run_capture(["pgbench", "--time=30", "--progress=2", endpoint.connstr()]) pg_bin.run_capture( ["pg_dumpall", f"--dbname={endpoint.connstr()}", f"--file={test_output_dir / 'dump.sql'}"] ) @@ -157,7 +164,9 @@ def test_create_snapshot( pageserver_http = env.pageserver.http_client() flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) - pageserver_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + pageserver_http.timeline_checkpoint( + tenant_id, timeline_id, wait_until_uploaded=True, force_image_layer_creation=True + ) env.endpoints.stop_all() for sk in env.safekeepers: @@ -242,6 +251,8 @@ def test_forward_compatibility( os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" ) + neon_env_builder.test_may_use_compatibility_snapshot_binaries = True + try: neon_env_builder.num_safekeepers = 3 @@ -303,7 +314,10 @@ def test_forward_compatibility( def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path): - ep = env.endpoints.create_start("main") + ep = env.endpoints.create("main") + ep_env = {"LD_LIBRARY_PATH": str(env.pg_distrib_dir / f"v{env.pg_version}/lib")} + ep.start(env=ep_env) + connstr = ep.connstr() pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) @@ -352,7 +366,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r ) # Timeline exists again: restart the endpoint - ep.start() + ep.start(env=ep_env) pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] @@ -460,6 +474,14 @@ HISTORIC_DATA_SETS = [ PgVersion.V16, "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst", ), + # This dataset created on a pageserver running modern code at time of capture, but configured with no generation. This + # is our regression test that we can load data written without generations in layer file names & indices + HistoricDataSet( + "2025-02-07-nogenerations", + TenantId("e1411ca6562d6ff62419f693a5695d67"), + PgVersion.V17, + "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst", + ), ] diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index b3719a45ed..3a08671bbf 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -1,7 +1,9 @@ from __future__ import annotations +import logging + import requests -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import NeonEnv, logical_replication_sync TEST_DB_NAMES = [ { @@ -80,7 +82,7 @@ def test_compute_catalog(neon_simple_env: NeonEnv): ddl = client.database_schema(database=test_db["name"]) # Check that it looks like a valid PostgreSQL dump - assert "-- PostgreSQL database dump" in ddl + assert "-- PostgreSQL database dump complete" in ddl # Check that it doesn't contain health_check and migration traces. # They are only created in system `postgres` database, so by checking @@ -136,3 +138,235 @@ def test_compute_create_databases(neon_simple_env: NeonEnv): assert curr_db is not None assert len(curr_db) == 1 assert curr_db[0] == db["name"] + + +def test_dropdb_with_subscription(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can drop a database that has a logical replication subscription. + """ + env = neon_simple_env + + # Create and start endpoint so that neon_local put all the generated + # stuff into the spec.json file. + endpoint = env.endpoints.create_start("main") + + TEST_DB_NAMES = [ + { + "name": "neondb", + "owner": "cloud_admin", + }, + { + "name": "subscriber_db", + "owner": "cloud_admin", + }, + { + "name": "publisher_db", + "owner": "cloud_admin", + }, + ] + + # Update the spec.json file to create the databases + # and reconfigure the endpoint to apply the changes. + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, + } + ) + endpoint.reconfigure() + + # connect to the publisher_db and create a publication + with endpoint.cursor(dbname="publisher_db") as cursor: + cursor.execute("CREATE PUBLICATION mypub FOR ALL TABLES") + cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');") + cursor.execute("CREATE TABLE t(a int)") + cursor.execute("INSERT INTO t VALUES (1)") + cursor.execute("CHECKPOINT") + + # connect to the subscriber_db and create a subscription + # Note that we need to create subscription with + connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''") + with endpoint.cursor(dbname="subscriber_db") as cursor: + cursor.execute("CREATE TABLE t(a int)") + cursor.execute( + f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub WITH (create_slot = false) " + ) + + # wait for the subscription to be active + logical_replication_sync( + endpoint, + endpoint, + "mysub", + sub_dbname="subscriber_db", + pub_dbname="publisher_db", + ) + + # Check that replication is working + with endpoint.cursor(dbname="subscriber_db") as cursor: + cursor.execute("SELECT * FROM t") + rows = cursor.fetchall() + assert len(rows) == 1 + assert rows[0][0] == 1 + + # drop the subscriber_db from the list + TEST_DB_NAMES_NEW = [ + { + "name": "neondb", + "owner": "cloud_admin", + }, + { + "name": "publisher_db", + "owner": "cloud_admin", + }, + ] + # Update the spec.json file to drop the database + # and reconfigure the endpoint to apply the changes. + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES_NEW, + }, + "delta_operations": [ + {"action": "delete_db", "name": "subscriber_db"}, + # also test the case when we try to delete a non-existent database + # shouldn't happen in normal operation, + # but can occur when failed operations are retried + {"action": "delete_db", "name": "nonexistent_db"}, + ], + } + ) + + logging.info("Reconfiguring the endpoint to drop the subscriber_db") + endpoint.reconfigure() + + # Check that the subscriber_db is dropped + with endpoint.cursor() as cursor: + cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", ("subscriber_db",)) + catalog_db = cursor.fetchone() + assert catalog_db is None + + # Check that we can still connect to the publisher_db + with endpoint.cursor(dbname="publisher_db") as cursor: + cursor.execute("SELECT * FROM current_database()") + curr_db = cursor.fetchone() + assert curr_db is not None + assert len(curr_db) == 1 + assert curr_db[0] == "publisher_db" + + +def test_compute_drop_role(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can drop a role even if it has some depending objects + like permissions in one of the databases. + Reproduction test for https://github.com/neondatabase/cloud/issues/13582 + """ + env = neon_simple_env + TEST_DB_NAME = "db_with_permissions" + + endpoint = env.endpoints.create_start("main") + + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [ + { + # We need to create role via compute_ctl, because in this case it will receive + # additional grants equivalent to our real environment, so we can repro some + # issues. + "name": "neon", + # Some autocomplete-suggested hash, no specific meaning. + "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=", + "options": [], + }, + ], + "databases": [ + { + "name": TEST_DB_NAME, + "owner": "neon", + }, + ], + }, + } + ) + endpoint.reconfigure() + + with endpoint.cursor(dbname=TEST_DB_NAME) as cursor: + # Create table and view as `cloud_admin`. This is the case when, for example, + # PostGIS extensions creates tables in `public` schema. + cursor.execute("create table test_table (id int)") + cursor.execute("create view test_view as select * from test_table") + + with endpoint.cursor(dbname=TEST_DB_NAME, user="neon") as cursor: + cursor.execute("create role readonly") + # We (`compute_ctl`) make 'neon' the owner of schema 'public' in the owned database. + # Postgres has all sorts of permissions and grants that we may not handle well, + # but this is the shortest repro grant for the issue + # https://github.com/neondatabase/cloud/issues/13582 + cursor.execute("grant select on all tables in schema public to readonly") + + # Check that role was created + with endpoint.cursor() as cursor: + cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'") + role = cursor.fetchone() + assert role is not None + + # Confirm that we actually have some permissions for 'readonly' role + # that may block our ability to drop the role. + with endpoint.cursor(dbname=TEST_DB_NAME) as cursor: + cursor.execute( + "select grantor from information_schema.role_table_grants where grantee = 'readonly'" + ) + res = cursor.fetchall() + assert len(res) == 2, f"Expected 2 table grants, got {len(res)}" + for row in res: + assert row[0] == "neon_superuser" + + # Drop role via compute_ctl + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "delta_operations": [ + { + "action": "delete_role", + "name": "readonly", + }, + ], + } + ) + endpoint.reconfigure() + + # Check that role is dropped + with endpoint.cursor() as cursor: + cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'") + role = cursor.fetchone() + assert role is None + + # + # Drop schema 'public' and check that we can still drop the role + # + with endpoint.cursor(dbname=TEST_DB_NAME) as cursor: + cursor.execute("create role readonly2") + cursor.execute("grant select on all tables in schema public to readonly2") + cursor.execute("drop schema public cascade") + + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "delta_operations": [ + { + "action": "delete_role", + "name": "readonly2", + }, + ], + } + ) + endpoint.reconfigure() + + with endpoint.cursor() as cursor: + cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly2'") + role = cursor.fetchone() + assert role is None diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index 787790103f..b360162dc1 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -3,19 +3,24 @@ from __future__ import annotations import enum import os import shutil +import sys from enum import StrEnum +from logging import debug from pathlib import Path from typing import TYPE_CHECKING, cast -# Docs are available at https://jsonnet.org/ref/bindings.html#python_api -import _jsonnet import pytest import requests import yaml +from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log +from fixtures.metrics import parse_metrics from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR +from fixtures.utils import wait_until +from prometheus_client.samples import Sample if TYPE_CHECKING: + from collections.abc import Callable from types import TracebackType from typing import Self, TypedDict @@ -87,6 +92,10 @@ def jsonnet_evaluate_file( ext_vars: str | dict[str, str] | None = None, tla_vars: str | dict[str, str] | None = None, ) -> str: + # Jsonnet doesn't support Python 3.13 yet + # Docs are available at https://jsonnet.org/ref/bindings.html#python_api + import _jsonnet + return cast( "str", _jsonnet.evaluate_file( @@ -121,6 +130,7 @@ class SqlExporterProcess(StrEnum): AUTOSCALING = "autoscaling" +@pytest.mark.xfail(sys.version_info >= (3, 13), reason="Jsonnet doesn't support Python 3.13 yet") @pytest.mark.parametrize( "collector_name", ["neon_collector", "neon_collector_autoscaling"], @@ -215,7 +225,7 @@ if SQL_EXPORTER is None: # # The "host" network mode allows sql_exporter to talk to the # endpoint which is running on the host. - super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host") + super().__init__("docker.io/burningalchemist/sql_exporter:0.17.0", network_mode="host") self.__logs_dir = logs_dir self.__port = port @@ -248,7 +258,7 @@ if SQL_EXPORTER is None: log.info("Waiting for sql_exporter to be ready") wait_for_logs( self, - rf'level=info msg="Listening on" address=\[::\]:{self.__port}', + rf'msg="Listening on" address=\[::\]:{self.__port}', timeout=5, ) @@ -340,10 +350,7 @@ else: time.sleep(0.5) continue - if ( - f'level=info msg="Listening on" address=[::]:{self._sql_exporter_port}' - in line - ): + if f'msg="Listening on" address=[::]:{self._sql_exporter_port}' in line: break @override @@ -352,6 +359,7 @@ else: self.__proc.wait() +@pytest.mark.xfail(sys.version_info >= (3, 13), reason="Jsonnet doesn't support Python 3.13 yet") @pytest.mark.parametrize( "exporter", [SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING], @@ -465,3 +473,100 @@ def test_perf_counters(neon_simple_env: NeonEnv): cur.execute("CREATE EXTENSION neon VERSION '1.5'") cur.execute("SELECT * FROM neon_perf_counters") cur.execute("SELECT * FROM neon_backend_perf_counters") + + +def collect_metric( + client: EndpointHttpClient, + name: str, + filter: dict[str, str], + predicate: Callable[[list[Sample]], bool], +) -> Callable[[], list[Sample]]: + """ + Call this function as the first argument to wait_until(). + """ + + def __collect_metric() -> list[Sample]: + resp = client.metrics() + debug("Metrics: %s", resp) + m = parse_metrics(resp) + samples = m.query_all(name, filter) + debug("Samples: %s", samples) + assert predicate(samples), "predicate failed" + return samples + + return __collect_metric + + +def test_compute_installed_extensions_metric(neon_simple_env: NeonEnv): + """ + Test that the compute_installed_extensions properly reports accurate + results. Important to note that currently this metric is only gathered on + compute start. We install the neon extension into a database other than + postgres because compute_ctl will run `ALTER EXTENSION neon UPDATE` during + Postgres startup in the postgres database, creating a race condition. + """ + DB_NAME = "test" + + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + endpoint.safe_psql(f"CREATE DATABASE {DB_NAME}") + + # The metric is only gathered on compute start, so restart to check that + # plpgsql is now in 3 databases, instead of its regular 2, template1 and + # postgres. + endpoint.stop() + endpoint.start() + + client = endpoint.http_client() + + def __has_plpgsql(samples: list[Sample]) -> bool: + """ + Check that plpgsql is installed in the template1, postgres, and test + databases + """ + return len(samples) == 1 and samples[0].value == 3 + + wait_until( + collect_metric( + client, + "compute_installed_extensions", + {"extension_name": "plpgsql", "version": "1.0", "owned_by_superuser": "1"}, + __has_plpgsql, + ), + name="compute_installed_extensions", + ) + + # Install the neon extension, so we can check for it on the restart. + endpoint.safe_psql("CREATE EXTENSION neon VERSION '1.0'", dbname=DB_NAME) + + # The metric is only gathered on compute start, so restart to check if the + # neon extension will now be there. + endpoint.stop() + endpoint.start() + + client = endpoint.http_client() + + def __has_neon(samples: list[Sample]) -> bool: + return len(samples) == 1 and samples[0].value == 1 + + wait_until( + collect_metric( + client, + "compute_installed_extensions", + {"extension_name": "neon", "version": "1.0", "owned_by_superuser": "1"}, + __has_neon, + ), + name="compute_installed_extensions", + ) + + # Double check that we also still have plpgsql + wait_until( + collect_metric( + client, + "compute_installed_extensions", + {"extension_name": "plpgsql", "version": "1.0", "owned_by_superuser": "1"}, + __has_plpgsql, + ), + name="compute_installed_extensions", + ) diff --git a/test_runner/regress/test_compute_migrations.py b/test_runner/regress/test_compute_migrations.py new file mode 100644 index 0000000000..0dbb187c39 --- /dev/null +++ b/test_runner/regress/test_compute_migrations.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, cast + +import pytest +from fixtures.compute_migrations import COMPUTE_MIGRATIONS, NUM_COMPUTE_MIGRATIONS +from fixtures.metrics import parse_metrics +from fixtures.utils import wait_until + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + + +def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_dir: Path): + """ + Test that compute_ctl can recover from migration failures next time it + starts, and that the persisted migration ID is correct in such cases. + """ + env = neon_simple_env + + endpoint = env.endpoints.create("main") + endpoint.respec(skip_pg_catalog_updates=False) + + for i in range(1, NUM_COMPUTE_MIGRATIONS + 1): + endpoint.start(env={"FAILPOINTS": f"compute-migration=return({i})"}) + + # Check that migration failure is properly recorded in the metrics + # + # N.B. wait_for_migrations() only waits till the last successful + # migration is applied. It doesn't wait till the migration failure due + # to the failpoint. This opens a race for checking the metrics. To avoid + # this, we first wait until the migration failure metric is seen. + def check_migration_failure_metrics(): + client = endpoint.http_client() + raw_metrics = client.metrics() + metrics = parse_metrics(raw_metrics) + failed_migration = metrics.query_all( + "compute_ctl_db_migration_failed_total", + ) + assert len(failed_migration) == 1 + for sample in failed_migration: + assert sample.value == 1 + + wait_until(check_migration_failure_metrics) + + # Make sure that all migrations before the failed one are applied + endpoint.wait_for_migrations(wait_for=i - 1) + + # Confirm that we correctly recorded that in the + # neon_migration.migration_id table + with endpoint.cursor() as cur: + cur.execute("SELECT id FROM neon_migration.migration_id") + migration_id = cast("int", cur.fetchall()[0][0]) + assert migration_id == i - 1 + + endpoint.stop() + + endpoint.start() + + # Now wait for the rest of the migrations + endpoint.wait_for_migrations() + + with endpoint.cursor() as cur: + cur.execute("SELECT id FROM neon_migration.migration_id") + migration_id = cast("int", cur.fetchall()[0][0]) + assert migration_id == NUM_COMPUTE_MIGRATIONS + + for i, m in enumerate(COMPUTE_MIGRATIONS, start=1): + migration_query = (compute_migrations_dir / m).read_text(encoding="utf-8") + if not migration_query.startswith("-- SKIP"): + pattern = rf"Skipping migration id={i}" + else: + pattern = rf"Running migration id={i}" + + endpoint.log_contains(pattern) + + +@pytest.mark.parametrize( + "migration", + (pytest.param((i, m), id=str(i)) for i, m in enumerate(COMPUTE_MIGRATIONS, start=1)), +) +def test_compute_migrations_e2e( + neon_simple_env: NeonEnv, + compute_migrations_dir: Path, + compute_migrations_test_dir: Path, + migration: tuple[int, str], +): + """ + Test that the migrations perform as advertised. + """ + env = neon_simple_env + + migration_id = migration[0] + migration_filename = migration[1] + + migration_query = (compute_migrations_dir / migration_filename).read_text(encoding="utf-8") + if migration_query.startswith("-- SKIP"): + pytest.skip("The migration is marked as SKIP") + + endpoint = env.endpoints.create("main") + endpoint.respec(skip_pg_catalog_updates=False) + + # Stop applying migrations after the one we want to test, so that we can + # test the state of the cluster at the given migration ID + endpoint.start(env={"FAILPOINTS": f"compute-migration=return({migration_id + 1})"}) + + endpoint.wait_for_migrations(wait_for=migration_id) + + check_query = (compute_migrations_test_dir / migration_filename).read_text(encoding="utf-8") + endpoint.safe_psql(check_query) diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py new file mode 100644 index 0000000000..6619548811 --- /dev/null +++ b/test_runner/regress/test_compute_reconfigure.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import wait_until + + +def test_compute_reconfigure(neon_simple_env: NeonEnv): + """ + Test that we can change postgresql.conf settings even if + skip_pg_catalog_updates=True is set. + """ + env = neon_simple_env + + TEST_LOG_LINE_PREFIX = "%m [%p] [test_compute_reconfigure]: " + + endpoint = env.endpoints.create_start("main") + + # Check that the log line prefix is not set + # or different from TEST_LOG_LINE_PREFIX + with endpoint.cursor() as cursor: + cursor.execute("SHOW log_line_prefix;") + row = cursor.fetchone() + assert row is not None + assert row[0] != TEST_LOG_LINE_PREFIX + + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": True, + "cluster": { + "settings": [ + { + "name": "log_line_prefix", + "vartype": "string", + "value": TEST_LOG_LINE_PREFIX, + } + ] + }, + } + ) + endpoint.reconfigure() + + # Check that in logs we see that it was actually reconfigured, + # not restarted or something else. + endpoint.log_contains("INFO request{method=POST uri=/configure") + + # In /configure we only send SIGHUP at the end, so in theory + # it doesn't necessarily mean that Postgres already reloaded + # the new config; and it may race in some envs. + # So we wait until we see the log line that the config was changed. + def check_logs(): + endpoint.log_contains( + f'[test_compute_reconfigure]: LOG: parameter "log_line_prefix" changed to "{TEST_LOG_LINE_PREFIX}"' + ) + + wait_until(check_logs) + + # Check that the log line prefix is set + with endpoint.cursor() as cursor: + cursor.execute("SHOW log_line_prefix;") + row = cursor.fetchone() + assert row is not None + assert row[0] == TEST_LOG_LINE_PREFIX diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index 1c5554c379..b10e38885e 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -15,6 +15,8 @@ from werkzeug.wrappers.response import Response if TYPE_CHECKING: from typing import Any, Self + from fixtures.httpserver import ListenAddress + def handle_db(dbs, roles, operation): if operation["op"] == "set": @@ -58,14 +60,12 @@ def ddl_forward_handler( if request.json is None: log.info("Received invalid JSON") return Response(status=400) - json = request.json + json: dict[str, list[str]] = request.json # Handle roles first - if "roles" in json: - for operation in json["roles"]: - handle_role(dbs, roles, operation) - if "dbs" in json: - for operation in json["dbs"]: - handle_db(dbs, roles, operation) + for operation in json.get("roles", []): + handle_role(dbs, roles, operation) + for operation in json.get("dbs", []): + handle_db(dbs, roles, operation) return Response(status=200) @@ -120,7 +120,7 @@ class DdlForwardingContext: @pytest.fixture(scope="function") def ddl( - httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: tuple[str, int] + httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: ListenAddress ): (host, port) = httpserver_listen_address with DdlForwardingContext(httpserver, vanilla_pg, host, port) as ddl: @@ -205,6 +205,23 @@ def test_ddl_forwarding(ddl: DdlForwardingContext): ddl.wait() assert ddl.roles == {} + cur.execute("CREATE ROLE bork WITH PASSWORD 'newyork'") + cur.execute("BEGIN") + cur.execute("SAVEPOINT point") + cur.execute("DROP ROLE bork") + cur.execute("COMMIT") + ddl.wait() + assert ddl.roles == {} + + cur.execute("CREATE ROLE bork WITH PASSWORD 'oldyork'") + cur.execute("BEGIN") + cur.execute("SAVEPOINT point") + cur.execute("ALTER ROLE bork PASSWORD NULL") + cur.execute("COMMIT") + cur.execute("DROP ROLE bork") + ddl.wait() + assert ddl.roles == {} + cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'") cur.execute("CREATE DATABASE stork WITH OWNER=bork") cur.execute("ALTER ROLE bork RENAME TO cork") diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 954db914b9..7abcdb3838 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -460,10 +460,10 @@ def test_pageserver_respects_overridden_resident_size( assert ( du_by_timeline[large_tenant] > min_resident_size ), "ensure the larger tenant will get a haircut" - env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side( + env.neon_env.storage_controller.pageserver_api().update_tenant_config( small_tenant[0], {"min_resident_size_override": min_resident_size} ) - env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side( + env.neon_env.storage_controller.pageserver_api().update_tenant_config( large_tenant[0], {"min_resident_size_override": min_resident_size} ) diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index b2e19ad713..7f12c14073 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log +from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( NeonEnvBuilder, ) @@ -20,6 +21,8 @@ from werkzeug.wrappers.response import Response if TYPE_CHECKING: from typing import Any + from fixtures.httpserver import ListenAddress + # use neon_env_builder_local fixture to override the default neon_env_builder fixture # and use a test-specific pg_install instead of shared one @@ -47,8 +50,8 @@ def neon_env_builder_local( def test_remote_extensions( httpserver: HTTPServer, neon_env_builder_local: NeonEnvBuilder, - httpserver_listen_address, - pg_version, + httpserver_listen_address: ListenAddress, + pg_version: PgVersion, ): # setup mock http server # that expects request for anon.tar.zst @@ -92,6 +95,8 @@ def test_remote_extensions( # mock remote_extensions spec spec: dict[str, Any] = { + "public_extensions": ["anon"], + "custom_extensions": None, "library_index": { "anon": "anon", }, @@ -126,6 +131,17 @@ def test_remote_extensions( httpserver.check() + # Check that we properly recorded downloads in the metrics + client = endpoint.http_client() + raw_metrics = client.metrics() + metrics = parse_metrics(raw_metrics) + remote_ext_requests = metrics.query_all( + "compute_ctl_remote_ext_requests_total", + ) + assert len(remote_ext_requests) == 1 + for sample in remote_ext_requests: + assert sample.value == 1 + # TODO # 1. Test downloading remote library. @@ -135,7 +151,7 @@ def test_remote_extensions( # # 3.Test that extension is downloaded after endpoint restart, # when the library is used in the query. -# Run the test with mutliple simultaneous connections to an endpoint. +# Run the test with multiple simultaneous connections to an endpoint. # to ensure that the extension is downloaded only once. # # 4. Test that private extensions are only downloaded when they are present in the spec. diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py index 0217cd0d03..03bfd1cb8d 100644 --- a/test_runner/regress/test_endpoint_crash.py +++ b/test_runner/regress/test_endpoint_crash.py @@ -2,6 +2,8 @@ from __future__ import annotations import pytest from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pg_version import PgVersion +from fixtures.utils import WITH_SANITIZERS, run_only_on_postgres @pytest.mark.parametrize( @@ -23,3 +25,20 @@ def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str): endpoint.safe_psql("CREATE EXTENSION neon_test_utils;") with pytest.raises(Exception, match="This probably means the server terminated abnormally"): endpoint.safe_psql(f"SELECT {sql_func}();") + + +@run_only_on_postgres([PgVersion.V17], "Currently, build vith sanitizers is possible with v17 only") +def test_sanitizers(neon_env_builder: NeonEnvBuilder): + """ + Test that undefined behavior leads to endpoint abort with sanitizers enabled + """ + env = neon_env_builder.init_start() + env.create_branch("test_ubsan") + endpoint = env.endpoints.create_start("test_ubsan") + + # Test case based on https://www.postgresql.org/message-id/17167-028026e4ca333817@postgresql.org + if not WITH_SANITIZERS: + endpoint.safe_psql("SELECT 1::int4 << 128") + else: + with pytest.raises(Exception, match="This probably means the server terminated abnormally"): + endpoint.safe_psql("SELECT 1::int4 << 128") diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index 29229b73c1..71e0d16edd 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -1,21 +1,27 @@ +import base64 import json import re import time from enum import Enum +from pathlib import Path import psycopg2 import psycopg2.errors import pytest from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.fast_import import FastImport from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, VanillaPostgres +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PgProtocol, VanillaPostgres from fixtures.pageserver.http import ( ImportPgdataIdemptencyKey, PageserverApiException, ) from fixtures.pg_version import PgVersion -from fixtures.remote_storage import RemoteStorageKind -from fixtures.utils import run_only_on_postgres +from fixtures.port_distributor import PortDistributor +from fixtures.remote_storage import MockS3Server, RemoteStorageKind +from mypy_boto3_kms import KMSClient +from mypy_boto3_kms.type_defs import EncryptResponseTypeDef +from mypy_boto3_s3 import S3Client from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -37,10 +43,6 @@ smoke_params = [ ] -@run_only_on_postgres( - [PgVersion.V14, PgVersion.V15, PgVersion.V16], - "newer control file catalog version and struct format isn't supported", -) @pytest.mark.parametrize("shard_count,stripe_size,rel_block_size", smoke_params) def test_pgdata_import_smoke( vanilla_pg: VanillaPostgres, @@ -63,6 +65,9 @@ def test_pgdata_import_smoke( neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() + # The test needs LocalFs support, which is only built in testing mode. + env.pageserver.is_testing_enabled_or_skip() + env.pageserver.patch_config_toml_nonrecursive( { "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api" @@ -71,6 +76,12 @@ def test_pgdata_import_smoke( env.pageserver.stop() env.pageserver.start() + # By default our tests run with a tiny shared_buffers=1MB setting. That + # doesn't allow any prefetching on v17 and above, where the new streaming + # read machinery keeps buffers pinned while prefetching them. Use a higher + # setting to enable prefetching and speed up the tests + ep_config = ["shared_buffers=64MB"] + # # Put data in vanilla pg # @@ -84,6 +95,8 @@ def test_pgdata_import_smoke( elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD: target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2 elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS: + # Postgres uses a 1GiB segment size, fixed at compile time, so we must use >2GB of data + # to exercise multiple segments. target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192) else: raise ValueError @@ -96,13 +109,15 @@ def test_pgdata_import_smoke( while True: relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')") log.info( - f"relblock size: {relblock_size/8192} pages (target: {target_relblock_size//8192}) pages" + f"relblock size: {relblock_size / 8192} pages (target: {target_relblock_size // 8192}) pages" ) if relblock_size >= target_relblock_size: break addrows = int((target_relblock_size - relblock_size) // 8192) assert addrows >= 1, "forward progress" - vanilla_pg.safe_psql(f"insert into t select generate_series({nrows+1}, {nrows + addrows})") + vanilla_pg.safe_psql( + f"insert into t select generate_series({nrows + 1}, {nrows + addrows})" + ) nrows += addrows expect_nrows = nrows expect_sum = ( @@ -111,9 +126,17 @@ def test_pgdata_import_smoke( def validate_vanilla_equivalence(ep): # TODO: would be nicer to just compare pgdump - assert ep.safe_psql("select count(*), sum(data::bigint)::bigint from t") == [ - (expect_nrows, expect_sum) - ] + + # Enable IO concurrency for batching on large sequential scan, to avoid making + # this test unnecessarily onerous on CPU. Especially on debug mode, it's still + # pretty onerous though, so increase statement_timeout to avoid timeouts. + assert ep.safe_psql_many( + [ + "set effective_io_concurrency=32;", + "SET statement_timeout='300s';", + "select count(*), sum(data::bigint)::bigint from t", + ] + ) == [[], [], [(expect_nrows, expect_sum)]] validate_vanilla_equivalence(vanilla_pg) @@ -216,14 +239,14 @@ def test_pgdata_import_smoke( shard_zero_http = shard_zero_ps.http_client() shard_zero_timeline_info = shard_zero_http.timeline_detail(shard_zero["shard_id"], timeline_id) initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"]) - latest_gc_cutoff_lsn = Lsn(shard_zero_timeline_info["latest_gc_cutoff_lsn"]) + min_readable_lsn = Lsn(shard_zero_timeline_info["min_readable_lsn"]) last_record_lsn = Lsn(shard_zero_timeline_info["last_record_lsn"]) disk_consistent_lsn = Lsn(shard_zero_timeline_info["disk_consistent_lsn"]) _remote_consistent_lsn = Lsn(shard_zero_timeline_info["remote_consistent_lsn"]) remote_consistent_lsn_visible = Lsn(shard_zero_timeline_info["remote_consistent_lsn_visible"]) # assert remote_consistent_lsn_visible == remote_consistent_lsn TODO: this fails initially and after restart, presumably because `UploadQueue::clean.1` is still `None` assert remote_consistent_lsn_visible == disk_consistent_lsn - assert initdb_lsn == latest_gc_cutoff_lsn + assert initdb_lsn == min_readable_lsn assert disk_consistent_lsn == initdb_lsn + 8 assert last_record_lsn == disk_consistent_lsn # TODO: assert these values are the same everywhere @@ -237,7 +260,11 @@ def test_pgdata_import_smoke( # ro_endpoint = env.endpoints.create_start( - branch_name=import_branch_name, endpoint_id="ro", tenant_id=tenant_id, lsn=last_record_lsn + branch_name=import_branch_name, + endpoint_id="ro", + tenant_id=tenant_id, + lsn=last_record_lsn, + config_lines=ep_config, ) validate_vanilla_equivalence(ro_endpoint) @@ -267,7 +294,10 @@ def test_pgdata_import_smoke( # validate that we can write # rw_endpoint = env.endpoints.create_start( - branch_name=import_branch_name, endpoint_id="rw", tenant_id=tenant_id + branch_name=import_branch_name, + endpoint_id="rw", + tenant_id=tenant_id, + config_lines=ep_config, ) rw_endpoint.safe_psql("create table othertable(values text)") rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()")) @@ -287,7 +317,7 @@ def test_pgdata_import_smoke( ancestor_start_lsn=rw_lsn, ) br_tip_endpoint = env.endpoints.create_start( - branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id + branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config ) validate_vanilla_equivalence(br_tip_endpoint) br_tip_endpoint.safe_psql("select * from othertable") @@ -300,8 +330,375 @@ def test_pgdata_import_smoke( ancestor_start_lsn=initdb_lsn, ) br_initdb_endpoint = env.endpoints.create_start( - branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id + branch_name="br-initdb", + endpoint_id="br-initdb-ro", + tenant_id=tenant_id, + config_lines=ep_config, ) validate_vanilla_equivalence(br_initdb_endpoint) with pytest.raises(psycopg2.errors.UndefinedTable): br_initdb_endpoint.safe_psql("select * from othertable") + + +def test_fast_import_with_pageserver_ingest( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, + mock_s3_server: MockS3Server, + mock_kms: KMSClient, + mock_s3_client: S3Client, + neon_env_builder: NeonEnvBuilder, + make_httpserver: HTTPServer, +): + # Prepare KMS and S3 + key_response = mock_kms.create_key( + Description="Test key", + KeyUsage="ENCRYPT_DECRYPT", + Origin="AWS_KMS", + ) + key_id = key_response["KeyMetadata"]["KeyId"] + + def encrypt(x: str) -> EncryptResponseTypeDef: + return mock_kms.encrypt(KeyId=key_id, Plaintext=x) + + # Start source postgres and ingest data + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + # Setup pageserver and fake cplane for import progress + def handler(request: Request) -> Response: + log.info(f"control plane request: {request.json}") + return Response(json.dumps({}), status=200) + + cplane_mgmt_api_server = make_httpserver + cplane_mgmt_api_server.expect_request(re.compile(".*")).respond_with_handler(handler) + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + env = neon_env_builder.init_start() + + env.pageserver.patch_config_toml_nonrecursive( + { + "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api", + # because import_pgdata code uses this endpoint, not the one in common remote storage config + # TODO: maybe use common remote_storage config in pageserver? + "import_pgdata_aws_endpoint_url": env.s3_mock_server.endpoint(), + } + ) + env.pageserver.stop() + env.pageserver.start() + + # Encrypt connstrings and put spec into S3 + source_connstring_encrypted = encrypt(vanilla_pg.connstr()) + spec = { + "encryption_secret": {"KMS": {"key_id": key_id}}, + "source_connstring_ciphertext_base64": base64.b64encode( + source_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + "project_id": "someproject", + "branch_id": "somebranch", + } + + bucket = "test-bucket" + key_prefix = "test-prefix" + mock_s3_client.create_bucket(Bucket=bucket) + mock_s3_client.put_object(Bucket=bucket, Key=f"{key_prefix}/spec.json", Body=json.dumps(spec)) + + # Create timeline with import_pgdata + tenant_id = TenantId.generate() + env.storage_controller.tenant_create(tenant_id) + + timeline_id = TimelineId.generate() + log.info("starting import") + start = time.monotonic() + + idempotency = ImportPgdataIdemptencyKey.random() + log.info(f"idempotency key {idempotency}") + # TODO: teach neon_local CLI about the idempotency & 429 error so we can run inside the loop + # and check for 429 + + import_branch_name = "imported" + env.storage_controller.timeline_create( + tenant_id, + { + "new_timeline_id": str(timeline_id), + "import_pgdata": { + "idempotency_key": str(idempotency), + "location": { + "AwsS3": { + "region": env.s3_mock_server.region(), + "bucket": bucket, + "key": key_prefix, + } + }, + }, + }, + ) + env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id) + + # Run fast_import + if fast_import.extra_env is None: + fast_import.extra_env = {} + fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key() + fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key() + fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token() + fast_import.extra_env["AWS_REGION"] = mock_s3_server.region() + fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint() + fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug" + pg_port = port_distributor.get_port() + fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}") + vanilla_pg.stop() + + def validate_vanilla_equivalence(ep): + res = ep.safe_psql("SELECT count(*), sum(a) FROM foo;", dbname="neondb") + assert res[0] == (10, 55), f"got result: {res}" + + # Sanity check that data in pgdata is expected: + pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) + with VanillaPostgres( + fast_import.workdir / "pgdata", pgbin, pg_port, False + ) as new_pgdata_vanilla_pg: + new_pgdata_vanilla_pg.start() + + # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres + conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb") + validate_vanilla_equivalence(conn) + + # Poll pageserver statuses in s3 + while True: + locations = env.storage_controller.locate(tenant_id) + active_count = 0 + for location in locations: + shard_id = TenantShardId.parse(location["shard_id"]) + ps = env.get_pageserver(location["node_id"]) + try: + detail = ps.http_client().timeline_detail(shard_id, timeline_id) + log.info(f"timeline {tenant_id}/{timeline_id} detail: {detail}") + state = detail["state"] + log.info(f"shard {shard_id} state: {state}") + if state == "Active": + active_count += 1 + except PageserverApiException as e: + if e.status_code == 404: + log.info("not found, import is in progress") + continue + elif e.status_code == 429: + log.info("import is in progress") + continue + else: + raise + + if state == "Active": + key = f"{key_prefix}/status/shard-{shard_id.shard_index}" + shard_status_file_contents = ( + mock_s3_client.get_object(Bucket=bucket, Key=key)["Body"].read().decode("utf-8") + ) + shard_status = json.loads(shard_status_file_contents) + assert shard_status["done"] is True + + if active_count == len(locations): + log.info("all shards are active") + break + time.sleep(0.5) + + import_duration = time.monotonic() - start + log.info(f"import complete; duration={import_duration:.2f}s") + + ep = env.endpoints.create_start(branch_name=import_branch_name, tenant_id=tenant_id) + + # check that data is there + validate_vanilla_equivalence(ep) + + # check that we can do basic ops + + ep.safe_psql("create table othertable(values text)", dbname="neondb") + rw_lsn = Lsn(ep.safe_psql_scalar("select pg_current_wal_flush_lsn()")) + ep.stop() + + # ... at the tip + _ = env.create_branch( + new_branch_name="br-tip", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=rw_lsn, + ) + br_tip_endpoint = env.endpoints.create_start( + branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_tip_endpoint) + br_tip_endpoint.safe_psql("select * from othertable", dbname="neondb") + br_tip_endpoint.stop() + + # ... at the initdb lsn + locations = env.storage_controller.locate(tenant_id) + [shard_zero] = [ + loc for loc in locations if TenantShardId.parse(loc["shard_id"]).shard_number == 0 + ] + shard_zero_ps = env.get_pageserver(shard_zero["node_id"]) + shard_zero_timeline_info = shard_zero_ps.http_client().timeline_detail( + shard_zero["shard_id"], timeline_id + ) + initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"]) + _ = env.create_branch( + new_branch_name="br-initdb", + ancestor_branch_name=import_branch_name, + tenant_id=tenant_id, + ancestor_start_lsn=initdb_lsn, + ) + br_initdb_endpoint = env.endpoints.create_start( + branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id + ) + validate_vanilla_equivalence(br_initdb_endpoint) + with pytest.raises(psycopg2.errors.UndefinedTable): + br_initdb_endpoint.safe_psql("select * from othertable", dbname="neondb") + br_initdb_endpoint.stop() + + env.pageserver.stop(immediate=True) + + +def test_fast_import_binary( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, +): + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + pg_port = port_distributor.get_port() + fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr()) + vanilla_pg.stop() + + pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) + with VanillaPostgres( + fast_import.workdir / "pgdata", pgbin, pg_port, False + ) as new_pgdata_vanilla_pg: + new_pgdata_vanilla_pg.start() + + # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres + conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb") + res = conn.safe_psql("SELECT count(*) FROM foo;") + log.info(f"Result: {res}") + assert res[0][0] == 10 + + +def test_fast_import_restore_to_connstring( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, +): + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + pgdatadir = test_output_dir / "destination-pgdata" + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg: + destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) + destination_vanilla_pg.start() + + # create another database & role and try to restore there + destination_vanilla_pg.safe_psql(""" + CREATE ROLE testrole WITH + LOGIN + PASSWORD 'testpassword' + NOSUPERUSER + NOCREATEDB + NOCREATEROLE; + """) + destination_vanilla_pg.safe_psql("CREATE DATABASE testdb OWNER testrole;") + + destination_connstring = destination_vanilla_pg.connstr( + dbname="testdb", user="testrole", password="testpassword" + ) + fast_import.run_dump_restore( + source_connection_string=vanilla_pg.connstr(), + destination_connection_string=destination_connstring, + ) + vanilla_pg.stop() + conn = PgProtocol(dsn=destination_connstring) + res = conn.safe_psql("SELECT count(*) FROM foo;") + log.info(f"Result: {res}") + assert res[0][0] == 10 + + +def test_fast_import_restore_to_connstring_from_s3_spec( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, + pg_distrib_dir: Path, + pg_version: PgVersion, + mock_s3_server: MockS3Server, + mock_kms: KMSClient, + mock_s3_client: S3Client, +): + # Prepare KMS and S3 + key_response = mock_kms.create_key( + Description="Test key", + KeyUsage="ENCRYPT_DECRYPT", + Origin="AWS_KMS", + ) + key_id = key_response["KeyMetadata"]["KeyId"] + + def encrypt(x: str) -> EncryptResponseTypeDef: + return mock_kms.encrypt(KeyId=key_id, Plaintext=x) + + # Start source postgres and ingest data + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);") + + # Start target postgres + pgdatadir = test_output_dir / "destination-pgdata" + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg: + destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) + destination_vanilla_pg.start() + + # Encrypt connstrings and put spec into S3 + source_connstring_encrypted = encrypt(vanilla_pg.connstr()) + destination_connstring_encrypted = encrypt(destination_vanilla_pg.connstr()) + spec = { + "encryption_secret": {"KMS": {"key_id": key_id}}, + "source_connstring_ciphertext_base64": base64.b64encode( + source_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + "destination_connstring_ciphertext_base64": base64.b64encode( + destination_connstring_encrypted["CiphertextBlob"] + ).decode("utf-8"), + } + + mock_s3_client.create_bucket(Bucket="test-bucket") + mock_s3_client.put_object( + Bucket="test-bucket", Key="test-prefix/spec.json", Body=json.dumps(spec) + ) + + # Run fast_import + if fast_import.extra_env is None: + fast_import.extra_env = {} + fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key() + fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key() + fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token() + fast_import.extra_env["AWS_REGION"] = mock_s3_server.region() + fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint() + fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug" + fast_import.run_dump_restore(s3prefix="s3://test-bucket/test-prefix") + vanilla_pg.stop() + + res = destination_vanilla_pg.safe_psql("SELECT count(*) FROM foo;") + log.info(f"Result: {res}") + assert res[0][0] == 10 + + +# TODO: Maybe test with pageserver? +# 1. run whole neon env +# 2. create timeline with some s3 path??? +# 3. run fast_import with s3 prefix +# 4. ??? mock http where pageserver will report progress +# 5. run compute on this timeline and check if data is there diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py index 9c9bc5b519..7e99d4b2f2 100644 --- a/test_runner/regress/test_ingestion_layer_size.py +++ b/test_runner/regress/test_ingestion_layer_size.py @@ -74,7 +74,7 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder): print_layer_size_histogram(post_ingest) # since all we have are L0s, we should be getting nice L1s and images out of them now - env.storage_controller.pageserver_api().patch_tenant_config_client_side( + env.storage_controller.pageserver_api().update_tenant_config( env.initial_tenant, { "compaction_threshold": 1, diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py deleted file mode 100644 index 04ccec5875..0000000000 --- a/test_runner/regress/test_installed_extensions.py +++ /dev/null @@ -1,152 +0,0 @@ -from __future__ import annotations - -import time -from logging import info -from typing import TYPE_CHECKING - -from fixtures.log_helper import log -from fixtures.metrics import parse_metrics - -if TYPE_CHECKING: - from fixtures.neon_fixtures import NeonEnv - - -def test_installed_extensions(neon_simple_env: NeonEnv): - """basic test for the endpoint that returns the list of installed extensions""" - - env = neon_simple_env - - env.create_branch("test_installed_extensions") - - endpoint = env.endpoints.create_start("test_installed_extensions") - - endpoint.safe_psql("CREATE DATABASE test_installed_extensions") - endpoint.safe_psql("CREATE DATABASE test_installed_extensions_2") - - client = endpoint.http_client() - res = client.installed_extensions() - - info("Extensions list: %s", res) - info("Extensions: %s", res["extensions"]) - # 'plpgsql' is a default extension that is always installed. - assert any( - ext["extname"] == "plpgsql" and ext["versions"] == ["1.0"] for ext in res["extensions"] - ), "The 'plpgsql' extension is missing" - - # check that the neon_test_utils extension is not installed - assert not any( - ext["extname"] == "neon_test_utils" for ext in res["extensions"] - ), "The 'neon_test_utils' extension is installed" - - pg_conn = endpoint.connect(dbname="test_installed_extensions") - with pg_conn.cursor() as cur: - cur.execute("CREATE EXTENSION neon_test_utils") - cur.execute( - "SELECT default_version FROM pg_available_extensions WHERE name = 'neon_test_utils'" - ) - res = cur.fetchone() - neon_test_utils_version = res[0] - - with pg_conn.cursor() as cur: - cur.execute("CREATE EXTENSION neon version '1.1'") - - pg_conn_2 = endpoint.connect(dbname="test_installed_extensions_2") - with pg_conn_2.cursor() as cur: - cur.execute("CREATE EXTENSION neon version '1.2'") - - res = client.installed_extensions() - - info("Extensions list: %s", res) - info("Extensions: %s", res["extensions"]) - - # check that the neon_test_utils extension is installed only in 1 database - # and has the expected version - assert any( - ext["extname"] == "neon_test_utils" - and ext["versions"] == [neon_test_utils_version] - and ext["n_databases"] == 1 - for ext in res["extensions"] - ) - - # check that the plpgsql extension is installed in all databases - # this is a default extension that is always installed - assert any(ext["extname"] == "plpgsql" and ext["n_databases"] == 4 for ext in res["extensions"]) - - # check that the neon extension is installed and has expected versions - for ext in res["extensions"]: - if ext["extname"] == "neon": - assert ext["n_databases"] == 2 - ext["versions"].sort() - assert ext["versions"] == ["1.1", "1.2"] - - with pg_conn.cursor() as cur: - cur.execute("ALTER EXTENSION neon UPDATE TO '1.3'") - - res = client.installed_extensions() - - info("Extensions list: %s", res) - info("Extensions: %s", res["extensions"]) - - # check that the neon_test_utils extension is updated - for ext in res["extensions"]: - if ext["extname"] == "neon": - assert ext["n_databases"] == 2 - ext["versions"].sort() - assert ext["versions"] == ["1.2", "1.3"] - - # check that /metrics endpoint is available - # ensure that we see the metric before and after restart - res = client.metrics() - info("Metrics: %s", res) - m = parse_metrics(res) - neon_m = m.query_all( - "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"} - ) - assert len(neon_m) == 1 - for sample in neon_m: - assert sample.value == 2 - neon_m = m.query_all( - "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"} - ) - assert len(neon_m) == 1 - for sample in neon_m: - assert sample.value == 1 - - endpoint.stop() - endpoint.start() - - timeout = 10 - while timeout > 0: - try: - res = client.metrics() - timeout = -1 - if len(parse_metrics(res).query_all("compute_installed_extensions")) < 4: - # Assume that not all metrics that are collected yet - time.sleep(1) - timeout -= 1 - continue - except Exception: - log.exception("failed to get metrics, assume they are not collected yet") - time.sleep(1) - timeout -= 1 - continue - - assert ( - len(parse_metrics(res).query_all("compute_installed_extensions")) >= 4 - ), "Not all metrics are collected" - - info("After restart metrics: %s", res) - m = parse_metrics(res) - neon_m = m.query_all( - "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"} - ) - assert len(neon_m) == 1 - for sample in neon_m: - assert sample.value == 1 - - neon_m = m.query_all( - "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"} - ) - assert len(neon_m) == 1 - for sample in neon_m: - assert sample.value == 1 diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py index d9043fef7f..0260704ebf 100644 --- a/test_runner/regress/test_layer_bloating.py +++ b/test_runner/regress/test_layer_bloating.py @@ -63,7 +63,7 @@ def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg): cur.execute("set statement_timeout=0") cur.execute("select create_snapshots(10000)") # Wait logical replication to sync - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline) env.pageserver.http_client().timeline_checkpoint(env.initial_tenant, timeline, compact=False) diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 8818b40712..872d3dc4cf 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -132,7 +132,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str): ), "sanity check for what above loop is supposed to do" # create the image layer from the future - env.storage_controller.pageserver_api().patch_tenant_config_client_side( + env.storage_controller.pageserver_api().update_tenant_config( tenant_id, {"image_creation_threshold": image_creation_threshold}, None ) assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1 @@ -172,7 +172,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str): # force removal of layers from the future tenant_conf = ps_http.tenant_config(tenant_id) generation_before_detach = get_generation_number() - env.pageserver.tenant_detach(tenant_id) + env.pageserver.http_client().tenant_detach(tenant_id) failpoint_deletion_queue = "deletion-queue-before-execute-pause" ps_http.configure_failpoints((failpoint_deletion_queue, "pause")) diff --git a/test_runner/regress/test_lfc_prefetch.py b/test_runner/regress/test_lfc_prefetch.py new file mode 100644 index 0000000000..dd422d996e --- /dev/null +++ b/test_runner/regress/test_lfc_prefetch.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import time + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import USE_LFC + + +@pytest.mark.timeout(600) +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_lfc_prefetch(neon_simple_env: NeonEnv): + """ + Test resizing the Local File Cache + """ + env = neon_simple_env + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "effective_io_concurrency=100", + "shared_buffers=1MB", + "enable_bitmapscan=off", + "enable_seqscan=off", + "autovacuum=off", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("create extension neon") + cur.execute("create table t(pk integer, sk integer, filler text default repeat('x',200))") + cur.execute("set statement_timeout=0") + cur.execute("select setseed(0.5)") + cur.execute("insert into t values (generate_series(1,1000000),random()*1000000)") + cur.execute("create index on t(sk)") + cur.execute("vacuum t") + + # reset LFC + cur.execute("alter system set neon.file_cache_size_limit=0") + cur.execute("select pg_reload_conf()") + time.sleep(1) + cur.execute("alter system set neon.file_cache_size_limit='1GB'") + cur.execute("select pg_reload_conf()") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 100000 and 200000 limit 100) s1" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 200000 and 300000 limit 100) s2" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 300000 and 400000 limit 100) s3" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 100000 and 200000 limit 100) s4" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + # if prefetch requests are not stored in LFC, we continue to sent unused prefetch request tyo PS + assert prefetch_expired > 0 + + cur.execute("set neon.store_prefetch_result_in_lfc=on") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 500000 and 600000 limit 100) s5" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 600000 and 700000 limit 100) s6" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 700000 and 800000 limit 100) s7" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + cur.execute( + "explain (analyze,prefetch,format json) select sum(pk) from (select pk from t where sk between 500000 and 600000 limit 100) s8" + ) + prefetch_expired = cur.fetchall()[0][0][0]["Plan"]["Prefetch Expired Requests"] + log.info(f"Unused prefetches: {prefetch_expired}") + + # No redundant prefethc requrests if prefetch results are stored in LFC + assert prefetch_expired == 0 diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 377b0fb4d4..ea7d38a3d9 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -30,7 +30,7 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): ], ) n_resize = 10 - scale = 100 + scale = 20 def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") @@ -46,17 +46,41 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): conn = endpoint.connect() cur = conn.cursor() + def get_lfc_size() -> tuple[int, int]: + lfc_file_path = endpoint.lfc_path() + lfc_file_size = lfc_file_path.stat().st_size + res = subprocess.run( + ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True + ) + lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] + log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") + + return (lfc_file_size, lfc_file_blocks) + # For as long as pgbench is running, twiddle the LFC size once a second. # Note that we launch this immediately, already while the "pgbench -i" # initialization step is still running. That's quite a different workload # than the actual pgbench benchamark run, so this gives us coverage of both. while thread.is_alive(): - size = random.randint(1, 512) + # Vary the LFC size randomly within a range above what we will later + # decrease it to. This should ensure that the final size decrease + # is really doing something. + size = random.randint(192, 512) cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'") cur.execute("select pg_reload_conf()") time.sleep(1) + thread.join() + # Fill LFC: seqscan should fetch the whole table in cache. + # It is needed for further correct evaluation of LFC file size + # (a sparse chunk of LFC takes less than 1 MB on disk). + cur.execute("select sum(abalance) from pgbench_accounts") + + # Before shrinking the cache, check that it really is large now + (lfc_file_size, lfc_file_blocks) = get_lfc_size() + assert int(lfc_file_blocks) > 128 * 1024 + # At the end, set it at 100 MB, and perform a final check that the disk usage # of the file is in that ballbark. # @@ -66,13 +90,7 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): cur.execute("select pg_reload_conf()") nretries = 10 while True: - lfc_file_path = endpoint.lfc_path() - lfc_file_size = lfc_file_path.stat().st_size - res = subprocess.run( - ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True - ) - lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] - log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") + (lfc_file_size, lfc_file_blocks) = get_lfc_size() assert lfc_file_size <= 512 * 1024 * 1024 if int(lfc_file_blocks) <= 128 * 1024 or nretries == 0: diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 94c630ffcf..52ee2f32a2 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -7,9 +7,78 @@ import threading import time import pytest -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.utils import USE_LFC, query_scalar +""" +Test whether LFC doesn't error out when the LRU is empty, but the LFC is +already at its maximum size. + +If we don't handle this safely, we might allocate more hash entries than +otherwise considered safe, thus causing ERRORs in hash_search(HASH_ENTER) once +we hit lfc->used >= lfc->limit. +""" + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_local_file_cache_all_pinned(neon_simple_env: NeonEnv): + env = neon_simple_env + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "neon.max_file_cache_size='1MB'", + "neon.file_cache_size_limit='1MB'", + ], + ) + top_cur = endpoint.connect().cursor() + + stop = threading.Event() + n_rows = 10000 + n_threads = 5 + n_updates_per_connection = 1000 + + top_cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)") + top_cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g") + + # Start threads that will perform random UPDATEs. Each UPDATE + # increments the counter on the row, so that we can check at the + # end that the sum of all the counters match the number of updates + # performed (plus the initial 1 on each row). + # + # Furthermore, each thread will reconnect between every 1000 updates. + def run_updates(n_updates_performed_q: queue.Queue[int]): + n_updates_performed = 0 + conn = endpoint.connect() + cur = conn.cursor() + while not stop.is_set(): + id = random.randint(1, n_rows) + cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}") + n_updates_performed += 1 + if n_updates_performed % n_updates_per_connection == 0: + cur.close() + conn.close() + conn = endpoint.connect() + cur = conn.cursor() + n_updates_performed_q.put(n_updates_performed) + + n_updates_performed_q: queue.Queue[int] = queue.Queue() + threads: list[threading.Thread] = [] + for _i in range(n_threads): + thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True) + thread.start() + threads.append(thread) + + time.sleep(15) + + stop.set() + + n_updates_performed = 0 + for thread in threads: + thread.join() + n_updates_performed += n_updates_performed_q.get() + + assert query_scalar(top_cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed + @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): @@ -29,8 +98,8 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder): cur = endpoint.connect().cursor() stop = threading.Event() - n_rows = 100000 - n_threads = 20 + n_rows = 10000 + n_threads = 5 n_updates_per_connection = 1000 cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)") diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index db18e1758c..3a92f0d1d1 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -55,13 +55,13 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgr vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") # Wait logical replication channel to be established - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") # insert some data cur.execute("insert into t values (generate_series(1,1000), 0)") # Wait logical replication to sync - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 1000 # now stop subscriber... @@ -78,7 +78,7 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgr vanilla_pg.start() # Wait logical replication to sync - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") # Check that subscribers receives all data assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 2000 @@ -148,7 +148,7 @@ COMMIT; endpoint.start() vanilla_pg.start() - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") eq_q = "select testcolumn1, testcolumn2, testcolumn3 from replication_example order by 1, 2, 3" assert vanilla_pg.safe_psql(eq_q) == endpoint.safe_psql(eq_q) log.info("rewriteheap synced") @@ -285,7 +285,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of vanilla_pg.safe_psql("create table t(a int)") connstr = endpoint.connstr().replace("'", "''") vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") vanilla_pg.stop() @@ -321,13 +321,13 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of sk_http = sk.http_client() sk_http.configure_failpoints([("sk-pause-send", "off")]) - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2] # Check that local reads also work with endpoint.connect().cursor() as cur: cur.execute("insert into t values (3)") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2, 3] log_path = vanilla_pg.pgdatadir / "pg.log" @@ -365,7 +365,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres) log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") connstr = endpoint.connstr().replace("'", "''") vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") vanilla_pg.stop() wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) @@ -375,7 +375,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres) # this should flush current wal page cur.execute("insert into replication_example values (3, 4)") vanilla_pg.start() - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert vanilla_pg.safe_psql( "select sum(somedata) from replication_example" ) == endpoint.safe_psql("select sum(somedata) from replication_example") @@ -409,18 +409,18 @@ def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres): # Test simple insert, update, delete. But with very large values value = random_string(10_000_000) cur.execute(f"INSERT INTO reptbl VALUES (1, '{value}')") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(1, value)] # Test delete, and reinsert another value cur.execute("DELETE FROM reptbl WHERE id = 1") cur.execute(f"INSERT INTO reptbl VALUES (2, '{value}')") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] value = random_string(10_000_000) cur.execute(f"UPDATE reptbl SET largeval='{value}'") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] endpoint.stop() @@ -428,7 +428,7 @@ def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres): cur = endpoint.connect().cursor() value = random_string(10_000_000) cur.execute(f"UPDATE reptbl SET largeval='{value}'") - logical_replication_sync(vanilla_pg, endpoint) + logical_replication_sync(vanilla_pg, endpoint, "sub1") assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] @@ -573,17 +573,18 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van vanilla_pg.safe_psql("create extension neon;") env.create_branch("subscriber") - # We want all data to fit into shared_buffers because later we stop - # safekeeper and insert more; this shouldn't cause page requests as they - # will be stuck. + # We want all data to fit into shared_buffers or LFC cache because later we + # stop safekeeper and insert more; this shouldn't cause page requests as + # they will be stuck. + if USE_LFC: + config_lines = ["neon.max_file_cache_size = 32MB", "neon.file_cache_size_limit = 32MB"] + else: + config_lines = [ + "shared_buffers = 32MB", + ] sub = env.endpoints.create( "subscriber", - config_lines=[ - "neon.max_file_cache_size = 32MB", - "neon.file_cache_size_limit = 32MB", - ] - if USE_LFC - else [], + config_lines=config_lines, ) sub.start() @@ -607,7 +608,7 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van for i in range(0, 1000): pcur.execute("INSERT into t values (%s, random()*100000)", (i,)) # wait until sub receives all data - logical_replication_sync(sub, vanilla_pg) + logical_replication_sync(sub, vanilla_pg, "sub") # Update confirmed_flush_lsn of the slot. If subscriber ack'ed recevied data # as flushed we'll now lose it if subscriber restars. That's why # logical_replication_wait_flush_lsn_sync is expected to hang while diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py deleted file mode 100644 index 7211619a99..0000000000 --- a/test_runner/regress/test_migrations.py +++ /dev/null @@ -1,33 +0,0 @@ -from __future__ import annotations - -import time -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from fixtures.neon_fixtures import NeonEnv - - -def test_migrations(neon_simple_env: NeonEnv): - env = neon_simple_env - - endpoint = env.endpoints.create("main") - endpoint.respec(skip_pg_catalog_updates=False) - endpoint.start() - - num_migrations = 11 - endpoint.wait_for_migrations(num_migrations=num_migrations) - - with endpoint.cursor() as cur: - cur.execute("SELECT id FROM neon_migration.migration_id") - migration_id = cur.fetchall() - assert migration_id[0][0] == num_migrations - - endpoint.stop() - endpoint.start() - # We don't have a good way of knowing that the migrations code path finished executing - # in compute_ctl in the case that no migrations are being run - time.sleep(1) - with endpoint.cursor() as cur: - cur.execute("SELECT id FROM neon_migration.migration_id") - migration_id = cur.fetchall() - assert migration_id[0][0] == num_migrations diff --git a/test_runner/regress/test_nbtree_pagesplit_cycleid.py b/test_runner/regress/test_nbtree_pagesplit_cycleid.py new file mode 100644 index 0000000000..32ec6fcb92 --- /dev/null +++ b/test_runner/regress/test_nbtree_pagesplit_cycleid.py @@ -0,0 +1,130 @@ +import threading +import time + +from fixtures.neon_fixtures import NeonEnv + +BTREE_NUM_CYCLEID_PAGES = """ + WITH lsns AS ( + /* + * pg_switch_wal() ensures we have an LSN that + * 1. is after any previous modifications, but also, + * 2. (critically) is flushed, preventing any issues with waiting for + * unflushed WAL in PageServer. + */ + SELECT pg_switch_wal() as lsn + ), + raw_pages AS ( + SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, lsn, lsn) page + FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) AS blkno, + lsns l(lsn) + ), + parsed_pages AS ( + /* cycle ID is the last 2 bytes of the btree page */ + SELECT blkno, SUBSTRING(page FROM 8191 FOR 2) as cycle_id + FROM raw_pages + ) + SELECT count(*), + encode(cycle_id, 'hex') + FROM parsed_pages + WHERE encode(cycle_id, 'hex') != '0000' + GROUP BY encode(cycle_id, 'hex'); + """ + + +def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv): + env = neon_simple_env + endpoint = env.endpoints.create_start("main") + + ses1 = endpoint.connect().cursor() + ses1.execute("ALTER SYSTEM SET autovacuum = off;") + ses1.execute("ALTER SYSTEM SET enable_seqscan = off;") + ses1.execute("ALTER SYSTEM SET full_page_writes = off;") + ses1.execute("SELECT pg_reload_conf();") + ses1.execute("CREATE EXTENSION neon_test_utils;") + # prepare a large index + ses1.execute("CREATE TABLE t(id integer GENERATED ALWAYS AS IDENTITY, txt text);") + ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);") + ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;") + + ses1.execute(BTREE_NUM_CYCLEID_PAGES) + pages = ses1.fetchall() + assert ( + len(pages) == 0 + ), f"0 back splits with cycle ID expected, real {len(pages)} first {pages[0]}" + # Delete enough tuples to clear the first index page. + # (there are up to 407 rows per 8KiB page; 406 for non-rightmost leafs. + ses1.execute("DELETE FROM t WHERE id <= 406;") + # Make sure the page is cleaned up + ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;") + + # Do another delete-then-indexcleanup cycle, to move the pages from + # "dead" to "reusable" + ses1.execute("DELETE FROM t WHERE id <= 446;") + ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;") + + # Make sure the vacuum we're about to trigger in s3 has cleanup work to do + ses1.execute("DELETE FROM t WHERE id <= 610;") + + # Flush wal, for checking purposes + ses1.execute(BTREE_NUM_CYCLEID_PAGES) + pages = ses1.fetchall() + assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead" + + ses2 = endpoint.connect().cursor() + ses3 = endpoint.connect().cursor() + + # Session 2 pins a btree page, which prevents vacuum from processing that + # page, thus allowing us to reliably split pages while a concurrent vacuum + # is running. + ses2.execute("BEGIN;") + ses2.execute( + "DECLARE foo NO SCROLL CURSOR FOR SELECT row_number() over () FROM t ORDER BY id ASC" + ) + ses2.execute("FETCH FROM foo;") # pins the leaf page with id 611 + wait_evt = threading.Event() + + # Session 3 runs the VACUUM command. Note that this will block, and + # therefore must run on another thread. + # We rely on this running quickly enough to hit the pinned page from + # session 2 by the time we start other work again in session 1, but + # technically there is a race where the thread (and/or PostgreSQL process) + # don't get to that pinned page with vacuum until >2s after evt.set() was + # called, and session 1 thus might already have split pages. + def vacuum_freeze_t(ses3, evt: threading.Event): + # Begin parallel vacuum that should hit the index + evt.set() + # this'll hang until s2 fetches enough new data from its cursor. + # this is technically a race with the time.sleep(2) below, but if this + # command doesn't hit + ses3.execute("VACUUM (FREEZE, INDEX_CLEANUP on, DISABLE_PAGE_SKIPPING on) t;") + + ses3t = threading.Thread(target=vacuum_freeze_t, args=(ses3, wait_evt)) + ses3t.start() + wait_evt.wait() + # Make extra sure we got the thread started and vacuum is stuck, by waiting + # some time even after wait_evt got set. This isn't truly reliable (it is + # possible + time.sleep(2) + + # Insert 2 pages worth of new data. + # This should reuse the one empty page, plus another page at the end of + # the index relation; with split ordering + # old_blk -> blkno=1 -> old_blk + 1. + # As this is run while vacuum in session 3 is happening, these splits + # should receive cycle IDs where applicable. + ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 812) i;") + # unpin the btree page, allowing s3's vacuum to complete + ses2.execute("FETCH ALL FROM foo;") + ses2.execute("ROLLBACK;") + # check that our expectations are correct + ses1.execute(BTREE_NUM_CYCLEID_PAGES) + pages = ses1.fetchall() + assert ( + len(pages) == 1 and pages[0][0] == 3 + ), f"3 page splits with cycle ID expected; actual {pages}" + + # final cleanup + ses3t.join() + ses1.close() + ses2.close() + ses3.close() diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 80e26d9432..8d9aab6848 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -17,11 +17,13 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por main_branch_name = "main" pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + internal_http_port = port_distributor.get_port() env.neon_cli.endpoint_create( main_branch_name, pg_port, - http_port, + external_http_port, + internal_http_port, endpoint_id="ep-basic-main", tenant_id=env.initial_tenant, pg_version=env.pg_version, @@ -35,11 +37,13 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por new_branch_name=branch_name, ) pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + internal_http_port = port_distributor.get_port() env.neon_cli.endpoint_create( branch_name, pg_port, - http_port, + external_http_port, + internal_http_port, endpoint_id=f"ep-{branch_name}", tenant_id=env.initial_tenant, pg_version=env.pg_version, @@ -59,23 +63,27 @@ def test_neon_two_primary_endpoints_fail( branch_name = "main" pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + internal_http_port = port_distributor.get_port() env.neon_cli.endpoint_create( branch_name, pg_port, - http_port, + external_http_port, + internal_http_port, endpoint_id="ep1", tenant_id=env.initial_tenant, pg_version=env.pg_version, ) pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + internal_http_port = port_distributor.get_port() # ep1 is not running so create will succeed env.neon_cli.endpoint_create( branch_name, pg_port, - http_port, + external_http_port, + internal_http_port, endpoint_id="ep2", tenant_id=env.initial_tenant, pg_version=env.pg_version, diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index ae2d171058..c8458b963e 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -6,9 +6,14 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.pageserver.http import PageserverHttpClient -def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): +def check_tenant( + env: NeonEnv, pageserver_http: PageserverHttpClient, safekeeper_proto_version: int +): tenant_id, timeline_id = env.create_tenant() - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + config_lines = [ + f"neon.safekeeper_proto_version = {safekeeper_proto_version}", + ] + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines) # we rely upon autocommit after each statement res_1 = endpoint.safe_psql_many( queries=[ @@ -33,7 +38,14 @@ def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): @pytest.mark.parametrize("num_timelines,num_safekeepers", [(3, 1)]) -def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int): +# Test both proto versions until we fully migrate. +@pytest.mark.parametrize("safekeeper_proto_version", [2, 3]) +def test_normal_work( + neon_env_builder: NeonEnvBuilder, + num_timelines: int, + num_safekeepers: int, + safekeeper_proto_version: int, +): """ Basic test: * create new tenant with a timeline @@ -52,4 +64,4 @@ def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_s pageserver_http = env.pageserver.http_client() for _ in range(num_timelines): - check_tenant(env, pageserver_http) + check_tenant(env, pageserver_http, safekeeper_proto_version) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 028d1c2e49..c344f30f4d 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -27,6 +27,7 @@ from fixtures.pageserver.utils import ( ) from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import query_scalar, wait_until +from urllib3 import Retry if TYPE_CHECKING: from typing import Any @@ -676,16 +677,14 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu "compaction_period": "0s", } ) - client = env.pageserver.http_client() + + # Disable retries, because we'll hit code paths that can give us + # 503 and want to see that directly + client = env.pageserver.http_client(retries=Retry(status=0)) + failpoint = "before-downloading-layer-stream-pausable" client.configure_failpoints((failpoint, "pause")) - env.pageserver.allowed_errors.extend( - [ - ".*downloading failed, possibly for shutdown.*", - ] - ) - info = client.layer_map_info(env.initial_tenant, env.initial_timeline) assert len(info.delta_layers()) == 1 @@ -720,13 +719,9 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu client.configure_failpoints((failpoint, "off")) - with pytest.raises( - PageserverApiException, match="downloading failed, possibly for shutdown" - ): + with pytest.raises(PageserverApiException, match="Shutting down"): download.result() - env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*") - detach.result() client.configure_failpoints((failpoint, "pause")) diff --git a/test_runner/regress/test_page_service_batching_regressions.py b/test_runner/regress/test_page_service_batching_regressions.py new file mode 100644 index 0000000000..fa85e1210b --- /dev/null +++ b/test_runner/regress/test_page_service_batching_regressions.py @@ -0,0 +1,60 @@ +# NB: there are benchmarks that double-serve as tests inside the `performance` directory. + +import subprocess +from pathlib import Path + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +@pytest.mark.timeout(30) # test takes <20s if pageserver impl is correct +@pytest.mark.parametrize("kind", ["pageserver-stop", "tenant-detach"]) +def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind: str): + def patch_pageserver_toml(config): + config["page_service_pipelining"] = { + "mode": "pipelined", + "max_batch_size": 32, + "execution": "concurrent-futures", + } + + neon_env_builder.pageserver_config_override = patch_pageserver_toml + env = neon_env_builder.init_start() + + log.info("make flush appear slow") + + log.info("sending requests until pageserver accepts no more") + # TODO: extract this into a helper, like subprocess_capture, + # so that we capture the stderr from the helper somewhere. + child = subprocess.Popen( + [ + neon_binpath / "test_helper_slow_client_reads", + env.pageserver.connstr(), + str(env.initial_tenant), + str(env.initial_timeline), + ], + bufsize=0, # unbuffered + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + assert child.stdout is not None + buf = child.stdout.read(1) + if len(buf) != 1: + raise Exception("unexpected EOF") + if buf != b"R": + raise Exception(f"unexpected data: {buf!r}") + log.info("helper reports pageserver accepts no more requests") + log.info( + "assuming pageserver connection handle is in a state where TCP has backpressured pageserver=>client response flush() into userspace" + ) + + if kind == "pageserver-stop": + log.info("try to shut down the pageserver cleanly") + env.pageserver.stop() + elif kind == "tenant-detach": + log.info("try to shut down the tenant") + env.pageserver.tenant_detach(env.initial_tenant) + else: + raise ValueError(f"unexpected kind: {kind}") + + log.info("shutdown did not time out, test passed") diff --git a/test_runner/regress/test_pageserver_crash_consistency.py b/test_runner/regress/test_pageserver_crash_consistency.py index fcae7983f4..e9eee2760e 100644 --- a/test_runner/regress/test_pageserver_crash_consistency.py +++ b/test_runner/regress/test_pageserver_crash_consistency.py @@ -46,7 +46,7 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin: for sk in env.safekeepers: sk.stop() - env.storage_controller.pageserver_api().patch_tenant_config_client_side( + env.storage_controller.pageserver_api().update_tenant_config( tenant_id, {"compaction_threshold": 3} ) # hit the exit failpoint diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 7e5bb45242..fa1cd61206 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -12,7 +12,6 @@ of the pageserver are: from __future__ import annotations import os -import re import time from enum import StrEnum @@ -29,7 +28,6 @@ from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( assert_tenant_state, - list_prefix, wait_for_last_record_lsn, wait_for_upload, ) @@ -124,109 +122,6 @@ def assert_deletion_queue(ps_http, size_fn) -> None: assert size_fn(v) is True -def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): - """ - Validate behavior when a pageserver is run without generation support enabled, - then started again after activating it: - - Before upgrade, no objects should have generation suffixes - - After upgrade, the bucket should contain a mixture. - - In both cases, postgres I/O should work. - """ - neon_env_builder.enable_pageserver_remote_storage( - RemoteStorageKind.MOCK_S3, - ) - - env = neon_env_builder.init_configs() - env.broker.start() - for sk in env.safekeepers: - sk.start() - env.storage_controller.start() - - # We will start a pageserver with no control_plane_api set, so it won't be able to self-register - env.storage_controller.node_register(env.pageserver) - - def remove_control_plane_api_field(config): - return config.pop("control_plane_api") - - control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field) - env.pageserver.start() - env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) - - env.create_tenant( - tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline - ) - - generate_uploads_and_deletions(env, pageserver=env.pageserver) - - def parse_generation_suffix(key): - m = re.match(".+-([0-9a-zA-Z]{8})$", key) - if m is None: - return None - else: - log.info(f"match: {m}") - log.info(f"group: {m.group(1)}") - return int(m.group(1), 16) - - assert neon_env_builder.pageserver_remote_storage is not None - pre_upgrade_keys = list( - [ - o["Key"] - for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[ - "Contents" - ] - ] - ) - for key in pre_upgrade_keys: - assert parse_generation_suffix(key) is None - - env.pageserver.stop() - # Starting without the override that disabled control_plane_api - env.pageserver.patch_config_toml_nonrecursive( - { - "control_plane_api": control_plane_api, - } - ) - env.pageserver.start() - - generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False) - - legacy_objects: list[str] = [] - suffixed_objects = [] - post_upgrade_keys = list( - [ - o["Key"] - for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[ - "Contents" - ] - ] - ) - for key in post_upgrade_keys: - log.info(f"post-upgrade key: {key}") - if parse_generation_suffix(key) is not None: - suffixed_objects.append(key) - else: - legacy_objects.append(key) - - # Bucket now contains a mixture of suffixed and non-suffixed objects - assert len(suffixed_objects) > 0 - assert len(legacy_objects) > 0 - - # Flush through deletions to get a clean state for scrub: we are implicitly validating - # that our generations-enabled pageserver was able to do deletions of layers - # from earlier which don't have a generation. - env.pageserver.http_client().deletion_queue_flush(execute=True) - - assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0 - - # Having written a mixture of generation-aware and legacy index_part.json, - # ensure the scrubber handles the situation as expected. - healthy, metadata_summary = env.storage_scrubber.scan_metadata() - assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline - assert metadata_summary["timeline_count"] == 1 - assert metadata_summary["timeline_shard_count"] == 1 - assert healthy - - def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index 706da1e35e..fcc465f90a 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -22,7 +22,10 @@ CHECKPOINT_TIMEOUT_SECONDS = 60 async def run_worker_for_tenant( - env: NeonEnv, entries: int, tenant: TenantId, offset: int | None = None + env: NeonEnv, + entries: int, + tenant: TenantId, + offset: int | None = None, ) -> Lsn: if offset is None: offset = 0 @@ -37,12 +40,20 @@ async def run_worker_for_tenant( finally: await conn.close(timeout=10) - last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + loop = asyncio.get_running_loop() + sql = await loop.run_in_executor( + None, lambda ep: ep.safe_psql("SELECT pg_current_wal_flush_lsn()"), ep + ) + last_flush_lsn = Lsn(sql[0][0]) return last_flush_lsn async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> tuple[TenantId, TimelineId, Lsn]: - tenant, timeline = env.create_tenant(conf=tenant_conf) + loop = asyncio.get_running_loop() + # capture tenant_conf by specifying `tenant_conf=tenant_conf`, otherwise it will be evaluated to some random value + tenant, timeline = await loop.run_in_executor( + None, lambda tenant_conf, env: env.create_tenant(conf=tenant_conf), tenant_conf, env + ) last_flush_lsn = await run_worker_for_tenant(env, entries, tenant) return tenant, timeline, last_flush_lsn diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index 5ec8357597..aedfdbd210 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -27,6 +27,8 @@ from werkzeug.wrappers.response import Response if TYPE_CHECKING: from typing import Any + from fixtures.httpserver import ListenAddress + # TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP @@ -34,7 +36,7 @@ if TYPE_CHECKING: def test_metric_collection( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, - httpserver_listen_address, + httpserver_listen_address: ListenAddress, ): (host, port) = httpserver_listen_address metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" @@ -195,7 +197,7 @@ def test_metric_collection( def test_metric_collection_cleans_up_tempfile( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, - httpserver_listen_address, + httpserver_listen_address: ListenAddress, ): (host, port) = httpserver_listen_address metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 835ccbd5d4..21cb780c06 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -3,6 +3,7 @@ from __future__ import annotations import random from contextlib import closing +import psycopg2.errors as pgerr import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -226,3 +227,43 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: int | N # so instead, do a fast shutdown for this one test. # See https://github.com/neondatabase/neon/issues/8709 env.stop(immediate=True) + + +def test_pageserver_lost_and_transaction_aborted(neon_env_builder: NeonEnvBuilder): + """ + If pageserver is unavailable during a transaction abort and target relation is + not present in cache, we abort the transaction in ABORT state which triggers a sigabrt. + This is _expected_ behavour + """ + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main", config_lines=["neon.relsize_hash_size=0"]) + with closing(endpoint.connect()) as conn, conn.cursor() as cur: + cur.execute("CREATE DATABASE test") + with ( + pytest.raises((pgerr.InterfaceError, pgerr.InternalError)), + endpoint.connect(dbname="test") as conn, + conn.cursor() as cur, + ): + cur.execute("create table t(b box)") + env.pageserver.stop() + cur.execute("create index ti on t using gist(b)") + + +def test_pageserver_lost_and_transaction_committed(neon_env_builder: NeonEnvBuilder): + """ + If pageserver is unavailable during a transaction commit and target relation is + not present in cache, we abort the transaction in COMMIT state which triggers a sigabrt. + This is _expected_ behavour + """ + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main", config_lines=["neon.relsize_hash_size=0"]) + with closing(endpoint.connect()) as conn, conn.cursor() as cur: + cur.execute("CREATE DATABASE test") + with ( + pytest.raises((pgerr.InterfaceError, pgerr.InternalError)), + endpoint.connect(dbname="test") as conn, + conn.cursor() as cur, + ): + cur.execute("create table t(t boolean)") + env.pageserver.stop() + cur.execute("drop table t") diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 1292682f9e..a9b897b741 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -8,16 +8,21 @@ from pathlib import Path from typing import TYPE_CHECKING import pytest -from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver +from fixtures.neon_fixtures import ( + DEFAULT_BRANCH_NAME, + NeonEnvBuilder, + NeonPageserver, + StorageControllerMigrationConfig, +) from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, wait_for_upload_queue_empty, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage -from fixtures.utils import skip_in_debug_build, wait_until +from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build, wait_until from fixtures.workload import Workload from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -443,7 +448,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): workload.write_rows(256, env.pageservers[0].id) env.pageserver.http_client().tenant_heatmap_upload(tenant_id) - def validate_heatmap(heatmap): + def validate_heatmap(heatmap, on_disk_heatmap): assert len(heatmap["timelines"]) == 1 assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id) assert len(heatmap["timelines"][0]["layers"]) > 0 @@ -452,10 +457,13 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): # Each layer appears at most once assert len(set(layer["name"] for layer in layers)) == len(layers) + assert heatmap == on_disk_heatmap + # Download and inspect the heatmap that the pageserver uploaded heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_first_on_disk = env.pageserver.heatmap_content(tenant_id) log.info(f"Read back heatmap: {heatmap_first}") - validate_heatmap(heatmap_first) + validate_heatmap(heatmap_first, heatmap_first_on_disk) # Do some more I/O to generate more layers workload.churn_rows(64, env.pageservers[0].id) @@ -463,9 +471,10 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): # Ensure that another heatmap upload includes the new layers heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_second_on_disk = env.pageserver.heatmap_content(tenant_id) log.info(f"Read back heatmap: {heatmap_second}") assert heatmap_second != heatmap_first - validate_heatmap(heatmap_second) + validate_heatmap(heatmap_second, heatmap_second_on_disk) def list_elegible_layers( @@ -885,3 +894,176 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll assert progress_3["heatmap_mtime"] is not None assert progress_3["layers_total"] == progress_3["layers_downloaded"] assert progress_3["bytes_total"] == progress_3["bytes_downloaded"] + + +@skip_in_debug_build("only run with release build") +@run_only_on_default_postgres("PG version is not interesting here") +def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + + tenant_conf = TENANT_CONF.copy() + tenant_conf["heatmap_period"] = "0s" + + env = neon_env_builder.init_configs() + env.start() + + assert isinstance(env.pageserver_remote_storage, S3Storage) # Satisfy linter + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.create_tenant(tenant_id, timeline_id, conf=tenant_conf, placement_policy='{"Attached":1}') + + env.storage_controller.reconcile_until_idle() + + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + # Generate a bunch of small layers (we will apply a slowdown failpoint that works on a per-layer basis) + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(128, upload=True) + workload.write_rows(128, upload=True) + workload.write_rows(128, upload=True) + + child_timeline_id = env.create_branch( + "foo", tenant_id, ancestor_branch_name=DEFAULT_BRANCH_NAME + ) + + workload.write_rows(128, upload=True) + + # Expect lots of layers + assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 + + # Simulate large data by making layer downloads artifically slow + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + + def timeline_heatmap(tlid): + assert env.pageserver_remote_storage is not None + + heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) + for htl in heatmap["timelines"]: + if htl["timeline_id"] == str(tlid): + return htl + + raise RuntimeError(f"No heatmap for timeline: {tlid}") + + # Upload a heatmap, so that secondaries have something to download + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + heatmap_before_migration = timeline_heatmap(timeline_id) + + # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms. + # However, it pulls the heatmap, which will be important later. + http_client = env.storage_controller.pageserver_api() + (status, progress) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000) + assert status == 202 + assert progress["heatmap_mtime"] is not None + assert progress["layers_downloaded"] > 0 + assert progress["bytes_downloaded"] > 0 + assert progress["layers_total"] > progress["layers_downloaded"] + assert progress["bytes_total"] > progress["bytes_downloaded"] + + env.storage_controller.allowed_errors.extend( + [ + ".*Timed out.*downloading layers.*", + ] + ) + + # Use a custom configuration that gives up earlier than usual. + # We can't hydrate everything anyway because of the failpoints. + config = StorageControllerMigrationConfig( + secondary_warmup_timeout="5s", secondary_download_request_timeout="2s" + ) + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_id, shard_number=0, shard_count=0), ps_secondary.id, config + ) + + env.storage_controller.reconcile_until_idle() + assert env.storage_controller.locate(tenant_id)[0]["node_id"] == ps_secondary.id + + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + heatmap_after_migration = timeline_heatmap(timeline_id) + + assert len(heatmap_before_migration["layers"]) > 0 + + after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"]) + assert len(heatmap_before_migration["layers"]) <= after_migration_heatmap_layers_count + + log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}") + + env.storage_controller.download_heatmap_layers( + TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id + ) + + # Now simulate the case where a child timeline is archived, parent layers + # are evicted and the child is unarchived. When the child is unarchived, + # itself and the parent update their heatmaps to contain layers needed by the + # child. One can warm up the timeline hierarchy since the heatmaps are ready. + + def all_layers_downloaded(expected_layer_count: int): + local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id)) + + log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}") + assert local_layers_count >= expected_layer_count + + wait_until(lambda: all_layers_downloaded(after_migration_heatmap_layers_count)) + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + + before = ( + ps_secondary.http_client() + .get_metrics() + .query_one("pageserver_remote_ondemand_downloaded_layers_total") + .value + ) + workload.validate() + after = ( + ps_secondary.http_client() + .get_metrics() + .query_one("pageserver_remote_ondemand_downloaded_layers_total") + .value + ) + + workload.stop() + assert before == after + + def check_archival_state(state: TimelineArchivalState, tline): + timelines = ( + timeline["timeline_id"] + for timeline in ps_secondary.http_client().timeline_list(tenant_id=tenant_id) + ) + + if state == TimelineArchivalState.ARCHIVED: + assert str(tline) not in timelines + elif state == TimelineArchivalState.UNARCHIVED: + assert str(tline) in timelines + + ps_secondary.http_client().timeline_archival_config( + tenant_id, child_timeline_id, TimelineArchivalState.ARCHIVED + ) + ps_secondary.http_client().timeline_offload(tenant_id, child_timeline_id) + wait_until(lambda: check_archival_state(TimelineArchivalState.ARCHIVED, child_timeline_id)) + + ps_secondary.http_client().evict_all_layers(tenant_id, timeline_id) + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + assert len(timeline_heatmap(timeline_id)["layers"]) == 0 + + ps_secondary.http_client().timeline_archival_config( + tenant_id, child_timeline_id, TimelineArchivalState.UNARCHIVED + ) + wait_until(lambda: check_archival_state(TimelineArchivalState.UNARCHIVED, child_timeline_id)) + + ps_secondary.http_client().tenant_heatmap_upload(tenant_id) + log.info(f"Parent timeline heatmap size: {len(timeline_heatmap(timeline_id)['layers'])}") + log.info(f"Child timeline heatmap size: {len(timeline_heatmap(child_timeline_id)['layers'])}") + + expected_locally = len(timeline_heatmap(timeline_id)["layers"]) + assert expected_locally > 0 + + env.storage_controller.download_heatmap_layers( + TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id + ) + wait_until(lambda: all_layers_downloaded(expected_locally)) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 2877f14e0e..afc7ef3e01 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -120,7 +120,7 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End # Run the main PostgreSQL regression tests, in src/test/regress. # -@pytest.mark.timeout(900) # Contains many sub-tests, is slow in debug builds +@pytest.mark.timeout(3000) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) def test_pg_regress( neon_env_builder: NeonEnvBuilder, @@ -194,7 +194,7 @@ def test_pg_regress( # Run the PostgreSQL "isolation" tests, in src/test/isolation. # -@pytest.mark.timeout(600) # Contains many sub-tests, is slow in debug builds +@pytest.mark.timeout(1500) # Contains many sub-tests, is slow in debug builds @pytest.mark.parametrize("shard_count", [None, 4]) def test_isolation( neon_env_builder: NeonEnvBuilder, @@ -222,6 +222,8 @@ def test_isolation( "max_prepared_transactions=100", # Enable the test mode, so that we don't need to patch the test cases. "neon.regress_test_mode = true", + # Stack size should be increased for tests to pass with asan. + "max_stack_depth = 4MB", ], ) endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") @@ -259,7 +261,7 @@ def test_isolation( pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) # This fails with a mismatch on `pg_multixact/offsets/0000` - # post_checks(env, test_output_dir, DBNAME, endpoint) + post_checks(env, test_output_dir, DBNAME, endpoint) # Run extra Neon-specific pg_regress-based tests. The tests and their @@ -330,8 +332,10 @@ def test_sql_regress( @skip_in_debug_build("only run with release build") +@pytest.mark.parametrize("reldir_type", ["v1", "v2"]) def test_tx_abort_with_many_relations( neon_env_builder: NeonEnvBuilder, + reldir_type: str, ): """ This is not a pg_regress test as such, but perhaps it should be -- this test exercises postgres @@ -340,7 +344,11 @@ def test_tx_abort_with_many_relations( Reproducer for https://github.com/neondatabase/neon/issues/9505 """ - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "rel_size_v2_enabled": "true" if reldir_type == "v2" else "false", + } + ) ep = env.endpoints.create_start( "main", tenant_id=env.initial_tenant, @@ -352,48 +360,65 @@ def test_tx_abort_with_many_relations( # How many relations: this number is tuned to be long enough to take tens of seconds # if the rollback code path is buggy, tripping the test's timeout. - n = 4000 + if reldir_type == "v1": + n = 4000 + step = 4000 + else: + n = 100000 + step = 5000 def create(): # Create many relations log.info(f"Creating {n} relations...") - ep.safe_psql_many( - [ - "BEGIN", - f"""DO $$ - DECLARE - i INT; - table_name TEXT; - BEGIN - FOR i IN 1..{n} LOOP - table_name := 'table_' || i; - EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)'; - END LOOP; - END $$; - """, - "COMMIT", - ] - ) + begin = 0 + while True: + end = begin + step + ep.safe_psql_many( + [ + "BEGIN", + f"""DO $$ + DECLARE + i INT; + table_name TEXT; + BEGIN + FOR i IN {begin}..{end} LOOP + table_name := 'table_' || i; + EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)'; + END LOOP; + END $$; + """, + "COMMIT", + ] + ) + begin = end + if begin >= n: + break def truncate(): # Truncate relations, then roll back the transaction containing the truncations log.info(f"Truncating {n} relations...") - ep.safe_psql_many( - [ - "BEGIN", - f"""DO $$ - DECLARE - i INT; - table_name TEXT; - BEGIN - FOR i IN 1..{n} LOOP - table_name := 'table_' || i; - EXECUTE 'TRUNCATE ' || table_name ; - END LOOP; - END $$; - """, - ] - ) + begin = 0 + while True: + end = begin + step + ep.safe_psql_many( + [ + "BEGIN", + f"""DO $$ + DECLARE + i INT; + table_name TEXT; + BEGIN + FOR i IN {begin}..{end} LOOP + table_name := 'table_' || i; + EXECUTE 'TRUNCATE ' || table_name ; + END LOOP; + END $$; + """, + ] + ) + begin = end + if begin >= n: + break def rollback_and_wait(): log.info(f"Rolling back after truncating {n} relations...") @@ -417,7 +442,7 @@ def test_tx_abort_with_many_relations( try: # Rollback phase should be fast: this is one WAL record that we should process efficiently fut = exec.submit(rollback_and_wait) - fut.result(timeout=5) + fut.result(timeout=15) except: exec.shutdown(wait=False, cancel_futures=True) raise diff --git a/test_runner/regress/test_pgstat.py b/test_runner/regress/test_pgstat.py new file mode 100644 index 0000000000..bf9b982e14 --- /dev/null +++ b/test_runner/regress/test_pgstat.py @@ -0,0 +1,83 @@ +import pytest +from fixtures.neon_fixtures import NeonEnv +from fixtures.pg_version import PgVersion + + +# +# Test that pgstat statistic is preserved across sessions +# +def test_pgstat(neon_simple_env: NeonEnv): + env = neon_simple_env + if env.pg_version == PgVersion.V14: + pytest.skip("PG14 doesn't support pgstat statistic persistence") + + n = 10000 + endpoint = env.endpoints.create_start( + "main", config_lines=["neon.pgstat_file_size_limit=100kB", "autovacuum=off"] + ) + + con = endpoint.connect() + cur = con.cursor() + + cur.execute("create table t(x integer)") + cur.execute(f"insert into t values (generate_series(1,{n}))") + cur.execute("vacuum analyze t") + cur.execute("select sum(x) from t") + cur.execute("update t set x=x+1") + + cur.execute("select pg_stat_force_next_flush()") + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + assert rec == (2, n * 2, n, n, n * 2, n, 1, 1) + + endpoint.stop() + endpoint.start() + + con = endpoint.connect() + cur = con.cursor() + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + assert rec == (2, n * 2, n, n, n * 2, n, 1, 1) + + cur.execute("update t set x=x+1") + + # stop without checkpoint + endpoint.stop(mode="immediate") + endpoint.start() + + con = endpoint.connect() + cur = con.cursor() + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + # pgstat information should be discarded in case of abnormal termination + assert rec == (0, 0, 0, 0, 0, 0, 0, 0) + + cur.execute("select sum(x) from t") + + # create more relations to increase size of statistics + for i in range(1, 1000): + cur.execute(f"create table t{i}(pk integer primary key)") + + cur.execute("select pg_stat_force_next_flush()") + + endpoint.stop() + endpoint.start() + + con = endpoint.connect() + cur = con.cursor() + + cur.execute( + "select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables" + ) + rec = cur.fetchall()[0] + # pgstat information is not restored because its size exeeds 100k threshold + assert rec == (0, 0, 0, 0, 0, 0, 0, 0) diff --git a/test_runner/regress/test_physical_and_logical_replicaiton.py b/test_runner/regress/test_physical_and_logical_replicaiton.py index ad2d0871b8..229439106b 100644 --- a/test_runner/regress/test_physical_and_logical_replicaiton.py +++ b/test_runner/regress/test_physical_and_logical_replicaiton.py @@ -2,7 +2,7 @@ from __future__ import annotations import time -from fixtures.neon_fixtures import NeonEnv, logical_replication_sync +from fixtures.neon_fixtures import NeonEnv, logical_replication_sync, wait_replica_caughtup def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonEnv, vanilla_pg): @@ -38,10 +38,12 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE for pk in range(n_records): p_cur.execute("insert into t (pk) values (%s)", (pk,)) + wait_replica_caughtup(primary, secondary) + s_cur.execute("select count(*) from t") assert s_cur.fetchall()[0][0] == n_records - logical_replication_sync(vanilla_pg, primary) + logical_replication_sync(vanilla_pg, primary, "sub1") assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records # Check that LR slot is not copied to replica @@ -85,7 +87,7 @@ def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg): s_con = secondary.connect() s_cur = s_con.cursor() - logical_replication_sync(vanilla_pg, primary) + logical_replication_sync(vanilla_pg, primary, "sub1") assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records s_cur.execute("select count(*) from t") diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py index 6cb11b825d..17819fd367 100644 --- a/test_runner/regress/test_physical_replication.py +++ b/test_runner/regress/test_physical_replication.py @@ -187,7 +187,7 @@ def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_en origin=primary, endpoint_id="secondary", config_lines=[ - "max_connections=2", + "max_connections=5", "autovacuum_max_workers=1", "max_worker_processes=5", "max_wal_senders=1", diff --git a/test_runner/regress/test_prefetch_buffer_resize.py b/test_runner/regress/test_prefetch_buffer_resize.py index 7676b78b0e..99fe80e621 100644 --- a/test_runner/regress/test_prefetch_buffer_resize.py +++ b/test_runner/regress/test_prefetch_buffer_resize.py @@ -7,7 +7,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder @pytest.mark.parametrize("shard_count", [None, 4]) -@pytest.mark.timeout(600) def test_prefetch(neon_env_builder: NeonEnvBuilder, shard_count: int | None): if shard_count is not None: neon_env_builder.num_pageservers = shard_count diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index d8df2efc78..3c7fd0b897 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -57,7 +57,7 @@ def test_proxy_select_1(static_proxy: NeonProxy): assert out[0][0] == 1 # with SNI - out = static_proxy.safe_psql("select 42", host="generic-project-name.localtest.me") + out = static_proxy.safe_psql("select 42", host="generic-project-name.local.neon.build") assert out[0][0] == 42 @@ -234,7 +234,7 @@ def test_sql_over_http_serverless_driver(static_proxy: NeonProxy): connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" response = requests.post( - f"https://api.localtest.me:{static_proxy.external_http_port}/sql", + f"https://api.local.neon.build:{static_proxy.external_http_port}/sql", data=json.dumps({"query": "select 42 as answer", "params": []}), headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr}, verify=str(static_proxy.test_output_dir / "proxy.crt"), diff --git a/test_runner/regress/test_proxy_allowed_ips.py b/test_runner/regress/test_proxy_allowed_ips.py index 902da1942e..c59da8c6b0 100644 --- a/test_runner/regress/test_proxy_allowed_ips.py +++ b/test_runner/regress/test_proxy_allowed_ips.py @@ -35,7 +35,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil check_cannot_connect(query="select 1", sslsni=0, options="endpoint=private-project") # with SNI - check_cannot_connect(query="select 1", host="private-project.localtest.me") + check_cannot_connect(query="select 1", host="private-project.local.neon.build") # no SNI, deprecated `options=project` syntax (before we had several endpoint in project) out = static_proxy.safe_psql(query="select 1", sslsni=0, options="project=generic-project") @@ -46,7 +46,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil assert out[0][0] == 1 # with SNI - out = static_proxy.safe_psql(query="select 1", host="generic-project.localtest.me") + out = static_proxy.safe_psql(query="select 1", host="generic-project.local.neon.build") assert out[0][0] == 1 diff --git a/test_runner/regress/test_proxy_metric_collection.py b/test_runner/regress/test_proxy_metric_collection.py index dd63256388..5ff4a99c51 100644 --- a/test_runner/regress/test_proxy_metric_collection.py +++ b/test_runner/regress/test_proxy_metric_collection.py @@ -2,6 +2,7 @@ from __future__ import annotations from collections.abc import Iterator from pathlib import Path +from typing import TYPE_CHECKING import pytest from fixtures.log_helper import log @@ -15,6 +16,9 @@ from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response +if TYPE_CHECKING: + from fixtures.httpserver import ListenAddress + def proxy_metrics_handler(request: Request) -> Response: if request.json is None: @@ -38,7 +42,7 @@ def proxy_metrics_handler(request: Request) -> Response: def proxy_with_metric_collector( port_distributor: PortDistributor, neon_binpath: Path, - httpserver_listen_address, + httpserver_listen_address: ListenAddress, test_output_dir: Path, ) -> Iterator[NeonProxy]: """Neon proxy that routes through link auth and has metric collection enabled.""" diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py index ea01252ce4..f14317a39f 100644 --- a/test_runner/regress/test_proxy_websockets.py +++ b/test_runner/regress/test_proxy_websockets.py @@ -1,10 +1,15 @@ from __future__ import annotations +import asyncio import ssl +import asyncpg import pytest +import websocket_tunnel import websockets +from fixtures.log_helper import log from fixtures.neon_fixtures import NeonProxy +from fixtures.port_distributor import PortDistributor @pytest.mark.asyncio @@ -196,3 +201,53 @@ async def test_websockets_pipelined(static_proxy: NeonProxy): # close await websocket.send(b"X\x00\x00\x00\x04") await websocket.wait_closed() + + +@pytest.mark.asyncio +async def test_websockets_tunneled(static_proxy: NeonProxy, port_distributor: PortDistributor): + static_proxy.safe_psql("create user ws_auth with password 'ws' superuser") + + user = "ws_auth" + password = "ws" + + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt")) + + # Launch a tunnel service so that we can speak the websockets protocol to + # the proxy + tunnel_port = port_distributor.get_port() + tunnel_server = await websocket_tunnel.start_server( + "127.0.0.1", + tunnel_port, + f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + ssl_context, + ) + log.info(f"websockets tunnel listening for connections on port {tunnel_port}") + + async with tunnel_server: + + async def run_tunnel(): + try: + async with tunnel_server: + await tunnel_server.serve_forever() + except Exception as e: + log.error(f"Error in tunnel task: {e}") + + tunnel_task = asyncio.create_task(run_tunnel()) + + # Ok, the tunnel is now running. Check that we can connect to the proxy's + # websocket interface, through the tunnel + tunnel_connstring = f"postgres://{user}:{password}@127.0.0.1:{tunnel_port}/postgres" + + log.info(f"connecting to {tunnel_connstring}") + conn = await asyncpg.connect(tunnel_connstring) + res = await conn.fetchval("SELECT 123") + assert res == 123 + await conn.close() + log.info("Ran a query successfully through the tunnel") + + tunnel_server.close() + try: + await tunnel_task + except asyncio.CancelledError: + pass diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index c13bea7ee1..fe970a868c 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -287,7 +287,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): offset=offset, ) - # Do some update so we can increment latest_gc_cutoff + # Do some update so we can increment gc_cutoff generate_updates_on_main(env, ep_main, i, end=100) # Wait for the existing lease to expire. diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index b43a443149..dab01fcd1a 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -11,10 +11,13 @@ from fixtures.neon_fixtures import NeonEnvBuilder # Test pageserver recovery after crash # def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): - # Override default checkpointer settings to run it more often + # Override default checkpointer settings to run it more often. + # This also creates a bunch more L0 layers, so disable backpressure. env = neon_env_builder.init_start( initial_tenant_conf={ "checkpoint_distance": "1048576", + "l0_flush_delay_threshold": "0", + "l0_flush_stall_threshold": "0", } ) env.pageserver.is_testing_enabled_or_skip() diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py new file mode 100644 index 0000000000..3e29c92a96 --- /dev/null +++ b/test_runner/regress/test_relations.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) + + +def test_pageserver_reldir_v2( + neon_env_builder: NeonEnvBuilder, +): + env = neon_env_builder.init_start( + initial_tenant_conf={ + "rel_size_v2_enabled": "false", + } + ) + + endpoint = env.endpoints.create_start("main") + # Create a relation in v1 + endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)") + endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)") + + # Switch to v2 + env.pageserver.http_client().update_tenant_config( + env.initial_tenant, + { + "rel_size_v2_enabled": True, + }, + ) + + # Check if both relations are still accessible + endpoint.safe_psql("SELECT * FROM foo1") + endpoint.safe_psql("SELECT * FROM foo2") + + # Restart the endpoint + endpoint.stop() + endpoint.start() + + # Check if both relations are still accessible again after restart + endpoint.safe_psql("SELECT * FROM foo1") + endpoint.safe_psql("SELECT * FROM foo2") + + # Create a relation in v2 + endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)") + # Delete a relation in v1 + endpoint.safe_psql("DROP TABLE foo1") + + # Check if both relations are still accessible + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("SELECT * FROM foo3") + + # Restart the endpoint + endpoint.stop() + # This will acquire a basebackup, which lists all relations. + endpoint.start() + + # Check if both relations are still accessible + endpoint.safe_psql("DROP TABLE IF EXISTS foo1") + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("SELECT * FROM foo3") + + endpoint.safe_psql("DROP TABLE foo3") + endpoint.stop() + endpoint.start() + + # Check if relations are still accessible + endpoint.safe_psql("DROP TABLE IF EXISTS foo1") + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("DROP TABLE IF EXISTS foo3") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 76a42ef4a2..c39c74fa2a 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -539,6 +539,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( # small checkpointing and compaction targets to ensure we generate many operations "checkpoint_distance": f"{64 * 1024}", "compaction_threshold": "1", + "l0_flush_delay_threshold": "0", + "l0_flush_stall_threshold": "0", "compaction_target_size": f"{64 * 1024}", # large horizon to avoid automatic GC (our assert on gc_result below relies on that) "gc_horizon": f"{1024 ** 4}", diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 30abf91d3a..f58bbcd3c0 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -3,7 +3,7 @@ from __future__ import annotations import os import time from collections import defaultdict -from typing import Any +from typing import TYPE_CHECKING, Any import pytest import requests @@ -11,6 +11,7 @@ from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchival from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( + DEFAULT_AZ_ID, NeonEnv, NeonEnvBuilder, StorageControllerApiException, @@ -27,6 +28,9 @@ from typing_extensions import override from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response +if TYPE_CHECKING: + from fixtures.httpserver import ListenAddress + def test_sharding_smoke( neon_env_builder: NeonEnvBuilder, @@ -87,7 +91,7 @@ def test_sharding_smoke( workload.init() sizes_before = get_sizes() - workload.write_rows(256) + workload.write_rows(65536) # Test that we can read data back from a sharded tenant workload.validate() @@ -516,14 +520,18 @@ def test_sharding_split_smoke( shard_count = 2 # Shard count we split into split_shard_count = 4 - # We will have 2 shards per pageserver once done (including secondaries) - neon_env_builder.num_pageservers = split_shard_count + # In preferred AZ & other AZ we will end up with one shard per pageserver + neon_env_builder.num_pageservers = split_shard_count * 2 # Two AZs def assign_az(ps_cfg): az = f"az-{(ps_cfg['id'] - 1) % 2}" ps_cfg["availability_zone"] = az + # We will run more pageservers than tests usually do, so give them tiny page caches + # in case we're on a test node under memory pressure. + ps_cfg["page_cache_size"] = 128 + neon_env_builder.pageserver_config_override = assign_az # 1MiB stripes: enable getting some meaningful data distribution without @@ -557,11 +565,17 @@ def test_sharding_split_smoke( workload.write_rows(256) # Note which pageservers initially hold a shard after tenant creation - pre_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] - log.info("Pre-split pageservers: {pre_split_pageserver_ids}") + pre_split_pageserver_ids = dict() + for loc in env.storage_controller.locate(tenant_id): + shard_no = TenantShardId.parse(loc["shard_id"]).shard_number + pre_split_pageserver_ids[loc["node_id"]] = shard_no + log.info(f"Pre-split pageservers: {pre_split_pageserver_ids}") # For pageservers holding a shard, validate their ingest statistics # reflect a proper splitting of the WAL. + + observed_on_shard_zero = 0 + received_on_non_zero_shard = 0 for pageserver in env.pageservers: if pageserver.id not in pre_split_pageserver_ids: continue @@ -569,28 +583,38 @@ def test_sharding_split_smoke( metrics = pageserver.http_client().get_metrics_values( [ "pageserver_wal_ingest_records_received_total", - "pageserver_wal_ingest_records_committed_total", - "pageserver_wal_ingest_records_filtered_total", + "pageserver_wal_ingest_records_observed_total", ] ) log.info(f"Pageserver {pageserver.id} metrics: {metrics}") - # Not everything received was committed - assert ( - metrics["pageserver_wal_ingest_records_received_total"] - > metrics["pageserver_wal_ingest_records_committed_total"] - ) + received = metrics["pageserver_wal_ingest_records_received_total"] + observed = metrics["pageserver_wal_ingest_records_observed_total"] - # Something was committed - assert metrics["pageserver_wal_ingest_records_committed_total"] > 0 + shard_number: int | None = pre_split_pageserver_ids.get(pageserver.id, None) + if shard_number is None: + assert received == 0 + assert observed == 0 + elif shard_number == 0: + # Shard 0 receives its own records and observes records of other shards + # for relation size tracking. + assert observed > 0 + assert received > 0 + observed_on_shard_zero = int(observed) + else: + # Non zero shards do not observe any records, but only receive their own. + assert observed == 0 + assert received > 0 + received_on_non_zero_shard += int(received) - # Counts are self consistent - assert ( - metrics["pageserver_wal_ingest_records_received_total"] - == metrics["pageserver_wal_ingest_records_committed_total"] - + metrics["pageserver_wal_ingest_records_filtered_total"] - ) + # Some records are sent to multiple shards and some shard 0 records include both value observations + # and other metadata. Hence, we do a sanity check below that shard 0 observes the majority of records + # received by other shards. + assert ( + observed_on_shard_zero <= received_on_non_zero_shard + and observed_on_shard_zero >= received_on_non_zero_shard // 2 + ) # TODO: validate that shards have different sizes @@ -629,7 +653,7 @@ def test_sharding_split_smoke( # We should have split into 8 shards, on the same 4 pageservers we started on. assert len(post_split_pageserver_ids) == split_shard_count assert len(set(post_split_pageserver_ids)) == shard_count - assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids) + assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids.keys()) # The old parent shards should no longer exist on disk assert not shards_on_disk(old_shard_ids) @@ -659,8 +683,8 @@ def test_sharding_split_smoke( # - shard_count reconciles for the original setup of the tenant # - shard_count reconciles for detaching the original secondary locations during split # - split_shard_count reconciles during shard splitting, for setting up secondaries. - # - split_shard_count/2 of the child shards will need to fail over to their secondaries (since we have 8 shards and 4 pageservers, only 4 will move) - expect_reconciles = shard_count * 2 + split_shard_count + split_shard_count / 2 + # - split_shard_count/2 reconciles to migrate shards to their temporary secondaries + expect_reconciles = shard_count * 2 + split_shard_count + 3 * (split_shard_count / 2) reconcile_ok = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} @@ -725,17 +749,21 @@ def test_sharding_split_smoke( # dominated by shard count. log.info(f"total: {total}") assert total == { - 1: 2, - 2: 2, - 3: 2, - 4: 2, + 1: 1, + 2: 1, + 3: 1, + 4: 1, + 5: 1, + 6: 1, + 7: 1, + 8: 1, } # The controller is not required to lay out the attached locations in any particular way, but # all the pageservers that originally held an attached shard should still hold one, otherwise # it would indicate that we had done some unnecessary migration. log.info(f"attached: {attached}") - for ps_id in pre_split_pageserver_ids: + for ps_id in pre_split_pageserver_ids.keys(): log.info("Pre-split pageserver {ps_id} should still hold an attached location") assert ps_id in attached @@ -759,7 +787,7 @@ def test_sharding_split_smoke( def test_sharding_split_stripe_size( neon_env_builder: NeonEnvBuilder, httpserver: HTTPServer, - httpserver_listen_address, + httpserver_listen_address: ListenAddress, initial_stripe_size: int, ): """ @@ -790,6 +818,7 @@ def test_sharding_split_stripe_size( "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, } assert notifications[0] == expect @@ -809,6 +838,7 @@ def test_sharding_split_stripe_size( {"node_id": int(env.pageservers[0].id), "shard_number": 0}, {"node_id": int(env.pageservers[0].id), "shard_number": 1}, ], + "preferred_az": DEFAULT_AZ_ID, } log.info(f"Got notification: {notifications[1]}") assert notifications[1] == expect_after @@ -1338,6 +1368,7 @@ def test_sharding_split_failures( workload = Workload(env, tenant_id, timeline_id) workload.init() workload.write_rows(100) + compute_reconfigure_listener.register_workload(workload) # Put the environment into a failing state (exact meaning depends on `failure`) failure.apply(env) @@ -1365,13 +1396,7 @@ def test_sharding_split_failures( else: attached_count += 1 - if exclude_ps_id is not None: - # For a node failure case, we expect there to be a secondary location - # scheduled on the offline node, so expect one fewer secondary in total - assert secondary_count == initial_shard_count - 1 - else: - assert secondary_count == initial_shard_count - + assert secondary_count == initial_shard_count assert attached_count == initial_shard_count def assert_split_done(exclude_ps_id: int | None = None) -> None: @@ -1522,6 +1547,9 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): # Tip: set to 100MB to make the test fail "max_replication_write_lag=1MB", ], + # We need `neon` extension for calling backpressure functions, + # this flag instructs `compute_ctl` to pre-install it. + "update_catalog": True, }, ) workload.init() @@ -1786,3 +1814,14 @@ def test_sharding_gc( shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"]) log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}") assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn + + for ps in env.pageservers: + # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by + # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. + # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed + ps.allowed_errors.extend( + [ + ".*could not find data for key.*", + ".*could not ingest record.*", + ] + ) diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 2a26fef59a..3487542d6e 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -116,7 +116,7 @@ def test_pg_sni_router( test_output_dir: Path, ): generate_tls_cert( - "endpoint.namespace.localtest.me", + "endpoint.namespace.local.neon.build", test_output_dir / "router.crt", test_output_dir / "router.key", ) @@ -130,7 +130,7 @@ def test_pg_sni_router( with PgSniRouter( neon_binpath=neon_binpath, port=router_port, - destination="localtest.me", + destination="local.neon.build", tls_cert=test_output_dir / "router.crt", tls_key=test_output_dir / "router.key", test_output_dir=test_output_dir, @@ -141,7 +141,7 @@ def test_pg_sni_router( "select 1", dbname="postgres", sslmode="require", - host=f"endpoint--namespace--{pg_port}.localtest.me", + host=f"endpoint--namespace--{pg_port}.local.neon.build", hostaddr="127.0.0.1", ) assert out[0][0] == 1 diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 9f74dcccb9..d5acc257b2 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -16,6 +16,8 @@ from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( + DEFAULT_AZ_ID, + LogCursor, NeonEnv, NeonEnvBuilder, NeonPageserver, @@ -58,6 +60,8 @@ from werkzeug.wrappers.response import Response if TYPE_CHECKING: from typing import Any + from fixtures.httpserver import ListenAddress + def get_node_shard_counts(env: NeonEnv, tenant_ids): counts: defaultdict[int, int] = defaultdict(int) @@ -109,6 +113,19 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination) for tid in tenant_ids: env.create_tenant(tid, shard_count=shards_per_tenant) + # Tenant listing API should work + listed_tenants = env.storage_controller.tenant_list() + log.info(f"listed_tenants: {listed_tenants}") + assert set(t["tenant_id"] for t in listed_tenants) == set(str(t) for t in tenant_ids) + paged = env.storage_controller.tenant_list(limit=2, start_after=listed_tenants[0]["tenant_id"]) + assert len(paged) == 2 + assert paged[0] == listed_tenants[1] + assert paged[1] == listed_tenants[2] + paged = env.storage_controller.tenant_list( + limit=1000, start_after="ffffffffffffffffffffffffffffffff" + ) + assert paged == [] + # Validate high level metrics assert ( env.storage_controller.get_metric_value("storage_controller_tenant_shards") @@ -165,6 +182,13 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination) time.sleep(1) assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0 + # Exercise live migration of a tenant back to the original pageserver + migrate_tenant = env.pageservers[1].http_client().tenant_list_locations()["tenant_shards"][0][0] + env.storage_controller.tenant_shard_migrate( + TenantShardId.parse(migrate_tenant), env.pageservers[0].id + ) + assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 1 + # Restarting a pageserver should not detach any tenants (i.e. /re-attach works) before_restart = env.pageservers[1].http_client().tenant_list_locations() env.pageservers[1].stop() @@ -356,6 +380,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up but imports the generation number. """ + neon_env_builder.num_azs = 3 env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder) virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) @@ -392,6 +417,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up "node_secondary" ][0] + # Check that the secondary's scheduling is stable + assert env.storage_controller.reconcile_all() == 0 + # Call into storage controller to onboard the tenant generation += 1 r = virtual_ps_http.tenant_location_conf( @@ -443,6 +471,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up ) assert len(r["shards"]) == 1 + # Check that onboarding did not result in an unstable scheduling state + assert env.storage_controller.reconcile_all() == 0 + # We should see the tenant is now attached to the pageserver managed # by the sharding service origin_tenants = origin_ps.http_client().tenant_list() @@ -563,7 +594,7 @@ def test_storage_controller_onboard_detached(neon_env_builder: NeonEnvBuilder): def test_storage_controller_compute_hook( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, - httpserver_listen_address, + httpserver_listen_address: ListenAddress, ): """ Test that the sharding service calls out to the configured HTTP endpoint on attachment changes @@ -597,6 +628,7 @@ def test_storage_controller_compute_hook( "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, } assert notifications[0] == expect @@ -614,6 +646,7 @@ def test_storage_controller_compute_hook( "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, } def received_migration_notification(): @@ -641,6 +674,7 @@ def test_storage_controller_compute_hook( {"node_id": int(env.pageservers[1].id), "shard_number": 0}, {"node_id": int(env.pageservers[1].id), "shard_number": 1}, ], + "preferred_az": DEFAULT_AZ_ID, } def received_split_notification(): @@ -681,7 +715,7 @@ NOTIFY_FAILURE_LOGS = [ def test_storage_controller_stuck_compute_hook( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, - httpserver_listen_address, + httpserver_listen_address: ListenAddress, ): """ Test the migration process's behavior when the compute hook does not enable it to proceed @@ -712,6 +746,7 @@ def test_storage_controller_stuck_compute_hook( "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, } assert notifications[0] == expect @@ -814,11 +849,127 @@ def test_storage_controller_stuck_compute_hook( env.storage_controller.consistency_check() +@run_only_on_default_postgres("postgres behavior is not relevant") +def test_storage_controller_compute_hook_retry( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address: ListenAddress, +): + """ + Test that when a reconciler can't do its compute hook notification, it will keep + trying until it succeeds. + + Reproducer for https://github.com/neondatabase/cloud/issues/22612 + """ + + neon_env_builder.num_pageservers = 2 + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + + handle_params = {"status": 200} + + notifications = [] + + def handler(request: Request): + status = handle_params["status"] + log.info(f"Notify request[{status}]: {request}") + notifications.append(request.json) + return Response(status=status) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + # Start running + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.create_tenant(tenant_id, placement_policy='{"Attached": 1}') + + # Initial notification from tenant creation + assert len(notifications) == 1 + expect: dict[str, list[dict[str, int]] | str | None | int] = { + "tenant_id": str(tenant_id), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, + } + assert notifications[0] == expect + + # Block notifications, and fail a node + handle_params["status"] = 423 + env.pageservers[0].stop() + env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) + env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) + + # Avoid waiting for heartbeats + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + + # Make reconciler run and fail: it should leave itself in a state where the shard will retry notification later, + # and we will check that that happens + notifications = [] + try: + assert env.storage_controller.reconcile_all() == 1 + except StorageControllerApiException as e: + assert "Control plane tenant busy" in str(e) + assert len(notifications) == 1 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is True + ) + + # Try reconciling again, it should try notifying again + notifications = [] + try: + assert env.storage_controller.reconcile_all() == 1 + except StorageControllerApiException as e: + assert "Control plane tenant busy" in str(e) + assert len(notifications) == 1 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is True + ) + + # The describe API should indicate that a notification is pending + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is True + ) + + # Unblock notifications: reconcile should work now + handle_params["status"] = 200 + notifications = [] + assert env.storage_controller.reconcile_all() == 1 + assert len(notifications) == 1 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is False + ) + + # Reconciler should be idle now that it succeeded in its compute notification + notifications = [] + assert env.storage_controller.reconcile_all() == 0 + assert len(notifications) == 0 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is False + ) + + @run_only_on_default_postgres("this test doesn't start an endpoint") def test_storage_controller_compute_hook_revert( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, - httpserver_listen_address, + httpserver_listen_address: ListenAddress, ): """ 'revert' in the sense of a migration which gets reversed shortly after, as may happen during @@ -928,7 +1079,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): that just hits the endpoints to check that they don't bitrot. """ - neon_env_builder.num_pageservers = 2 + neon_env_builder.num_pageservers = 3 env = neon_env_builder.init_start() tenant_id = TenantId.generate() @@ -953,7 +1104,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): "GET", f"{env.storage_controller_api}/debug/v1/scheduler" ) # Two nodes, in a dict of node_id->node - assert len(response.json()["nodes"]) == 2 + assert len(response.json()["nodes"]) == 3 assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3 assert all(v["may_schedule"] for v in response.json()["nodes"].values()) @@ -964,13 +1115,25 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): headers=env.storage_controller.headers(TokenScope.ADMIN), ) + # Secondary migration API: superficial check that it migrates + secondary_dest = env.pageservers[2].id + env.storage_controller.request( + "PUT", + f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0002/migrate_secondary", + headers=env.storage_controller.headers(TokenScope.ADMIN), + json={"tenant_shard_id": f"{tenant_id}-0002", "node_id": secondary_dest}, + ) + assert env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_secondary"] == [ + secondary_dest + ] + # Node unclean drop API response = env.storage_controller.request( "POST", f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop", headers=env.storage_controller.headers(TokenScope.ADMIN), ) - assert len(env.storage_controller.node_list()) == 1 + assert len(env.storage_controller.node_list()) == 2 # Tenant unclean drop API response = env.storage_controller.request( @@ -1370,7 +1533,7 @@ class PageserverFailpoint(Failure): def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]: - tenants = env.storage_controller.tenant_list() + tenants = env.storage_controller.tenant_shard_dump() node_to_tenants: dict[int, list[TenantId]] = {} for t in tenants: @@ -1688,7 +1851,13 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): """ output_dir = neon_env_builder.test_output_dir shard_count = 4 - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.create_tenant(tenant_id, placement_policy='{"Attached":1}', shard_count=shard_count) + base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api] def storcon_cli(args): @@ -1717,7 +1886,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): # List nodes node_lines = storcon_cli(["nodes"]) # Table header, footer, and one line of data - assert len(node_lines) == 5 + assert len(node_lines) == 7 assert "localhost" in node_lines[3] # Pause scheduling onto a node @@ -1735,10 +1904,21 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"]) assert "Offline" in storcon_cli(["nodes"])[3] + # Restore node, verify status changes in CLI output + env.pageservers[0].start() + + def is_online(): + assert "Offline" not in storcon_cli(["nodes"]) + + wait_until(is_online) + + # Let everything stabilize after node failure to avoid interfering with subsequent steps + env.storage_controller.reconcile_until_idle(timeout_secs=10) + # List tenants tenant_lines = storcon_cli(["tenants"]) assert len(tenant_lines) == 5 - assert str(env.initial_tenant) in tenant_lines[3] + assert str(tenant_id) in tenant_lines[3] # Setting scheduling policies intentionally result in warnings, they're for rare use. env.storage_controller.allowed_errors.extend( @@ -1746,31 +1926,66 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): ) # Describe a tenant - tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)]) + tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(tenant_id)]) assert len(tenant_lines) >= 3 + shard_count * 2 - assert str(env.initial_tenant) in tenant_lines[0] + assert str(tenant_id) in tenant_lines[0] + + # Migrate an attached location + def other_ps_id(current_ps_id): + return ( + env.pageservers[0].id + if current_ps_id == env.pageservers[1].id + else env.pageservers[1].id + ) + + storcon_cli( + [ + "tenant-shard-migrate", + "--tenant-shard-id", + f"{tenant_id}-0004", + "--node", + str( + other_ps_id( + env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_attached"] + ) + ), + ] + ) + + # Migrate a secondary location + storcon_cli( + [ + "tenant-shard-migrate-secondary", + "--tenant-shard-id", + f"{tenant_id}-0004", + "--node", + str( + other_ps_id( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "node_secondary" + ][0] + ) + ), + ] + ) # Pause changes on a tenant - storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"]) + storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--scheduling", "stop"]) assert "Stop" in storcon_cli(["tenants"])[3] # Cancel ongoing reconcile on a tenant - storcon_cli( - ["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"] - ) + storcon_cli(["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{tenant_id}-0104"]) # Change a tenant's placement - storcon_cli( - ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"] - ) + storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--placement", "secondary"]) assert "Secondary" in storcon_cli(["tenants"])[3] # Modify a tenant's config storcon_cli( [ - "tenant-config", + "patch-tenant-config", "--tenant-id", - str(env.initial_tenant), + str(tenant_id), "--config", json.dumps({"pitr_interval": "1m"}), ] @@ -1931,12 +2146,19 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto workload.validate() -def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) +@pytest.mark.parametrize("num_azs", [1, 2]) +def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder, num_azs: int, combination): """ Graceful reststart of storage controller clusters use the drain and fill hooks in order to migrate attachments away from pageservers before restarting. In practice, Ansible will drive this process. + + Test is parametrized on the number of AZs to exercise the AZ-driven behavior + of reliably moving shards back to their home AZ, and the behavior for AZ-agnostic + tenants where we fill based on a target shard count. """ + neon_env_builder.num_azs = num_azs neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_configs() env.start() @@ -1966,8 +2188,15 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): min_shard_count = min(shard_counts.values()) max_shard_count = max(shard_counts.values()) - flake_factor = 5 / 100 - assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + if num_azs == 1: + # AZ-agnostic case: we expect all nodes to have the same number of shards, within some bound + flake_factor = 5 / 100 + assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + else: + # AZ-driven case: we expect tenants to have been round-robin allocated to AZs, + # and after the restart they should all be back in their home AZ, so difference + # should be at most a single shard's tenants + assert max_shard_count - min_shard_count <= shard_count_per_tenant # Perform a graceful rolling restart for ps in env.pageservers: @@ -2136,7 +2365,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): env.start() tenant_count = 10 - shard_count_per_tenant = 8 + shard_count_per_tenant = 16 tenant_ids = [] for _ in range(0, tenant_count): @@ -2193,6 +2422,7 @@ def test_storage_controller_node_deletion( Test that deleting a node works & properly reschedules everything that was on the node. """ neon_env_builder.num_pageservers = 3 + neon_env_builder.num_azs = 3 env = neon_env_builder.init_configs() env.start() @@ -2206,6 +2436,9 @@ def test_storage_controller_node_deletion( tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant ) + # Sanity check: initial creations should not leave the system in an unstable scheduling state + assert env.storage_controller.reconcile_all() == 0 + victim = env.pageservers[-1] # The procedure a human would follow is: @@ -2399,11 +2632,18 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): env.storage_controller.tenant_create(tid) env.storage_controller.reconcile_until_idle() - env.storage_controller.configure_failpoints(("sleep-on-reconcile-epilogue", "return(10000)")) + env.storage_controller.configure_failpoints(("reconciler-epilogue", "pause")) + + def unpause_failpoint(): + time.sleep(2) + env.storage_controller.configure_failpoints(("reconciler-epilogue", "off")) + + thread = threading.Thread(target=unpause_failpoint) + thread.start() # Make a change to the tenant config to trigger a slow reconcile virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) - virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None) + virtual_ps_http.update_tenant_config(tid, {"compaction_threshold": 5}, None) env.storage_controller.allowed_errors.extend( [ ".*Accepted configuration update but reconciliation failed.*", @@ -2414,6 +2654,8 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): observed_state = env.storage_controller.step_down() log.info(f"Storage controller stepped down with {observed_state=}") + thread.join() + # Validate that we waited for the slow reconcile to complete # and updated the observed state in the storcon before stepping down. node_id = str(env.pageserver.id) @@ -2434,7 +2676,7 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): # Validate that the storcon attempts to forward the request, but stops. # when it realises it is still the current leader. with pytest.raises(StorageControllerApiException, match="Leader is stepped down instance"): - env.storage_controller.tenant_list() + env.storage_controller.tenant_shard_dump() # Validate that we can step down multiple times and the observed state # doesn't change. @@ -2584,7 +2826,7 @@ def test_storage_controller_leadership_transfer( # Check that the stepped down instance forwards requests # to the new leader while it's still running. storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") - env.storage_controller.tenant_list() + env.storage_controller.tenant_shard_dump() env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) status = env.storage_controller.node_status(env.pageservers[0].id) assert status["scheduling"] == "Pause" @@ -2953,15 +3195,19 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): assert target.get_safekeeper(fake_id) is None + assert len(target.get_safekeepers()) == 0 + + sk_0 = env.safekeepers[0] + body = { "active": True, "id": fake_id, "created_at": "2023-10-25T09:11:25Z", "updated_at": "2024-08-28T11:32:43Z", "region_id": "aws-us-east-2", - "host": "safekeeper-333.us-east-2.aws.neon.build", - "port": 6401, - "http_port": 7676, + "host": "localhost", + "port": sk_0.port.pg, + "http_port": sk_0.port.http, "version": 5957, "availability_zone_id": "us-east-2b", } @@ -2970,6 +3216,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): inserted = target.get_safekeeper(fake_id) assert inserted is not None + assert target.get_safekeepers() == [inserted] assert eq_safekeeper_records(body, inserted) # error out if pk is changed (unexpected) @@ -2981,6 +3228,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): assert exc.value.status_code == 400 inserted_again = target.get_safekeeper(fake_id) + assert target.get_safekeepers() == [inserted_again] assert inserted_again is not None assert eq_safekeeper_records(inserted, inserted_again) @@ -2989,15 +3237,42 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): body["version"] += 1 target.on_safekeeper_deploy(fake_id, body) inserted_now = target.get_safekeeper(fake_id) + assert target.get_safekeepers() == [inserted_now] assert inserted_now is not None assert eq_safekeeper_records(body, inserted_now) + # some small tests for the scheduling policy querying and returning APIs + newest_info = target.get_safekeeper(inserted["id"]) + assert newest_info + assert newest_info["scheduling_policy"] == "Pause" + target.safekeeper_scheduling_policy(inserted["id"], "Active") + newest_info = target.get_safekeeper(inserted["id"]) + assert newest_info + assert newest_info["scheduling_policy"] == "Active" + # Ensure idempotency + target.safekeeper_scheduling_policy(inserted["id"], "Active") + newest_info = target.get_safekeeper(inserted["id"]) + assert newest_info + assert newest_info["scheduling_policy"] == "Active" + # change back to paused again + target.safekeeper_scheduling_policy(inserted["id"], "Pause") + + def storcon_heartbeat(): + assert env.storage_controller.log_contains( + "Heartbeat round complete for 1 safekeepers, 0 offline" + ) + + wait_until(storcon_heartbeat) + + # Now decomission it + target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") + def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: compared = [dict(a), dict(b)] - masked_keys = ["created_at", "updated_at"] + masked_keys = ["created_at", "updated_at", "active", "scheduling_policy"] for d in compared: # keep deleting these in case we are comparing the body as it will be uploaded by real scripts @@ -3011,11 +3286,12 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: @run_only_on_default_postgres("this is like a 'unit test' against storcon db") def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): def assign_az(ps_cfg): - az = f"az-{ps_cfg['id']}" + az = f"az-{ps_cfg['id'] % 2}" + log.info("Assigned AZ {az}") ps_cfg["availability_zone"] = az neon_env_builder.pageserver_config_override = assign_az - neon_env_builder.num_pageservers = 2 + neon_env_builder.num_pageservers = 4 env = neon_env_builder.init_configs() env.start() @@ -3030,8 +3306,14 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): assert shards[0]["preferred_az_id"] == expected_az + # When all other schedule scoring parameters are equal, tenants should round-robin on AZs + assert env.storage_controller.tenant_describe(tids[0])["shards"][0]["preferred_az_id"] == "az-0" + assert env.storage_controller.tenant_describe(tids[1])["shards"][0]["preferred_az_id"] == "az-1" + assert env.storage_controller.tenant_describe(tids[2])["shards"][0]["preferred_az_id"] == "az-0" + + # Try modifying preferred AZ updated = env.storage_controller.set_preferred_azs( - {TenantShardId(tid, 0, 0): "foo" for tid in tids} + {TenantShardId(tid, 0, 0): "az-0" for tid in tids} ) assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids]) @@ -3039,29 +3321,24 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): for tid in tids: shards = env.storage_controller.tenant_describe(tid)["shards"] assert len(shards) == 1 - assert shards[0]["preferred_az_id"] == "foo" + assert shards[0]["preferred_az_id"] == "az-0" - # Generate a layer to avoid shard split handling on ps from tripping - # up on debug assert. - timeline_id = TimelineId.generate() - env.create_timeline("bar", tids[0], timeline_id) - - workload = Workload(env, tids[0], timeline_id, branch_name="bar") - workload.init() - workload.write_rows(256) - workload.validate() + # Having modified preferred AZ, we should get moved there + env.storage_controller.reconcile_until_idle(max_interval=0.1) + for tid in tids: + shard = env.storage_controller.tenant_describe(tid)["shards"][0] + attached_to = shard["node_attached"] + attached_in_az = env.get_pageserver(attached_to).az_id + assert shard["preferred_az_id"] == attached_in_az == "az-0" env.storage_controller.tenant_shard_split(tids[0], shard_count=2) + env.storage_controller.reconcile_until_idle(max_interval=0.1) shards = env.storage_controller.tenant_describe(tids[0])["shards"] assert len(shards) == 2 for shard in shards: attached_to = shard["node_attached"] - expected_az = env.get_pageserver(attached_to).az_id - - # The scheduling optimization logic is not yet AZ-aware, so doesn't succeed - # in putting the tenant shards in the preferred AZ. - # To be fixed in https://github.com/neondatabase/neon/pull/9916 - # assert shard["preferred_az_id"] == expected_az + attached_in_az = env.get_pageserver(attached_to).az_id + assert shard["preferred_az_id"] == attached_in_az == "az-0" @run_only_on_default_postgres("Postgres version makes no difference here") @@ -3277,8 +3554,314 @@ def test_storage_controller_detached_stopped( "generation": None, }, ) - + env.storage_controller.reconcile_until_idle() env.storage_controller.consistency_check() # Confirm the detach happened assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == [] + + +@run_only_on_default_postgres("Postgres version makes no difference here") +def test_storage_controller_detach_lifecycle( + neon_env_builder: NeonEnvBuilder, +): + """ + Test that detached tenants are handled properly through their lifecycle: getting dropped + from memory when detached, then getting loaded back on-demand. + """ + + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + neon_env_builder.num_pageservers = 1 + + env = neon_env_builder.init_configs() + env.start() + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.storage_controller.tenant_create( + tenant_id, + shard_count=1, + ) + virtual_ps_http.timeline_create(PgVersion.NOT_SET, tenant_id, timeline_id) + + remote_prefix = "/".join( + ( + "tenants", + str(tenant_id), + ) + ) + # We will later check data is gone after deletion, so as a control check that it is present to begin with + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix=remote_prefix, + ) + + assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 + assert len(env.storage_controller.tenant_list()) == 1 + + # Detach the tenant + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + # Ensure reconciles are done (the one we do inline in location_conf is advisory and if it takes too long that API just succeeds anyway) + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() + + # Confirm the detach happened on pageserver + assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == [] + # Confirm the tenant is not in memory on the controller + assert env.storage_controller.tenant_list() == [] + + # The detached tenant does not get loaded into memory across a controller restart + env.storage_controller.stop() + env.storage_controller.start() + assert env.storage_controller.tenant_list() == [] + env.storage_controller.consistency_check() + + # The detached tenant can be re-attached + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 + assert len(env.storage_controller.tenant_list()) == 1 + env.storage_controller.consistency_check() + + # Detach it again before doing deletion + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() + + # A detached tenant can be deleted + virtual_ps_http.tenant_delete(tenant_id) + + # Such deletions really work (empty remote storage) + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix=remote_prefix, + ) + + +@run_only_on_default_postgres("Postgres version makes no difference here") +def test_storage_controller_node_flap_detach_race( + neon_env_builder: NeonEnvBuilder, +): + """ + Reproducer for https://github.com/neondatabase/neon/issues/10253. + + When a node's availability flaps, the reconciliations spawned by the node + going offline may race with the reconciliation done when then node comes + back online. + """ + neon_env_builder.num_pageservers = 4 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, + shard_count=2, + ) + env.storage_controller.reconcile_until_idle() + + stopped_nodes = [s["node_id"] for s in env.storage_controller.locate(tenant_id)] + + def has_hit_failpoint(failpoint: str, offset: LogCursor | None = None) -> LogCursor: + res = env.storage_controller.log_contains(f"at failpoint {failpoint}", offset=offset) + assert res + return res[1] + + # Stop the nodes which host attached shards. + # This will trigger reconciliations which pause before incrmenenting the generation, + # and, more importantly, updating the `generation_pageserver` of the shards. + env.storage_controller.configure_failpoints(("reconciler-pre-increment-generation", "pause")) + for node_id in stopped_nodes: + env.get_pageserver(node_id).stop(immediate=True) + + def failure_handled() -> LogCursor: + stop_offset = None + + for node_id in stopped_nodes: + res = env.storage_controller.log_contains(f"node {node_id} going offline") + assert res + stop_offset = res[1] + + assert stop_offset + return stop_offset + + offset = wait_until(failure_handled) + + # Now restart the nodes and make them pause before marking themselves as available + # or running the activation reconciliation. + env.storage_controller.configure_failpoints(("heartbeat-pre-node-state-configure", "pause")) + + for node_id in stopped_nodes: + env.get_pageserver(node_id).start(await_active=False) + + offset = wait_until( + lambda: has_hit_failpoint("heartbeat-pre-node-state-configure", offset=offset) + ) + + # The nodes have restarted and are waiting to perform activaction reconciliation. + # Unpause the initial reconciliation triggered by the nodes going offline. + # It will attempt to detach from the old location, but notice that the old location + # is not yet available, and then stop before processing the results of the reconciliation. + env.storage_controller.configure_failpoints(("reconciler-epilogue", "pause")) + env.storage_controller.configure_failpoints(("reconciler-pre-increment-generation", "off")) + + offset = wait_until(lambda: has_hit_failpoint("reconciler-epilogue", offset=offset)) + + # Let the nodes perform activation reconciliation while still holding up processing the result + # from the initial reconcile triggered by going offline. + env.storage_controller.configure_failpoints(("heartbeat-pre-node-state-configure", "off")) + + def activate_reconciliation_done(): + for node_id in stopped_nodes: + assert env.storage_controller.log_contains( + f"Node {node_id} transition to active", offset=offset + ) + + wait_until(activate_reconciliation_done) + + # Finally, allow the initial reconcile to finish up. + env.storage_controller.configure_failpoints(("reconciler-epilogue", "off")) + + # Give things a chance to settle and validate that no stale locations exist + env.storage_controller.reconcile_until_idle() + + def validate_locations(): + shard_locations = defaultdict(list) + for ps in env.pageservers: + locations = ps.http_client().tenant_list_locations()["tenant_shards"] + for loc in locations: + shard_locations[loc[0]].append( + {"generation": loc[1]["generation"], "mode": loc[1]["mode"], "node": ps.id} + ) + + log.info(f"Shard locations: {shard_locations}") + + attached_locations = { + k: list(filter(lambda loc: loc["mode"] == "AttachedSingle", v)) + for k, v in shard_locations.items() + } + + for shard, locs in attached_locations.items(): + assert len(locs) == 1, f"{shard} has {len(locs)} attached locations" + + wait_until(validate_locations, timeout=10) + + +def test_update_node_on_registration(neon_env_builder: NeonEnvBuilder): + """ + Check that storage controller handles node_register requests with updated fields correctly. + 1. Run storage controller and register 1 pageserver without https port. + 2. Register the same pageserver with https port. Check that port has been updated. + 3. Restart the storage controller. Check that https port is persistent. + 4. Register the same pageserver without https port again (rollback). Check that port has been removed. + """ + neon_env_builder.num_pageservers = 1 + env = neon_env_builder.init_configs() + + env.storage_controller.start() + env.storage_controller.wait_until_ready() + + pageserver = env.pageservers[0] + + # Step 1. Register pageserver without https port. + env.storage_controller.node_register(pageserver) + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] is None + + # Step 2. Register pageserver with https port. + pageserver.service_port.https = 1234 + env.storage_controller.node_register(pageserver) + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] == 1234 + + # Step 3. Restart storage controller. + env.storage_controller.stop() + env.storage_controller.start() + env.storage_controller.wait_until_ready() + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] == 1234 + + # Step 4. Register pageserver with no https port again. + pageserver.service_port.https = None + env.storage_controller.node_register(pageserver) + env.storage_controller.consistency_check() + + nodes = env.storage_controller.node_list() + assert len(nodes) == 1 + assert nodes[0]["listen_https_port"] is None + + +def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvBuilder): + """ + Validate that a storage controller restart with no shards in a transient state + performs zero reconciliations at start-up. Implicitly, this means that the location + configs returned by the pageserver are identical to the persisted state in the + storage controller database. + """ + neon_env_builder.num_pageservers = 1 + neon_env_builder.storage_controller_config = { + "start_as_candidate": False, + } + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, shard_count=2, tenant_config={"pitr_interval": "1h2m3s"} + ) + + env.storage_controller.reconcile_until_idle() + + reconciles_before_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + assert reconciles_before_restart != 0 + + env.storage_controller.stop() + env.storage_controller.start() + + env.storage_controller.reconcile_until_idle() + + reconciles_after_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + assert reconciles_after_restart == 0 diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 198e4f0460..d44c176b35 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -32,6 +32,12 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1 env = neon_env_builder.init_start() + # We restart pageserver(s), which will cause storage storage controller + # requests to fail and warn. + env.storage_controller.allowed_errors.append(".*management API still failed.*") + env.storage_controller.allowed_errors.append( + ".*Reconcile error.*error sending request for url.*" + ) tenant_id = env.initial_tenant timeline_id = env.initial_timeline branch = "main" @@ -65,6 +71,10 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: else: tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)] + # Let shards finish rescheduling to other pageservers: this makes the rest of the test more stable + # is it won't overlap with migrations + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + output_path = neon_env_builder.test_output_dir / "snapshot" os.makedirs(output_path) @@ -227,7 +237,9 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ new_shard_count = 4 assert shard_count is None or new_shard_count > shard_count shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) - env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately + env.storage_controller.reconcile_until_idle( + timeout_secs=120 + ) # Move shards to their final locations immediately # Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors env.storage_controller.pageserver_api().timeline_create( @@ -266,7 +278,17 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ for shard in shards: ps = env.get_tenant_pageserver(shard) assert ps is not None - ps.http_client().timeline_compact(shard, timeline_id, force_image_layer_creation=True) + ps.http_client().timeline_compact( + shard, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True + ) + + # Add some WAL so that we don't gc at the latest remote consistent lsn + workload.churn_rows(10) + + # Now gc the old stuff away + for shard in shards: + ps = env.get_tenant_pageserver(shard) + assert ps is not None ps.http_client().timeline_gc(shard, timeline_id, 0) # We will use a min_age_secs=1 threshold for deletion, let it pass @@ -290,6 +312,17 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_ drop_local_state(env, tenant_id) workload.validate() + for ps in env.pageservers: + # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by + # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does. + # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed + ps.allowed_errors.extend( + [ + ".*could not find data for key.*", + ".*could not ingest record.*", + ] + ) + def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py new file mode 100644 index 0000000000..6175643389 --- /dev/null +++ b/test_runner/regress/test_subscriber_branching.py @@ -0,0 +1,412 @@ +from __future__ import annotations + +import threading +import time + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, logical_replication_sync +from fixtures.utils import query_scalar, wait_until + + +# This test checks that branching of timeline with logical subscriptions +# does not affect logical replication for parent. +# Endpoint on a new branch will drop all existing subscriptions at the start, +# so it will not receive any changes. +# If needed, user can create new subscriptions on the child branch. +def test_subscriber_branching(neon_simple_env: NeonEnv): + env = neon_simple_env + env.create_branch("publisher") + pub = env.endpoints.create("publisher") + pub.respec( + skip_pg_catalog_updates=False, + create_test_user=True, + ) + pub.start(create_test_user=True) + + env.create_branch("subscriber") + sub = env.endpoints.create("subscriber") + # Pass create_test_user flag to get properly filled spec.users and spec.databases fields. + # + # This test checks the per-database operations that happen at compute start + # and these operations are applied to the databases that are present in the spec. + sub.respec( + skip_pg_catalog_updates=False, + create_test_user=True, + ) + sub.start(create_test_user=True) + + pub.wait_for_migrations() + sub.wait_for_migrations() + + n_records = 1000 + + def check_that_changes_propagated(): + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + def insert_data(pub, start): + with pub.cursor(dbname="neondb", user="test", password="pubtestpwd") as pcur: + for i in range(start, start + n_records): + pcur.execute("INSERT into t values (%s,random()*100000)", (i,)) + + # create_test_user creates a user without password + # but psycopg2 execute() requires a password + with sub.cursor() as scur: + scur.execute("ALTER USER test WITH PASSWORD 'testpwd'") + with pub.cursor() as pcur: + # Create a test user to avoid using superuser + pcur.execute("ALTER USER test WITH PASSWORD 'pubtestpwd'") + # If we don't do this, creating the subscription will fail + pub.edit_hba(["host all test 0.0.0.0/0 md5"]) + + with pub.cursor(dbname="neondb", user="test", password="pubtestpwd") as pcur: + pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + pcur.execute("CREATE PUBLICATION pub FOR TABLE t") + + with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur: + scur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + pub_conn = ( + f"host=localhost port={pub.pg_port} dbname=neondb user=test password=pubtestpwd" + ) + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + scur.execute(query) + time.sleep(2) # let initial table sync complete + + insert_data(pub, 0) + + with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur: + wait_until(check_that_changes_propagated) + latest_end_lsn = query_scalar( + scur, "select latest_end_lsn from pg_catalog.pg_stat_subscription; " + ) + last_insert_lsn = query_scalar(scur, "select pg_current_wal_insert_lsn();") + + log.info(f"latest_end_lsn = {latest_end_lsn}") + log.info(f"last_insert_lsn = {last_insert_lsn}") + + # stop the parent subscriber so that it doesn't interfere with the test + sub.stop() + + # 1. good scenario: + # create subscriber_child_1 + # it will not get changes from publisher, because drop_subscriptions_before_start is set to True + sub_child_1_timeline_id = env.create_branch( + "subscriber_child_1", + ancestor_branch_name="subscriber", + ancestor_start_lsn=last_insert_lsn, + ) + sub_child_1 = env.endpoints.create("subscriber_child_1") + # Pass drop_subscriptions_before_start flag + sub_child_1.respec( + skip_pg_catalog_updates=False, + create_test_user=True, + drop_subscriptions_before_start=True, + ) + sub_child_1.start(create_test_user=True) + + # ensure that subscriber_child_1 sees all the data + with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur: + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + # ensure that there are no subscriptions in this database + scur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub'") + assert len(scur.fetchall()) == 0 + + # ensure that drop_subscriptions_done happened on this timeline + with sub_child_1.cursor() as scur_postgres: + scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done") + res = scur_postgres.fetchall() + assert len(res) == 1 + assert str(sub_child_1_timeline_id) == res[0][0] + + old_n_records = n_records + # insert more data on publisher + insert_data(pub, n_records) + n_records += n_records + + pcur.execute("SELECT count(*) FROM t") + res = pcur.fetchall() + assert res[0][0] == n_records + + # ensure that subscriber_child_1 doesn't see the new data + with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur: + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == old_n_records + + # reenable logical replication on subscriber_child_1 + # using new publication + # ensure that new publication works as expected + with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur: + scur.execute("TRUNCATE t") + + # create new subscription + # with new pub name + pcur.execute("CREATE PUBLICATION pub_new FOR TABLE t") + query = f"CREATE SUBSCRIPTION sub_new CONNECTION '{pub_conn}' PUBLICATION pub_new" + scur.execute(query) + + wait_until(check_that_changes_propagated) + + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + # ensure that new publication works as expected after compute restart + # first restart with drop_subscriptions_before_start=True + # to emulate the case when compute restarts within the VM with stale spec + sub_child_1.stop() + sub_child_1.respec( + skip_pg_catalog_updates=False, + create_test_user=True, + drop_subscriptions_before_start=True, + ) + sub_child_1.start(create_test_user=True) + + with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur: + # ensure that even though the flag is set, we didn't drop new subscription + scur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub_new'") + assert len(scur.fetchall()) == 1 + + # ensure that drop_subscriptions_done happened on this timeline + with sub_child_1.cursor() as scur_postgres: + scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done") + res = scur_postgres.fetchall() + assert len(res) == 1 + assert str(sub_child_1_timeline_id) == res[0][0] + + sub_child_1.stop() + sub_child_1.respec( + skip_pg_catalog_updates=False, + create_test_user=True, + drop_subscriptions_before_start=False, + ) + sub_child_1.start(create_test_user=True) + + # insert more data on publisher + insert_data(pub, n_records) + n_records += n_records + with sub_child_1.cursor(dbname="neondb", user="test", password="testpwd") as scur: + # ensure that there is a subscriptions in this database + scur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub_new'") + assert len(scur.fetchall()) == 1 + + wait_until(check_that_changes_propagated) + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + # ensure that drop_subscriptions_done happened on this timeline + with sub_child_1.cursor() as scur_postgres: + scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done") + res = scur_postgres.fetchall() + assert len(res) == 1 + assert str(sub_child_1_timeline_id) == res[0][0] + + # wake the sub and ensure that it catches up with the new data + sub.start(create_test_user=True) + with sub.cursor(dbname="neondb", user="test", password="testpwd") as scur: + wait_until(check_that_changes_propagated) + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + # test that we can create a branch of a branch + sub_child_2_timeline_id = env.create_branch( + "subscriber_child_2", + ancestor_branch_name="subscriber_child_1", + ) + sub_child_2 = env.endpoints.create("subscriber_child_2") + # Pass drop_subscriptions_before_start flag + sub_child_2.respec( + skip_pg_catalog_updates=False, + drop_subscriptions_before_start=True, + ) + sub_child_2.start(create_test_user=True) + + # ensure that subscriber_child_2 does not inherit subscription from child_1 + with sub_child_2.cursor(dbname="neondb", user="test", password="testpwd") as scur: + # ensure that there are no subscriptions in this database + scur.execute("SELECT count(*) FROM pg_catalog.pg_subscription") + res = scur.fetchall() + assert res[0][0] == 0 + + # ensure that drop_subscriptions_done happened on this timeline + with sub_child_2.cursor() as scur_postgres: + scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done") + res = scur_postgres.fetchall() + assert len(res) == 1 + assert str(sub_child_2_timeline_id) == res[0][0] + + +def test_multiple_subscription_branching(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can handle concurrent deletion of subscriptions in a multiple databases + """ + env = neon_simple_env + + NUMBER_OF_DBS = 5 + + # Create and start endpoint so that neon_local put all the generated + # stuff into the spec.json file. + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "max_replication_slots = 10", + "max_logical_replication_workers=10", + "max_worker_processes=10", + ], + ) + + TEST_DB_NAMES = [ + { + "name": "neondb", + "owner": "cloud_admin", + }, + { + "name": "publisher_db", + "owner": "cloud_admin", + }, + ] + + for i in range(NUMBER_OF_DBS): + TEST_DB_NAMES.append( + { + "name": f"db{i}", + "owner": "cloud_admin", + } + ) + + # Update the spec.json file to create the databases + # and reconfigure the endpoint to apply the changes. + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, + } + ) + endpoint.reconfigure() + + connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''") + + # create table, replication and subscription for each of the databases + with endpoint.cursor(dbname="publisher_db") as publisher_cursor: + for i in range(NUMBER_OF_DBS): + publisher_cursor.execute(f"CREATE TABLE t{i}(a int)") + publisher_cursor.execute(f"CREATE PUBLICATION mypub{i} FOR TABLE t{i}") + publisher_cursor.execute( + f"select pg_catalog.pg_create_logical_replication_slot('mysub{i}', 'pgoutput');" + ) + publisher_cursor.execute(f"INSERT INTO t{i} VALUES ({i})") + + with endpoint.cursor(dbname=f"db{i}") as cursor: + cursor.execute(f"CREATE TABLE t{i}(a int)") + cursor.execute( + f"CREATE SUBSCRIPTION mysub{i} CONNECTION '{connstr}' PUBLICATION mypub{i} WITH (create_slot = false) " + ) + + # wait for the subscription to be active + for i in range(NUMBER_OF_DBS): + logical_replication_sync( + endpoint, + endpoint, + f"mysub{i}", + sub_dbname=f"db{i}", + pub_dbname="publisher_db", + ) + + # Check that replication is working + for i in range(NUMBER_OF_DBS): + with endpoint.cursor(dbname=f"db{i}") as cursor: + cursor.execute(f"SELECT * FROM t{i}") + rows = cursor.fetchall() + assert len(rows) == 1 + assert rows[0][0] == i + + last_insert_lsn = query_scalar(cursor, "select pg_current_wal_insert_lsn();") + + def start_publisher_workload(table_num: int, duration: int): + start = time.time() + with endpoint.cursor(dbname="publisher_db") as cur: + while time.time() - start < duration: + cur.execute(f"INSERT INTO t{i} SELECT FROM generate_series(1,1000)") + + LOAD_DURATION = 5 + threads = [ + threading.Thread(target=start_publisher_workload, args=(i, LOAD_DURATION)) + for i in range(NUMBER_OF_DBS) + ] + + for thread in threads: + thread.start() + + sub_child_1_timeline_id = env.create_branch( + "subscriber_child_1", + ancestor_branch_name="main", + ancestor_start_lsn=last_insert_lsn, + ) + + sub_child_1 = env.endpoints.create("subscriber_child_1") + + sub_child_1.respec( + skip_pg_catalog_updates=False, + reconfigure_concurrency=5, + drop_subscriptions_before_start=True, + cluster={ + "databases": TEST_DB_NAMES, + "roles": [], + }, + ) + + sub_child_1.start() + + # ensure that subscription deletion happened on this timeline + with sub_child_1.cursor() as scur_postgres: + scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done") + res = scur_postgres.fetchall() + log.info(f"res = {res}") + assert len(res) == 1 + assert str(sub_child_1_timeline_id) == res[0][0] + + # ensure that there are no subscriptions in the databases + for i in range(NUMBER_OF_DBS): + with sub_child_1.cursor(dbname=f"db{i}") as cursor: + cursor.execute("SELECT * FROM pg_catalog.pg_subscription") + res = cursor.fetchall() + assert len(res) == 0 + + # ensure that there are no unexpected rows in the tables + cursor.execute(f"SELECT * FROM t{i}") + rows = cursor.fetchall() + assert len(rows) == 1 + assert rows[0][0] == i + + for thread in threads: + thread.join() + + # ensure that logical replication is still working in main endpoint + # wait for it to catch up + for i in range(NUMBER_OF_DBS): + logical_replication_sync( + endpoint, + endpoint, + f"mysub{i}", + sub_dbname=f"db{i}", + pub_dbname="publisher_db", + ) + + # verify that the data is the same in publisher and subscriber tables + with endpoint.cursor(dbname="publisher_db") as publisher_cursor: + for i in range(NUMBER_OF_DBS): + with endpoint.cursor(dbname=f"db{i}") as cursor: + publisher_cursor.execute(f"SELECT count(*) FROM t{i}") + cursor.execute(f"SELECT count(*) FROM t{i}") + pub_res = publisher_cursor.fetchone() + sub_res = cursor.fetchone() + log.info(f"for table t{i}: pub_res = {pub_res}, sub_res = {sub_res}") + assert pub_res == sub_res diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index 7d4f66d044..8ad7282ea2 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -3,12 +3,14 @@ from __future__ import annotations import threading import time +import pytest from fixtures.neon_fixtures import NeonEnv from fixtures.utils import wait_until # This test checks of logical replication subscriber is able to correctly restart replication without receiving duplicates. # It requires tracking information about replication origins at page server side +@pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM def test_subscriber_restart(neon_simple_env: NeonEnv): env = neon_simple_env env.create_branch("publisher") diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index f8f240cfdc..0c2d535af4 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -3,13 +3,14 @@ from __future__ import annotations import json from typing import TYPE_CHECKING +import pytest from fixtures.common_types import Lsn from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.utils import wait_until +from fixtures.utils import run_only_on_default_postgres, wait_until from fixtures.workload import Workload if TYPE_CHECKING: @@ -330,3 +331,83 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( metric = get_metric() assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default" assert int(metric.value) == 0, "value resets to default" + + +@run_only_on_default_postgres("Test does not start a compute") +@pytest.mark.parametrize("ps_managed_by", ["storcon", "cplane"]) +def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: str): + """ + Test tenant config patching (i.e. additive updates) + + The flow is different for storage controller and cplane managed pageserver. + 1. Storcon managed: /v1/tenant/config request lands on storcon, which generates + location_config calls containing the update to the pageserver + 2. Cplane managed: /v1/tenant/config is called directly on the pageserver + """ + + def assert_tenant_conf_semantically_equal(lhs, rhs): + """ + Storcon returns None for fields that are not set while the pageserver does not. + Compare two tenant's config overrides semantically, by dropping the None values. + """ + lhs = {k: v for k, v in lhs.items() if v is not None} + rhs = {k: v for k, v in rhs.items() if v is not None} + + assert lhs == rhs + + env = neon_env_builder.init_start() + + if ps_managed_by == "storcon": + api = env.storage_controller.pageserver_api() + elif ps_managed_by == "cplane": + # Disallow storcon from sending location_configs to the pageserver. + # These would overwrite the manually set tenant configs. + env.storage_controller.reconcile_until_idle() + env.storage_controller.tenant_policy_update(env.initial_tenant, {"scheduling": "Stop"}) + env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*") + + api = env.pageserver.http_client() + else: + raise Exception(f"Unexpected value of ps_managed_by param: {ps_managed_by}") + + crnt_tenant_conf = api.tenant_config(env.initial_tenant).tenant_specific_overrides + + patch: dict[str, Any | None] = { + "gc_period": "3h", + "wal_receiver_protocol_override": { + "type": "interpreted", + "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, + }, + } + api.patch_tenant_config(env.initial_tenant, patch) + tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides + if ps_managed_by == "storcon": + # Check that the config was propagated to the PS. + overrides_on_ps = ( + env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides + ) + assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch) + assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch) + crnt_tenant_conf = tenant_conf_after_patch + + patch = {"gc_period": "5h", "wal_receiver_protocol_override": None} + api.patch_tenant_config(env.initial_tenant, patch) + tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides + if ps_managed_by == "storcon": + overrides_on_ps = ( + env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides + ) + assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch) + assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch) + crnt_tenant_conf = tenant_conf_after_patch + + put = {"pitr_interval": "1m 1s"} + api.set_tenant_config(env.initial_tenant, put) + tenant_conf_after_put = api.tenant_config(env.initial_tenant).tenant_specific_overrides + if ps_managed_by == "storcon": + overrides_on_ps = ( + env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides + ) + assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_put) + assert_tenant_conf_semantically_equal(tenant_conf_after_put, put) + crnt_tenant_conf = tenant_conf_after_put diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 48e55c1ab1..3720f653c5 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +from concurrent.futures import ThreadPoolExecutor from threading import Thread import pytest @@ -253,29 +254,8 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause")) def timeline_create(): - try: - ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1) - raise RuntimeError("creation succeeded even though it shouldn't") - except ReadTimeout: - pass - - Thread(target=timeline_create).start() - - def hit_initdb_upload_failpoint(): - env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") - - wait_until(hit_initdb_upload_failpoint) - - def creation_connection_timed_out(): - env.pageserver.assert_log_contains( - "POST.*/timeline.* request was dropped before completing" - ) - - # Wait so that we hit the timeout and the connection is dropped - # (But timeline creation still continues) - wait_until(creation_connection_timed_out) - - ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) + ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1) + raise RuntimeError("creation succeeded even though it shouldn't") def tenant_delete(): def tenant_delete_inner(): @@ -283,21 +263,46 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) wait_until(tenant_delete_inner) - Thread(target=tenant_delete).start() + # We will spawn background threads for timeline creation and tenant deletion. They will both + # get blocked on our failpoint. + with ThreadPoolExecutor(max_workers=1) as executor: + create_fut = executor.submit(timeline_create) - def deletion_arrived(): - env.pageserver.assert_log_contains( - f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" - ) + def hit_initdb_upload_failpoint(): + env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") - wait_until(deletion_arrived) + wait_until(hit_initdb_upload_failpoint) - ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off")) + def creation_connection_timed_out(): + env.pageserver.assert_log_contains( + "POST.*/timeline.* request was dropped before completing" + ) - # Disable the failpoint and wait for deletion to finish - ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off")) + # Wait so that we hit the timeout and the connection is dropped + # (But timeline creation still continues) + wait_until(creation_connection_timed_out) - ps_http.tenant_delete(tenant_id) + with pytest.raises(ReadTimeout): + # Our creation failed from the client's point of view. + create_fut.result() + + ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) + + delete_fut = executor.submit(tenant_delete) + + def deletion_arrived(): + env.pageserver.assert_log_contains( + f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" + ) + + wait_until(deletion_arrived) + + ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off")) + + # Disable the failpoint and wait for deletion to finish + ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off")) + + delete_fut.result() # Physical deletion should have happened assert_prefix_empty( diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index d31901b384..afe444f227 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -194,7 +194,7 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): io_metrics = query_all_safekeepers( "safekeeper_pg_io_bytes_total", { - "app_name": "pageserver", + "app_name": f"pageserver-{env.pageserver.id}", "client_az": "test_ps_az", "dir": io_direction, "same_az": "false", @@ -481,7 +481,8 @@ def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder): counts = timeline_detail["directory_entries_counts"] assert counts log.info(f"directory counts: {counts}") - assert counts[2] > COUNT_AT_LEAST_EXPECTED + # We need to add up reldir v1 + v2 counts + assert counts[2] + counts[7] > COUNT_AT_LEAST_EXPECTED def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 68e9385035..c87b520366 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -2,6 +2,7 @@ from __future__ import annotations import time from dataclasses import dataclass +from typing import TYPE_CHECKING from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -13,12 +14,15 @@ from fixtures.pageserver.http import LayerMapInfo from fixtures.remote_storage import RemoteStorageKind from pytest_httpserver import HTTPServer +if TYPE_CHECKING: + from fixtures.httpserver import ListenAddress + # NB: basic config change tests are in test_tenant_conf.py def test_threshold_based_eviction( httpserver: HTTPServer, - httpserver_listen_address, + httpserver_listen_address: ListenAddress, pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, ): @@ -81,7 +85,7 @@ def test_threshold_based_eviction( # create a bunch of L1s, only the least of which will need to be resident compaction_threshold = 3 # create L1 layers quickly - vps_http.patch_tenant_config_client_side( + vps_http.update_tenant_config( tenant_id, inserts={ # Disable gc and compaction to avoid on-demand downloads from their side. diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index e808dd1396..c17840d31c 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -139,9 +139,9 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): @pytest.mark.parametrize("manual_offload", [False, True]) def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool): - if not manual_offload: - # (automatic) timeline offloading defaults to false for now - neon_env_builder.pageserver_config_override = "timeline_offloading = true" + if manual_offload: + # (automatic) timeline offloading defaults to true + neon_env_builder.pageserver_config_override = "timeline_offloading = false" env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() @@ -396,8 +396,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): with tenant migrations and timeline deletions. """ - # Offloading is off by default at time of writing: remove this line when it's on by default - neon_env_builder.pageserver_config_override = "timeline_offloading = true" + neon_env_builder.storage_controller_config = {"heartbeat_interval": "100msec"} neon_env_builder.enable_pageserver_remote_storage(s3_storage()) # We will exercise migrations, so need multiple pageservers @@ -426,6 +425,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): [ ".*removing local file.*because it has unexpected length.*", ".*__temp.*", + ".*method=POST path=\\S+/timeline .*: Not activating a Stopping timeline.*", # FIXME: there are still anyhow::Error paths in timeline creation/deletion which # generate 500 results when called during shutdown (https://github.com/neondatabase/neon/issues/9768) ".*InternalServerError.*", @@ -435,6 +435,14 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): ] ) + env.storage_scrubber.allowed_errors.extend( + [ + # Unclcean shutdowns of pageserver can legitimately result in orphan layers + # (https://github.com/neondatabase/neon/issues/9988#issuecomment-2520558211) + f".*Orphan layer detected: tenants/{tenant_id}/.*" + ] + ) + class TimelineState: def __init__(self): self.timeline_id = TimelineId.generate() @@ -544,8 +552,33 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): log.info(f"Timeline {state.timeline_id} is still active") shutdown.wait(0.5) elif state.timeline_id in offloaded_ids: - log.info(f"Timeline {state.timeline_id} is now offloaded") - state.offloaded = True + log.info(f"Timeline {state.timeline_id} is now offloaded in memory") + + # Hack: when we see something offloaded in the API, it doesn't guarantee that the offload + # is persistent (it is marked offloaded first, then that is persisted to the tenant manifest). + # So we wait until we see the manifest update before considering it offloaded, that way + # subsequent checks that it doesn't revert to active on a restart will pass reliably. + time.sleep(0.1) + assert isinstance(env.pageserver_remote_storage, S3Storage) + manifest = env.pageserver_remote_storage.download_tenant_manifest( + tenant_id + ) + if manifest is None: + log.info( + f"Timeline {state.timeline_id} is not yet offloaded persistently (no manifest)" + ) + elif str(state.timeline_id) in [ + t["timeline_id"] for t in manifest["offloaded_timelines"] + ]: + log.info( + f"Timeline {state.timeline_id} is now offloaded persistently" + ) + state.offloaded = True + else: + log.info( + f"Timeline {state.timeline_id} is not yet offloaded persistently (manifest: {manifest})" + ) + break else: # Timeline is neither offloaded nor active, this is unexpected: the pageserver @@ -572,12 +605,12 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): # This is expected: we are injecting chaos, API calls will sometimes fail. # TODO: can we narrow this to assert we are getting friendly 503s? log.info(f"Iteration error, will retry: {e}") - shutdown.wait(random.random()) + shutdown.wait(random.random() * 0.5) except requests.exceptions.RetryError as e: # Retryable error repeated more times than `requests` is configured to tolerate, this # is expected when a pageserver remains unavailable for a couple seconds log.info(f"Iteration error, will retry: {e}") - shutdown.wait(random.random()) + shutdown.wait(random.random() * 0.5) except Exception as e: log.warning( f"Unexpected worker exception (current timeline {state.timeline_id}): {e}" @@ -622,7 +655,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): # Make sure we're up for as long as we spent restarting, to ensure operations can make progress log.info(f"Staying alive for {restart_duration}s") - time.sleep(restart_duration) + time.sleep(restart_duration * 2) else: # Migrate our tenant between pageservers origin_ps = env.get_tenant_pageserver(tenant_shard_id) @@ -641,7 +674,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): # Sanity check that during our run we did exercise some full timeline lifecycles, in case # one of our workers got stuck - assert len(timelines_deleted) > 10 + assert len(timelines_deleted) > 5 # That no invariant-violations were reported by workers assert violations == [] @@ -790,6 +823,8 @@ def test_timeline_retain_lsn( [ ".*initial size calculation failed: PageRead.MissingKey.could not find data for key.*", ".*page_service_conn_main.*could not find data for key.*", + ".*failed to get checkpoint bytes.*", + ".*failed to get control bytes.*", ] ) if offload_child is None or "no-restart" not in offload_child: @@ -949,3 +984,101 @@ def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder): assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] > 0 assert gc_summary["tenant_manifests_deleted"] > 0 + + +@pytest.mark.parametrize("end_with_offloaded", [False, True]) +def test_timeline_offload_race_unarchive( + neon_env_builder: NeonEnvBuilder, end_with_offloaded: bool +): + """ + Ensure that unarchive and timeline offload don't race each other + """ + # Regression test for issue https://github.com/neondatabase/neon/issues/10220 + + failpoint = "before-timeline-auto-offload" + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, initial_timeline_id = env.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "1s", + } + ) + + # Create a branch + leaf_timeline_id = env.create_branch("test_ancestor_branch_archive", tenant_id) + + # write some stuff to the leaf + with env.endpoints.create_start( + "test_ancestor_branch_archive", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,1000)", + ] + ) + sum = endpoint.safe_psql("SELECT sum(key) from foo where key % 7 = 1") + + ps_http.configure_failpoints((failpoint, "pause")) + + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is True + + # The actual race: get the compaction task to right before + # offloading the timeline and attempt to unarchive it + wait_until(lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")) + + # This unarchival should go through + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + + def timeline_offloaded_api(timeline_id: TimelineId) -> bool: + # TODO add a proper API to check if a timeline has been offloaded or not + return not any( + timeline["timeline_id"] == str(timeline_id) + for timeline in ps_http.timeline_list(tenant_id=tenant_id) + ) + + def leaf_offloaded(): + assert timeline_offloaded_api(leaf_timeline_id) + + # Ensure that we've hit the failed offload attempt + ps_http.configure_failpoints((failpoint, "off")) + wait_until( + lambda: env.pageserver.assert_log_contains( + f".*compaction_loop.*offload_timeline.*{leaf_timeline_id}.*can't shut down timeline.*" + ) + ) + + with env.endpoints.create_start( + "test_ancestor_branch_archive", tenant_id=tenant_id + ) as endpoint: + sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key % 7 = 1") + assert sum == sum_again + + if end_with_offloaded: + # Ensure that offloading still works after all of this + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + wait_until(leaf_offloaded) + else: + # Test that deletion of leaf timeline works + ps_http.timeline_delete(tenant_id, leaf_timeline_id) diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 2c3ee38bae..612a767480 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -514,7 +514,7 @@ def test_compaction_induced_by_detaches_in_history( assert len(delta_layers(branch_timeline_id)) == 5 - env.storage_controller.pageserver_api().patch_tenant_config_client_side( + env.storage_controller.pageserver_api().update_tenant_config( env.initial_tenant, {"compaction_threshold": 5}, None ) @@ -607,7 +607,7 @@ def test_timeline_ancestor_detach_idempotent_success( if shards_after > 1: # FIXME: should this be in the neon_env_builder.init_start? - env.storage_controller.reconcile_until_idle() + env.storage_controller.reconcile_until_idle(timeout_secs=120) client = env.storage_controller.pageserver_api() else: client = env.pageserver.http_client() @@ -636,7 +636,7 @@ def test_timeline_ancestor_detach_idempotent_success( # Do a shard split # This is a reproducer for https://github.com/neondatabase/neon/issues/9667 env.storage_controller.tenant_shard_split(env.initial_tenant, shards_after) - env.storage_controller.reconcile_until_idle() + env.storage_controller.reconcile_until_idle(timeout_secs=120) first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch) assert set(first_reparenting_response) == {reparented1, reparented2} diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 95bf9106cd..e2fdacdbfc 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -440,7 +440,7 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder env = neon_env_builder.init_start( initial_tenant_conf={ "checkpoint_distance": "100000", - "compaction_period": "10m", + "compaction_period": "0s", } ) pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 46e90852a6..4865178ca8 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -3,6 +3,7 @@ from __future__ import annotations import time from contextlib import closing +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, fork_at_current_lsn from fixtures.utils import query_scalar @@ -202,6 +203,9 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder): "checkpoint_distance": f"{128 * 1024}", "compaction_target_size": f"{128 * 1024}", "compaction_threshold": "1", + # disable L0 backpressure + "l0_flush_delay_threshold": "0", + "l0_flush_stall_threshold": "0", # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", # set PITR interval to be small, so we can do GC @@ -294,6 +298,7 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder): cur.execute("commit transaction") +@pytest.mark.timeout(600) # slow in debug builds def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): """ Runs pgbench across a few databases on a sharded tenant, then performs a visibility map diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 23d4f23cdb..c5045fe4a4 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -48,7 +48,12 @@ from fixtures.remote_storage import ( default_remote_storage, s3_storage, ) -from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.safekeeper.http import ( + Configuration, + SafekeeperHttpClient, + SafekeeperId, + TimelineCreateRequest, +) from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( PropagatingThread, @@ -561,10 +566,14 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder): assert_prefix_empty(neon_env_builder.safekeepers_remote_storage, prefix) -def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): +# This test is flaky, probably because PUTs of local fs storage are not atomic. +# Let's keep both remote storage kinds for a while to see if this is the case. +# https://github.com/neondatabase/neon/issues/10761 +@pytest.mark.parametrize("remote_storage_kind", [s3_storage(), RemoteStorageKind.LOCAL_FS]) +def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 - neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() tenant_id = env.initial_tenant @@ -658,7 +667,13 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): for sk in env.safekeepers: sk.start() cli = sk.http_client() - cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn) + mconf = Configuration(generation=0, members=[], new_members=None) + # set start_lsn to the beginning of the first segment to allow reading + # WAL from there (could you intidb LSN as well). + r = TimelineCreateRequest( + tenant_id, timeline_id, mconf, pg_version, Lsn("0/1000000"), commit_lsn=last_lsn + ) + cli.timeline_create(r) f_partial_path = ( Path(sk.data_dir) / str(tenant_id) / str(timeline_id) / f_partial_saved.name ) @@ -1090,6 +1105,62 @@ def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder): endpoint.safe_psql("SELECT 'works'") +# Test restarting compute at WAL page boundary. +def test_restart_endpoint_wal_page_boundary(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + ep = env.endpoints.create_start("main") + ep.safe_psql("create table t (i int)") + + with ep.cursor() as cur: + # measure how much space logical message takes. Sometimes first attempt + # creates huge message and then it stabilizes, have no idea why. + for _ in range(3): + lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + log.info(f"current_lsn={lsn_before}") + # Non-transactional logical message doesn't write WAL, only XLogInsert's + # it, so use transactional. Which is a bit problematic as transactional + # necessitates commit record. Alternatively we can do smth like + # select neon_xlogflush(pg_current_wal_insert_lsn()); + # but isn't much better + that particular call complains on 'xlog flush + # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips + # page headers. + payload = "blahblah" + cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')") + lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before + logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload) + log.info( + f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}" + ) + + # and write logical message spanning exactly as we want + lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + log.info(f"current_lsn={lsn_before}") + curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + offs = int(curr_lsn) % 8192 + till_page = 8192 - offs + target_lsn = curr_lsn + till_page + payload_len = ( + till_page - logical_message_base - 8 + ) # not sure why 8 is here, it is deduced from experiments + log.info( + f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}, target_lsn {target_lsn}" + ) + + cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')") + supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + log.info(f"supposedly_page_boundary={supposedly_contrecord_end}") + # The calculations to hit the page boundary are very fuzzy, so just + # ignore test if we fail to reach it. + if not (int(supposedly_contrecord_end) % 8192 == 0): + pytest.skip(f"missed page boundary, bad luck: lsn is {supposedly_contrecord_end}") + + ep.stop(mode="immediate") + ep = env.endpoints.create_start("main") + ep.safe_psql("insert into t values (42)") # should be ok + + # Context manager which logs passed time on exit. class DurationLogger: def __init__(self, desc): @@ -1374,6 +1445,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): # roughly fills one segment endpoint.safe_psql("insert into t select generate_series(1,250000), 'payload'") + lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) endpoint.stop() # stop compute @@ -1402,7 +1474,15 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): "flush_lsn to get aligned", ) - cmp_sk_wal([sk1, sk2], tenant_id, timeline_id) + sk1_digest = sk1.http_client().timeline_digest( + tenant_id, timeline_id, sk1.get_timeline_start_lsn(tenant_id, timeline_id), lsn + ) + + sk2_digest = sk1.http_client().timeline_digest( + tenant_id, timeline_id, sk2.get_timeline_start_lsn(tenant_id, timeline_id), lsn + ) + + assert sk1_digest == sk2_digest # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit env.safekeepers[2].stop() @@ -2181,6 +2261,63 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): wait_until(unevicted_on_dest, interval=0.1, timeout=1.0) +# Basic test for http API membership related calls: create timeline and switch +# configuration. Normally these are called by storage controller, but this +# allows to test them separately. +@run_only_on_default_postgres("tests only safekeeper API") +def test_membership_api(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + sk = env.safekeepers[0] + http_cli = sk.http_client() + + sk_id_1 = SafekeeperId(env.safekeepers[0].id, "localhost", sk.port.pg_tenant_only) + sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock + + # Request to switch before timeline creation should fail. + init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None) + with pytest.raises(requests.exceptions.HTTPError): + http_cli.membership_switch(tenant_id, timeline_id, init_conf) + + # Create timeline. + create_r = TimelineCreateRequest( + tenant_id, timeline_id, init_conf, 150002, Lsn("0/1000000"), commit_lsn=None + ) + log.info(f"sending {create_r.to_json()}") + http_cli.timeline_create(create_r) + + # Switch into some conf. + joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2]) + resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf) + log.info(f"joint switch resp: {resp}") + assert resp.previous_conf.generation == 1 + assert resp.current_conf.generation == 4 + + # Restart sk, conf should be preserved. + sk.stop().start() + after_restart = http_cli.get_membership(tenant_id, timeline_id) + log.info(f"conf after restart: {after_restart}") + assert after_restart.generation == 4 + + # Switch into disjoint conf. + non_joint = Configuration(generation=5, members=[sk_id_2], new_members=None) + resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint) + log.info(f"non joint switch resp: {resp}") + assert resp.previous_conf.generation == 4 + assert resp.current_conf.generation == 5 + + # Switch request to lower conf should be ignored. + lower_conf = Configuration(generation=3, members=[], new_members=None) + resp = http_cli.membership_switch(tenant_id, timeline_id, lower_conf) + log.info(f"lower switch resp: {resp}") + assert resp.previous_conf.generation == 5 + assert resp.current_conf.generation == 5 + + # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries # when compute is active, but there are no writes to the timeline. In that case # pageserver should maintain a single connection to safekeeper and don't attempt diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index b32b028fa1..56539a0a08 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -268,7 +268,8 @@ def endpoint_create_start( env, tenant_id=env.initial_tenant, pg_port=env.port_distributor.get_port(), - http_port=env.port_distributor.get_port(), + external_http_port=env.port_distributor.get_port(), + internal_http_port=env.port_distributor.get_port(), # In these tests compute has high probability of terminating on its own # before our stop() due to lost consensus leadership. check_stop_result=False, @@ -538,13 +539,16 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder): asyncio.run(run_recovery_uncommitted(env)) -async def run_wal_truncation(env: NeonEnv): +async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int): tenant_id = env.initial_tenant timeline_id = env.initial_timeline (sk1, sk2, sk3) = env.safekeepers - ep = env.endpoints.create_start("main") + config_lines = [ + f"neon.safekeeper_proto_version = {safekeeper_proto_version}", + ] + ep = env.endpoints.create_start("main", config_lines=config_lines) ep.safe_psql("create table t (key int, value text)") ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") @@ -571,6 +575,7 @@ async def run_wal_truncation(env: NeonEnv): sk2.start() ep = env.endpoints.create_start( "main", + config_lines=config_lines, ) ep.safe_psql("insert into t select generate_series(1, 200), 'payload'") @@ -589,11 +594,13 @@ async def run_wal_truncation(env: NeonEnv): # Simple deterministic test creating tail of WAL on safekeeper which is # truncated when majority without this sk elects walproposer starting earlier. -def test_wal_truncation(neon_env_builder: NeonEnvBuilder): +# Test both proto versions until we fully migrate. +@pytest.mark.parametrize("safekeeper_proto_version", [2, 3]) +def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_version: int): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - asyncio.run(run_wal_truncation(env)) + asyncio.run(run_wal_truncation(env, safekeeper_proto_version)) async def run_segment_init_failure(env: NeonEnv): diff --git a/test_runner/stubs/h2/__init__.pyi b/test_runner/stubs/h2/__init__.pyi index e69de29bb2..bda5b5a7f4 100644 --- a/test_runner/stubs/h2/__init__.pyi +++ b/test_runner/stubs/h2/__init__.pyi @@ -0,0 +1 @@ +__version__: str diff --git a/test_runner/stubs/h2/config.pyi b/test_runner/stubs/h2/config.pyi index 710005db69..422344b981 100644 --- a/test_runner/stubs/h2/config.pyi +++ b/test_runner/stubs/h2/config.pyi @@ -1,11 +1,12 @@ from _typeshed import Incomplete +from typing import Any class _BooleanConfigOption: name: Incomplete attr_name: Incomplete - def __init__(self, name) -> None: ... - def __get__(self, instance, owner): ... - def __set__(self, instance, value) -> None: ... + def __init__(self, name: str) -> None: ... + def __get__(self, instance: Any, owner: Any) -> bool: ... + def __set__(self, instance: Any, value: bool) -> None: ... class DummyLogger: def __init__(self, *vargs) -> None: ... @@ -15,7 +16,7 @@ class DummyLogger: class OutputLogger: file: Incomplete trace_level: Incomplete - def __init__(self, file: Incomplete | None = ..., trace_level: bool = ...) -> None: ... + def __init__(self, file: Incomplete | None = None, trace_level: bool = False) -> None: ... def debug(self, fmtstr, *args) -> None: ... def trace(self, fmtstr, *args) -> None: ... @@ -23,20 +24,12 @@ class H2Configuration: client_side: Incomplete validate_outbound_headers: Incomplete normalize_outbound_headers: Incomplete + split_outbound_cookies: Incomplete validate_inbound_headers: Incomplete normalize_inbound_headers: Incomplete logger: Incomplete - def __init__( - self, - client_side: bool = ..., - header_encoding: Incomplete | None = ..., - validate_outbound_headers: bool = ..., - normalize_outbound_headers: bool = ..., - validate_inbound_headers: bool = ..., - normalize_inbound_headers: bool = ..., - logger: Incomplete | None = ..., - ) -> None: ... + def __init__(self, client_side: bool = True, header_encoding: bool | str | None = None, validate_outbound_headers: bool = True, normalize_outbound_headers: bool = True, split_outbound_cookies: bool = False, validate_inbound_headers: bool = True, normalize_inbound_headers: bool = True, logger: DummyLogger | OutputLogger | None = None) -> None: ... @property - def header_encoding(self): ... + def header_encoding(self) -> bool | str | None: ... @header_encoding.setter - def header_encoding(self, value) -> None: ... + def header_encoding(self, value: bool | str | None) -> None: ... diff --git a/test_runner/stubs/h2/connection.pyi b/test_runner/stubs/h2/connection.pyi index 04be18ca74..f7ec78a997 100644 --- a/test_runner/stubs/h2/connection.pyi +++ b/test_runner/stubs/h2/connection.pyi @@ -1,72 +1,55 @@ -from enum import Enum, IntEnum - -from _typeshed import Incomplete - from .config import H2Configuration as H2Configuration from .errors import ErrorCodes as ErrorCodes -from .events import AlternativeServiceAvailable as AlternativeServiceAvailable -from .events import ConnectionTerminated as ConnectionTerminated -from .events import PingAckReceived as PingAckReceived -from .events import PingReceived as PingReceived -from .events import PriorityUpdated as PriorityUpdated -from .events import RemoteSettingsChanged as RemoteSettingsChanged -from .events import SettingsAcknowledged as SettingsAcknowledged -from .events import UnknownFrameReceived as UnknownFrameReceived -from .events import WindowUpdated as WindowUpdated -from .exceptions import DenialOfServiceError as DenialOfServiceError -from .exceptions import FlowControlError as FlowControlError -from .exceptions import FrameTooLargeError as FrameTooLargeError -from .exceptions import NoAvailableStreamIDError as NoAvailableStreamIDError -from .exceptions import NoSuchStreamError as NoSuchStreamError -from .exceptions import ProtocolError as ProtocolError -from .exceptions import RFC1122Error as RFC1122Error -from .exceptions import StreamClosedError as StreamClosedError -from .exceptions import StreamIDTooLowError as StreamIDTooLowError -from .exceptions import TooManyStreamsError as TooManyStreamsError +from .events import AlternativeServiceAvailable as AlternativeServiceAvailable, ConnectionTerminated as ConnectionTerminated, Event as Event, InformationalResponseReceived as InformationalResponseReceived, PingAckReceived as PingAckReceived, PingReceived as PingReceived, PriorityUpdated as PriorityUpdated, RemoteSettingsChanged as RemoteSettingsChanged, RequestReceived as RequestReceived, ResponseReceived as ResponseReceived, SettingsAcknowledged as SettingsAcknowledged, TrailersReceived as TrailersReceived, UnknownFrameReceived as UnknownFrameReceived, WindowUpdated as WindowUpdated +from .exceptions import DenialOfServiceError as DenialOfServiceError, FlowControlError as FlowControlError, FrameTooLargeError as FrameTooLargeError, NoAvailableStreamIDError as NoAvailableStreamIDError, NoSuchStreamError as NoSuchStreamError, ProtocolError as ProtocolError, RFC1122Error as RFC1122Error, StreamClosedError as StreamClosedError, StreamIDTooLowError as StreamIDTooLowError, TooManyStreamsError as TooManyStreamsError from .frame_buffer import FrameBuffer as FrameBuffer -from .settings import SettingCodes as SettingCodes -from .settings import Settings as Settings -from .stream import H2Stream as H2Stream -from .stream import StreamClosedBy as StreamClosedBy -from .utilities import guard_increment_window as guard_increment_window +from .settings import ChangedSetting as ChangedSetting, SettingCodes as SettingCodes, Settings as Settings +from .stream import H2Stream as H2Stream, StreamClosedBy as StreamClosedBy +from .utilities import SizeLimitDict as SizeLimitDict, guard_increment_window as guard_increment_window from .windows import WindowManager as WindowManager +from _typeshed import Incomplete +from collections.abc import Iterable +from enum import Enum, IntEnum +from hpack.struct import Header as Header, HeaderWeaklyTyped as HeaderWeaklyTyped +from hyperframe.frame import Frame as Frame +from typing import Any class ConnectionState(Enum): - IDLE: int - CLIENT_OPEN: int - SERVER_OPEN: int - CLOSED: int + IDLE = 0 + CLIENT_OPEN = 1 + SERVER_OPEN = 2 + CLOSED = 3 class ConnectionInputs(Enum): - SEND_HEADERS: int - SEND_PUSH_PROMISE: int - SEND_DATA: int - SEND_GOAWAY: int - SEND_WINDOW_UPDATE: int - SEND_PING: int - SEND_SETTINGS: int - SEND_RST_STREAM: int - SEND_PRIORITY: int - RECV_HEADERS: int - RECV_PUSH_PROMISE: int - RECV_DATA: int - RECV_GOAWAY: int - RECV_WINDOW_UPDATE: int - RECV_PING: int - RECV_SETTINGS: int - RECV_RST_STREAM: int - RECV_PRIORITY: int - SEND_ALTERNATIVE_SERVICE: int - RECV_ALTERNATIVE_SERVICE: int + SEND_HEADERS = 0 + SEND_PUSH_PROMISE = 1 + SEND_DATA = 2 + SEND_GOAWAY = 3 + SEND_WINDOW_UPDATE = 4 + SEND_PING = 5 + SEND_SETTINGS = 6 + SEND_RST_STREAM = 7 + SEND_PRIORITY = 8 + RECV_HEADERS = 9 + RECV_PUSH_PROMISE = 10 + RECV_DATA = 11 + RECV_GOAWAY = 12 + RECV_WINDOW_UPDATE = 13 + RECV_PING = 14 + RECV_SETTINGS = 15 + RECV_RST_STREAM = 16 + RECV_PRIORITY = 17 + SEND_ALTERNATIVE_SERVICE = 18 + RECV_ALTERNATIVE_SERVICE = 19 class AllowedStreamIDs(IntEnum): - EVEN: int - ODD: int + EVEN = 0 + ODD = 1 class H2ConnectionStateMachine: state: Incomplete def __init__(self) -> None: ... - def process_input(self, input_): ... + def process_input(self, input_: ConnectionInputs) -> list[Event]: ... class H2Connection: DEFAULT_MAX_OUTBOUND_FRAME_SIZE: int @@ -88,55 +71,30 @@ class H2Connection: max_outbound_frame_size: Incomplete max_inbound_frame_size: Incomplete incoming_buffer: Incomplete - def __init__(self, config: Incomplete | None = ...) -> None: ... + def __init__(self, config: H2Configuration | None = None) -> None: ... @property - def open_outbound_streams(self): ... + def open_outbound_streams(self) -> int: ... @property - def open_inbound_streams(self): ... + def open_inbound_streams(self) -> int: ... @property - def inbound_flow_control_window(self): ... + def inbound_flow_control_window(self) -> int: ... def initiate_connection(self) -> None: ... - def initiate_upgrade_connection(self, settings_header: Incomplete | None = ...): ... - def get_next_available_stream_id(self): ... - def send_headers( - self, - stream_id, - headers, - end_stream: bool = ..., - priority_weight: Incomplete | None = ..., - priority_depends_on: Incomplete | None = ..., - priority_exclusive: Incomplete | None = ..., - ) -> None: ... - def send_data( - self, stream_id, data, end_stream: bool = ..., pad_length: Incomplete | None = ... - ) -> None: ... - def end_stream(self, stream_id) -> None: ... - def increment_flow_control_window( - self, increment, stream_id: Incomplete | None = ... - ) -> None: ... - def push_stream(self, stream_id, promised_stream_id, request_headers) -> None: ... - def ping(self, opaque_data) -> None: ... - def reset_stream(self, stream_id, error_code: int = ...) -> None: ... - def close_connection( - self, - error_code: int = ..., - additional_data: Incomplete | None = ..., - last_stream_id: Incomplete | None = ..., - ) -> None: ... - def update_settings(self, new_settings) -> None: ... - def advertise_alternative_service( - self, field_value, origin: Incomplete | None = ..., stream_id: Incomplete | None = ... - ) -> None: ... - def prioritize( - self, - stream_id, - weight: Incomplete | None = ..., - depends_on: Incomplete | None = ..., - exclusive: Incomplete | None = ..., - ) -> None: ... - def local_flow_control_window(self, stream_id): ... - def remote_flow_control_window(self, stream_id): ... - def acknowledge_received_data(self, acknowledged_size, stream_id) -> None: ... - def data_to_send(self, amount: Incomplete | None = ...): ... + def initiate_upgrade_connection(self, settings_header: bytes | None = None) -> bytes | None: ... + def get_next_available_stream_id(self) -> int: ... + def send_headers(self, stream_id: int, headers: Iterable[HeaderWeaklyTyped], end_stream: bool = False, priority_weight: int | None = None, priority_depends_on: int | None = None, priority_exclusive: bool | None = None) -> None: ... + def send_data(self, stream_id: int, data: bytes | memoryview, end_stream: bool = False, pad_length: Any = None) -> None: ... + def end_stream(self, stream_id: int) -> None: ... + def increment_flow_control_window(self, increment: int, stream_id: int | None = None) -> None: ... + def push_stream(self, stream_id: int, promised_stream_id: int, request_headers: Iterable[HeaderWeaklyTyped]) -> None: ... + def ping(self, opaque_data: bytes | str) -> None: ... + def reset_stream(self, stream_id: int, error_code: ErrorCodes | int = 0) -> None: ... + def close_connection(self, error_code: ErrorCodes | int = 0, additional_data: bytes | None = None, last_stream_id: int | None = None) -> None: ... + def update_settings(self, new_settings: dict[SettingCodes | int, int]) -> None: ... + def advertise_alternative_service(self, field_value: bytes | str, origin: bytes | None = None, stream_id: int | None = None) -> None: ... + def prioritize(self, stream_id: int, weight: int | None = None, depends_on: int | None = None, exclusive: bool | None = None) -> None: ... + def local_flow_control_window(self, stream_id: int) -> int: ... + def remote_flow_control_window(self, stream_id: int) -> int: ... + def acknowledge_received_data(self, acknowledged_size: int, stream_id: int) -> None: ... + def data_to_send(self, amount: int | None = None) -> bytes: ... def clear_outbound_data_buffer(self) -> None: ... - def receive_data(self, data): ... + def receive_data(self, data: bytes) -> list[Event]: ... diff --git a/test_runner/stubs/h2/errors.pyi b/test_runner/stubs/h2/errors.pyi index b70c632f8c..7cf77bd833 100644 --- a/test_runner/stubs/h2/errors.pyi +++ b/test_runner/stubs/h2/errors.pyi @@ -1,17 +1,19 @@ import enum +__all__ = ['ErrorCodes'] + class ErrorCodes(enum.IntEnum): - NO_ERROR: int - PROTOCOL_ERROR: int - INTERNAL_ERROR: int - FLOW_CONTROL_ERROR: int - SETTINGS_TIMEOUT: int - STREAM_CLOSED: int - FRAME_SIZE_ERROR: int - REFUSED_STREAM: int - CANCEL: int - COMPRESSION_ERROR: int - CONNECT_ERROR: int - ENHANCE_YOUR_CALM: int - INADEQUATE_SECURITY: int - HTTP_1_1_REQUIRED: int + NO_ERROR = 0 + PROTOCOL_ERROR = 1 + INTERNAL_ERROR = 2 + FLOW_CONTROL_ERROR = 3 + SETTINGS_TIMEOUT = 4 + STREAM_CLOSED = 5 + FRAME_SIZE_ERROR = 6 + REFUSED_STREAM = 7 + CANCEL = 8 + COMPRESSION_ERROR = 9 + CONNECT_ERROR = 10 + ENHANCE_YOUR_CALM = 11 + INADEQUATE_SECURITY = 12 + HTTP_1_1_REQUIRED = 13 diff --git a/test_runner/stubs/h2/events.pyi b/test_runner/stubs/h2/events.pyi index 75d0a9e53b..a086db38b3 100644 --- a/test_runner/stubs/h2/events.pyi +++ b/test_runner/stubs/h2/events.pyi @@ -1,6 +1,8 @@ +from .errors import ErrorCodes as ErrorCodes +from .settings import ChangedSetting as ChangedSetting, SettingCodes as SettingCodes, Settings as Settings from _typeshed import Incomplete - -from .settings import ChangedSetting as ChangedSetting +from hpack import HeaderTuple as HeaderTuple +from hyperframe.frame import Frame as Frame class Event: ... @@ -53,7 +55,7 @@ class RemoteSettingsChanged(Event): changed_settings: Incomplete def __init__(self) -> None: ... @classmethod - def from_settings(cls, old_settings, new_settings): ... + def from_settings(cls, old_settings: Settings | dict[int, int], new_settings: dict[int, int]) -> RemoteSettingsChanged: ... class PingReceived(Event): ping_data: Incomplete diff --git a/test_runner/stubs/h2/exceptions.pyi b/test_runner/stubs/h2/exceptions.pyi index 82019d5ec1..7149b46521 100644 --- a/test_runner/stubs/h2/exceptions.pyi +++ b/test_runner/stubs/h2/exceptions.pyi @@ -1,3 +1,4 @@ +from .errors import ErrorCodes as ErrorCodes from _typeshed import Incomplete class H2Error(Exception): ... @@ -19,27 +20,27 @@ class FlowControlError(ProtocolError): class StreamIDTooLowError(ProtocolError): stream_id: Incomplete max_stream_id: Incomplete - def __init__(self, stream_id, max_stream_id) -> None: ... + def __init__(self, stream_id: int, max_stream_id: int) -> None: ... class NoAvailableStreamIDError(ProtocolError): ... class NoSuchStreamError(ProtocolError): stream_id: Incomplete - def __init__(self, stream_id) -> None: ... + def __init__(self, stream_id: int) -> None: ... class StreamClosedError(NoSuchStreamError): stream_id: Incomplete error_code: Incomplete - def __init__(self, stream_id) -> None: ... + def __init__(self, stream_id: int) -> None: ... class InvalidSettingsValueError(ProtocolError, ValueError): error_code: Incomplete - def __init__(self, msg, error_code) -> None: ... + def __init__(self, msg: str, error_code: ErrorCodes) -> None: ... class InvalidBodyLengthError(ProtocolError): expected_length: Incomplete actual_length: Incomplete - def __init__(self, expected, actual) -> None: ... + def __init__(self, expected: int, actual: int) -> None: ... class UnsupportedFrameError(ProtocolError): ... class RFC1122Error(H2Error): ... diff --git a/test_runner/stubs/h2/frame_buffer.pyi b/test_runner/stubs/h2/frame_buffer.pyi index f47adab704..90746f63c1 100644 --- a/test_runner/stubs/h2/frame_buffer.pyi +++ b/test_runner/stubs/h2/frame_buffer.pyi @@ -1,19 +1,12 @@ -from .exceptions import ( - FrameDataMissingError as FrameDataMissingError, -) -from .exceptions import ( - FrameTooLargeError as FrameTooLargeError, -) -from .exceptions import ( - ProtocolError as ProtocolError, -) +from .exceptions import FrameDataMissingError as FrameDataMissingError, FrameTooLargeError as FrameTooLargeError, ProtocolError as ProtocolError +from hyperframe.frame import Frame CONTINUATION_BACKLOG: int class FrameBuffer: data: bytes max_frame_size: int - def __init__(self, server: bool = ...) -> None: ... - def add_data(self, data) -> None: ... - def __iter__(self): ... - def __next__(self): ... + def __init__(self, server: bool = False) -> None: ... + def add_data(self, data: bytes) -> None: ... + def __iter__(self) -> FrameBuffer: ... + def __next__(self) -> Frame: ... diff --git a/test_runner/stubs/h2/settings.pyi b/test_runner/stubs/h2/settings.pyi index a352abe53e..c3920f9969 100644 --- a/test_runner/stubs/h2/settings.pyi +++ b/test_runner/stubs/h2/settings.pyi @@ -1,61 +1,59 @@ import enum -from collections.abc import MutableMapping -from typing import Any - +from .errors import ErrorCodes as ErrorCodes +from .exceptions import InvalidSettingsValueError as InvalidSettingsValueError from _typeshed import Incomplete -from h2.errors import ErrorCodes as ErrorCodes -from h2.exceptions import InvalidSettingsValueError as InvalidSettingsValueError +from collections.abc import Iterator, MutableMapping class SettingCodes(enum.IntEnum): - HEADER_TABLE_SIZE: Incomplete - ENABLE_PUSH: Incomplete - MAX_CONCURRENT_STREAMS: Incomplete - INITIAL_WINDOW_SIZE: Incomplete - MAX_FRAME_SIZE: Incomplete - MAX_HEADER_LIST_SIZE: Incomplete - ENABLE_CONNECT_PROTOCOL: Incomplete + HEADER_TABLE_SIZE = ... + ENABLE_PUSH = ... + MAX_CONCURRENT_STREAMS = ... + INITIAL_WINDOW_SIZE = ... + MAX_FRAME_SIZE = ... + MAX_HEADER_LIST_SIZE = ... + ENABLE_CONNECT_PROTOCOL = ... class ChangedSetting: setting: Incomplete original_value: Incomplete new_value: Incomplete - def __init__(self, setting, original_value, new_value) -> None: ... + def __init__(self, setting: SettingCodes | int, original_value: int | None, new_value: int) -> None: ... -class Settings(MutableMapping[str, Any]): - def __init__(self, client: bool = ..., initial_values: Incomplete | None = ...) -> None: ... - def acknowledge(self): ... +class Settings(MutableMapping[SettingCodes | int, int]): + def __init__(self, client: bool = True, initial_values: dict[SettingCodes, int] | None = None) -> None: ... + def acknowledge(self) -> dict[SettingCodes | int, ChangedSetting]: ... @property - def header_table_size(self): ... + def header_table_size(self) -> int: ... @header_table_size.setter - def header_table_size(self, value) -> None: ... + def header_table_size(self, value: int) -> None: ... @property - def enable_push(self): ... + def enable_push(self) -> int: ... @enable_push.setter - def enable_push(self, value) -> None: ... + def enable_push(self, value: int) -> None: ... @property - def initial_window_size(self): ... + def initial_window_size(self) -> int: ... @initial_window_size.setter - def initial_window_size(self, value) -> None: ... + def initial_window_size(self, value: int) -> None: ... @property - def max_frame_size(self): ... + def max_frame_size(self) -> int: ... @max_frame_size.setter - def max_frame_size(self, value) -> None: ... + def max_frame_size(self, value: int) -> None: ... @property - def max_concurrent_streams(self): ... + def max_concurrent_streams(self) -> int: ... @max_concurrent_streams.setter - def max_concurrent_streams(self, value) -> None: ... + def max_concurrent_streams(self, value: int) -> None: ... @property - def max_header_list_size(self): ... + def max_header_list_size(self) -> int | None: ... @max_header_list_size.setter - def max_header_list_size(self, value) -> None: ... + def max_header_list_size(self, value: int) -> None: ... @property - def enable_connect_protocol(self): ... + def enable_connect_protocol(self) -> int: ... @enable_connect_protocol.setter - def enable_connect_protocol(self, value) -> None: ... - def __getitem__(self, key): ... - def __setitem__(self, key, value) -> None: ... - def __delitem__(self, key) -> None: ... - def __iter__(self): ... + def enable_connect_protocol(self, value: int) -> None: ... + def __getitem__(self, key: SettingCodes | int) -> int: ... + def __setitem__(self, key: SettingCodes | int, value: int) -> None: ... + def __delitem__(self, key: SettingCodes | int) -> None: ... + def __iter__(self) -> Iterator[SettingCodes | int]: ... def __len__(self) -> int: ... - def __eq__(self, other): ... - def __ne__(self, other): ... + def __eq__(self, other: object) -> bool: ... + def __ne__(self, other: object) -> bool: ... diff --git a/test_runner/stubs/h2/stream.pyi b/test_runner/stubs/h2/stream.pyi index d52ab8e72b..89171da981 100644 --- a/test_runner/stubs/h2/stream.pyi +++ b/test_runner/stubs/h2/stream.pyi @@ -1,114 +1,52 @@ -from enum import Enum, IntEnum - -from _typeshed import Incomplete - +from .config import H2Configuration as H2Configuration from .errors import ErrorCodes as ErrorCodes -from .events import ( - AlternativeServiceAvailable as AlternativeServiceAvailable, -) -from .events import ( - DataReceived as DataReceived, -) -from .events import ( - InformationalResponseReceived as InformationalResponseReceived, -) -from .events import ( - PushedStreamReceived as PushedStreamReceived, -) -from .events import ( - RequestReceived as RequestReceived, -) -from .events import ( - ResponseReceived as ResponseReceived, -) -from .events import ( - StreamEnded as StreamEnded, -) -from .events import ( - StreamReset as StreamReset, -) -from .events import ( - TrailersReceived as TrailersReceived, -) -from .events import ( - WindowUpdated as WindowUpdated, -) -from .exceptions import ( - FlowControlError as FlowControlError, -) -from .exceptions import ( - InvalidBodyLengthError as InvalidBodyLengthError, -) -from .exceptions import ( - ProtocolError as ProtocolError, -) -from .exceptions import ( - StreamClosedError as StreamClosedError, -) -from .utilities import ( - HeaderValidationFlags as HeaderValidationFlags, -) -from .utilities import ( - authority_from_headers as authority_from_headers, -) -from .utilities import ( - extract_method_header as extract_method_header, -) -from .utilities import ( - guard_increment_window as guard_increment_window, -) -from .utilities import ( - is_informational_response as is_informational_response, -) -from .utilities import ( - normalize_inbound_headers as normalize_inbound_headers, -) -from .utilities import ( - normalize_outbound_headers as normalize_outbound_headers, -) -from .utilities import ( - validate_headers as validate_headers, -) -from .utilities import ( - validate_outbound_headers as validate_outbound_headers, -) +from .events import AlternativeServiceAvailable as AlternativeServiceAvailable, DataReceived as DataReceived, Event as Event, InformationalResponseReceived as InformationalResponseReceived, PushedStreamReceived as PushedStreamReceived, RequestReceived as RequestReceived, ResponseReceived as ResponseReceived, StreamEnded as StreamEnded, StreamReset as StreamReset, TrailersReceived as TrailersReceived, WindowUpdated as WindowUpdated +from .exceptions import FlowControlError as FlowControlError, InvalidBodyLengthError as InvalidBodyLengthError, ProtocolError as ProtocolError, StreamClosedError as StreamClosedError +from .utilities import HeaderValidationFlags as HeaderValidationFlags, authority_from_headers as authority_from_headers, extract_method_header as extract_method_header, guard_increment_window as guard_increment_window, is_informational_response as is_informational_response, normalize_inbound_headers as normalize_inbound_headers, normalize_outbound_headers as normalize_outbound_headers, utf8_encode_headers as utf8_encode_headers, validate_headers as validate_headers, validate_outbound_headers as validate_outbound_headers from .windows import WindowManager as WindowManager +from _typeshed import Incomplete +from collections.abc import Iterable +from enum import Enum, IntEnum +from hpack.hpack import Encoder as Encoder +from hpack.struct import Header as Header, HeaderWeaklyTyped as HeaderWeaklyTyped +from hyperframe.frame import AltSvcFrame, ContinuationFrame, Frame as Frame, HeadersFrame, PushPromiseFrame, RstStreamFrame +from typing import Any class StreamState(IntEnum): - IDLE: int - RESERVED_REMOTE: int - RESERVED_LOCAL: int - OPEN: int - HALF_CLOSED_REMOTE: int - HALF_CLOSED_LOCAL: int - CLOSED: int + IDLE = 0 + RESERVED_REMOTE = 1 + RESERVED_LOCAL = 2 + OPEN = 3 + HALF_CLOSED_REMOTE = 4 + HALF_CLOSED_LOCAL = 5 + CLOSED = 6 class StreamInputs(Enum): - SEND_HEADERS: int - SEND_PUSH_PROMISE: int - SEND_RST_STREAM: int - SEND_DATA: int - SEND_WINDOW_UPDATE: int - SEND_END_STREAM: int - RECV_HEADERS: int - RECV_PUSH_PROMISE: int - RECV_RST_STREAM: int - RECV_DATA: int - RECV_WINDOW_UPDATE: int - RECV_END_STREAM: int - RECV_CONTINUATION: int - SEND_INFORMATIONAL_HEADERS: int - RECV_INFORMATIONAL_HEADERS: int - SEND_ALTERNATIVE_SERVICE: int - RECV_ALTERNATIVE_SERVICE: int - UPGRADE_CLIENT: int - UPGRADE_SERVER: int + SEND_HEADERS = 0 + SEND_PUSH_PROMISE = 1 + SEND_RST_STREAM = 2 + SEND_DATA = 3 + SEND_WINDOW_UPDATE = 4 + SEND_END_STREAM = 5 + RECV_HEADERS = 6 + RECV_PUSH_PROMISE = 7 + RECV_RST_STREAM = 8 + RECV_DATA = 9 + RECV_WINDOW_UPDATE = 10 + RECV_END_STREAM = 11 + RECV_CONTINUATION = 12 + SEND_INFORMATIONAL_HEADERS = 13 + RECV_INFORMATIONAL_HEADERS = 14 + SEND_ALTERNATIVE_SERVICE = 15 + RECV_ALTERNATIVE_SERVICE = 16 + UPGRADE_CLIENT = 17 + UPGRADE_SERVER = 18 class StreamClosedBy(Enum): - SEND_END_STREAM: int - RECV_END_STREAM: int - SEND_RST_STREAM: int - RECV_RST_STREAM: int + SEND_END_STREAM = 0 + RECV_END_STREAM = 1 + SEND_RST_STREAM = 2 + RECV_RST_STREAM = 3 STREAM_OPEN: Incomplete @@ -121,32 +59,32 @@ class H2StreamStateMachine: headers_received: Incomplete trailers_received: Incomplete stream_closed_by: Incomplete - def __init__(self, stream_id) -> None: ... - def process_input(self, input_): ... - def request_sent(self, previous_state): ... - def response_sent(self, previous_state): ... - def request_received(self, previous_state): ... - def response_received(self, previous_state): ... - def data_received(self, previous_state): ... - def window_updated(self, previous_state): ... - def stream_half_closed(self, previous_state): ... - def stream_ended(self, previous_state): ... - def stream_reset(self, previous_state): ... - def send_new_pushed_stream(self, previous_state): ... - def recv_new_pushed_stream(self, previous_state): ... - def send_push_promise(self, previous_state): ... - def recv_push_promise(self, previous_state): ... - def send_end_stream(self, previous_state) -> None: ... - def send_reset_stream(self, previous_state) -> None: ... - def reset_stream_on_error(self, previous_state) -> None: ... - def recv_on_closed_stream(self, previous_state) -> None: ... - def send_on_closed_stream(self, previous_state) -> None: ... - def recv_push_on_closed_stream(self, previous_state) -> None: ... - def send_push_on_closed_stream(self, previous_state) -> None: ... - def send_informational_response(self, previous_state): ... - def recv_informational_response(self, previous_state): ... - def recv_alt_svc(self, previous_state): ... - def send_alt_svc(self, previous_state) -> None: ... + def __init__(self, stream_id: int) -> None: ... + def process_input(self, input_: StreamInputs) -> Any: ... + def request_sent(self, previous_state: StreamState) -> list[Event]: ... + def response_sent(self, previous_state: StreamState) -> list[Event]: ... + def request_received(self, previous_state: StreamState) -> list[Event]: ... + def response_received(self, previous_state: StreamState) -> list[Event]: ... + def data_received(self, previous_state: StreamState) -> list[Event]: ... + def window_updated(self, previous_state: StreamState) -> list[Event]: ... + def stream_half_closed(self, previous_state: StreamState) -> list[Event]: ... + def stream_ended(self, previous_state: StreamState) -> list[Event]: ... + def stream_reset(self, previous_state: StreamState) -> list[Event]: ... + def send_new_pushed_stream(self, previous_state: StreamState) -> list[Event]: ... + def recv_new_pushed_stream(self, previous_state: StreamState) -> list[Event]: ... + def send_push_promise(self, previous_state: StreamState) -> list[Event]: ... + def recv_push_promise(self, previous_state: StreamState) -> list[Event]: ... + def send_end_stream(self, previous_state: StreamState) -> None: ... + def send_reset_stream(self, previous_state: StreamState) -> None: ... + def reset_stream_on_error(self, previous_state: StreamState) -> None: ... + def recv_on_closed_stream(self, previous_state: StreamState) -> None: ... + def send_on_closed_stream(self, previous_state: StreamState) -> None: ... + def recv_push_on_closed_stream(self, previous_state: StreamState) -> None: ... + def send_push_on_closed_stream(self, previous_state: StreamState) -> None: ... + def send_informational_response(self, previous_state: StreamState) -> list[Event]: ... + def recv_informational_response(self, previous_state: StreamState) -> list[Event]: ... + def recv_alt_svc(self, previous_state: StreamState) -> list[Event]: ... + def send_alt_svc(self, previous_state: StreamState) -> None: ... class H2Stream: state_machine: Incomplete @@ -155,30 +93,30 @@ class H2Stream: request_method: Incomplete outbound_flow_control_window: Incomplete config: Incomplete - def __init__(self, stream_id, config, inbound_window_size, outbound_window_size) -> None: ... + def __init__(self, stream_id: int, config: H2Configuration, inbound_window_size: int, outbound_window_size: int) -> None: ... @property - def inbound_flow_control_window(self): ... + def inbound_flow_control_window(self) -> int: ... @property - def open(self): ... + def open(self) -> bool: ... @property - def closed(self): ... + def closed(self) -> bool: ... @property - def closed_by(self): ... - def upgrade(self, client_side) -> None: ... - def send_headers(self, headers, encoder, end_stream: bool = ...): ... - def push_stream_in_band(self, related_stream_id, headers, encoder): ... - def locally_pushed(self): ... - def send_data(self, data, end_stream: bool = ..., pad_length: Incomplete | None = ...): ... - def end_stream(self): ... - def advertise_alternative_service(self, field_value): ... - def increase_flow_control_window(self, increment): ... - def receive_push_promise_in_band(self, promised_stream_id, headers, header_encoding): ... - def remotely_pushed(self, pushed_headers): ... - def receive_headers(self, headers, end_stream, header_encoding): ... - def receive_data(self, data, end_stream, flow_control_len): ... - def receive_window_update(self, increment): ... + def closed_by(self) -> StreamClosedBy | None: ... + def upgrade(self, client_side: bool) -> None: ... + def send_headers(self, headers: Iterable[HeaderWeaklyTyped], encoder: Encoder, end_stream: bool = False) -> list[HeadersFrame | ContinuationFrame | PushPromiseFrame]: ... + def push_stream_in_band(self, related_stream_id: int, headers: Iterable[HeaderWeaklyTyped], encoder: Encoder) -> list[HeadersFrame | ContinuationFrame | PushPromiseFrame]: ... + def locally_pushed(self) -> list[Frame]: ... + def send_data(self, data: bytes | memoryview, end_stream: bool = False, pad_length: int | None = None) -> list[Frame]: ... + def end_stream(self) -> list[Frame]: ... + def advertise_alternative_service(self, field_value: bytes) -> list[Frame]: ... + def increase_flow_control_window(self, increment: int) -> list[Frame]: ... + def receive_push_promise_in_band(self, promised_stream_id: int, headers: Iterable[Header], header_encoding: bool | str | None) -> tuple[list[Frame], list[Event]]: ... + def remotely_pushed(self, pushed_headers: Iterable[Header]) -> tuple[list[Frame], list[Event]]: ... + def receive_headers(self, headers: Iterable[Header], end_stream: bool, header_encoding: bool | str | None) -> tuple[list[Frame], list[Event]]: ... + def receive_data(self, data: bytes, end_stream: bool, flow_control_len: int) -> tuple[list[Frame], list[Event]]: ... + def receive_window_update(self, increment: int) -> tuple[list[Frame], list[Event]]: ... def receive_continuation(self) -> None: ... - def receive_alt_svc(self, frame): ... - def reset_stream(self, error_code: int = ...): ... - def stream_reset(self, frame): ... - def acknowledge_received_data(self, acknowledged_size): ... + def receive_alt_svc(self, frame: AltSvcFrame) -> tuple[list[Frame], list[Event]]: ... + def reset_stream(self, error_code: ErrorCodes | int = 0) -> list[Frame]: ... + def stream_reset(self, frame: RstStreamFrame) -> tuple[list[Frame], list[Event]]: ... + def acknowledge_received_data(self, acknowledged_size: int) -> list[Frame]: ... diff --git a/test_runner/stubs/h2/utilities.pyi b/test_runner/stubs/h2/utilities.pyi index e0a8d55d1d..8802087e4c 100644 --- a/test_runner/stubs/h2/utilities.pyi +++ b/test_runner/stubs/h2/utilities.pyi @@ -1,25 +1,32 @@ -from typing import NamedTuple - +import collections +from .exceptions import FlowControlError as FlowControlError, ProtocolError as ProtocolError from _typeshed import Incomplete - -from .exceptions import FlowControlError as FlowControlError -from .exceptions import ProtocolError as ProtocolError +from collections.abc import Generator, Iterable +from hpack.struct import Header as Header, HeaderWeaklyTyped as HeaderWeaklyTyped +from typing import Any, NamedTuple UPPER_RE: Incomplete +SIGIL: Incomplete +INFORMATIONAL_START: Incomplete CONNECTION_HEADERS: Incomplete -def extract_method_header(headers): ... -def is_informational_response(headers): ... -def guard_increment_window(current, increment): ... -def authority_from_headers(headers): ... +def extract_method_header(headers: Iterable[Header]) -> bytes | None: ... +def is_informational_response(headers: Iterable[Header]) -> bool: ... +def guard_increment_window(current: int, increment: int) -> int: ... +def authority_from_headers(headers: Iterable[Header]) -> bytes | None: ... class HeaderValidationFlags(NamedTuple): - is_client: Incomplete - is_trailer: Incomplete - is_response_header: Incomplete - is_push_promise: Incomplete + is_client: bool + is_trailer: bool + is_response_header: bool + is_push_promise: bool -def validate_headers(headers, hdr_validation_flags): ... -def normalize_outbound_headers(headers, hdr_validation_flags): ... -def normalize_inbound_headers(headers, hdr_validation_flags): ... -def validate_outbound_headers(headers, hdr_validation_flags): ... +def validate_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags) -> Iterable[Header]: ... +def utf8_encode_headers(headers: Iterable[HeaderWeaklyTyped]) -> list[Header]: ... +def normalize_outbound_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags | None, should_split_outbound_cookies: bool = False) -> Generator[Header, None, None]: ... +def normalize_inbound_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags) -> Generator[Header, None, None]: ... +def validate_outbound_headers(headers: Iterable[Header], hdr_validation_flags: HeaderValidationFlags) -> Generator[Header, None, None]: ... + +class SizeLimitDict(collections.OrderedDict[int, Any]): + def __init__(self, *args: dict[int, int], **kwargs: Any) -> None: ... + def __setitem__(self, key: int, value: Any | int) -> None: ... diff --git a/test_runner/stubs/h2/windows.pyi b/test_runner/stubs/h2/windows.pyi index 7dc78e431c..b132ee610c 100644 --- a/test_runner/stubs/h2/windows.pyi +++ b/test_runner/stubs/h2/windows.pyi @@ -1,13 +1,12 @@ -from _typeshed import Incomplete - from .exceptions import FlowControlError as FlowControlError +from _typeshed import Incomplete LARGEST_FLOW_CONTROL_WINDOW: Incomplete class WindowManager: max_window_size: Incomplete current_window_size: Incomplete - def __init__(self, max_window_size) -> None: ... - def window_consumed(self, size) -> None: ... - def window_opened(self, size) -> None: ... - def process_bytes(self, size): ... + def __init__(self, max_window_size: int) -> None: ... + def window_consumed(self, size: int) -> None: ... + def window_opened(self, size: int) -> None: ... + def process_bytes(self, size: int) -> int | None: ... diff --git a/test_runner/websocket_tunnel.py b/test_runner/websocket_tunnel.py new file mode 100755 index 0000000000..069852468d --- /dev/null +++ b/test_runner/websocket_tunnel.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# +# This program helps to test the WebSocket tunneling in proxy. It listens for a TCP +# connection on a port, and when you connect to it, it opens a websocket connection, +# and forwards all the traffic to the websocket connection, wrapped in WebSocket binary +# frames. +# +# This is used in the test_proxy::test_websockets test, but it is handy for manual testing too. +# +# Usage for manual testing: +# +# ## Launch Posgres on port 3000: +# postgres -D data -p3000 +# +# ## Launch proxy with WSS enabled: +# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.local.neon.build' +# ./target/debug/proxy --wss 127.0.0.1:40433 --http 127.0.0.1:28080 --mgmt 127.0.0.1:9099 --proxy 127.0.0.1:4433 --tls-key server.key --tls-cert server.crt --auth-backend postgres +# +# ## Launch the tunnel: +# +# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.local.neon.build" +# +# ## Now you can connect with psql: +# psql "postgresql://heikki@localhost:40433/postgres" +# + +import argparse +import asyncio +import logging +import ssl +from ssl import Purpose + +import websockets +from fixtures.log_helper import log + + +# Enable verbose logging of all the traffic +def enable_verbose_logging(): + logger = logging.getLogger("websockets") + logger.setLevel(logging.DEBUG) + logger.addHandler(logging.StreamHandler()) + + +async def start_server(tcp_listen_host, tcp_listen_port, ws_url, ctx): + server = await asyncio.start_server( + lambda r, w: handle_client(r, w, ws_url, ctx), tcp_listen_host, tcp_listen_port + ) + return server + + +async def handle_tcp_to_websocket(tcp_reader, ws): + try: + while not tcp_reader.at_eof(): + data = await tcp_reader.read(1024) + + await ws.send(data) + except websockets.exceptions.ConnectionClosedError as e: + log.debug(f"connection closed: {e}") + except websockets.exceptions.ConnectionClosedOK: + log.debug("connection closed") + except Exception as e: + log.error(e) + + +async def handle_websocket_to_tcp(ws, tcp_writer): + try: + async for message in ws: + tcp_writer.write(message) + await tcp_writer.drain() + except websockets.exceptions.ConnectionClosedError as e: + log.debug(f"connection closed: {e}") + except websockets.exceptions.ConnectionClosedOK: + log.debug("connection closed") + except Exception as e: + log.error(e) + + +async def handle_client(tcp_reader, tcp_writer, ws_url: str, ctx: ssl.SSLContext): + try: + log.info("Received TCP connection. Connecting to websockets proxy.") + + async with websockets.connect(ws_url, ssl=ctx) as ws: + try: + log.info("Connected to websockets proxy") + + async with asyncio.TaskGroup() as tg: + task1 = tg.create_task(handle_tcp_to_websocket(tcp_reader, ws)) + task2 = tg.create_task(handle_websocket_to_tcp(ws, tcp_writer)) + + done, pending = await asyncio.wait( + [task1, task2], return_when=asyncio.FIRST_COMPLETED + ) + tcp_writer.close() + await ws.close() + + except* Exception as ex: + log.error(ex.exceptions) + except Exception as e: + log.error(e) + + +async def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--tcp-listen-addr", + default="localhost", + help="TCP addr to listen on", + ) + parser.add_argument( + "--tcp-listen-port", + default="40444", + help="TCP port to listen on", + ) + + parser.add_argument( + "--ws-url", + default="wss://localhost/", + help="websocket URL to connect to. This determines the Host header sent to the server", + ) + parser.add_argument( + "--ws-host", + default="127.0.0.1", + help="websockets host to connect to", + ) + parser.add_argument( + "--ws-port", + type=int, + default=443, + help="websockets port to connect to", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="enable verbose logging", + ) + args = parser.parse_args() + + if args.verbose: + enable_verbose_logging() + + ctx = ssl.create_default_context(Purpose.SERVER_AUTH) + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + server = await start_server(args.tcp_listen_addr, args.tcp_listen_port, args.ws_url, ctx) + print( + f"Listening for connections at {args.tcp_listen_addr}:{args.tcp_listen_port}, forwarding them to {args.ws_host}:{args.ws_port}" + ) + async with server: + await server.serve_forever() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 373f9decad..6254ab9b44 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 373f9decad933d2d46f321231032ae8b0da81acd +Subproject commit 6254ab9b4496c3e481bc037ae69d859bbc2bdd7d diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 972e325e62..9b118b1cff 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 972e325e62b455957adbbdd8580e31275bb5b8c9 +Subproject commit 9b118b1cffa6e4ca0d63389b57b54d11e207e9a8 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index dff6615a8e..799e7a08dd 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit dff6615a8e48a10bb17a03fa3c00635f1ace7a92 +Subproject commit 799e7a08dd171aa06a7395dd326f4243aaeb9f93 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index a10d95be67..517b8dc244 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit a10d95be67265e0f10a422ba0457f5a7af01de71 +Subproject commit 517b8dc244abf3e56f0089849e464af76f70b94e diff --git a/vendor/revisions.json b/vendor/revisions.json index 8a73e14dcf..8dde46a01e 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ - "17.2", - "a10d95be67265e0f10a422ba0457f5a7af01de71" + "17.4", + "517b8dc244abf3e56f0089849e464af76f70b94e" ], "v16": [ - "16.6", - "dff6615a8e48a10bb17a03fa3c00635f1ace7a92" + "16.8", + "799e7a08dd171aa06a7395dd326f4243aaeb9f93" ], "v15": [ - "15.10", - "972e325e62b455957adbbdd8580e31275bb5b8c9" + "15.12", + "9b118b1cffa6e4ca0d63389b57b54d11e207e9a8" ], "v14": [ - "14.15", - "373f9decad933d2d46f321231032ae8b0da81acd" + "14.17", + "6254ab9b4496c3e481bc037ae69d859bbc2bdd7d" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 33bdc25785..1b7c376560 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -17,8 +17,6 @@ license.workspace = true [dependencies] ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } -axum = { version = "0.7", features = ["ws"] } -axum-core = { version = "0.4", default-features = false, features = ["tracing"] } base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] } base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } @@ -44,9 +42,9 @@ half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } -hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] } +hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["client", "http1", "http2", "runtime", "server", "stream"] } hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] } -hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] } +hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } itertools = { version = "0.12" } @@ -87,14 +85,16 @@ tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unpref time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["full", "test-util"] } tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } -tokio-stream = { version = "0.1", features = ["net"] } +tokio-stream = { version = "0.1" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } -tonic = { version = "0.12", features = ["tls-roots"] } -tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "util"] } +tonic = { version = "0.12", default-features = false, features = ["codegen", "prost", "tls-roots"] } +tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } +tracing-log = { version = "0.2" } url = { version = "2", features = ["serde"] } +uuid = { version = "1", features = ["serde", "v4", "v7"] } zerocopy = { version = "0.7", features = ["derive", "simd"] } zeroize = { version = "1", features = ["derive", "serde"] } zstd = { version = "0.13" }