diff --git a/.github/actionlint.yml b/.github/actionlint.yml index a5282876d0..4ad8a7b460 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -1,7 +1,6 @@ self-hosted-runner: labels: - arm64 - - gen3 - large - large-arm64 - small diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 9d39ab6ad7..4ccf190c6a 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -83,7 +83,6 @@ runs: uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 - name: Cache poetry deps uses: actions/cache@v4 diff --git a/.github/actions/set-docker-config-dir/action.yml b/.github/actions/set-docker-config-dir/action.yml new file mode 100644 index 0000000000..3ee8bec8c6 --- /dev/null +++ b/.github/actions/set-docker-config-dir/action.yml @@ -0,0 +1,36 @@ +name: "Set custom docker config directory" +description: "Create a directory for docker config and set DOCKER_CONFIG" + +# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings +runs: + using: "composite" + steps: + - name: Show warning on GitHub-hosted runners + if: runner.environment == 'github-hosted' + shell: bash -euo pipefail {0} + run: | + # Using the following environment variables to find a path to the workflow file + # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch + # ${GITHUB_REPOSITORY} - octocat/hello-world + # ${GITHUB_REF} - refs/heads/my_branch + # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables + + filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"} + filename=${filename_with_ref%"@$GITHUB_REF"} + + # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message + title='Unnecessary usage of `.github/actions/set-docker-config-dir`' + message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners' + echo "::warning file=${filename},title=${title}::${message}" + + - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7 + env: + DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }} + with: + main: | + mkdir -p "${DOCKER_CONFIG}" + echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV + post: | + if [ -d "${DOCKER_CONFIG}" ]; then + rm -r "${DOCKER_CONFIG}" + fi diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml new file mode 100644 index 0000000000..7229776cd6 --- /dev/null +++ b/.github/workflows/_benchmarking_preparation.yml @@ -0,0 +1,152 @@ +name: Prepare benchmarking databases by restoring dumps + +on: + workflow_call: + # no inputs needed + +defaults: + run: + shell: bash -euxo pipefail {0} + +jobs: + setup-databases: + strategy: + fail-fast: false + matrix: + platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] + database: [ clickbench, tpch, userexample ] + + env: + LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib + PLATFORM: ${{ matrix.platform }} + PG_BINARIES: /tmp/neon/pg_install/v16/bin + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + options: --init + + steps: + - name: Set up Connection String + id: set-up-prep-connstr + run: | + case "${PLATFORM}" in + neon) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} + ;; + aws-rds-postgres) + CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} + ;; + aws-aurora-serverless-v2-postgres) + CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} + ;; + *) + echo >&2 "Unknown PLATFORM=${PLATFORM}" + exit 1 + ;; + esac + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + # we create a table that has one row for each database that we want to restore with the status whether the restore is done + - name: Create benchmark_restore_status table if it does not exist + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + # to avoid a race condition of multiple jobs trying to create the table at the same time, + # we use an advisory lock + run: | + ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c " + SELECT pg_advisory_lock(4711); + CREATE TABLE IF NOT EXISTS benchmark_restore_status ( + databasename text primary key, + restore_done boolean + ); + SELECT pg_advisory_unlock(4711); + " + + - name: Check if restore is already done + id: check-restore-done + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + run: | + skip=false + if ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM benchmark_restore_status WHERE databasename='${{ env.DATABASE_NAME }}' AND restore_done=true;" | grep -q 1; then + echo "Restore already done for database ${{ env.DATABASE_NAME }} on platform ${{ env.PLATFORM }}. Skipping this database." + skip=true + fi + echo "skip=${skip}" | tee -a $GITHUB_OUTPUT + + - name: Check and create database if it does not exist + if: steps.check-restore-done.outputs.skip != 'true' + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + run: | + DB_EXISTS=$(${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM pg_database WHERE datname='${{ env.DATABASE_NAME }}'") + if [ "$DB_EXISTS" != "1" ]; then + echo "Database ${{ env.DATABASE_NAME }} does not exist. Creating it..." + ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "CREATE DATABASE \"${{ env.DATABASE_NAME }}\";" + else + echo "Database ${{ env.DATABASE_NAME }} already exists." + fi + + - name: Download dump from S3 to /tmp/dumps + if: steps.check-restore-done.outputs.skip != 'true' + env: + DATABASE_NAME: ${{ matrix.database }} + run: | + mkdir -p /tmp/dumps + aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ + + - name: Replace database name in connection string + if: steps.check-restore-done.outputs.skip != 'true' + id: replace-dbname + env: + DATABASE_NAME: ${{ matrix.database }} + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + run: | + # Extract the part before the database name + base_connstr="${BENCHMARK_CONNSTR%/*}" + # Extract the query parameters (if any) after the database name + query_params="${BENCHMARK_CONNSTR#*\?}" + # Reconstruct the new connection string + if [ "$query_params" != "$BENCHMARK_CONNSTR" ]; then + new_connstr="${base_connstr}/${DATABASE_NAME}?${query_params}" + else + new_connstr="${base_connstr}/${DATABASE_NAME}" + fi + echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT + + - name: Restore dump + if: steps.check-restore-done.outputs.skip != 'true' + env: + DATABASE_NAME: ${{ matrix.database }} + DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }} + # the following works only with larger computes: + # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7" + # we add the || true because: + # the dumps were created with Neon and contain neon extensions that are not + # available in RDS, so we will always report an error, but we can ignore it + run: | + ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \ + -d "${DATABASE_CONNSTR}" /tmp/dumps/${DATABASE_NAME}.pg_dump || true + + - name: Update benchmark_restore_status table + if: steps.check-restore-done.outputs.skip != 'true' + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + run: | + ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c " + INSERT INTO benchmark_restore_status (databasename, restore_done) VALUES ('${{ env.DATABASE_NAME }}', true) + ON CONFLICT (databasename) DO UPDATE SET restore_done = true; + " diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index a0ed169024..af76e51ebc 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -70,7 +70,6 @@ jobs: - uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 - name: Set pg 14 revision for caching id: pg_v14_rev @@ -208,7 +207,7 @@ jobs: export LD_LIBRARY_PATH #nextest does not yet support running doctests - cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES for io_engine in std-fs tokio-epoll-uring ; do NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES @@ -263,7 +262,6 @@ jobs: - uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 - name: Pytest regression tests uses: ./.github/actions/run-python-test-set diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 34fd8b1d15..85cfe7446e 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -44,7 +44,7 @@ jobs: grep -ERl $PAT .github/workflows |\ while read -r f do - l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1) + l=$(grep -nE $PAT $f | awk -F: '{print $1}' | head -1) echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'" done exit 1 diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 6f80d6e431..a4a597acde 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -96,7 +96,7 @@ jobs: uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - name: Download Neon artifact @@ -146,6 +146,7 @@ jobs: api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -154,7 +155,10 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic perf testing: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} @@ -176,7 +180,7 @@ jobs: steps: - uses: actions/checkout@v4 - + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -215,15 +219,23 @@ jobs: NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@v1 with: - channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream + slack-message: | + Periodic replication testing: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} @@ -280,8 +292,9 @@ jobs: { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] }' - if [ "$(date +%A)" = "Saturday" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]') + if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -321,9 +334,13 @@ jobs: echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT + prepare_AWS_RDS_databases: + uses: ./.github/workflows/_benchmarking_preparation.yml + secrets: inherit + pgbench-compare: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} - needs: [ generate-matrices ] + needs: [ generate-matrices, prepare_AWS_RDS_databases ] permissions: contents: write statuses: write @@ -360,7 +377,7 @@ jobs: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} role-duration-seconds: 18000 # 5 hours - + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -455,6 +472,7 @@ jobs: api_key: ${{ secrets.NEON_STAGING_API_KEY }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -463,7 +481,10 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} @@ -537,7 +558,7 @@ jobs: esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - + - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3 uses: aws-actions/configure-aws-credentials@v4 with: @@ -572,8 +593,9 @@ jobs: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - + - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -582,7 +604,10 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} @@ -595,7 +620,7 @@ jobs: # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} - needs: [ generate-matrices, pgbench-compare ] + needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false @@ -603,7 +628,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }} TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }} @@ -655,6 +680,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_clickbench + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -664,6 +690,7 @@ jobs: TEST_OLAP_SCALE: 10 - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -672,7 +699,10 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} @@ -684,7 +714,7 @@ jobs: # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} - needs: [ generate-matrices, clickbench-compare ] + needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false @@ -692,7 +722,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -724,7 +754,7 @@ jobs: ENV_PLATFORM=RDS_AURORA_TPCH ;; rds-postgres) - ENV_PLATFORM=RDS_AURORA_TPCH + ENV_PLATFORM=RDS_POSTGRES_TPCH ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" @@ -750,6 +780,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_tpch + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -757,6 +788,7 @@ jobs: TEST_OLAP_SCALE: ${{ matrix.scale }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -765,13 +797,16 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} user-examples-compare: if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} - needs: [ generate-matrices, tpch-compare ] + needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false @@ -779,7 +814,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -836,6 +871,7 @@ jobs: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -844,6 +880,10 @@ jobs: uses: slackapi/slack-github-action@v1 with: channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + slack-message: | + Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }} + <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run> + <${{ steps.create-allure-report.outputs.report-url }}|Allure report> + env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 76fc58151a..ca5ff573e1 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -38,7 +38,7 @@ jobs: matrix: arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} env: IMAGE_TAG: ${{ inputs.image-tag }} @@ -56,13 +56,7 @@ jobs: - uses: actions/checkout@v4 - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p /tmp/.docker-custom - echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV - + - uses: ./.github/actions/set-docker-config-dir - uses: docker/setup-buildx-action@v3 with: cache-binary: false @@ -89,11 +83,6 @@ jobs: cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }} tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} - - name: Remove custom docker config directory - if: always() - run: | - rm -rf /tmp/.docker-custom - merge-images: needs: [ build-image ] runs-on: ubuntu-22.04 diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index c7ae2aedd4..ee6d3ba005 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -48,7 +48,7 @@ jobs: tag: needs: [ check-permissions ] - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned outputs: build-tag: ${{steps.build-tag.outputs.tag}} @@ -90,7 +90,7 @@ jobs: check-codestyle-python: needs: [ check-permissions, build-build-tools-image ] - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -101,9 +101,6 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 - with: - submodules: false - fetch-depth: 1 - name: Cache poetry deps uses: actions/cache@v4 @@ -142,7 +139,6 @@ jobs: uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 # Disabled for now # - name: Restore cargo deps cache @@ -204,7 +200,7 @@ jobs: matrix: arch: [ x64 ] # Do not build or run tests in debug for release branches - build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }} + build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }} include: - build-type: release arch: arm64 @@ -224,7 +220,7 @@ jobs: outputs: json: ${{ steps.get-benchmark-durations.outputs.json }} needs: [ check-permissions, build-build-tools-image ] - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -257,7 +253,7 @@ jobs: benchmarks: if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -302,9 +298,8 @@ jobs: with: channel-id: C060CNA47S9 # on-call-staging-storage-stream slack-message: | - Benchmarks failed on main: ${{ github.event.head_commit.url }} - - Allure report: ${{ needs.create-test-report.outputs.report-url }} + Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}> + <${{ needs.create-test-report.outputs.report-url }}|Allure report> env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} @@ -314,7 +309,7 @@ jobs: outputs: report-url: ${{ steps.create-allure-report.outputs.report-url }} - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -361,7 +356,7 @@ jobs: coverage-report: needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: @@ -475,7 +470,7 @@ jobs: matrix: arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Checkout @@ -484,12 +479,7 @@ jobs: submodules: true fetch-depth: 0 - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p .docker-custom - echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV + - uses: ./.github/actions/set-docker-config-dir - uses: docker/setup-buildx-action@v3 with: cache-binary: false @@ -508,7 +498,10 @@ jobs: - uses: docker/build-push-action@v6 with: context: . + # ARM-specific flags are recommended for Graviton ≥ 2, these flags are also supported by Ampere Altra (Azure) + # https://github.com/aws/aws-graviton-getting-started/blob/57dc813626d0266f1cc12ef83474745bb1f31fb4/rust.md build-args: | + ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} TAG=${{ needs.build-build-tools-image.outputs.image-tag }} @@ -521,11 +514,6 @@ jobs: tags: | neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - - name: Remove custom docker config directory - if: always() - run: | - rm -rf .docker-custom - neon-image: needs: [ neon-image-arch, tag ] runs-on: ubuntu-22.04 @@ -561,7 +549,7 @@ jobs: version: [ v14, v15, v16 ] arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Checkout @@ -570,12 +558,7 @@ jobs: submodules: true fetch-depth: 0 - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p .docker-custom - echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV + - uses: ./.github/actions/set-docker-config-dir - uses: docker/setup-buildx-action@v3 with: cache-binary: false @@ -658,11 +641,6 @@ jobs: tags: | neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - - name: Remove custom docker config directory - if: always() - run: | - rm -rf .docker-custom - compute-node-image: needs: [ compute-node-image-arch, tag ] runs-on: ubuntu-22.04 @@ -716,7 +694,7 @@ jobs: vm-compute-node-image: needs: [ check-permissions, tag, compute-node-image ] - runs-on: [ self-hosted, gen3, large ] + runs-on: [ self-hosted, large ] strategy: fail-fast: false matrix: @@ -735,13 +713,7 @@ jobs: curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder chmod +x vm-builder - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p .docker-custom - echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - + - uses: ./.github/actions/set-docker-config-dir - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} @@ -764,11 +736,6 @@ jobs: run: | docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - - name: Remove custom docker config directory - if: always() - run: | - rm -rf .docker-custom - test-images: needs: [ check-permissions, tag, neon-image, compute-node-image ] strategy: @@ -776,7 +743,7 @@ jobs: matrix: arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - name: Checkout @@ -784,13 +751,7 @@ jobs: with: fetch-depth: 0 - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p .docker-custom - echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - + - uses: ./.github/actions/set-docker-config-dir - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} @@ -830,11 +791,6 @@ jobs: docker compose -f ./docker-compose/docker-compose.yml logs || 0 docker compose -f ./docker-compose/docker-compose.yml down - - name: Remove custom docker config directory - if: always() - run: | - rm -rf .docker-custom - promote-images: permissions: contents: read # This is required for actions/checkout @@ -1002,7 +958,7 @@ jobs: needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ] if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest steps: - name: Fix git ownership @@ -1022,7 +978,6 @@ jobs: - name: Checkout uses: actions/checkout@v4 with: - submodules: false fetch-depth: 0 - name: Trigger deploy workflow @@ -1103,7 +1058,7 @@ jobs: needs: [ check-permissions, promote-images, tag, build-and-test-locally ] if: github.ref_name == 'release' - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init @@ -1159,10 +1114,12 @@ jobs: # Format `needs` differently to make the list more readable. # Usually we do `needs: [...]` needs: + - build-and-test-locally - check-codestyle-python - check-codestyle-rust - - build-and-test-locally + - promote-images - test-images + - trigger-custom-extensions-build-and-wait runs-on: ubuntu-22.04 steps: # The list of possible results: diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml new file mode 100644 index 0000000000..585d118dfb --- /dev/null +++ b/.github/workflows/label-for-external-users.yml @@ -0,0 +1,54 @@ +name: Add `external` label to issues and PRs created by external users + +on: + issues: + types: + - opened + pull_request_target: + types: + - opened + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +env: + LABEL: external + +jobs: + check-user: + runs-on: ubuntu-22.04 + + outputs: + is-member: ${{ steps.check-user.outputs.is-member }} + + steps: + - name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}` + id: check-user + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then + is_member=true + else + is_member=false + fi + + echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT} + + add-label: + if: needs.check-user.outputs.is-member == 'false' + needs: [ check-user ] + + runs-on: ubuntu-22.04 + permissions: + pull-requests: write # for `gh pr edit` + issues: write # for `gh issue edit` + + steps: + - name: Add `${{ env.LABEL }}` label + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request_target' && 'pull_request' || 'issue'].number }} + GH_CLI_COMMAND: ${{ github.event_name == 'pull_request_target' && 'pr' || 'issue' }} + run: | + gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 2ee66cfdc1..7fecdbde8c 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -56,7 +56,6 @@ jobs: uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 - name: Install macOS postgres dependencies run: brew install flex bison openssl protobuf icu4c pkg-config @@ -158,7 +157,6 @@ jobs: uses: actions/checkout@v4 with: submodules: true - fetch-depth: 1 # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index ed4e6be712..615937b5a1 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -27,7 +27,7 @@ concurrency: jobs: trigger_bench_on_ec2_machine_in_eu_central_1: - runs-on: [ self-hosted, gen3, small ] + runs-on: [ self-hosted, small ] container: image: neondatabase/build-tools:pinned credentials: diff --git a/Cargo.lock b/Cargo.lock index 031fae0f37..dee15b6aa7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -484,7 +484,7 @@ dependencies = [ "http 0.2.9", "http 1.1.0", "once_cell", - "p256", + "p256 0.11.1", "percent-encoding", "ring 0.17.6", "sha2", @@ -848,6 +848,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" +[[package]] +name = "base16ct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" + [[package]] name = "base64" version = "0.13.1" @@ -971,9 +977,9 @@ checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" [[package]] name = "bytemuck" -version = "1.16.0" +version = "1.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5" +checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" [[package]] name = "byteorder" @@ -1526,8 +1532,10 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" dependencies = [ + "generic-array", "rand_core 0.6.4", "subtle", + "zeroize", ] [[package]] @@ -1621,6 +1629,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" dependencies = [ "const-oid", + "pem-rfc7468", "zeroize", ] @@ -1720,6 +1729,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", + "const-oid", "crypto-common", "subtle", ] @@ -1771,11 +1781,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" dependencies = [ "der 0.6.1", - "elliptic-curve", - "rfc6979", + "elliptic-curve 0.12.3", + "rfc6979 0.3.1", "signature 1.6.4", ] +[[package]] +name = "ecdsa" +version = "0.16.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca" +dependencies = [ + "der 0.7.8", + "digest", + "elliptic-curve 0.13.8", + "rfc6979 0.4.0", + "signature 2.2.0", + "spki 0.7.3", +] + [[package]] name = "either" version = "1.8.1" @@ -1788,16 +1812,36 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" dependencies = [ - "base16ct", + "base16ct 0.1.1", "crypto-bigint 0.4.9", "der 0.6.1", "digest", - "ff", + "ff 0.12.1", "generic-array", - "group", - "pkcs8", + "group 0.12.1", + "pkcs8 0.9.0", "rand_core 0.6.4", - "sec1", + "sec1 0.3.0", + "subtle", + "zeroize", +] + +[[package]] +name = "elliptic-curve" +version = "0.13.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" +dependencies = [ + "base16ct 0.2.0", + "crypto-bigint 0.5.5", + "digest", + "ff 0.13.0", + "generic-array", + "group 0.13.0", + "pem-rfc7468", + "pkcs8 0.10.2", + "rand_core 0.6.4", + "sec1 0.7.3", "subtle", "zeroize", ] @@ -1951,6 +1995,16 @@ dependencies = [ "subtle", ] +[[package]] +name = "ff" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ded41244b729663b1e574f1b4fb731469f69f79c17667b5d776b16cda0479449" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + [[package]] name = "filetime" version = "0.2.22" @@ -2148,6 +2202,7 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", + "zeroize", ] [[package]] @@ -2214,7 +2269,18 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" dependencies = [ - "ff", + "ff 0.12.1", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "group" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" +dependencies = [ + "ff 0.13.0", "rand_core 0.6.4", "subtle", ] @@ -2776,6 +2842,42 @@ dependencies = [ "libc", ] +[[package]] +name = "jose-b64" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bec69375368709666b21c76965ce67549f2d2db7605f1f8707d17c9656801b56" +dependencies = [ + "base64ct", + "serde", + "subtle", + "zeroize", +] + +[[package]] +name = "jose-jwa" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ab78e053fe886a351d67cf0d194c000f9d0dcb92906eb34d853d7e758a4b3a7" +dependencies = [ + "serde", +] + +[[package]] +name = "jose-jwk" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280fa263807fe0782ecb6f2baadc28dffc04e00558a58e33bfdb801d11fd58e7" +dependencies = [ + "jose-b64", + "jose-jwa", + "p256 0.13.2", + "p384", + "rsa", + "serde", + "zeroize", +] + [[package]] name = "js-sys" version = "0.3.69" @@ -2835,6 +2937,9 @@ name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +dependencies = [ + "spin 0.5.2", +] [[package]] name = "lazycell" @@ -3204,6 +3309,23 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-bigint-dig" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +dependencies = [ + "byteorder", + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand 0.8.5", + "smallvec", + "zeroize", +] + [[package]] name = "num-complex" version = "0.4.4" @@ -3481,11 +3603,33 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" dependencies = [ - "ecdsa", - "elliptic-curve", + "ecdsa 0.14.8", + "elliptic-curve 0.12.3", "sha2", ] +[[package]] +name = "p256" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b" +dependencies = [ + "ecdsa 0.16.9", + "elliptic-curve 0.13.8", + "primeorder", + "sha2", +] + +[[package]] +name = "p384" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70786f51bcc69f6a4c0360e063a4cac5419ef7c5cd5b3c99ad70f3be5ba79209" +dependencies = [ + "elliptic-curve 0.13.8", + "primeorder", +] + [[package]] name = "pagebench" version = "0.1.0" @@ -3847,6 +3991,15 @@ dependencies = [ "serde", ] +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.2.0" @@ -3913,6 +4066,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der 0.7.8", + "pkcs8 0.10.2", + "spki 0.7.3", +] + [[package]] name = "pkcs8" version = "0.9.0" @@ -3923,6 +4087,16 @@ dependencies = [ "spki 0.6.0", ] +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der 0.7.8", + "spki 0.7.3", +] + [[package]] name = "pkg-config" version = "0.3.27" @@ -4116,6 +4290,15 @@ dependencies = [ "syn 2.0.52", ] +[[package]] +name = "primeorder" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6" +dependencies = [ + "elliptic-curve 0.13.8", +] + [[package]] name = "proc-macro-hack" version = "0.5.20+deprecated" @@ -4233,6 +4416,7 @@ version = "0.1.0" dependencies = [ "ahash", "anyhow", + "arc-swap", "async-compression", "async-trait", "atomic-take", @@ -4250,6 +4434,7 @@ dependencies = [ "consumption_metrics", "crossbeam-deque", "dashmap", + "ecdsa 0.16.9", "env_logger", "fallible-iterator", "framed-websockets", @@ -4270,12 +4455,15 @@ dependencies = [ "indexmap 2.0.1", "ipnet", "itertools 0.10.5", + "jose-jwa", + "jose-jwk", "lasso", "md5", "measured", "metrics", "once_cell", "opentelemetry", + "p256 0.13.2", "parking_lot 0.12.1", "parquet", "parquet_derive", @@ -4296,6 +4484,7 @@ dependencies = [ "reqwest-retry", "reqwest-tracing", "routerify", + "rsa", "rstest", "rustc-hash", "rustls 0.22.4", @@ -4305,6 +4494,7 @@ dependencies = [ "serde", "serde_json", "sha2", + "signature 2.2.0", "smallvec", "smol_str", "socket2 0.5.5", @@ -4807,6 +4997,16 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rfc6979" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2" +dependencies = [ + "hmac", + "subtle", +] + [[package]] name = "ring" version = "0.16.20" @@ -4867,6 +5067,26 @@ dependencies = [ "archery", ] +[[package]] +name = "rsa" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc" +dependencies = [ + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8 0.10.2", + "rand_core 0.6.4", + "signature 2.2.0", + "spki 0.7.3", + "subtle", + "zeroize", +] + [[package]] name = "rstest" version = "0.18.2" @@ -5195,10 +5415,24 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" dependencies = [ - "base16ct", + "base16ct 0.1.1", "der 0.6.1", "generic-array", - "pkcs8", + "pkcs8 0.9.0", + "subtle", + "zeroize", +] + +[[package]] +name = "sec1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" +dependencies = [ + "base16ct 0.2.0", + "der 0.7.8", + "generic-array", + "pkcs8 0.10.2", "subtle", "zeroize", ] @@ -5545,6 +5779,7 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" dependencies = [ + "digest", "rand_core 0.6.4", ] @@ -7379,13 +7614,17 @@ dependencies = [ "clap", "clap_builder", "crossbeam-utils", + "crypto-bigint 0.5.5", + "der 0.7.8", "deranged", + "digest", "either", "fail", "futures-channel", "futures-executor", "futures-io", "futures-util", + "generic-array", "getrandom 0.2.11", "hashbrown 0.14.5", "hex", @@ -7393,6 +7632,7 @@ dependencies = [ "hyper 0.14.26", "indexmap 1.9.3", "itertools 0.10.5", + "lazy_static", "libc", "log", "memchr", @@ -7416,7 +7656,9 @@ dependencies = [ "serde", "serde_json", "sha2", + "signature 2.2.0", "smallvec", + "spki 0.7.3", "subtle", "syn 1.0.109", "syn 2.0.52", @@ -7527,6 +7769,7 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" dependencies = [ + "serde", "zeroize_derive", ] diff --git a/Dockerfile b/Dockerfile index ceb1c7cb55..d3d12330c6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,8 +35,9 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib COPY --chown=nonroot . . +ARG ADDITIONAL_RUSTFLAGS RUN set -e \ - && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \ + && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index bf8a27e550..619c5bce3e 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -379,7 +379,7 @@ where } } -fn process_has_stopped(pid: Pid) -> anyhow::Result { +pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result { match kill(pid, None) { // Process exists, keep waiting Ok(_) => Ok(false), diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 51e9a51a57..edd88dc71c 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -15,7 +15,9 @@ use control_plane::local_env::{ }; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; -use control_plane::storage_controller::StorageController; +use control_plane::storage_controller::{ + NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, +}; use control_plane::{broker, local_env}; use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, @@ -1052,6 +1054,36 @@ fn get_start_timeout(args: &ArgMatches) -> &Duration { humantime_duration.as_ref() } +fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs { + let maybe_instance_id = args.get_one::("instance-id"); + + let base_port = args.get_one::("base-port"); + + if maybe_instance_id.is_some() && base_port.is_none() { + panic!("storage-controller start specificied instance-id but did not provide base-port"); + } + + let start_timeout = args + .get_one::("start-timeout") + .expect("invalid value for start-timeout"); + + NeonStorageControllerStartArgs { + instance_id: maybe_instance_id.copied().unwrap_or(1), + base_port: base_port.copied(), + start_timeout: *start_timeout, + } +} + +fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs { + let maybe_instance_id = args.get_one::("instance-id"); + let immediate = args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); + + NeonStorageControllerStopArgs { + instance_id: maybe_instance_id.copied().unwrap_or(1), + immediate, + } +} + async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { match sub_match.subcommand() { Some(("start", subcommand_args)) => { @@ -1113,19 +1145,14 @@ async fn handle_storage_controller( let svc = StorageController::from_env(env); match sub_match.subcommand() { Some(("start", start_match)) => { - if let Err(e) = svc.start(get_start_timeout(start_match)).await { + if let Err(e) = svc.start(storage_controller_start_args(start_match)).await { eprintln!("start failed: {e}"); exit(1); } } Some(("stop", stop_match)) => { - let immediate = stop_match - .get_one::("stop-mode") - .map(|s| s.as_str()) - == Some("immediate"); - - if let Err(e) = svc.stop(immediate).await { + if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await { eprintln!("stop failed: {}", e); exit(1); } @@ -1228,7 +1255,12 @@ async fn handle_start_all( // Only start the storage controller if the pageserver is configured to need it if env.control_plane_api.is_some() { let storage_controller = StorageController::from_env(env); - if let Err(e) = storage_controller.start(retry_timeout).await { + if let Err(e) = storage_controller + .start(NeonStorageControllerStartArgs::with_default_instance_id( + (*retry_timeout).into(), + )) + .await + { eprintln!("storage_controller start failed: {:#}", e); try_stop_all(env, true).await; exit(1); @@ -1358,10 +1390,21 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { eprintln!("neon broker stop failed: {e:#}"); } - if env.control_plane_api.is_some() { + // Stop all storage controller instances. In the most common case there's only one, + // but iterate though the base data directory in order to discover the instances. + let storcon_instances = env + .storage_controller_instances() + .await + .expect("Must inspect data dir"); + for (instance_id, _instance_dir_path) in storcon_instances { let storage_controller = StorageController::from_env(env); - if let Err(e) = storage_controller.stop(immediate).await { - eprintln!("storage controller stop failed: {e:#}"); + let stop_args = NeonStorageControllerStopArgs { + instance_id, + immediate, + }; + + if let Err(e) = storage_controller.stop(stop_args).await { + eprintln!("Storage controller instance {instance_id} stop failed: {e:#}"); } } } @@ -1501,6 +1544,18 @@ fn cli() -> Command { .action(ArgAction::SetTrue) .required(false); + let instance_id = Arg::new("instance-id") + .long("instance-id") + .help("Identifier used to distinguish storage controller instances (default 1)") + .value_parser(value_parser!(u8)) + .required(false); + + let base_port = Arg::new("base-port") + .long("base-port") + .help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)") + .value_parser(value_parser!(u16)) + .required(false); + Command::new("Neon CLI") .arg_required_else_help(true) .version(GIT_VERSION) @@ -1609,9 +1664,12 @@ fn cli() -> Command { .arg_required_else_help(true) .about("Manage storage_controller") .subcommand(Command::new("start").about("Start storage controller") - .arg(timeout_arg.clone())) + .arg(timeout_arg.clone()) + .arg(instance_id.clone()) + .arg(base_port)) .subcommand(Command::new("stop").about("Stop storage controller") - .arg(stop_mode_arg.clone())) + .arg(stop_mode_arg.clone()) + .arg(instance_id)) ) .subcommand( Command::new("safekeeper") diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index f9bb2da7e7..9f879c4b08 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -824,11 +824,12 @@ impl Endpoint { // cleanup work to do after postgres stops, like syncing safekeepers, // etc. // - // If destroying, send it SIGTERM before waiting. Sometimes we do *not* - // want this cleanup: tests intentionally do stop when majority of - // safekeepers is down, so sync-safekeepers would hang otherwise. This - // could be a separate flag though. - self.wait_for_compute_ctl_to_exit(destroy)?; + // If destroying or stop mode is immediate, send it SIGTERM before + // waiting. Sometimes we do *not* want this cleanup: tests intentionally + // do stop when majority of safekeepers is down, so sync-safekeepers + // would hang otherwise. This could be a separate flag though. + let send_sigterm = destroy || mode == "immediate"; + self.wait_for_compute_ctl_to_exit(send_sigterm)?; if destroy { println!( "Destroying postgres data directory '{}'", diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 15bbac702f..807519c88d 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -156,6 +156,11 @@ pub struct NeonStorageControllerConf { #[serde(with = "humantime_serde")] pub max_warming_up: Duration, + pub start_as_candidate: bool, + + /// Database url used when running multiple storage controller instances + pub database_url: Option, + /// Threshold for auto-splitting a tenant into shards pub split_threshold: Option, @@ -174,6 +179,8 @@ impl Default for NeonStorageControllerConf { Self { max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL, max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL, + start_as_candidate: false, + database_url: None, split_threshold: None, max_secondary_lag_bytes: None, } @@ -392,6 +399,36 @@ impl LocalEnv { } } + /// Inspect the base data directory and extract the instance id and instance directory path + /// for all storage controller instances + pub async fn storage_controller_instances(&self) -> std::io::Result> { + let mut instances = Vec::default(); + + let dir = std::fs::read_dir(self.base_data_dir.clone())?; + for dentry in dir { + let dentry = dentry?; + let is_dir = dentry.metadata()?.is_dir(); + let filename = dentry.file_name().into_string().unwrap(); + let parsed_instance_id = match filename.strip_prefix("storage_controller_") { + Some(suffix) => suffix.parse::().ok(), + None => None, + }; + + let is_instance_dir = is_dir && parsed_instance_id.is_some(); + + if !is_instance_dir { + continue; + } + + instances.push(( + parsed_instance_id.expect("Checked previously"), + dentry.path(), + )); + } + + Ok(instances) + } + pub fn register_branch_mapping( &mut self, branch_name: String, diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index f180e922e8..2c077595a1 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -3,6 +3,8 @@ use crate::{ local_env::{LocalEnv, NeonStorageControllerConf}, }; use camino::{Utf8Path, Utf8PathBuf}; +use hyper::Uri; +use nix::unistd::Pid; use pageserver_api::{ controller_api::{ NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, @@ -18,7 +20,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; use reqwest::Method; use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use std::{fs, str::FromStr, time::Duration}; +use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock}; use tokio::process::Command; use tracing::instrument; use url::Url; @@ -29,12 +31,14 @@ use utils::{ pub struct StorageController { env: LocalEnv, - listen: String, private_key: Option>, public_key: Option, - postgres_port: u16, client: reqwest::Client, config: NeonStorageControllerConf, + + // The listen addresses is learned when starting the storage controller, + // hence the use of OnceLock to init it at the right time. + listen: OnceLock, } const COMMAND: &str = "storage_controller"; @@ -43,6 +47,36 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; const DB_NAME: &str = "storage_controller"; +pub struct NeonStorageControllerStartArgs { + pub instance_id: u8, + pub base_port: Option, + pub start_timeout: humantime::Duration, +} + +impl NeonStorageControllerStartArgs { + pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self { + Self { + instance_id: 1, + base_port: None, + start_timeout, + } + } +} + +pub struct NeonStorageControllerStopArgs { + pub instance_id: u8, + pub immediate: bool, +} + +impl NeonStorageControllerStopArgs { + pub fn with_default_instance_id(immediate: bool) -> Self { + Self { + instance_id: 1, + immediate, + } + } +} + #[derive(Serialize, Deserialize)] pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, @@ -67,23 +101,6 @@ pub struct InspectResponse { impl StorageController { pub fn from_env(env: &LocalEnv) -> Self { - // Makes no sense to construct this if pageservers aren't going to use it: assume - // pageservers have control plane API set - let listen_url = env.control_plane_api.clone().unwrap(); - - let listen = format!( - "{}:{}", - listen_url.host_str().unwrap(), - listen_url.port().unwrap() - ); - - // Convention: NeonEnv in python tests reserves the next port after the control_plane_api - // port, for use by our captive postgres. - let postgres_port = listen_url - .port() - .expect("Control plane API setting should always have a port") - + 1; - // Assume all pageservers have symmetric auth configuration: this service // expects to use one JWT token to talk to all of them. let ps_conf = env @@ -126,20 +143,28 @@ impl StorageController { Self { env: env.clone(), - listen, private_key, public_key, - postgres_port, client: reqwest::ClientBuilder::new() .build() .expect("Failed to construct http client"), config: env.storage_controller.clone(), + listen: OnceLock::default(), } } - fn pid_file(&self) -> Utf8PathBuf { - Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid")) - .expect("non-Unicode path") + fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf { + self.env + .base_data_dir + .join(format!("storage_controller_{}", instance_id)) + } + + fn pid_file(&self, instance_id: u8) -> Utf8PathBuf { + Utf8PathBuf::from_path_buf( + self.storage_controller_instance_dir(instance_id) + .join("storage_controller.pid"), + ) + .expect("non-Unicode path") } /// PIDFile for the postgres instance used to store storage controller state @@ -184,9 +209,9 @@ impl StorageController { } /// Readiness check for our postgres process - async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result { + async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result { let bin_path = pg_bin_dir.join("pg_isready"); - let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)]; + let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)]; let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; Ok(exitcode.success()) @@ -199,8 +224,8 @@ impl StorageController { /// who just want to run `cargo neon_local` without knowing about diesel. /// /// Returns the database url - pub async fn setup_database(&self) -> anyhow::Result { - let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port); + pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result { + let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); let pg_bin_dir = self.get_pg_bin_dir().await?; let createdb_path = pg_bin_dir.join("createdb"); @@ -209,7 +234,7 @@ impl StorageController { "-h", "localhost", "-p", - &format!("{}", self.postgres_port), + &format!("{}", postgres_port), DB_NAME, ]) .output() @@ -230,13 +255,14 @@ impl StorageController { pub async fn connect_to_database( &self, + postgres_port: u16, ) -> anyhow::Result<( tokio_postgres::Client, tokio_postgres::Connection, )> { tokio_postgres::Config::new() .host("localhost") - .port(self.postgres_port) + .port(postgres_port) // The user is the ambient operating system user name. // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400 // @@ -252,72 +278,115 @@ impl StorageController { .map_err(anyhow::Error::new) } - pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> { - // Start a vanilla Postgres process used by the storage controller for persistence. - let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone()) - .unwrap() - .join("storage_controller_db"); - let pg_bin_dir = self.get_pg_bin_dir().await?; - let pg_lib_dir = self.get_pg_lib_dir().await?; - let pg_log_path = pg_data_path.join("postgres.log"); + pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> { + let instance_dir = self.storage_controller_instance_dir(start_args.instance_id); + if let Err(err) = tokio::fs::create_dir(&instance_dir).await { + if err.kind() != std::io::ErrorKind::AlreadyExists { + panic!("Failed to create instance dir {instance_dir:?}"); + } + } - if !tokio::fs::try_exists(&pg_data_path).await? { - // Initialize empty database - let initdb_path = pg_bin_dir.join("initdb"); - let mut child = Command::new(&initdb_path) - .envs(vec![ - ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ]) - .args(["-D", pg_data_path.as_ref()]) - .spawn() - .expect("Failed to spawn initdb"); - let status = child.wait().await?; - if !status.success() { - anyhow::bail!("initdb failed with status {status}"); + let (listen, postgres_port) = { + if let Some(base_port) = start_args.base_port { + ( + format!("127.0.0.1:{base_port}"), + self.config + .database_url + .expect("--base-port requires NeonStorageControllerConf::database_url") + .port(), + ) + } else { + let listen_url = self.env.control_plane_api.clone().unwrap(); + + let listen = format!( + "{}:{}", + listen_url.host_str().unwrap(), + listen_url.port().unwrap() + ); + + (listen, listen_url.port().unwrap() + 1) } }; - // Write a minimal config file: - // - Specify the port, since this is chosen dynamically - // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing - // the storage controller we don't want a slow local disk to interfere with that. - // - // NB: it's important that we rewrite this file on each start command so we propagate changes - // from `LocalEnv`'s config file (`.neon/config`). - tokio::fs::write( - &pg_data_path.join("postgresql.conf"), - format!("port = {}\nfsync=off\n", self.postgres_port), - ) - .await?; + let socket_addr = listen + .parse() + .expect("listen address is a valid socket address"); + self.listen + .set(socket_addr) + .expect("StorageController::listen is only set here"); - println!("Starting storage controller database..."); - let db_start_args = [ - "-w", - "-D", - pg_data_path.as_ref(), - "-l", - pg_log_path.as_ref(), - "start", - ]; + // Do we remove the pid file on stop? + let pg_started = self.is_postgres_running().await?; + let pg_lib_dir = self.get_pg_lib_dir().await?; - background_process::start_process( - "storage_controller_db", - &self.env.base_data_dir, - pg_bin_dir.join("pg_ctl").as_std_path(), - db_start_args, - vec![ - ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ], - background_process::InitialPidFile::Create(self.postgres_pid_file()), - retry_timeout, - || self.pg_isready(&pg_bin_dir), - ) - .await?; + if !pg_started { + // Start a vanilla Postgres process used by the storage controller for persistence. + let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone()) + .unwrap() + .join("storage_controller_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + let pg_log_path = pg_data_path.join("postgres.log"); - // Run migrations on every startup, in case something changed. - let database_url = self.setup_database().await?; + if !tokio::fs::try_exists(&pg_data_path).await? { + // Initialize empty database + let initdb_path = pg_bin_dir.join("initdb"); + let mut child = Command::new(&initdb_path) + .envs(vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]) + .args(["-D", pg_data_path.as_ref()]) + .spawn() + .expect("Failed to spawn initdb"); + let status = child.wait().await?; + if !status.success() { + anyhow::bail!("initdb failed with status {status}"); + } + }; + + // Write a minimal config file: + // - Specify the port, since this is chosen dynamically + // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing + // the storage controller we don't want a slow local disk to interfere with that. + // + // NB: it's important that we rewrite this file on each start command so we propagate changes + // from `LocalEnv`'s config file (`.neon/config`). + tokio::fs::write( + &pg_data_path.join("postgresql.conf"), + format!("port = {}\nfsync=off\n", postgres_port), + ) + .await?; + + println!("Starting storage controller database..."); + let db_start_args = [ + "-w", + "-D", + pg_data_path.as_ref(), + "-l", + pg_log_path.as_ref(), + "start", + ]; + + background_process::start_process( + "storage_controller_db", + &self.env.base_data_dir, + pg_bin_dir.join("pg_ctl").as_std_path(), + db_start_args, + vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ], + background_process::InitialPidFile::Create(self.postgres_pid_file()), + &start_args.start_timeout, + || self.pg_isready(&pg_bin_dir, postgres_port), + ) + .await?; + + // Run migrations on every startup, in case something changed. + self.setup_database(postgres_port).await?; + } + + let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); // We support running a startup SQL script to fiddle with the database before we launch storcon. // This is used by the test suite. @@ -339,7 +408,7 @@ impl StorageController { } } }; - let (mut client, conn) = self.connect_to_database().await?; + let (mut client, conn) = self.connect_to_database(postgres_port).await?; let conn = tokio::spawn(conn); let tx = client.build_transaction(); let tx = tx.start().await?; @@ -348,9 +417,20 @@ impl StorageController { drop(client); conn.await??; + let listen = self + .listen + .get() + .expect("cell is set earlier in this function"); + let address_for_peers = Uri::builder() + .scheme("http") + .authority(format!("{}:{}", listen.ip(), listen.port())) + .path_and_query("") + .build() + .unwrap(); + let mut args = vec![ "-l", - &self.listen, + &listen.to_string(), "--dev", "--database-url", &database_url, @@ -358,10 +438,17 @@ impl StorageController { &humantime::Duration::from(self.config.max_offline).to_string(), "--max-warming-up-interval", &humantime::Duration::from(self.config.max_warming_up).to_string(), + "--address-for-peers", + &address_for_peers.to_string(), ] .into_iter() .map(|s| s.to_string()) .collect::>(); + + if self.config.start_as_candidate { + args.push("--start-as-candidate".to_string()); + } + if let Some(private_key) = &self.private_key { let claims = Claims::new(None, Scope::PageServerApi); let jwt_token = @@ -394,15 +481,15 @@ impl StorageController { background_process::start_process( COMMAND, - &self.env.base_data_dir, + &instance_dir, &self.env.storage_controller_bin(), args, vec![ ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ], - background_process::InitialPidFile::Create(self.pid_file()), - retry_timeout, + background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)), + &start_args.start_timeout, || async { match self.ready().await { Ok(_) => Ok(true), @@ -415,8 +502,35 @@ impl StorageController { Ok(()) } - pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> { - background_process::stop_process(immediate, COMMAND, &self.pid_file())?; + pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> { + background_process::stop_process( + stop_args.immediate, + COMMAND, + &self.pid_file(stop_args.instance_id), + )?; + + let storcon_instances = self.env.storage_controller_instances().await?; + for (instance_id, instanced_dir_path) in storcon_instances { + if instance_id == stop_args.instance_id { + continue; + } + + let pid_file = instanced_dir_path.join("storage_controller.pid"); + let pid = tokio::fs::read_to_string(&pid_file) + .await + .map_err(|err| { + anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}") + })? + .parse::() + .expect("pid is valid i32"); + + let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?; + if other_proc_alive { + // There is another storage controller instance running, so we return + // and leave the database running. + return Ok(()); + } + } let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); let pg_bin_dir = self.get_pg_bin_dir().await?; @@ -429,27 +543,51 @@ impl StorageController { .wait() .await?; if !stop_status.success() { - let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"]; - let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl")) - .args(pg_status_args) - .spawn()? - .wait() - .await?; - - // pg_ctl status returns this exit code if postgres is not running: in this case it is - // fine that stop failed. Otherwise it is an error that stop failed. - const PG_STATUS_NOT_RUNNING: i32 = 3; - if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() { - println!("Storage controller database is already stopped"); - return Ok(()); - } else { - anyhow::bail!("Failed to stop storage controller database: {stop_status}") + match self.is_postgres_running().await { + Ok(false) => { + println!("Storage controller database is already stopped"); + return Ok(()); + } + Ok(true) => { + anyhow::bail!("Failed to stop storage controller database"); + } + Err(err) => { + anyhow::bail!("Failed to stop storage controller database: {err}"); + } } } Ok(()) } + async fn is_postgres_running(&self) -> anyhow::Result { + let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + + let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"]; + let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl")) + .args(pg_status_args) + .spawn()? + .wait() + .await?; + + // pg_ctl status returns this exit code if postgres is not running: in this case it is + // fine that stop failed. Otherwise it is an error that stop failed. + const PG_STATUS_NOT_RUNNING: i32 = 3; + const PG_NO_DATA_DIR: i32 = 4; + const PG_STATUS_RUNNING: i32 = 0; + match status_exitcode.code() { + Some(PG_STATUS_NOT_RUNNING) => Ok(false), + Some(PG_NO_DATA_DIR) => Ok(false), + Some(PG_STATUS_RUNNING) => Ok(true), + Some(code) => Err(anyhow::anyhow!( + "pg_ctl status returned unexpected status code: {:?}", + code + )), + None => Err(anyhow::anyhow!("pg_ctl status returned no status code")), + } + } + fn get_claims_for_path(path: &str) -> anyhow::Result> { let category = match path.find('/') { Some(idx) => &path[..idx], @@ -475,15 +613,31 @@ impl StorageController { RQ: Serialize + Sized, RS: DeserializeOwned + Sized, { - // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out - // for general purpose API access. - let listen_url = self.env.control_plane_api.clone().unwrap(); - let url = Url::from_str(&format!( - "http://{}:{}/{path}", - listen_url.host_str().unwrap(), - listen_url.port().unwrap() - )) - .unwrap(); + // In the special case of the `storage_controller start` subcommand, we wish + // to use the API endpoint of the newly started storage controller in order + // to pass the readiness check. In this scenario [`Self::listen`] will be set + // (see [`Self::start`]). + // + // Otherwise, we infer the storage controller api endpoint from the configured + // control plane API. + let url = if let Some(socket_addr) = self.listen.get() { + Url::from_str(&format!( + "http://{}:{}/{path}", + socket_addr.ip().to_canonical(), + socket_addr.port() + )) + .unwrap() + } else { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let listen_url = self.env.control_plane_api.clone().unwrap(); + Url::from_str(&format!( + "http://{}:{}/{path}", + listen_url.host_str().unwrap(), + listen_url.port().unwrap() + )) + .unwrap() + }; let mut builder = self.client.request(method, url); if let Some(body) = body { diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 5c1add070a..e27491c1c8 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -622,6 +622,7 @@ async fn main() -> anyhow::Result<()> { threshold: threshold.into(), }, )), + heatmap_period: Some("300s".to_string()), ..Default::default() }, }) diff --git a/deny.toml b/deny.toml index dc985138e6..327ac58db7 100644 --- a/deny.toml +++ b/deny.toml @@ -22,7 +22,10 @@ feature-depth = 1 [advisories] db-urls = ["https://github.com/rustsec/advisory-db"] yanked = "warn" -ignore = [] + +[[advisories.ignore]] +id = "RUSTSEC-2023-0071" +reason = "the marvin attack only affects private key decryption, not public key signature verification" # This section is considered when running `cargo deny check licenses` # More documentation for the licenses section can be found here: diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md new file mode 100644 index 0000000000..239ec58186 --- /dev/null +++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md @@ -0,0 +1,495 @@ +# Safekeeper dynamic membership change + +To quickly recover from safekeeper node failures and do rebalancing we need to +be able to change set of safekeepers the timeline resides on. The procedure must +be safe (not lose committed log) regardless of safekeepers and compute state. It +should be able to progress if any majority of old safekeeper set, any majority +of new safekeeper set and compute are up and connected. This is known as a +consensus membership change. It always involves two phases: 1) switch old +majority to old + new configuration, preventing commits without acknowledge from +the new set 2) bootstrap the new set by ensuring majority of the new set has all +data which ever could have been committed before the first phase completed; +after that switch is safe to finish. Without two phases switch to the new set +which quorum might not intersect with quorum of the old set (and typical case of +ABC -> ABD switch is an example of that, because quorums AC and BD don't +intersect). Furthermore, procedure is typically carried out by the consensus +leader, and so enumeration of configurations which establishes order between +them is done through consensus log. + +In our case consensus leader is compute (walproposer), and we don't want to wake +up all computes for the change. Neither we want to fully reimplement the leader +logic second time outside compute. Because of that the proposed algorithm relies +for issuing configurations on the external fault tolerant (distributed) strongly +consisent storage with simple API: CAS (compare-and-swap) on the single key. +Properly configured postgres suits this. + +In the system consensus is implemented at the timeline level, so algorithm below +applies to the single timeline. + +## Algorithm + +### Definitions + +A configuration is + +``` +struct Configuration { + generation: Generation, // a number uniquely identifying configuration + sk_set: Vec, // current safekeeper set + new_sk_set: Optional>, +} +``` + +Configuration with `new_set` present is used for the intermediate step during +the change and called joint configuration. Generations establish order of +generations: we say `c1` is higher than `c2` if `c1.generation` > +`c2.generation`. + +### Persistently stored data changes + +Safekeeper starts storing its current configuration in the control file. Update +of is atomic, so in-memory value always matches the persistent one. + +External CAS providing storage (let's call it configuration storage here) also +stores configuration for each timeline. It is initialized with generation 1 and +initial set of safekeepers during timeline creation. Executed CAS on it must +never be lost. + +### Compute <-> safekeeper protocol changes + +`ProposerGreeting` message carries walproposer's configuration if it is already +established (see below), else null. `AcceptorGreeting` message carries +safekeeper's current `Configuration`. All further messages (`VoteRequest`, +`VoteResponse`, `ProposerElected`, `AppendRequest`, `AppendResponse`) carry +generation number, of walproposer in case of wp->sk message or of safekeeper in +case of sk->wp message. + +### Safekeeper changes + +Basic rule: once safekeeper observes configuration higher than his own it +immediately switches to it. It must refuse all messages with lower generation +that his. It also refuses messages if it is not member of the current generation +(that is, of either `sk_set` of `sk_new_set`), though it is likely not unsafe to +process them (walproposer should ignore result anyway). + +If there is non null configuration in `ProposerGreeting` and it is higher than +current safekeeper one, safekeeper switches to it. + +Safekeeper sends its current configuration in its first message to walproposer +`AcceptorGreeting`. It refuses all other walproposer messages if the +configuration generation in them is less than its current one. Namely, it +refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In +response it sends its current configuration generation to let walproposer know. + +Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` +accepting `Configuration`. Safekeeper switches to the given conf it is higher than its +current one and ignores it otherwise. In any case it replies with +``` +struct ConfigurationSwitchResponse { + conf: Configuration, + term: Term, + last_log_term: Term, + flush_lsn: Lsn, +} +``` + +### Compute (walproposer) changes + +Basic rule is that joint configuration requires votes from majorities in the +both `set` and `new_sk_set`. + +Compute receives list of safekeepers to connect to from the control plane as +currently and tries to communicate with all of them. However, the list does not +define consensus members. Instead, on start walproposer tracks highest +configuration it receives from `AcceptorGreeting`s. Once it assembles greetings +from majority of `sk_set` and majority of `new_sk_set` (if it is present), it +establishes this configuration as its own and moves to voting. + +It should stop talking to safekeepers not listed in the configuration at this +point, though it is not unsafe to continue doing so. + +To be elected it must receive votes from both majorites if `new_sk_set` is present. +Similarly, to commit WAL it must receive flush acknowledge from both majorities. + +If walproposer hears from safekeeper configuration higher than his own (i.e. +refusal to accept due to configuration change) it simply restarts. + +### Change algorithm + +The following algorithm can be executed anywhere having access to configuration +storage and safekeepers. It is safe to interrupt / restart it and run multiple +instances of it concurrently, though likely one of them won't make +progress then. It accepts `desired_set: Vec` as input. + +Algorithm will refuse to make the change if it encounters previous interrupted +change attempt, but in this case it will try to finish it. + +It will eventually converge if old majority, new majority and configuration +storage are reachable. + +1) Fetch current timeline configuration from the configuration storage. +2) If it is already joint one and `new_set` is different from `desired_set` + refuse to change. However, assign join conf to (in memory) var + `join_conf` and proceed to step 4 to finish the ongoing change. +3) Else, create joint `joint_conf: Configuration`: increment current conf number + `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration + storage by doing CAS on the current generation: change happens only if + current configuration number is still `n`. Apart from guaranteeing uniqueness + of configurations, CAS linearizes them, ensuring that new configuration is + created only following the previous one when we know that the transition is + safe. Failed CAS aborts the procedure. +4) Call `PUT` `configuration` on safekeepers from the current set, + delivering them `joint_conf`. Collecting responses from majority is required + to proceed. If any response returned generation higher than + `joint_conf.generation`, abort (another switch raced us). Otherwise, choose + max `` among responses and establish it as + (in memory) `sync_position`. Also choose max `term` and establish it as (in + memory) `sync_term`. We can't finish the switch until majority of the new set + catches up to this `sync_position` because data before it could be committed + without ack from the new set. Similarly, we'll bump term on new majority + to `sync_term` so that two computes with the same term are never elected. +4) Initialize timeline on safekeeper(s) from `new_sk_set` where it + doesn't exist yet by doing `pull_timeline` from the majority of the + current set. Doing that on majority of `new_sk_set` is enough to + proceed, but it is reasonable to ensure that all `new_sk_set` members + are initialized -- if some of them are down why are we migrating there? +5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. + Success on majority is enough. +6) Repeatedly call `PUT` `configuration` on safekeepers from the new set, + delivering them `joint_conf` and collecting their positions. This will + switch them to the `joint_conf` which generally won't be needed + because `pull_timeline` already includes it and plus additionally would be + broadcast by compute. More importantly, we may proceed to the next step + only when `` on the majority of the new set reached + `sync_position`. Similarly, on the happy path no waiting is not needed because + `pull_timeline` already includes it. However, we should double + check to be safe. For example, timeline could have been created earlier e.g. + manually or after try-to-migrate, abort, try-to-migrate-again sequence. +7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new + safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration + storage under one more CAS. +8) Call `PUT` `configuration` on safekeepers from the new set, + delivering them `new_conf`. It is enough to deliver it to the majority + of the new set; the rest can be updated by compute. + +I haven't put huge effort to make the description above very precise, because it +is natural language prone to interpretations anyway. Instead I'd like to make TLA+ +spec of it. + +Description above focuses on safety. To make the flow practical and live, here a few more +considerations. +1) It makes sense to ping new set to ensure it we are migrating to live node(s) before + step 3. +2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed + it is safe to rollback to the old conf with one more CAS. +3) On step 4 timeline might be already created on members of the new set for various reasons; + the simplest is the procedure restart. There are more complicated scenarious like mentioned + in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving + generations, so seems simpler to treat existing timeline as success. However, this also + has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in + the step 5 is never reached until compute is (re)awaken up to synchronize new member(s). + I don't think we'll observe this in practice, but can add waking up compute if needed. +4) In the end timeline should be locally deleted on the safekeeper(s) which are + in the old set but not in the new one, unless they are unreachable. To be + safe this also should be done under generation number (deletion proceeds only if + current configuration is <= than one in request and safekeeper is not memeber of it). +5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`, + jump to step 7, using it as `new_conf`. + +## Implementation + +The procedure ought to be driven from somewhere. Obvious candidates are control +plane and storage_controller; and as each of them already has db we don't want +yet another storage. I propose to manage safekeepers in storage_controller +because 1) since it is in rust it simplifies simulation testing (more on this +below) 2) it already manages pageservers. + +This assumes that migration will be fully usable only after we migrate all +tenants/timelines to storage_controller. It is discussible whether we want also +to manage pageserver attachments for all of these, but likely we do. + +This requires us to define storcon <-> cplane interface. + +### storage_controller <-> control plane interface + +First of all, control plane should +[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829) +storing safekeepers per timeline instead of per tenant because we can't migrate +tenants atomically. + +The important question is how updated configuration is delivered from +storage_controller to control plane to provide it to computes. As always, there +are two options, pull and push. Let's do it the same push as with pageserver +`/notify-attach` because 1) it keeps storage_controller out of critical compute +start path 2) provides easier upgrade: there won't be such a thing as 'timeline +managed by control plane / storcon', cplane just takes the value out of its db +when needed 3) uniformity. It makes storage_controller responsible for retrying notifying +control plane until it succeeds. + +So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and +updates it in the db if the provided conf generation is higher (the cplane db +should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it +should update db which makes the call successful, and then try to schedule +`apply_config` if possible, it is ok if not. storage_controller +should rate limit calling the endpoint, but likely this won't be needed, as migration +throughput is limited by `pull_timeline`. + +Timeline (branch) creation in cplane should call storage_controller POST +`tenant/:tenant_id/timeline` like it currently does for sharded tenants. +Response should be augmented with `safekeeper_conf: Configuration`. The call +should be retried until succeeds. + +Timeline deletion and tenant deletion in cplane should call appropriate +storage_controller endpoints like it currently does for sharded tenants. The +calls should be retried until they succeed. + +### storage_controller implementation + +Current 'load everything on startup and keep in memory' easy design is fine. +Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16 +byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so +10^6 of timelines shouldn't take more than 100MB. + +Similar to pageserver attachment Intents storage_controller would have in-memory +`MigrationRequest` (or its absense) for each timeline and pool of tasks trying +to make these request reality; this ensures one instance of storage_controller +won't do several migrations on the same timeline concurrently. In the first +version it is simpler to have more manual control and no retries, i.e. migration +failure removes the request. Later we can build retries and automatic +scheduling/migration. `MigrationRequest` is +``` +enum MigrationRequest { + To(Vec), + FinishPending, +} +``` + +`FinishPending` requests to run the procedure to ensure state is clean: current +configuration is not joint and majority of safekeepers are aware of it, but do +not attempt to migrate anywhere. If current configuration fetched on step 1 is +not joint it jumps to step 7. It should be run at startup for all timelines (but +similarly, in the first version it is ok to trigger it manually). + +#### Schema + +`safekeepers` table mirroring current `nodes` should be added, except that for +`scheduling_policy` field (seems like `status` is a better name for it): it is enough +to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3) +`decomissioned`. + +`timelines` table: +``` +table! { + // timeline_id is primary key + timelines (tenant_id, timeline_id) { + timeline_id -> Varchar, + tenant_id -> Varchar, + generation -> Int4, + sk_set -> Array, // list of safekeeper ids + new_sk_set -> Nullable>, // list of safekeeper ids, null if not joint conf + cplane_notified_generation -> Int4, + } +} +``` + +#### API + +Node management is similar to pageserver: +1) POST `/control/v1/safekeepers` upserts safekeeper. +2) GET `/control/v1/safekeepers` lists safekeepers. +3) GET `/control/v1/safekeepers/:node_id` gets safekeeper. +4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g. + `offline` or `decomissioned`. Initially it is simpler not to schedule any + migrations here. + +Safekeeper deploy scripts should register safekeeper at storage_contorller as +they currently do with cplane, under the same id. + +Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline` +would 1) choose initial set of safekeepers; 2) write to the db initial +`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in +case of conflict; 3) create timeline on the majority of safekeepers (already +created is ok). + +We don't want to block timeline creation when one safekeeper is down. Currently +this is solved by compute implicitly creating timeline on any safekeeper it is +connected to. This creates ugly timeline state on safekeeper when timeline is +created, but start LSN is not defined yet. It would be nice to remove this; to +do that, controller can in the background retry to create timeline on +safekeeper(s) which missed that during initial creation call. It can do that +through `pull_timeline` from majority so it doesn't need to remember +`parent_lsn` in its db. + +Timeline deletion removes the row from the db and forwards deletion to the +current configuration members. Without additional actions deletions might leak, +see below on this; initially let's ignore these, reporting to cplane success if +at least one safekeeper deleted the timeline (this will remove s3 data). + +Tenant deletion repeats timeline deletion for all timelines. + +Migration API: the first version is the simplest and the most imperative: +1) PUT `/control/v1/safekeepers/migrate` schedules `MigrationRequest`s to move +all timelines from one safekeeper to another. It accepts json +``` +{ + "src_sk": u32, + "dst_sk": u32, + "limit": Optional, +} +``` + +Returns list of scheduled requests. + +2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest` + to move single timeline to given set of safekeepers: +``` +{ + "desired_set": Vec, +} +``` + +Returns scheduled request. + +Similar call should be added for the tenant. + +It would be great to have some way of subscribing to the results (apart from +looking at logs/metrics). + +Migration is executed as described above. One subtlety is that (local) deletion on +source safekeeper might fail, which is not a problem if we are going to +decomission the node but leaves garbage otherwise. I'd propose in the first version +1) Don't attempt deletion at all if node status is `offline`. +2) If it failed, just issue warning. +And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and +remove garbage timelines for manual use. It will 1) list all timelines on the +safekeeper 2) compare each one against configuration storage: if timeline +doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can +be deleted under generation number if node is not member of current generation. + +Automating this is untrivial; we'd need to register all potential missing +deletions in the same transaction +which switches configurations. Similarly when timeline is fully deleted to +prevent cplane operation from blocking when some safekeeper is not available +deletion should be also registered. + +One more task pool should infinitely retry notifying control plane about changed +safekeeper sets. + +3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return + current in memory state of the timeline and pending `MigrationRequest`, + if any. + +4) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort` tries to abort the + migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS + (incrementing generation as always). + +#### Dealing with multiple instances of storage_controller + +Operations described above executed concurrently might create some errors but do +not prevent progress, so while we normally don't want to run multiple instances +of storage_controller it is fine to have it temporarily, e.g. during redeploy. + +Any interactions with db update in-memory controller state, e.g. if migration +request failed because different one is in progress, controller remembers that +and tries to finish it. + +## Testing + +`neon_local` should be switched to use storage_controller, playing role of +control plane. + +There should be following layers of tests: +1) Model checked TLA+ spec specifies the algorithm and verifies its basic safety. + +2) To cover real code and at the same time test many schedules we should have + simulation tests. For that, configuration storage, storage_controller <-> + safekeeper communication and pull_timeline need to be mocked and main switch + procedure wrapped to as a node (thread) in simulation tests, using these + mocks. Test would inject migrations like it currently injects + safekeeper/walproposer restars. Main assert is the same -- committed WAL must + not be lost. + +3) Since simulation testing injects at relatively high level points (not + syscalls), it omits some code, in particular `pull_timeline`. Thus it is + better to have basic tests covering whole system as well. Extended version of + `test_restarts_under_load` would do: start background load and do migration + under it, then restart endpoint and check that no reported commits + had been lost. I'd also add one more creating classic network split scenario, with + one compute talking to AC and another to BD while migration from nodes ABC to ABD + happens. + +4) Simple e2e test should ensure that full flow including cplane notification works. + +## Order of implementation and rollout + +Note that +- Control plane parts and integration with it is fully independent from everything else + (tests would use simulation and neon_local). +- There is a lot of infra work making storage_controller aware of timelines and safekeepers + and its impl/rollout should be separate from migration itself. +- Initially walproposer can just stop working while it observers joint configuration. + Such window would be typically very short anyway. + +To rollout smoothly, both walproposer and safekeeper should have flag +`configurations_enabled`; when set to false, they would work as currently, i.e. +walproposer is able to commit on whatever safekeeper set it is provided. Until +all timelines are managed by storcon we'd need to use current script to migrate +and update/drop entries in the storage_controller database if it has any. + +Safekeepers would need to be able to talk both current and new protocol version +with compute to reduce number of computes restarted in prod once v2 protocol is +deployed (though before completely switching we'd need to force this). + +Let's have the following rollout order: +- storage_controller becomes aware of safekeepers; +- storage_controller gets timeline creation for new timelines and deletion requests, but + doesn't manage all timelines yet. Migration can be tested on these new timelines. + To keep control plane and storage_controller databases in sync while control + plane still chooses the safekeepers initially (until all timelines are imported + it can choose better), `TimelineCreateRequest` can get optional safekeepers + field with safekeepers chosen by cplane. +- Then we can import all existing timelines from control plane to + storage_controller and gradually enable configurations region by region. + + +Very rough implementation order: +- Add concept of configurations to safekeepers (including control file), + implement v3 protocol. +- Implement walproposer changes, including protocol. +- Implement storconn part. Use it in neon_local (and pytest). +- Make cplane store safekeepers per timeline instead of per tenant. +- Implement cplane/storcon integration. Route branch creation/deletion + through storcon. Then we can test migration of new branches. +- Finally import existing branches. Then we can drop cplane + safekeeper selection code. Gradually enable configurations at + computes and safekeepers. Before that, all computes must talk only + v3 protocol version. + +## Integration with evicted timelines + +Currently, `pull_timeline` doesn't work correctly with evicted timelines because +copy would point to original partial file. To fix let's just do s3 copy of the +file. It is a bit stupid as generally unnecessary work, but it makes sense to +implement proper migration before doing smarter timeline archival. [Issue](https://github.com/neondatabase/neon/issues/8542) + +## Possible optimizations + +Steps above suggest walproposer restart (with re-election) and thus reconnection +to safekeepers. Since by bumping term on new majority we ensure that leader +terms are unique even across generation switches it is possible to preserve +connections. However, it is more complicated, reconnection is very fast and it +is much more important to avoid compute restart than millisecond order of write +stall. + +Multiple joint consensus: algorithm above rejects attempt to change membership +while another attempt is in progress. It is possible to overlay them and AFAIK +Aurora does this but similarly I don't think this is needed. + +## Misc + +We should use Compute <-> safekeeper protocol change to include other (long +yearned) modifications: +- send data in network order to make arm work. +- remove term_start_lsn from AppendRequest +- add horizon to TermHistory +- add to ProposerGreeting number of connection from this wp to sk diff --git a/docs/rfcs/036-physical-replication.md b/docs/rfcs/036-physical-replication.md new file mode 100644 index 0000000000..41aced0545 --- /dev/null +++ b/docs/rfcs/036-physical-replication.md @@ -0,0 +1,265 @@ +# Physical Replication + +This RFC is a bit special in that we have already implemented physical +replication a long time ago. However, we never properly wrote down all +the decisions and assumptions, and in the last months when more users +have started to use the feature, numerous issues have surfaced. + +This RFC documents the design decisions that have been made. + +## Summary + +PostgreSQL has a feature called streaming replication, where a replica +streams WAL from the primary and continuously applies it. It is also +known as "physical replication", to distinguish it from logical +replication. In PostgreSQL, a replica is initialized by taking a +physical backup of the primary. In Neon, the replica is initialized +from a slim "base backup" from the pageserver, just like a primary, +and the primary and the replicas connect to the same pageserver, +sharing the storage. + +There are two kinds of read-only replicas in Neon: +- replicas that follow the primary, and +- "static" replicas that are pinned at a particular LSN. + +A static replica is useful e.g. for performing time-travel queries and +running one-off slow queries without affecting the primary. A replica +that follows the primary can be used e.g. to scale out read-only +workloads. + +## Motivation + +Read-only replicas allow offloading read-only queries. It's useful for +isolation, if you want to make sure that read-only queries don't +affect the primary, and it's also an easy way to provide guaranteed +read-only access to an application, without having to mess with access +controls. + +## Non Goals (if relevant) + +This RFC is all about WAL-based *physical* replication. Logical +replication is a different feature. + +Neon also has the capability to launch "static" read-only nodes which +do not follow the primary, but are pinned to a particular LSN. They +can be used for long-running one-off queries, or for Point-in-time +queries. They work similarly to read replicas that follow the primary, +but some things are simpler: there are no concerns about cache +invalidation when the data changes on the primary, or worrying about +transactions that are in-progress on the primary. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +- Control plane launches the replica +- Replica Postgres instance connects to the safekeepers, to stream the WAL +- The primary does not know about the standby, except for the hot standby feedback +- The primary and replicas all connect to the same pageservers + + +# Context + +Some useful things to know about hot standby and replicas in +PostgreSQL. + +## PostgreSQL startup sequence + +"Running" and "start up" terms are little imprecise. PostgreSQL +replica startup goes through several stages: + +1. First, the process is started up, and various initialization steps + are performed, like initializing shared memory. If you try to + connect to the server in this stage, you get an error: ERROR: the + database system is starting up. This stage happens very quickly, no + +2. Then the server reads the checpoint record from the WAL and starts + the WAL replay starting from the checkpoint. This works differently + in Neon: we start the WAL replay at the basebackup LSN, not from a + checkpoint! If you connect to the server in this state, you get an + error: ERROR: the database system is not yet accepting + connections. We proceed to the next stage, when the WAL replay sees + a running-xacts record. Or in Neon, the "CLOG scanning" mechanism + can allow us to move directly to next stage, with all the caveats + listed in this RFC. + +3. When the running-xacts information is established, the server + starts to accept connections normally. + +From PostgreSQL's point of view, the server is already running in +stage 2, even though it's not accepting connections yet. Our +`compute_ctl` does not consider it as running until stage 3. If the +transition from stage 2 to 3 doesn't happen fast enough, the control +plane will mark the start operation as failed. + + +## Decisions, Issues + +### Cache invalidation in replica + +When a read replica follows the primary in PostgreSQL, it needs to +stream all the WAL from the primary and apply all the records, to keep +the local copy of the data consistent with the primary. In Neon, the +replica can fetch the updated page versions from the pageserver, so +it's not necessary to apply all the WAL. However, it needs to ensure +that any pages that are currently in the Postgres buffer cache, or the +Local File Cache, are either updated, or thrown away so that the next +read of the page will fetch the latest version. + +We choose to apply the WAL records for pages that are already in the +buffer cache, and skip records for other pages. Somewhat arbitrarily, +we also apply records affecting catalog relations, fetching the old +page version from the pageserver if necessary first. See +`neon_redo_read_buffer_filter()` function. + +The replica wouldn't necessarily need to see all the WAL records, only +the records that apply to cached pages. For simplicity, we do stream +all the WAL to the replica, and the replica simply ignores WAL records +that require no action. + +Like in PostgreSQL, the read replica maintains a "replay LSN", which +is the LSN up to which the replica has received and replayed the +WAL. The replica can lag behind the primary, if it cannot quite keep +up with the primary, or if a long-running query conflicts with changes +that are about to be applied, or even intentionally if the user wishes +to see delayed data (see recovery_min_apply_delay). It's important +that the replica sees a consistent view of the whole cluster at the +replay LSN, when it's lagging behind. + +In Neon, the replica connects to a safekeeper to get the WAL +stream. That means that the safekeepers must be able to regurgitate +the original WAL as far back as the replay LSN of any running read +replica. (A static read-only node that does not follow the primary +does not require a WAL stream however). The primary does not need to +be running, and when it is, the replicas don't incur any extra +overhead to the primary (see hot standby feedback though). + +### In-progress transactions + +In PostgreSQL, when a hot standby server starts up, it cannot +immediately open up for queries (see [PostgreSQL startup +sequence]). It first needs to establish a complete list of in-progress +transactions, including subtransactions, that are running at the +primary, at the current replay LSN. Normally that happens quickly, +when the replica sees a "running-xacts" WAL record, because the +primary writes a running-xacts WAL record at every checkpoint, and in +PostgreSQL the replica always starts the WAL replay from a checkpoint +REDO point. (A shutdown checkpoint WAL record also implies that all +the non-prepared transactions have ended.) If there are a lot of +subtransactions in progress, however, the standby might need to wait +for old transactions to complete before it can open up for queries. + +In Neon that problem is worse: a replica can start at any LSN, so +there's no guarantee that it will see a running-xacts record any time +soon. In particular, if the primary is not running when the replica is +started, it might never see a running-xacts record. + +To make things worse, we initially missed this issue, and always +started accepting queries at replica startup, even if it didn't have +the transaction information. That could lead to incorrect query +results and data corruption later. However, as we fixed that, we +introduced a new problem compared to what we had before: previously +the replica would always start up, but after fixing that bug, it might +not. In a superficial way, the old behavior was better (but could lead +to serious issues later!). That made fixing that bug was very hard, +because as we fixed it, we made things (superficially) worse for +others. + +See https://github.com/neondatabase/neon/pull/7288 which fixed the +bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323 +and https://github.com/neondatabase/neon/pull/8484 to try to claw back +the cases that started to cause trouble as fixing it. As of this +writing, there are still cases where a replica might not immediately +start up, causing the control plane operation to fail, the remaining +issues are tracked in https://github.com/neondatabase/neon/issues/6211. + +One long-term fix for this is to switch to using so-called CSN +snapshots in read replica. That would make it unnecessary to have the +full in-progress transaction list in the replica at startup time. See +https://commitfest.postgresql.org/48/4912/ for a work-in-progress +patch to upstream to implement that. + +Another thing we could do is to teach the control plane about that +distinction between "starting up" and "running but haven't received +running-xacts information yet", so that we could keep the replica +waiting longer in that stage, and also give any client connections the +same `ERROR: the database system is not yet accepting connections` +error that you get in standalone PostgreSQL in that state. + + +### Recovery conflicts and Hot standby feedback + +It's possible that a tuple version is vacuumed away in the primary, +even though it is still needed by a running transactions in the +replica. This is called a "recovery conflict", and PostgreSQL provides +various options for dealing with it. By default, the WAL replay will +wait up to 30 s for the conflicting query to finish. After that, it +will kill the running query, so that the WAL replay can proceed. + +Another way to avoid the situation is to enable the +[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK) +option. When it is enabled, the primary will refrain from vacuuming +tuples that are still needed in the primary. That means potentially +bloating the primary, which violates the usual rule that read replicas +don't affect the operations on the primary, which is why it's off by +default. We leave it to users to decide if they want to turn it on, +same as PostgreSQL. + +Neon supports `hot_standby_feedback` by passing the feedback messages +from the replica to the safekeepers, and from safekeepers to the +primary. + +### Relationship of settings between primary and replica + +In order to enter hot standby mode, some configuration options need to +be set to the same or larger values in the standby, compared to the +primary. See [explanation in the PostgreSQL +docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN) + +In Neon, we have this problem too. To prevent customers from hitting +it, the control plane automatically adjusts the settings of a replica, +so that they match or exceed the primary's settings (see +https://github.com/neondatabase/cloud/issues/14903). However, you +can still hit the issue if the primary is restarted with larger +settings, while the replica is running. + + +### Interaction with Pageserver GC + +The read replica can lag behind the primary. If there are recovery +conflicts or the replica cannot keep up for some reason, the lag can +in principle grow indefinitely. The replica will issue all GetPage +requests to the pageservers at the current replay LSN, and needs to +see the old page versions. + +If the retention period in the pageserver is set to be small, it may +have already garbage collected away the old page versions. That will +cause read errors in the compute, and can mean that the replica cannot +make progress with the replication anymore. + +There is a mechanism for replica to pass information about its replay +LSN to the pageserver, so that the pageserver refrains from GC'ing +data that is still needed by the standby. It's called +'standby_horizon' in the pageserver code, see +https://github.com/neondatabase/neon/pull/7368. A separate "lease" +mechanism also is in the works, where the replica could hold a lease +on the old LSN, preventing the pageserver from advancing the GC +horizon past that point. The difference is that the standby_horizon +mechanism relies on a feedback message from replica to safekeeper, +while the least API is exposed directly from the pageserver. A static +read-only node is not connected to safekeepers, so it cannot use the +standby_horizon mechanism. + + +### Synchronous replication + +We haven't put any effort into synchronous replication yet. + +PostgreSQL provides multiple levels of synchronicity. In the weaker +levels, a transaction is not acknowledged as committed to the client +in the primary until the WAL has been streamed to a replica or flushed +to disk there. Those modes don't make senses in Neon, because the +safekeepers handle durability. + +`synchronous_commit=remote_apply` mode would make sense. In that mode, +the commit is not acknowledged to the client until it has been +replayed in the replica. That ensures that after commit, you can see +the commit in the replica too (aka. read-your-write consistency). diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index a5b452da83..a50707a1b8 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -313,20 +313,17 @@ pub struct MetadataHealthUpdateRequest { pub struct MetadataHealthUpdateResponse {} #[derive(Serialize, Deserialize, Debug)] - pub struct MetadataHealthListUnhealthyResponse { pub unhealthy_tenant_shards: Vec, } #[derive(Serialize, Deserialize, Debug)] - pub struct MetadataHealthListOutdatedRequest { #[serde(with = "humantime_serde")] pub not_scrubbed_for: Duration, } #[derive(Serialize, Deserialize, Debug)] - pub struct MetadataHealthListOutdatedResponse { pub health_records: Vec, } diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 3af3f74e9c..2fdd7de38f 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -22,6 +22,11 @@ pub struct Key { pub field6: u32, } +/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as +/// a struct of fields. +#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)] +pub struct CompactKey(i128); + /// The storage key size. pub const KEY_SIZE: usize = 18; @@ -130,6 +135,14 @@ impl Key { } } + pub fn to_compact(&self) -> CompactKey { + CompactKey(self.to_i128()) + } + + pub fn from_compact(k: CompactKey) -> Self { + Self::from_i128(k.0) + } + pub const fn next(&self) -> Key { self.add(1) } @@ -199,6 +212,13 @@ impl fmt::Display for Key { } } +impl fmt::Display for CompactKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let k = Key::from_compact(*self); + k.fmt(f) + } +} + impl Key { pub const MIN: Key = Key { field1: u8::MIN, diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index e88cab5d6a..0fec221276 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -1,4 +1,5 @@ -use utils::serde_system_time::SystemTime; +use std::time::SystemTime; +use utils::{serde_percent::Percent, serde_system_time}; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. @@ -9,19 +10,88 @@ use utils::serde_system_time::SystemTime; /// not handle full u64 values properly. #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)] pub struct PageserverUtilization { - /// Used disk space + /// Used disk space (physical, ground truth from statfs()) #[serde(serialize_with = "ser_saturating_u63")] pub disk_usage_bytes: u64, /// Free disk space #[serde(serialize_with = "ser_saturating_u63")] pub free_space_bytes: u64, - /// Lower is better score for how good candidate for a next tenant would this pageserver be. - #[serde(serialize_with = "ser_saturating_u63")] + + /// Wanted disk space, based on the tenant shards currently present on this pageserver: this + /// is like disk_usage_bytes, but it is stable and does not change with the cache state of + /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay + /// there, or may be unrealistically low if the pageserver has attached tenants which haven't + /// downloaded layers yet. + #[serde(serialize_with = "ser_saturating_u63", default)] + pub disk_wanted_bytes: u64, + + // What proportion of total disk space will this pageserver use before it starts evicting data? + #[serde(default = "unity_percent")] + pub disk_usable_pct: Percent, + + // How many shards are currently on this node? + #[serde(default)] + pub shard_count: u32, + + // How many shards should this node be able to handle at most? + #[serde(default)] + pub max_shard_count: u32, + + /// Cached result of [`Self::score`] pub utilization_score: u64, + /// When was this snapshot captured, pageserver local time. /// /// Use millis to give confidence that the value is regenerated often enough. - pub captured_at: SystemTime, + pub captured_at: serde_system_time::SystemTime, +} + +fn unity_percent() -> Percent { + Percent::new(0).unwrap() +} + +impl PageserverUtilization { + const UTILIZATION_FULL: u64 = 1000000; + + /// Calculate a utilization score. The result is to be inrepreted as a fraction of + /// Self::UTILIZATION_FULL. + /// + /// Lower values are more affine to scheduling more work on this node. + /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work. + /// - 0.0 represents an empty node. + /// - Negative values are forbidden + /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to + /// layer eviction. + pub fn score(&self) -> u64 { + let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes) + * self.disk_usable_pct.get() as u64) + / 100; + let disk_utilization_score = + self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity; + + let shard_utilization_score = + self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64; + std::cmp::max(disk_utilization_score, shard_utilization_score) + } + + pub fn refresh_score(&mut self) { + self.utilization_score = self.score(); + } + + /// A utilization structure that has a full utilization score: use this as a placeholder when + /// you need a utilization but don't have real values yet. + pub fn full() -> Self { + Self { + disk_usage_bytes: 1, + free_space_bytes: 0, + disk_wanted_bytes: 1, + disk_usable_pct: Percent::new(100).unwrap(), + shard_count: 1, + max_shard_count: 1, + utilization_score: Self::UTILIZATION_FULL, + captured_at: serde_system_time::SystemTime(SystemTime::now()), + } + } } /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. @@ -49,15 +119,19 @@ mod tests { let doc = PageserverUtilization { disk_usage_bytes: u64::MAX, free_space_bytes: 0, - utilization_score: u64::MAX, - captured_at: SystemTime( + disk_wanted_bytes: u64::MAX, + utilization_score: 13, + disk_usable_pct: Percent::new(90).unwrap(), + shard_count: 100, + max_shard_count: 200, + captured_at: serde_system_time::SystemTime( std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), ), }; let s = serde_json::to_string(&doc).unwrap(); - let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#; + let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}"; assert_eq!(s, expected); } diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 729f57f829..0940ad207f 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -143,8 +143,8 @@ pub use v14::xlog_utils::XLogFileName; pub use v14::bindings::DBState_DB_SHUTDOWNED; -pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result { - dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info))) +pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { + dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info)) } pub fn generate_wal_segment( diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs index 66422853e1..5fd0eaabc7 100644 --- a/libs/remote_storage/src/error.rs +++ b/libs/remote_storage/src/error.rs @@ -42,6 +42,10 @@ impl DownloadError { Timeout | Other(_) => false, } } + + pub fn is_cancelled(&self) -> bool { + matches!(self, DownloadError::Cancelled) + } } impl From for DownloadError { diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs index 2fef8d35df..f65c080ad4 100644 --- a/libs/utils/src/completion.rs +++ b/libs/utils/src/completion.rs @@ -5,13 +5,40 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker}; /// Can be cloned, moved and kept around in futures as "guard objects". #[derive(Clone)] pub struct Completion { - _token: TaskTrackerToken, + token: TaskTrackerToken, +} + +impl std::fmt::Debug for Completion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Completion") + .field("siblings", &self.token.task_tracker().len()) + .finish() + } +} + +impl Completion { + /// Returns true if this completion is associated with the given barrier. + pub fn blocks(&self, barrier: &Barrier) -> bool { + TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0) + } + + pub fn barrier(&self) -> Barrier { + Barrier(self.token.task_tracker().clone()) + } } /// Barrier will wait until all clones of [`Completion`] have been dropped. #[derive(Clone)] pub struct Barrier(TaskTracker); +impl std::fmt::Debug for Barrier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Barrier") + .field("remaining", &self.0.len()) + .finish() + } +} + impl Default for Barrier { fn default() -> Self { let (_, rx) = channel(); @@ -51,5 +78,5 @@ pub fn channel() -> (Completion, Barrier) { tracker.close(); let token = tracker.token(); - (Completion { _token: token }, Barrier(tracker)) + (Completion { token }, Barrier(tracker)) } diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 9bab02e46c..0336302de0 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -95,7 +95,7 @@ async fn ingest( } } - layer.put_value(key, lsn, &data, &ctx).await?; + layer.put_value(key.to_compact(), lsn, &data, &ctx).await?; } layer.freeze(lsn + 1).await; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 932918410c..da0c11d9bf 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -124,8 +124,6 @@ fn main() -> anyhow::Result<()> { // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings"); - info!(?conf.get_impl, "starting with get page implementation"); - info!(?conf.get_vectored_impl, "starting with vectored get page implementation"); info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access"); let tenants_path = conf.tenants_path(); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index f4c367bd4d..0ebaf78840 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -29,12 +29,12 @@ use utils::{ logging::LogFormat, }; +use crate::l0_flush::L0FlushConfig; +use crate::tenant::config::TenantConfOpt; use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess; use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; -use crate::tenant::{config::TenantConfOpt, timeline::GetImpl}; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine}; -use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl}; use crate::{tenant::config::TenantConf, virtual_file}; use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX}; @@ -50,7 +50,6 @@ pub mod defaults { DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_PG_LISTEN_PORT, }; - use pageserver_api::models::ImageCompressionAlgorithm; pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; @@ -90,8 +89,7 @@ pub mod defaults { pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB - pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = - ImageCompressionAlgorithm::Disabled; + pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)"; pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false; @@ -133,14 +131,8 @@ pub mod defaults { #virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}' -#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}' - -#get_impl = '{DEFAULT_GET_IMPL}' - #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}' -#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' - [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -278,14 +270,8 @@ pub struct PageServerConf { pub virtual_file_io_engine: virtual_file::IoEngineKind, - pub get_vectored_impl: GetVectoredImpl, - - pub get_impl: GetImpl, - pub max_vectored_read_bytes: MaxVectoredReadBytes, - pub validate_vectored_get: bool, - pub image_compression: ImageCompressionAlgorithm, /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this @@ -396,14 +382,8 @@ struct PageServerConfigBuilder { virtual_file_io_engine: BuilderValue, - get_vectored_impl: BuilderValue, - - get_impl: BuilderValue, - max_vectored_read_bytes: BuilderValue, - validate_vectored_get: BuilderValue, - image_compression: BuilderValue, ephemeral_bytes_per_memory_kb: BuilderValue, @@ -493,13 +473,10 @@ impl PageServerConfigBuilder { virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), - get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()), - get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()), max_vectored_read_bytes: Set(MaxVectoredReadBytes( NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), - image_compression: Set(DEFAULT_IMAGE_COMPRESSION), - validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), + image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: Set(L0FlushConfig::default()), compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()), @@ -659,22 +636,10 @@ impl PageServerConfigBuilder { self.virtual_file_io_engine = BuilderValue::Set(value); } - pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) { - self.get_vectored_impl = BuilderValue::Set(value); - } - - pub fn get_impl(&mut self, value: GetImpl) { - self.get_impl = BuilderValue::Set(value); - } - pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) { self.max_vectored_read_bytes = BuilderValue::Set(value); } - pub fn get_validate_vectored_get(&mut self, value: bool) { - self.validate_vectored_get = BuilderValue::Set(value); - } - pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) { self.image_compression = BuilderValue::Set(value); } @@ -745,10 +710,7 @@ impl PageServerConfigBuilder { heatmap_upload_concurrency, secondary_download_concurrency, ingest_batch_size, - get_vectored_impl, - get_impl, max_vectored_read_bytes, - validate_vectored_get, image_compression, ephemeral_bytes_per_memory_kb, l0_flush, @@ -1002,21 +964,12 @@ impl PageServerConf { "virtual_file_io_engine" => { builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?) } - "get_vectored_impl" => { - builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?) - } - "get_impl" => { - builder.get_impl(parse_toml_from_str("get_impl", item)?) - } "max_vectored_read_bytes" => { let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize; builder.get_max_vectored_read_bytes( MaxVectoredReadBytes( NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0"))) } - "validate_vectored_get" => { - builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?) - } "image_compression" => { builder.get_image_compression(parse_toml_from_str("image_compression", item)?) } @@ -1106,14 +1059,11 @@ impl PageServerConf { secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), - get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant"), ), - image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, - validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, l0_flush: L0FlushConfig::default(), compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), @@ -1349,14 +1299,11 @@ background_task_maximum_delay = '334 s' secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), - get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant") ), - validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, - image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, + image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, l0_flush: L0FlushConfig::default(), compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), @@ -1425,14 +1372,11 @@ background_task_maximum_delay = '334 s' secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: 100, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), - get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant") ), - validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, - image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, + image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, l0_flush: L0FlushConfig::default(), compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 92dcf6ee61..5e4a49bc56 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -64,7 +64,7 @@ use crate::{ mgr::TenantManager, remote_timeline_client::LayerFileMetadata, secondary::SecondaryTenant, - storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName}, + storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint}, }, CancellableTask, DiskUsageEvictionTask, }; @@ -114,7 +114,7 @@ fn default_highest_layer_count_loses_first() -> bool { } impl EvictionOrder { - fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) { + fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) { use EvictionOrder::*; match self { @@ -644,6 +644,7 @@ pub(crate) struct EvictionCandidate { pub(crate) layer: EvictionLayer, pub(crate) last_activity_ts: SystemTime, pub(crate) relative_last_activity: finite_f32::FiniteF32, + pub(crate) visibility: LayerVisibilityHint, } impl std::fmt::Display for EvictionLayer { @@ -685,14 +686,22 @@ impl std::fmt::Debug for EvictionCandidate { } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -enum MinResidentSizePartition { +enum EvictionPartition { + // A layer that is un-wanted by the tenant: evict all these first, before considering + // any other layers + EvictNow, + + // Above the minimum size threshold: this layer is a candidate for eviction. Above, + + // Below the minimum size threshold: this layer should only be evicted if all the + // tenants' layers above the minimum size threshold have already been considered. Below, } enum EvictionCandidates { Cancelled, - Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>), + Finished(Vec<(EvictionPartition, EvictionCandidate)>), } /// Gather the eviction candidates. @@ -890,8 +899,10 @@ async fn collect_eviction_candidates( max_layer_size }; - // Sort layers most-recently-used first, then partition by - // cumsum above/below min_resident_size. + // Sort layers most-recently-used first, then calculate [`EvictionPartition`] for each layer, + // where the inputs are: + // - whether the layer is visible + // - whether the layer is above/below the min_resident_size cutline tenant_candidates .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); let mut cumsum: i128 = 0; @@ -908,12 +919,23 @@ async fn collect_eviction_candidates( candidate.relative_last_activity = eviction_order.relative_last_activity(total, i); - let partition = if cumsum > min_resident_size as i128 { - MinResidentSizePartition::Above - } else { - MinResidentSizePartition::Below + let partition = match candidate.visibility { + LayerVisibilityHint::Covered => { + // Covered layers are evicted first + EvictionPartition::EvictNow + } + LayerVisibilityHint::Visible => { + cumsum += i128::from(candidate.layer.get_file_size()); + + if cumsum > min_resident_size as i128 { + EvictionPartition::Above + } else { + // The most recent layers below the min_resident_size threshold + // are the last to be evicted. + EvictionPartition::Below + } + } }; - cumsum += i128::from(candidate.layer.get_file_size()); (partition, candidate) }); @@ -981,7 +1003,7 @@ async fn collect_eviction_candidates( // Secondary locations' layers are always considered above the min resident size, // i.e. secondary locations are permitted to be trimmed to zero layers if all // the layers have sufficiently old access times. - MinResidentSizePartition::Above, + EvictionPartition::Above, candidate, ) }); @@ -1009,7 +1031,9 @@ async fn collect_eviction_candidates( } } - debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, + debug_assert!(EvictionPartition::Above < EvictionPartition::Below, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); + debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above, "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); eviction_order.sort(&mut candidates); @@ -1022,7 +1046,7 @@ async fn collect_eviction_candidates( /// /// Returns the amount of candidates selected, with the planned usage. fn select_victims( - candidates: &[(MinResidentSizePartition, EvictionCandidate)], + candidates: &[(EvictionPartition, EvictionCandidate)], usage_pre: U, ) -> VictimSelection { let mut usage_when_switched = None; @@ -1034,7 +1058,7 @@ fn select_victims( break; } - if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() { + if partition == &EvictionPartition::Below && usage_when_switched.is_none() { usage_when_switched = Some((usage_planned, i)); } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a983d8c4c2..a4da8506d6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -178,10 +178,8 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res impl From for ApiError { fn from(pre: PageReconstructError) -> ApiError { match pre { - PageReconstructError::Other(pre) => ApiError::InternalServerError(pre), - PageReconstructError::MissingKey(e) => { - ApiError::InternalServerError(anyhow::anyhow!("{e}")) - } + PageReconstructError::Other(other) => ApiError::InternalServerError(other), + PageReconstructError::MissingKey(e) => ApiError::InternalServerError(e.into()), PageReconstructError::Cancelled => ApiError::Cancelled, PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()), PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre), @@ -1787,9 +1785,11 @@ async fn timeline_checkpoint_handler( } if wait_until_uploaded { + tracing::info!("Waiting for uploads to complete..."); timeline.remote_client.wait_completion().await // XXX map to correct ApiError for the cases where it's due to shutdown .context("wait completion").map_err(ApiError::InternalServerError)?; + tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0))); } json_response(StatusCode::OK, ()) @@ -1887,7 +1887,7 @@ async fn timeline_detach_ancestor_handler( // drop(tenant); let resp = match progress { - detach_ancestor::Progress::Prepared(_guard, prepared) => { + detach_ancestor::Progress::Prepared(attempt, prepared) => { // it would be great to tag the guard on to the tenant activation future let reparented_timelines = state .tenant_manager @@ -1895,11 +1895,10 @@ async fn timeline_detach_ancestor_handler( tenant_shard_id, timeline_id, prepared, + attempt, ctx, ) - .await - .context("timeline detach ancestor completion") - .map_err(ApiError::InternalServerError)?; + .await?; AncestorDetached { reparented_timelines, @@ -2357,8 +2356,9 @@ async fn get_utilization( // regenerate at most 1Hz to allow polling at any rate. if !still_valid { let path = state.conf.tenants_path(); - let doc = crate::utilization::regenerate(path.as_std_path()) - .map_err(ApiError::InternalServerError)?; + let doc = + crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager) + .map_err(ApiError::InternalServerError)?; let mut buf = Vec::new(); serde_json::to_writer(&mut buf, &doc) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 85f3a6e0fb..4f7eb1a00c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -287,10 +287,7 @@ impl Timeline { // then check if the database was already initialized. // get_rel_exists can be called before dbdir is created. let buf = version.get(self, DBDIR_KEY, ctx).await?; - let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.dbdirs), - Err(e) => Err(PageReconstructError::from(e)), - }?; + let dbdirs = DbDirectory::des(&buf)?.dbdirs; if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) { return Ok(false); } @@ -298,13 +295,8 @@ impl Timeline { let key = rel_dir_to_key(tag.spcnode, tag.dbnode); let buf = version.get(self, key, ctx).await?; - match RelDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => { - let exists = dir.rels.contains(&(tag.relnode, tag.forknum)); - Ok(exists) - } - Err(e) => Err(PageReconstructError::from(e)), - } + let dir = RelDirectory::des(&buf)?; + Ok(dir.rels.contains(&(tag.relnode, tag.forknum))) } /// Get a list of all existing relations in given tablespace and database. @@ -323,20 +315,16 @@ impl Timeline { let key = rel_dir_to_key(spcnode, dbnode); let buf = version.get(self, key, ctx).await?; - match RelDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => { - let rels: HashSet = - HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { - spcnode, - dbnode, - relnode: *relnode, - forknum: *forknum, - })); + let dir = RelDirectory::des(&buf)?; + let rels: HashSet = + HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { + spcnode, + dbnode, + relnode: *relnode, + forknum: *forknum, + })); - Ok(rels) - } - Err(e) => Err(PageReconstructError::from(e)), - } + Ok(rels) } /// Get the whole SLRU segment @@ -398,13 +386,8 @@ impl Timeline { let key = slru_dir_to_key(kind); let buf = version.get(self, key, ctx).await?; - match SlruSegmentDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => { - let exists = dir.segments.contains(&segno); - Ok(exists) - } - Err(e) => Err(PageReconstructError::from(e)), - } + let dir = SlruSegmentDirectory::des(&buf)?; + Ok(dir.segments.contains(&segno)) } /// Locate LSN, such that all transactions that committed before @@ -620,10 +603,7 @@ impl Timeline { let key = slru_dir_to_key(kind); let buf = version.get(self, key, ctx).await?; - match SlruSegmentDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.segments), - Err(e) => Err(PageReconstructError::from(e)), - } + Ok(SlruSegmentDirectory::des(&buf)?.segments) } pub(crate) async fn get_relmap_file( @@ -647,10 +627,7 @@ impl Timeline { // fetch directory entry let buf = self.get(DBDIR_KEY, lsn, ctx).await?; - match DbDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.dbdirs), - Err(e) => Err(PageReconstructError::from(e)), - } + Ok(DbDirectory::des(&buf)?.dbdirs) } pub(crate) async fn get_twophase_file( @@ -672,10 +649,7 @@ impl Timeline { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - match TwoPhaseDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.xids), - Err(e) => Err(PageReconstructError::from(e)), - } + Ok(TwoPhaseDirectory::des(&buf)?.xids) } pub(crate) async fn get_control_file( @@ -700,10 +674,7 @@ impl Timeline { ctx: &RequestContext, ) -> Result, PageReconstructError> { match self.get(AUX_FILES_KEY, lsn, ctx).await { - Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") { - Ok(dir) => Ok(dir.files), - Err(e) => Err(PageReconstructError::from(e)), - }, + Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files), Err(e) => { // This is expected: historical databases do not have the key. debug!("Failed to get info about AUX files: {}", e); @@ -719,13 +690,14 @@ impl Timeline { ) -> Result, PageReconstructError> { let kv = self .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx) - .await - .context("scan")?; + .await?; let mut result = HashMap::new(); let mut sz = 0; for (_, v) in kv { - let v = v.context("get value")?; - let v = aux_file::decode_file_value_bytes(&v).context("value decode")?; + let v = v?; + let v = aux_file::decode_file_value_bytes(&v) + .context("value decode") + .map_err(PageReconstructError::Other)?; for (fname, content) in v { sz += fname.len(); sz += content.len(); @@ -793,11 +765,10 @@ impl Timeline { ) -> Result, PageReconstructError> { let kv = self .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx) - .await - .context("scan")?; + .await?; let mut result = HashMap::new(); for (k, v) in kv { - let v = v.context("get value")?; + let v = v?; let origin_id = k.field6 as RepOriginId; let origin_lsn = Lsn::des(&v).unwrap(); if origin_lsn != Lsn::INVALID { @@ -1733,12 +1704,17 @@ impl<'a> DatadirModification<'a> { // the original code assumes all other errors are missing keys. Therefore, we keep the code path // the same for now, though in theory, we should only match the `MissingKey` variant. Err( - PageReconstructError::Other(_) + e @ (PageReconstructError::Other(_) | PageReconstructError::WalRedo(_) - | PageReconstructError::MissingKey { .. }, + | PageReconstructError::MissingKey(_)), ) => { // Key is missing, we must insert an image as the basis for subsequent deltas. + if !matches!(e, PageReconstructError::MissingKey(_)) { + let e = utils::error::report_compact_sources(&e); + tracing::warn!("treating error as if it was a missing key: {}", e); + } + let mut dir = AuxFilesDirectory { files: HashMap::new(), }; @@ -1893,7 +1869,7 @@ impl<'a> DatadirModification<'a> { // work directly with Images, and we never need to read actual // data pages. We could handle this if we had to, by calling // the walredo manager, but let's keep it simple for now. - Err(PageReconstructError::from(anyhow::anyhow!( + Err(PageReconstructError::Other(anyhow::anyhow!( "unexpected pending WAL record" ))) }; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 90c0e28bc4..8ab8d08ce1 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -41,6 +41,7 @@ use tokio::sync::watch; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; +use upload_queue::NotInitialized; use utils::backoff; use utils::circuit_breaker::CircuitBreaker; use utils::completion; @@ -301,7 +302,11 @@ pub struct Tenant { pub(crate) timeline_get_throttle: Arc>, - /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. + /// An ongoing timeline detach concurrency limiter. + /// + /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense + /// to have two running at the same time. A different one can be started if an earlier one + /// has failed for whatever reason. ongoing_timeline_detach: std::sync::Mutex>, /// `index_part.json` based gc blocking reason tracking. @@ -601,6 +606,15 @@ impl From for GcError { } } +impl From for GcError { + fn from(value: NotInitialized) -> Self { + match value { + NotInitialized::Uninitialized => GcError::Remote(value.into()), + NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled, + } + } +} + impl From for GcError { fn from(_: timeline::layer_manager::Shutdown) -> Self { GcError::TimelineCancelled @@ -823,9 +837,9 @@ impl Tenant { // The Stopping case is for when we have passed control on to DeleteTenantFlow: // if it errors, we will call make_broken when tenant is already in Stopping. assert!( - matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }), - "the attach task owns the tenant state until activation is complete" - ); + matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }), + "the attach task owns the tenant state until activation is complete" + ); *state = TenantState::broken_from_reason(err.to_string()); }); @@ -3722,6 +3736,19 @@ impl Tenant { pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt { self.tenant_conf.load().tenant_conf.clone() } + + /// How much local storage would this tenant like to have? It can cope with + /// less than this (via eviction and on-demand downloads), but this function enables + /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O + /// by keeping important things on local disk. + pub(crate) fn local_storage_wanted(&self) -> u64 { + let mut wanted = 0; + let timelines = self.timelines.lock().unwrap(); + for timeline in timelines.values() { + wanted += timeline.metrics.visible_physical_size_gauge.get(); + } + wanted + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository @@ -4464,10 +4491,13 @@ mod tests { // This needs to traverse to the parent, and fails. let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err(); - assert!(err.to_string().starts_with(&format!( - "Bad state on timeline {}: Broken", - tline.timeline_id - ))); + assert!( + err.to_string().starts_with(&format!( + "bad state on timeline {}: Broken", + tline.timeline_id + )), + "{err}" + ); Ok(()) } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 8e9d349ca8..a245c99a88 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -24,6 +24,7 @@ use tracing::warn; use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; use crate::tenant::block_io::BlockCursor; +use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::VirtualFile; use std::cmp::min; use std::io::{Error, ErrorKind}; @@ -186,11 +187,11 @@ impl BlobWriter { /// You need to make sure that the internal buffer is empty, otherwise /// data will be written in wrong order. #[inline(always)] - async fn write_all_unbuffered, Buf: IoBuf + Send>( + async fn write_all_unbuffered( &mut self, - src_buf: B, + src_buf: FullSlice, ctx: &RequestContext, - ) -> (B::Buf, Result<(), Error>) { + ) -> (FullSlice, Result<(), Error>) { let (src_buf, res) = self.inner.write_all(src_buf, ctx).await; let nbytes = match res { Ok(nbytes) => nbytes, @@ -204,8 +205,9 @@ impl BlobWriter { /// Flushes the internal buffer to the underlying `VirtualFile`. pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> { let buf = std::mem::take(&mut self.buf); - let (mut buf, res) = self.inner.write_all(buf, ctx).await; + let (slice, res) = self.inner.write_all(buf.slice_len(), ctx).await; res?; + let mut buf = slice.into_raw_slice().into_inner(); buf.clear(); self.buf = buf; Ok(()) @@ -222,19 +224,30 @@ impl BlobWriter { } /// Internal, possibly buffered, write function - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - src_buf: B, + src_buf: FullSlice, ctx: &RequestContext, - ) -> (B::Buf, Result<(), Error>) { + ) -> (FullSlice, Result<(), Error>) { + let src_buf = src_buf.into_raw_slice(); + let src_buf_bounds = src_buf.bounds(); + let restore = move |src_buf_slice: Slice<_>| { + FullSlice::must_new(Slice::from_buf_bounds( + src_buf_slice.into_inner(), + src_buf_bounds, + )) + }; + if !BUFFERED { assert!(self.buf.is_empty()); - return self.write_all_unbuffered(src_buf, ctx).await; + return self + .write_all_unbuffered(FullSlice::must_new(src_buf), ctx) + .await; } let remaining = Self::CAPACITY - self.buf.len(); let src_buf_len = src_buf.bytes_init(); if src_buf_len == 0 { - return (Slice::into_inner(src_buf.slice_full()), Ok(())); + return (restore(src_buf), Ok(())); } let mut src_buf = src_buf.slice(0..src_buf_len); // First try to copy as much as we can into the buffer @@ -245,7 +258,7 @@ impl BlobWriter { // Then, if the buffer is full, flush it out if self.buf.len() == Self::CAPACITY { if let Err(e) = self.flush_buffer(ctx).await { - return (Slice::into_inner(src_buf), Err(e)); + return (restore(src_buf), Err(e)); } } // Finally, write the tail of src_buf: @@ -258,27 +271,29 @@ impl BlobWriter { let copied = self.write_into_buffer(&src_buf); // We just verified above that src_buf fits into our internal buffer. assert_eq!(copied, src_buf.len()); - Slice::into_inner(src_buf) + restore(src_buf) } else { - let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await; + let (src_buf, res) = self + .write_all_unbuffered(FullSlice::must_new(src_buf), ctx) + .await; if let Err(e) = res { return (src_buf, Err(e)); } src_buf } } else { - Slice::into_inner(src_buf) + restore(src_buf) }; (src_buf, Ok(())) } /// Write a blob of data. Returns the offset that it was written to, /// which can be used to retrieve the data later. - pub async fn write_blob, Buf: IoBuf + Send>( + pub async fn write_blob( &mut self, - srcbuf: B, + srcbuf: FullSlice, ctx: &RequestContext, - ) -> (B::Buf, Result) { + ) -> (FullSlice, Result) { let (buf, res) = self .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled) .await; @@ -287,43 +302,40 @@ impl BlobWriter { /// Write a blob of data. Returns the offset that it was written to, /// which can be used to retrieve the data later. - pub async fn write_blob_maybe_compressed, Buf: IoBuf + Send>( + pub(crate) async fn write_blob_maybe_compressed( &mut self, - srcbuf: B, + srcbuf: FullSlice, ctx: &RequestContext, algorithm: ImageCompressionAlgorithm, - ) -> (B::Buf, Result<(u64, CompressionInfo), Error>) { + ) -> (FullSlice, Result<(u64, CompressionInfo), Error>) { let offset = self.offset; let mut compression_info = CompressionInfo { written_compressed: false, compressed_size: None, }; - let len = srcbuf.bytes_init(); + let len = srcbuf.len(); let mut io_buf = self.io_buf.take().expect("we always put it back below"); io_buf.clear(); let mut compressed_buf = None; - let ((io_buf, hdr_res), srcbuf) = async { + let ((io_buf_slice, hdr_res), srcbuf) = async { if len < 128 { // Short blob. Write a 1-byte length header io_buf.put_u8(len as u8); - ( - self.write_all(io_buf, ctx).await, - srcbuf.slice_full().into_inner(), - ) + (self.write_all(io_buf.slice_len(), ctx).await, srcbuf) } else { // Write a 4-byte length header if len > MAX_SUPPORTED_LEN { return ( ( - io_buf, + io_buf.slice_len(), Err(Error::new( ErrorKind::Other, format!("blob too large ({len} bytes)"), )), ), - srcbuf.slice_full().into_inner(), + srcbuf, ); } let (high_bit_mask, len_written, srcbuf) = match algorithm { @@ -336,8 +348,7 @@ impl BlobWriter { } else { async_compression::tokio::write::ZstdEncoder::new(Vec::new()) }; - let slice = srcbuf.slice_full(); - encoder.write_all(&slice[..]).await.unwrap(); + encoder.write_all(&srcbuf[..]).await.unwrap(); encoder.shutdown().await.unwrap(); let compressed = encoder.into_inner(); compression_info.compressed_size = Some(compressed.len()); @@ -345,31 +356,29 @@ impl BlobWriter { compression_info.written_compressed = true; let compressed_len = compressed.len(); compressed_buf = Some(compressed); - (BYTE_ZSTD, compressed_len, slice.into_inner()) + (BYTE_ZSTD, compressed_len, srcbuf) } else { - (BYTE_UNCOMPRESSED, len, slice.into_inner()) + (BYTE_UNCOMPRESSED, len, srcbuf) } } - ImageCompressionAlgorithm::Disabled => { - (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner()) - } + ImageCompressionAlgorithm::Disabled => (BYTE_UNCOMPRESSED, len, srcbuf), }; let mut len_buf = (len_written as u32).to_be_bytes(); assert_eq!(len_buf[0] & 0xf0, 0); len_buf[0] |= high_bit_mask; io_buf.extend_from_slice(&len_buf[..]); - (self.write_all(io_buf, ctx).await, srcbuf) + (self.write_all(io_buf.slice_len(), ctx).await, srcbuf) } } .await; - self.io_buf = Some(io_buf); + self.io_buf = Some(io_buf_slice.into_raw_slice().into_inner()); match hdr_res { Ok(_) => (), - Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)), + Err(e) => return (srcbuf, Err(e)), } let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf { - let (_buf, res) = self.write_all(compressed_buf, ctx).await; - (Slice::into_inner(srcbuf.slice(..)), res) + let (_buf, res) = self.write_all(compressed_buf.slice_len(), ctx).await; + (srcbuf, res) } else { self.write_all(srcbuf, ctx).await }; @@ -432,21 +441,21 @@ pub(crate) mod tests { let (_, res) = if compression { let res = wtr .write_blob_maybe_compressed( - blob.clone(), + blob.clone().slice_len(), ctx, ImageCompressionAlgorithm::Zstd { level: Some(1) }, ) .await; (res.0, res.1.map(|(off, _)| off)) } else { - wtr.write_blob(blob.clone(), ctx).await + wtr.write_blob(blob.clone().slice_len(), ctx).await }; let offs = res?; offsets.push(offs); } // Write out one page worth of zeros so that we can // read again with read_blk - let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await; + let (_, res) = wtr.write_blob(vec![0; PAGE_SZ].slice_len(), ctx).await; let offs = res?; println!("Writing final blob at offs={offs}"); wtr.flush_buffer(ctx).await?; diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 0a12b64a7c..7355b3b5a3 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -4,6 +4,7 @@ use crate::context::RequestContext; use crate::page_cache::{self, PAGE_SZ}; use crate::tenant::block_io::BlockLease; +use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice; use crate::virtual_file::VirtualFile; use once_cell::sync::Lazy; @@ -208,21 +209,11 @@ impl PreWarmingWriter { } impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter { - async fn write_all< - B: tokio_epoll_uring::BoundedBuf, - Buf: tokio_epoll_uring::IoBuf + Send, - >( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, ctx: &RequestContext, - ) -> std::io::Result<(usize, B::Buf)> { - let buf = buf.slice(..); - let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done - let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) { - Some(buf.to_vec()) - } else { - None - }; + ) -> std::io::Result<(usize, FullSlice)> { let buflen = buf.len(); assert_eq!( buflen % PAGE_SZ, @@ -231,10 +222,10 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi ); // Do the IO. - let iobuf = match self.file.write_all(buf, ctx).await { - (iobuf, Ok(nwritten)) => { + let buf = match self.file.write_all(buf, ctx).await { + (buf, Ok(nwritten)) => { assert_eq!(nwritten, buflen); - iobuf + buf } (_, Err(e)) => { return Err(std::io::Error::new( @@ -248,12 +239,6 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi } }; - // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf) - let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds); - if let Some(check_bounds_stuff_works) = check_bounds_stuff_works { - assert_eq!(&check_bounds_stuff_works, &*buf); - } - let nblocks = buflen / PAGE_SZ; let nblocks32 = u32::try_from(nblocks).unwrap(); @@ -300,6 +285,6 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi } self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap(); - Ok((buflen, buf.into_inner())) + Ok((buflen, buf)) } } diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs index f90291bbf8..2dc0277638 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs @@ -5,6 +5,8 @@ use std::mem::MaybeUninit; +use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice; + /// See module-level comment. pub struct Buffer { allocation: Box<[u8; N]>, @@ -60,10 +62,10 @@ impl crate::virtual_file::owned_buffers_io::write::Buffer for Bu self.written } - fn flush(self) -> tokio_epoll_uring::Slice { + fn flush(self) -> FullSlice { self.invariants(); let written = self.written; - tokio_epoll_uring::BoundedBuf::slice(self, 0..written) + FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written)) } fn reuse_after_flush(iobuf: Self::IoBuf) -> Self { diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index bbc070a81b..6073abc8c3 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -285,12 +285,15 @@ impl TimelineMetadata { } /// When reparenting, the `ancestor_lsn` does not change. + /// + /// Returns true if anything was changed. pub fn reparent(&mut self, timeline: &TimelineId) { assert!(self.body.ancestor_timeline.is_some()); // no assertion for redoing this: it's fine, we may have to repeat this multiple times over self.body.ancestor_timeline = Some(*timeline); } + /// Returns true if anything was changed pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { if let Some(ancestor) = self.body.ancestor_timeline { assert_eq!(ancestor, branchpoint.0); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 3316627540..4e6ea0c8f9 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -54,7 +54,7 @@ use utils::id::{TenantId, TimelineId}; use super::remote_timeline_client::remote_tenant_path; use super::secondary::SecondaryTenant; -use super::timeline::detach_ancestor::PreparedTimelineDetach; +use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; use super::{GlobalShutDown, TenantSharedResources}; /// For a tenant that appears in TenantsMap, it may either be @@ -1927,93 +1927,149 @@ impl TenantManager { tenant_shard_id: TenantShardId, timeline_id: TimelineId, prepared: PreparedTimelineDetach, + mut attempt: detach_ancestor::Attempt, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - // FIXME: this is unnecessary, slotguard already has these semantics - struct RevertOnDropSlot(Option); + ) -> Result, detach_ancestor::Error> { + use detach_ancestor::Error; - impl Drop for RevertOnDropSlot { - fn drop(&mut self) { - if let Some(taken) = self.0.take() { - taken.revert(); - } - } - } + let slot_guard = + tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist).map_err( + |e| { + use TenantSlotError::*; - impl RevertOnDropSlot { - fn into_inner(mut self) -> SlotGuard { - self.0.take().unwrap() - } - } - - impl std::ops::Deref for RevertOnDropSlot { - type Target = SlotGuard; - - fn deref(&self) -> &Self::Target { - self.0.as_ref().unwrap() - } - } - - let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; - let slot_guard = RevertOnDropSlot(Some(slot_guard)); + match e { + MapState(TenantMapError::ShuttingDown) => Error::ShuttingDown, + NotFound(_) | InProgress | MapState(_) => Error::DetachReparent(e.into()), + } + }, + )?; let tenant = { - let Some(old_slot) = slot_guard.get_old_value() else { - anyhow::bail!( - "Tenant not found when trying to complete detaching timeline ancestor" - ); - }; + let old_slot = slot_guard + .get_old_value() + .as_ref() + .expect("requested MustExist"); let Some(tenant) = old_slot.get_attached() else { - anyhow::bail!("Tenant is not in attached state"); + return Err(Error::DetachReparent(anyhow::anyhow!( + "Tenant is not in attached state" + ))); }; if !tenant.is_active() { - anyhow::bail!("Tenant is not active"); + return Err(Error::DetachReparent(anyhow::anyhow!( + "Tenant is not active" + ))); } tenant.clone() }; - let timeline = tenant.get_timeline(timeline_id, true)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(Error::NotFound)?; - let reparented = timeline - .complete_detaching_timeline_ancestor(&tenant, prepared, ctx) + let resp = timeline + .detach_from_ancestor_and_reparent(&tenant, prepared, ctx) .await?; - let mut slot_guard = slot_guard.into_inner(); + let mut slot_guard = slot_guard; - let (_guard, progress) = utils::completion::channel(); - match tenant.shutdown(progress, ShutdownMode::Hard).await { - Ok(()) => { - slot_guard.drop_old_value()?; + let tenant = if resp.reset_tenant_required() { + attempt.before_reset_tenant(); + + let (_guard, progress) = utils::completion::channel(); + match tenant.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => { + slot_guard.drop_old_value().expect("it was just shutdown"); + } + Err(_barrier) => { + slot_guard.revert(); + // this really should not happen, at all, unless a shutdown without acquiring + // tenant slot was already going? regardless, on restart the attempt tracking + // will reset to retryable. + return Err(Error::ShuttingDown); + } } - Err(_barrier) => { - slot_guard.revert(); - // this really should not happen, at all, unless shutdown was already going? - anyhow::bail!("Cannot restart Tenant, already shutting down"); + + let tenant_path = self.conf.tenant_path(&tenant_shard_id); + let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id) + .map_err(|e| Error::DetachReparent(e.into()))?; + + let shard_identity = config.shard; + let tenant = tenant_spawn( + self.conf, + tenant_shard_id, + &tenant_path, + self.resources.clone(), + AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?, + shard_identity, + None, + SpawnMode::Eager, + ctx, + ) + .map_err(|_| Error::ShuttingDown)?; + + { + let mut g = tenant.ongoing_timeline_detach.lock().unwrap(); + assert!( + g.is_none(), + "there cannot be any new timeline detach ancestor on newly created tenant" + ); + *g = Some((attempt.timeline_id, attempt.new_barrier())); } + + // if we bail out here, we will not allow a new attempt, which should be fine. + // pageserver should be shutting down regardless? tenant_reset would help, unless it + // runs into the same problem. + slot_guard + .upsert(TenantSlot::Attached(tenant.clone())) + .map_err(|e| match e { + TenantSlotUpsertError::ShuttingDown(_) => Error::ShuttingDown, + other => Error::DetachReparent(other.into()), + })?; + tenant + } else { + tracing::info!("skipping tenant_reset as no changes made required it"); + tenant + }; + + if let Some(reparented) = resp.completed() { + // finally ask the restarted tenant to complete the detach + // + // rationale for 9999s: we don't really have a timetable here; if retried, the caller + // will get an 503. + tenant + .wait_to_become_active(std::time::Duration::from_secs(9999)) + .await + .map_err(|e| { + use pageserver_api::models::TenantState; + use GetActiveTenantError::{Cancelled, WillNotBecomeActive}; + match e { + Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => { + Error::ShuttingDown + } + other => Error::Complete(other.into()), + } + })?; + + utils::pausable_failpoint!( + "timeline-detach-ancestor::after_activating_before_finding-pausable" + ); + + let timeline = tenant + .get_timeline(attempt.timeline_id, true) + .map_err(Error::NotFound)?; + + timeline + .complete_detaching_timeline_ancestor(&tenant, attempt, ctx) + .await + .map(|()| reparented) + } else { + // at least the latest versions have now been downloaded and refreshed; be ready to + // retry another time. + Err(Error::FailedToReparentAll) } - - let tenant_path = self.conf.tenant_path(&tenant_shard_id); - let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?; - - let shard_identity = config.shard; - let tenant = tenant_spawn( - self.conf, - tenant_shard_id, - &tenant_path, - self.resources.clone(), - AttachedTenantConf::try_from(config)?, - shard_identity, - None, - SpawnMode::Eager, - ctx, - )?; - - slot_guard.upsert(TenantSlot::Attached(tenant))?; - - Ok(reparented) } /// A page service client sends a TenantId, and to look up the correct Tenant we must @@ -2085,6 +2141,57 @@ impl TenantManager { } } } + + /// Calculate the tenant shards' contributions to this pageserver's utilization metrics. The + /// returned values are: + /// - the number of bytes of local disk space this pageserver's shards are requesting, i.e. + /// how much space they would use if not impacted by disk usage eviction. + /// - the number of tenant shards currently on this pageserver, including attached + /// and secondary. + /// + /// This function is quite expensive: callers are expected to cache the result and + /// limit how often they call it. + pub(crate) fn calculate_utilization(&self) -> Result<(u64, u32), TenantMapListError> { + let tenants = self.tenants.read().unwrap(); + let m = match &*tenants { + TenantsMap::Initializing => return Err(TenantMapListError::Initializing), + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, + }; + let shard_count = m.len(); + let mut wanted_bytes = 0; + + for tenant_slot in m.values() { + match tenant_slot { + TenantSlot::InProgress(_barrier) => { + // While a slot is being changed, we can't know how much storage it wants. This + // means this function's output can fluctuate if a lot of changes are going on + // (such as transitions from secondary to attached). + // + // We could wait for the barrier and retry, but it's important that the utilization + // API is responsive, and the data quality impact is not very significant. + continue; + } + TenantSlot::Attached(tenant) => { + wanted_bytes += tenant.local_storage_wanted(); + } + TenantSlot::Secondary(secondary) => { + let progress = secondary.progress.lock().unwrap(); + wanted_bytes += if progress.heatmap_mtime.is_some() { + // If we have heatmap info, then we will 'want' the sum + // of the size of layers in the heatmap: this is how much space + // we would use if not doing any eviction. + progress.bytes_total + } else { + // In the absence of heatmap info, assume that the secondary location simply + // needs as much space as it is currently using. + secondary.resident_size_metric.get() + } + } + } + } + + Ok((wanted_bytes, shard_count as u32)) + } } #[derive(Debug, thiserror::Error)] @@ -2284,6 +2391,9 @@ impl SlotGuard { /// Get any value that was present in the slot before we acquired ownership /// of it: in state transitions, this will be the old state. + /// + // FIXME: get_ prefix + // FIXME: this should be .as_ref() -- unsure why no clippy fn get_old_value(&self) -> &Option { &self.old_value } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 1344fe4192..b4d7ad1e97 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -736,12 +736,13 @@ impl RemoteTimelineClient { Ok(()) } + /// Reparent this timeline to a new parent. + /// + /// A retryable step of timeline ancestor detach. pub(crate) async fn schedule_reparenting_and_wait( self: &Arc, new_parent: &TimelineId, ) -> anyhow::Result<()> { - // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing - // and reads the in-memory part we cannot do the detaching like this let receiver = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -752,17 +753,25 @@ impl RemoteTimelineClient { )); }; - upload_queue.dirty.metadata.reparent(new_parent); - upload_queue.dirty.lineage.record_previous_ancestor(&prev); + let uploaded = &upload_queue.clean.0.metadata; - self.schedule_index_upload(upload_queue)?; + if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() { + // nothing to do + None + } else { + upload_queue.dirty.metadata.reparent(new_parent); + upload_queue.dirty.lineage.record_previous_ancestor(&prev); - self.schedule_barrier0(upload_queue) + self.schedule_index_upload(upload_queue)?; + + Some(self.schedule_barrier0(upload_queue)) + } }; - Self::wait_completion0(receiver) - .await - .context("wait completion") + if let Some(receiver) = receiver { + Self::wait_completion0(receiver).await?; + } + Ok(()) } /// Schedules uploading a new version of `index_part.json` with the given layers added, @@ -778,26 +787,30 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - upload_queue.dirty.metadata.detach_from_ancestor(&adopted); - upload_queue.dirty.lineage.record_detaching(&adopted); + if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) { + None + } else { + upload_queue.dirty.metadata.detach_from_ancestor(&adopted); + upload_queue.dirty.lineage.record_detaching(&adopted); - for layer in layers { - upload_queue - .dirty - .layer_metadata - .insert(layer.layer_desc().layer_name(), layer.metadata()); + for layer in layers { + let prev = upload_queue + .dirty + .layer_metadata + .insert(layer.layer_desc().layer_name(), layer.metadata()); + assert!(prev.is_none(), "copied layer existed already {layer}"); + } + + self.schedule_index_upload(upload_queue)?; + + Some(self.schedule_barrier0(upload_queue)) } - - self.schedule_index_upload(upload_queue)?; - - let barrier = self.schedule_barrier0(upload_queue); - self.launch_queued_tasks(upload_queue); - barrier }; - Self::wait_completion0(barrier) - .await - .context("wait completion") + if let Some(barrier) = barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) } /// Adds a gc blocking reason for this timeline if one does not exist already. @@ -873,12 +886,7 @@ impl RemoteTimelineClient { let upload_queue = guard.initialized_mut()?; if let index::GcBlockingReason::DetachAncestor = reason { - if !upload_queue - .clean - .0 - .lineage - .is_detached_from_original_ancestor() - { + if !upload_queue.clean.0.lineage.is_detached_from_ancestor() { drop(guard); panic!("cannot complete timeline_ancestor_detach while not detached"); } @@ -985,7 +993,10 @@ impl RemoteTimelineClient { /// /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`] /// is invoked on them. - pub(crate) fn schedule_gc_update(self: &Arc, gc_layers: &[Layer]) -> anyhow::Result<()> { + pub(crate) fn schedule_gc_update( + self: &Arc, + gc_layers: &[Layer], + ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index a17b32c983..d9725ad756 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -23,6 +23,8 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; +#[cfg_attr(target_os = "macos", allow(unused_imports))] +use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath}; @@ -219,9 +221,7 @@ async fn download_object<'a>( Ok(chunk) => chunk, Err(e) => return Err(e), }; - buffered - .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx) - .await?; + buffered.write_buffered(chunk.slice_len(), ctx).await?; } let size_tracking = buffered.flush_and_into_inner(ctx).await?; Ok(size_tracking.into_inner()) diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 90453b1922..757fb9d032 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -216,26 +216,47 @@ fn is_false(b: &bool) -> bool { impl Lineage { const REMEMBER_AT_MOST: usize = 100; - pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) { + pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool { if self.reparenting_history.last() == Some(old_ancestor) { // do not re-record it - return; - } + false + } else { + #[cfg(feature = "testing")] + { + let existing = self + .reparenting_history + .iter() + .position(|x| x == old_ancestor); + assert_eq!( + existing, None, + "we cannot reparent onto and off and onto the same timeline twice" + ); + } + let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; - let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; - - self.reparenting_history_truncated |= drop_oldest; - if drop_oldest { - self.reparenting_history.remove(0); + self.reparenting_history_truncated |= drop_oldest; + if drop_oldest { + self.reparenting_history.remove(0); + } + self.reparenting_history.push(*old_ancestor); + true } - self.reparenting_history.push(*old_ancestor); } - pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) { - assert!(self.original_ancestor.is_none()); - - self.original_ancestor = - Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + /// Returns true if anything changed. + pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool { + if let Some((id, lsn, _)) = self.original_ancestor { + assert_eq!( + &(id, lsn), + branchpoint, + "detaching attempt has to be for the same ancestor we are already detached from" + ); + false + } else { + self.original_ancestor = + Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + true + } } /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed @@ -247,10 +268,16 @@ impl Lineage { .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn) } - pub(crate) fn is_detached_from_original_ancestor(&self) -> bool { + /// Returns true if the timeline originally had an ancestor, and no longer has one. + pub(crate) fn is_detached_from_ancestor(&self) -> bool { self.original_ancestor.is_some() } + /// Returns original ancestor timeline id and lsn that this timeline has been detached from. + pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> { + self.original_ancestor.map(|(id, lsn, _)| (id, lsn)) + } + pub(crate) fn is_reparented(&self) -> bool { !self.reparenting_history.is_empty() } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 135e73b57f..8cff1d2864 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -22,7 +22,7 @@ use crate::{ FAILED_REMOTE_OP_RETRIES, }, span::debug_assert_current_span_has_tenant_id, - storage_layer::{layer::local_layer_path, LayerName}, + storage_layer::{layer::local_layer_path, LayerName, LayerVisibilityHint}, tasks::{warn_when_period_overrun, BackgroundLoopKind}, }, virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, @@ -296,6 +296,9 @@ impl SecondaryDetail { }), last_activity_ts: ods.access_time, relative_last_activity: finite_f32::FiniteF32::ZERO, + // Secondary location layers are presumed visible, because Covered layers + // are excluded from the heatmap + visibility: LayerVisibilityHint::Visible, } })); diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 166483ba5d..4a8e66d38a 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -29,16 +29,16 @@ pub(super) struct HeatMapTenant { #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapTimeline { #[serde_as(as = "DisplayFromStr")] - pub(super) timeline_id: TimelineId, + pub(crate) timeline_id: TimelineId, - pub(super) layers: Vec, + pub(crate) layers: Vec, } #[serde_as] #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapLayer { - pub(super) name: LayerName, - pub(super) metadata: LayerFileMetadata, + pub(crate) name: LayerName, + pub(crate) metadata: LayerFileMetadata, #[serde_as(as = "TimestampSeconds")] pub(super) access_time: SystemTime, diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index f4e965b99a..6c2391d72d 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -42,6 +42,7 @@ use crate::tenant::vectored_blob_io::{ VectoredReadPlanner, }; use crate::tenant::PageReconstructError; +use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::{self, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; @@ -63,6 +64,7 @@ use std::os::unix::fs::FileExt; use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; +use tokio_epoll_uring::IoBufMut; use tracing::*; use utils::{ @@ -436,19 +438,28 @@ impl DeltaLayerWriterInner { ctx: &RequestContext, ) -> anyhow::Result<()> { let (_, res) = self - .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx) + .put_value_bytes( + key, + lsn, + Value::ser(&val)?.slice_len(), + val.will_init(), + ctx, + ) .await; res } - async fn put_value_bytes( + async fn put_value_bytes( &mut self, key: Key, lsn: Lsn, - val: Vec, + val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (Vec, anyhow::Result<()>) { + ) -> (FullSlice, anyhow::Result<()>) + where + Buf: IoBufMut + Send, + { assert!( self.lsn_range.start <= lsn, "lsn_start={}, lsn={}", @@ -514,7 +525,7 @@ impl DeltaLayerWriterInner { file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64)) .await?; for buf in block_buf.blocks { - let (_buf, res) = file.write_all(buf, ctx).await; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; res?; } assert!(self.lsn_range.start < self.lsn_range.end); @@ -534,7 +545,7 @@ impl DeltaLayerWriterInner { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf, ctx).await; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; res?; let metadata = file @@ -646,14 +657,17 @@ impl DeltaLayerWriter { .await } - pub async fn put_value_bytes( + pub async fn put_value_bytes( &mut self, key: Key, lsn: Lsn, - val: Vec, + val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (Vec, anyhow::Result<()>) { + ) -> (FullSlice, anyhow::Result<()>) + where + Buf: IoBufMut + Send, + { self.inner .as_mut() .unwrap() @@ -743,7 +757,7 @@ impl DeltaLayer { // TODO: could use smallvec here, but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf, ctx).await; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; res?; Ok(()) } @@ -1020,7 +1034,7 @@ impl DeltaLayerInner { for (_, blob_meta) in read.blobs_at.as_slice() { reconstruct_state.on_key_error( blob_meta.key, - PageReconstructError::from(anyhow!( + PageReconstructError::Other(anyhow!( "Failed to read blobs from virtual file {}: {}", self.file.path, kind @@ -1047,7 +1061,7 @@ impl DeltaLayerInner { Err(e) => { reconstruct_state.on_key_error( meta.meta.key, - PageReconstructError::from(anyhow!(e).context(format!( + PageReconstructError::Other(anyhow!(e).context(format!( "Failed to deserialize blob from virtual file {}", self.file.path, ))), @@ -1291,12 +1305,12 @@ impl DeltaLayerInner { .put_value_bytes( key, lsn, - std::mem::take(&mut per_blob_copy), + std::mem::take(&mut per_blob_copy).slice_len(), will_init, ctx, ) .await; - per_blob_copy = tmp; + per_blob_copy = tmp.into_raw_slice().into_inner(); res?; @@ -1871,7 +1885,7 @@ pub(crate) mod test { for entry in entries { let (_, res) = writer - .put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx) + .put_value_bytes(entry.key, entry.lsn, entry.value.slice_len(), false, &ctx) .await; res?; } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 16ba0fda94..9a19e4e2c7 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -38,6 +38,7 @@ use crate::tenant::vectored_blob_io::{ VectoredReadPlanner, }; use crate::tenant::{PageReconstructError, Timeline}; +use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::{self, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{anyhow, bail, ensure, Context, Result}; @@ -354,7 +355,7 @@ impl ImageLayer { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf, ctx).await; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; res?; Ok(()) } @@ -369,9 +370,6 @@ impl ImageLayerInner { self.lsn } - /// Returns nested result following Result, Critical>: - /// - inner has the success or transient failure - /// - outer has the permanent failure pub(super) async fn load( path: &Utf8Path, lsn: Lsn, @@ -789,7 +787,7 @@ impl ImageLayerWriterInner { self.num_keys += 1; let (_img, res) = self .blob_writer - .write_blob_maybe_compressed(img, ctx, compression) + .write_blob_maybe_compressed(img.slice_len(), ctx, compression) .await; // TODO: re-use the buffer for `img` further upstack let (off, compression_info) = res?; @@ -841,7 +839,7 @@ impl ImageLayerWriterInner { .await?; let (index_root_blk, block_buf) = self.tree.finish()?; for buf in block_buf.blocks { - let (_buf, res) = file.write_all(buf, ctx).await; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; res?; } @@ -861,7 +859,7 @@ impl ImageLayerWriterInner { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf, ctx).await; + let (_buf, res) = file.write_all(buf.slice_len(), ctx).await; res?; let metadata = file diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 57d93feaaf..748d79c149 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -12,9 +12,11 @@ use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::timeline::GetVectoredError; use crate::tenant::PageReconstructError; +use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::{l0_flush, page_cache, walrecord}; use anyhow::{anyhow, Result}; use camino::Utf8PathBuf; +use pageserver_api::key::CompactKey; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; @@ -78,7 +80,7 @@ pub struct InMemoryLayerInner { /// All versions of all pages in the layer are kept here. Indexed /// by block number and LSN. The value is an offset into the /// ephemeral file where the page version is stored. - index: BTreeMap>, + index: BTreeMap>, /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. @@ -312,8 +314,12 @@ impl InMemoryLayer { let reader = inner.file.block_cursor(); for range in keyspace.ranges.iter() { - for (key, vec_map) in inner.index.range(range.start..range.end) { - let lsn_range = match reconstruct_state.get_cached_lsn(key) { + for (key, vec_map) in inner + .index + .range(range.start.to_compact()..range.end.to_compact()) + { + let key = Key::from_compact(*key); + let lsn_range = match reconstruct_state.get_cached_lsn(&key) { Some(cached_lsn) => (cached_lsn + 1)..end_lsn, None => self.start_lsn..end_lsn, }; @@ -324,20 +330,18 @@ impl InMemoryLayer { // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183 let buf = reader.read_blob(*pos, &ctx).await; if let Err(e) = buf { - reconstruct_state - .on_key_error(*key, PageReconstructError::from(anyhow!(e))); + reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); break; } let value = Value::des(&buf.unwrap()); if let Err(e) = value { - reconstruct_state - .on_key_error(*key, PageReconstructError::from(anyhow!(e))); + reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); break; } let key_situation = - reconstruct_state.update_key(key, *entry_lsn, value.unwrap()); + reconstruct_state.update_key(&key, *entry_lsn, value.unwrap()); if key_situation == ValueReconstructSituation::Complete { break; } @@ -417,7 +421,7 @@ impl InMemoryLayer { /// Adds the page version to the in-memory tree pub async fn put_value( &self, - key: Key, + key: CompactKey, lsn: Lsn, buf: &[u8], ctx: &RequestContext, @@ -430,7 +434,7 @@ impl InMemoryLayer { async fn put_value_locked( &self, locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>, - key: Key, + key: CompactKey, lsn: Lsn, buf: &[u8], ctx: &RequestContext, @@ -539,6 +543,8 @@ impl InMemoryLayer { let end_lsn = *self.end_lsn.get().unwrap(); let key_count = if let Some(key_range) = key_range { + let key_range = key_range.start.to_compact()..key_range.end.to_compact(); + inner .index .iter() @@ -576,11 +582,17 @@ impl InMemoryLayer { for (lsn, pos) in vec_map.as_slice() { cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; let will_init = Value::des(&buf)?.will_init(); - let res; - (buf, res) = delta_layer_writer - .put_value_bytes(*key, *lsn, buf, will_init, &ctx) + let (tmp, res) = delta_layer_writer + .put_value_bytes( + Key::from_compact(*key), + *lsn, + buf.slice_len(), + will_init, + &ctx, + ) .await; res?; + buf = tmp.into_raw_slice().into_inner(); } } } @@ -615,11 +627,17 @@ impl InMemoryLayer { // => https://github.com/neondatabase/neon/issues/8183 cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?; let will_init = Value::des(&buf)?.will_init(); - let res; - (buf, res) = delta_layer_writer - .put_value_bytes(*key, *lsn, buf, will_init, ctx) + let (tmp, res) = delta_layer_writer + .put_value_bytes( + Key::from_compact(*key), + *lsn, + buf.slice_len(), + will_init, + ctx, + ) .await; res?; + buf = tmp.into_raw_slice().into_inner(); } } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 83450d24bb..774f97e1d9 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -312,7 +312,9 @@ impl Layer { .get_or_maybe_download(true, Some(ctx)) .await .map_err(|err| match err { - DownloadError::DownloadCancelled => GetVectoredError::Cancelled, + DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => { + GetVectoredError::Cancelled + } other => GetVectoredError::Other(anyhow::anyhow!(other)), })?; @@ -1612,6 +1614,12 @@ pub(crate) enum DownloadError { Failpoint(failpoints::FailpointKind), } +impl DownloadError { + pub(crate) fn is_cancelled(&self) -> bool { + matches!(self, DownloadError::DownloadCancelled) + } +} + #[derive(Debug, PartialEq)] pub(crate) enum NeedsDownload { NotFound, @@ -1848,8 +1856,8 @@ impl ResidentLayer { /// Read all they keys in this layer which match the ShardIdentity, and write them all to /// the provided writer. Return the number of keys written. #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] - pub(crate) async fn filter<'a>( - &'a self, + pub(crate) async fn filter( + &self, shard_identity: &ShardIdentity, writer: &mut ImageLayerWriter, ctx: &RequestContext, diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs index d7bfe48c60..e12e29cd45 100644 --- a/pageserver/src/tenant/storage_layer/split_writer.rs +++ b/pageserver/src/tenant/storage_layer/split_writer.rs @@ -208,6 +208,8 @@ impl SplitDeltaLayerWriter { #[cfg(test)] mod tests { + use rand::{RngCore, SeedableRng}; + use crate::{ tenant::{ harness::{TenantHarness, TIMELINE_ID}, @@ -229,7 +231,10 @@ mod tests { } fn get_large_img() -> Bytes { - vec![0; 8192].into() + let mut rng = rand::rngs::SmallRng::seed_from_u64(42); + let mut data = vec![0; 8192]; + rng.fill_bytes(&mut data); + data.into() } #[tokio::test] diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index b4706ea59d..dbcd704b4e 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -211,6 +211,11 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } else { // Run compaction match tenant.compaction_iteration(&cancel, &ctx).await { + Ok(has_pending_task) => { + error_run_count = 0; + // schedule the next compaction immediately in case there is a pending compaction task + if has_pending_task { Duration::ZERO } else { period } + } Err(e) => { let wait_duration = backoff::exponential_backoff_duration_seconds( error_run_count + 1, @@ -227,11 +232,6 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { ); wait_duration } - Ok(has_pending_task) => { - error_run_count = 0; - // schedule the next compaction immediately in case there is a pending compaction task - if has_pending_task { Duration::from_secs(0) } else { period } - } } }; @@ -265,7 +265,8 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { count_throttled, sum_throttled_usecs, allowed_rps=%format_args!("{allowed_rps:.0}"), - "shard was throttled in the last n_seconds") + "shard was throttled in the last n_seconds" + ); }); // Sleep @@ -365,14 +366,13 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { if first { first = false; - if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel) - .await - .is_err() - { - break; - } + let delays = async { + delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?; + random_init_delay(period, &cancel).await?; + Ok::<_, Cancelled>(()) + }; - if random_init_delay(period, &cancel).await.is_err() { + if delays.await.is_err() { break; } } @@ -424,7 +424,6 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc); - // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) .await .is_ok() diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f810df5a56..26dc87c373 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -511,7 +511,7 @@ pub(crate) struct TimelineVisitOutcome { #[derive(thiserror::Error, Debug)] pub(crate) enum PageReconstructError { #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), #[error("Ancestor LSN wait error: {0}")] AncestorLsnTimeout(WaitLsnError), @@ -527,6 +527,22 @@ pub(crate) enum PageReconstructError { MissingKey(MissingKeyError), } +impl From for PageReconstructError { + fn from(value: anyhow::Error) -> Self { + // with walingest.rs many PageReconstructError are wrapped in as anyhow::Error + match value.downcast::() { + Ok(pre) => pre, + Err(other) => PageReconstructError::Other(other), + } + } +} + +impl From for PageReconstructError { + fn from(value: utils::bin_ser::DeserializeError) -> Self { + PageReconstructError::Other(anyhow::Error::new(value).context("deserialization failure")) + } +} + impl From for PageReconstructError { fn from(_: layer_manager::Shutdown) -> Self { PageReconstructError::Cancelled @@ -546,6 +562,7 @@ impl From for GetVectoredError { } } +#[derive(thiserror::Error)] pub struct MissingKeyError { key: Key, shard: ShardNumber, @@ -585,11 +602,8 @@ impl PageReconstructError { pub(crate) fn is_stopping(&self) -> bool { use PageReconstructError::*; match self { - Other(_) => false, - AncestorLsnTimeout(_) => false, Cancelled => true, - WalRedo(_) => false, - MissingKey { .. } => false, + Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false, } } } @@ -599,11 +613,11 @@ pub(crate) enum CreateImageLayersError { #[error("timeline shutting down")] Cancelled, - #[error(transparent)] - GetVectoredError(GetVectoredError), + #[error("read failed")] + GetVectoredError(#[source] GetVectoredError), - #[error(transparent)] - PageReconstructError(PageReconstructError), + #[error("reconstruction failed")] + PageReconstructError(#[source] PageReconstructError), #[error(transparent)] Other(#[from] anyhow::Error), @@ -627,10 +641,10 @@ pub(crate) enum FlushLayerError { // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush // loop via a watch channel, where we can only borrow it. - #[error(transparent)] + #[error("create image layers (shared)")] CreateImageLayersError(Arc), - #[error(transparent)] + #[error("other (shared)")] Other(#[from] Arc), } @@ -663,34 +677,46 @@ pub(crate) enum GetVectoredError { #[error("timeline shutting down")] Cancelled, - #[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)] + #[error("requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)] Oversized(u64), - #[error("Requested at invalid LSN: {0}")] + #[error("requested at invalid LSN: {0}")] InvalidLsn(Lsn), - #[error("Requested key not found: {0}")] + #[error("requested key not found: {0}")] MissingKey(MissingKeyError), - #[error(transparent)] - GetReadyAncestorError(GetReadyAncestorError), + #[error("ancestry walk")] + GetReadyAncestorError(#[source] GetReadyAncestorError), #[error(transparent)] Other(#[from] anyhow::Error), } +impl From for GetVectoredError { + fn from(value: GetReadyAncestorError) -> Self { + use GetReadyAncestorError::*; + match value { + Cancelled => GetVectoredError::Cancelled, + AncestorLsnTimeout(_) | BadState { .. } => { + GetVectoredError::GetReadyAncestorError(value) + } + } + } +} + #[derive(thiserror::Error, Debug)] pub(crate) enum GetReadyAncestorError { - #[error("Ancestor LSN wait error: {0}")] + #[error("ancestor LSN wait error")] AncestorLsnTimeout(#[from] WaitLsnError), - #[error("Bad state on timeline {timeline_id}: {state:?}")] + #[error("bad state on timeline {timeline_id}: {state:?}")] BadState { timeline_id: TimelineId, state: TimelineState, }, - #[error("Cancelled")] + #[error("cancelled")] Cancelled, } @@ -802,40 +828,6 @@ impl From for PageReconstructError { } } -#[derive( - Eq, - PartialEq, - Debug, - Copy, - Clone, - strum_macros::EnumString, - strum_macros::Display, - serde_with::DeserializeFromStr, - serde_with::SerializeDisplay, -)] -#[strum(serialize_all = "kebab-case")] -pub enum GetVectoredImpl { - Sequential, - Vectored, -} - -#[derive( - Eq, - PartialEq, - Debug, - Copy, - Clone, - strum_macros::EnumString, - strum_macros::Display, - serde_with::DeserializeFromStr, - serde_with::SerializeDisplay, -)] -#[strum(serialize_all = "kebab-case")] -pub enum GetImpl { - Legacy, - Vectored, -} - pub(crate) enum WaitLsnWaiter<'a> { Timeline(&'a Timeline), Tenant, @@ -995,11 +987,10 @@ impl Timeline { } trace!( - "get vectored request for {:?}@{} from task kind {:?} will use {} implementation", + "get vectored request for {:?}@{} from task kind {:?}", keyspace, lsn, ctx.task_kind(), - self.conf.get_vectored_impl ); let start = crate::metrics::GET_VECTORED_LATENCY @@ -1654,6 +1645,20 @@ impl Timeline { self.last_record_lsn.shutdown(); if try_freeze_and_flush { + if let Some((open, frozen)) = self + .layers + .read() + .await + .layer_map() + .map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len())) + .ok() + .filter(|(open, frozen)| *open || *frozen > 0) + { + tracing::info!(?open, frozen, "flushing and freezing on shutdown"); + } else { + // this is double-shutdown, ignore it + } + // we shut down walreceiver above, so, we won't add anything more // to the InMemoryLayer; freeze it and wait for all frozen layers // to reach the disk & upload queue, then shut the upload queue and @@ -2972,11 +2977,7 @@ impl Timeline { LayerVisibilityHint::Visible => { // Layer is visible to one or more read LSNs: elegible for inclusion in layer map let last_activity_ts = layer.latest_activity(); - Some(HeatMapLayer::new( - layer.layer_desc().layer_name(), - layer.metadata(), - last_activity_ts, - )) + Some((layer.layer_desc(), layer.metadata(), last_activity_ts)) } LayerVisibilityHint::Covered => { // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap. @@ -2985,7 +2986,23 @@ impl Timeline { } }); - let layers = resident.collect(); + let mut layers = resident.collect::>(); + + // Sort layers in order of which to download first. For a large set of layers to download, we + // want to prioritize those layers which are most likely to still be in the resident many minutes + // or hours later: + // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might + // only exist for a few minutes before being compacted into L1s. + // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner + // the layer is likely to be covered by an image layer during compaction. + layers.sort_by_key(|(desc, _meta, _atime)| { + std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end)) + }); + + let layers = layers + .into_iter() + .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime)) + .collect(); Some(HeatMapTimeline::new(self.timeline_id, layers)) } @@ -3081,8 +3098,7 @@ impl Timeline { cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); timeline_owned = timeline .get_ready_ancestor_timeline(ancestor_timeline, ctx) - .await - .map_err(GetVectoredError::GetReadyAncestorError)?; + .await?; timeline = &*timeline_owned; }; @@ -3952,6 +3968,10 @@ impl Timeline { .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) .await?; + if self.cancel.is_cancelled() { + return Err(CreateImageLayersError::Cancelled); + } + for (img_key, img) in results { let img = match img { Ok(img) => img, @@ -3975,7 +3995,7 @@ impl Timeline { warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); ZERO_PAGE.clone() } else { - return Err(CreateImageLayersError::PageReconstructError(err)); + return Err(CreateImageLayersError::from(err)); } } }; @@ -4035,7 +4055,7 @@ impl Timeline { let mut total_kb_retrieved = 0; let mut total_keys_retrieved = 0; for (k, v) in data { - let v = v.map_err(CreateImageLayersError::PageReconstructError)?; + let v = v?; total_kb_retrieved += KEY_SIZE + v.len(); total_keys_retrieved += 1; new_data.insert(k, v); @@ -4059,6 +4079,9 @@ impl Timeline { next_start_key: img_range.end, }); } + if self.cancel.is_cancelled() { + return Err(CreateImageLayersError::Cancelled); + } let mut wrote_any_image = false; for (k, v) in data { if v.is_empty() { @@ -4173,6 +4196,10 @@ impl Timeline { let check_for_image_layers = self.should_check_if_image_layers_required(lsn); for partition in partitioning.parts.iter() { + if self.cancel.is_cancelled() { + return Err(CreateImageLayersError::Cancelled); + } + let img_range = start..partition.ranges.last().unwrap().end; let compact_metadata = partition.overlaps(&Key::metadata_key_range()); if compact_metadata { @@ -4352,18 +4379,34 @@ impl Timeline { detach_ancestor::prepare(self, tenant, options, ctx).await } - /// Completes the ancestor detach. This method is to be called while holding the - /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any - /// timeline be deleted. After this method returns successfully, tenant must be reloaded. + /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and + /// reparents any reparentable children of previous ancestor. /// - /// Pageserver receiving a SIGKILL during this operation is not supported (yet). - pub(crate) async fn complete_detaching_timeline_ancestor( + /// This method is to be called while holding the TenantManager's tenant slot, so during this + /// method we cannot be deleted nor can any timeline be deleted. After this method returns + /// successfully, tenant must be reloaded. + /// + /// Final step will be to [`Self::complete_detaching_timeline_ancestor`] after optionally + /// resetting the tenant. + pub(crate) async fn detach_from_ancestor_and_reparent( self: &Arc, tenant: &crate::tenant::Tenant, prepared: detach_ancestor::PreparedTimelineDetach, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - detach_ancestor::complete(self, tenant, prepared, ctx).await + ) -> Result { + detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await + } + + /// Final step which unblocks the GC. + /// + /// The tenant must've been reset if ancestry was modified previously (in tenant manager). + pub(crate) async fn complete_detaching_timeline_ancestor( + self: &Arc, + tenant: &crate::tenant::Tenant, + attempt: detach_ancestor::Attempt, + ctx: &RequestContext, + ) -> Result<(), detach_ancestor::Error> { + detach_ancestor::complete(self, tenant, attempt, ctx).await } /// Switch aux file policy and schedule upload to the index part. @@ -4421,22 +4464,24 @@ impl From for CompactionError { } } -impl CompactionError { - /// We cannot do compaction because we could not download a layer that is input to the compaction. - pub(crate) fn input_layer_download_failed( - e: super::storage_layer::layer::DownloadError, - ) -> Self { +impl From for CompactionError { + fn from(e: super::storage_layer::layer::DownloadError) -> Self { match e { - super::storage_layer::layer::DownloadError::TimelineShutdown | - /* TODO DownloadCancelled correct here? */ - super::storage_layer::layer::DownloadError::DownloadCancelled => CompactionError::ShuttingDown, - super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads | - super::storage_layer::layer::DownloadError::DownloadRequired | - super::storage_layer::layer::DownloadError::NotFile(_) | - super::storage_layer::layer::DownloadError::DownloadFailed | - super::storage_layer::layer::DownloadError::PreStatFailed(_)=>CompactionError::Other(anyhow::anyhow!(e)), + super::storage_layer::layer::DownloadError::TimelineShutdown + | super::storage_layer::layer::DownloadError::DownloadCancelled => { + CompactionError::ShuttingDown + } + super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads + | super::storage_layer::layer::DownloadError::DownloadRequired + | super::storage_layer::layer::DownloadError::NotFile(_) + | super::storage_layer::layer::DownloadError::DownloadFailed + | super::storage_layer::layer::DownloadError::PreStatFailed(_) => { + CompactionError::Other(anyhow::anyhow!(e)) + } #[cfg(test)] - super::storage_layer::layer::DownloadError::Failpoint(_) => CompactionError::Other(anyhow::anyhow!(e)), + super::storage_layer::layer::DownloadError::Failpoint(_) => { + CompactionError::Other(anyhow::anyhow!(e)) + } } } } @@ -4483,6 +4528,7 @@ impl DurationRecorder { /// the layer descriptor requires the user to provide the ranges, which should cover all /// keys specified in the `data` field. #[cfg(test)] +#[derive(Clone)] pub struct DeltaLayerTestDesc { pub lsn_range: Range, pub key_range: Range, @@ -4512,6 +4558,13 @@ impl DeltaLayerTestDesc { data, } } + + pub(crate) fn layer_name(&self) -> LayerName { + LayerName::Delta(super::storage_layer::DeltaLayerName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + }) + } } impl Timeline { @@ -4521,7 +4574,12 @@ impl Timeline { new_images: &[ResidentLayer], layers_to_remove: &[Layer], ) -> Result<(), CompactionError> { - let mut guard = self.layers.write().await; + let mut guard = tokio::select! { + guard = self.layers.write() => guard, + _ = self.cancel.cancelled() => { + return Err(CompactionError::ShuttingDown); + } + }; let mut duplicated_layers = HashSet::new(); @@ -4990,15 +5048,7 @@ impl Timeline { result.layers_removed = gc_layers.len() as u64; - self.remote_client - .schedule_gc_update(&gc_layers) - .map_err(|e| { - if self.cancel.is_cancelled() { - GcError::TimelineCancelled - } else { - GcError::Remote(e) - } - })?; + self.remote_client.schedule_gc_update(&gc_layers)?; guard.open_mut()?.finish_gc_timeline(&gc_layers); @@ -5275,6 +5325,7 @@ impl Timeline { layer: layer.to_owned().into(), last_activity_ts, relative_last_activity: finite_f32::FiniteF32::ZERO, + visibility: layer.visibility(), } }) .collect(); @@ -5559,7 +5610,7 @@ impl<'a> TimelineWriter<'a> { let action = self.get_open_layer_action(lsn, buf_size); let layer = self.handle_open_layer_action(lsn, action, ctx).await?; - let res = layer.put_value(key, lsn, &buf, ctx).await; + let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await; if res.is_ok() { // Update the current size only when the entire write was ok. @@ -5737,12 +5788,110 @@ fn is_send() { #[cfg(test)] mod tests { + use pageserver_api::key::Key; use utils::{id::TimelineId, lsn::Lsn}; - use crate::tenant::{ - harness::TenantHarness, storage_layer::Layer, timeline::EvictionError, Timeline, + use crate::{ + repository::Value, + tenant::{ + harness::{test_img, TenantHarness}, + layer_map::LayerMap, + storage_layer::{Layer, LayerName}, + timeline::{DeltaLayerTestDesc, EvictionError}, + Timeline, + }, }; + #[tokio::test] + async fn test_heatmap_generation() { + let harness = TenantHarness::create("heatmap_generation").await.unwrap(); + + let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + Lsn(0x11), + Value::Image(test_img("foo")), + )], + ); + let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![( + Key::from_hex("720000000033333333444444445500000000").unwrap(), + Lsn(0x11), + Value::Image(test_img("foo")), + )], + ); + let l0_delta = DeltaLayerTestDesc::new( + Lsn(0x20)..Lsn(0x30), + Key::from_hex("000000000000000000000000000000000000").unwrap() + ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(), + vec![( + Key::from_hex("720000000033333333444444445500000000").unwrap(), + Lsn(0x25), + Value::Image(test_img("foo")), + )], + ); + let delta_layers = vec![ + covered_delta.clone(), + visible_delta.clone(), + l0_delta.clone(), + ]; + + let image_layer = ( + Lsn(0x40), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + test_img("bar"), + )], + ); + let image_layers = vec![image_layer]; + + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_test_timeline_with_layers( + TimelineId::generate(), + Lsn(0x10), + 14, + &ctx, + delta_layers, + image_layers, + Lsn(0x100), + ) + .await + .unwrap(); + + // Layer visibility is an input to heatmap generation, so refresh it first + timeline.update_layer_visibility().await.unwrap(); + + let heatmap = timeline + .generate_heatmap() + .await + .expect("Infallible while timeline is not shut down"); + + assert_eq!(heatmap.timeline_id, timeline.timeline_id); + + // L0 should come last + assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name()); + + let mut last_lsn = Lsn::MAX; + for layer in heatmap.layers { + // Covered layer should be omitted + assert!(layer.name != covered_delta.layer_name()); + + let layer_lsn = match &layer.name { + LayerName::Delta(d) => d.lsn_range.end, + LayerName::Image(i) => i.lsn, + }; + + // Apart from L0s, newest Layers should come first + if !LayerMap::is_l0(layer.name.key_range()) { + assert!(layer_lsn <= last_lsn); + last_lsn = layer_lsn; + } + } + } + #[tokio::test] async fn two_layer_eviction_attempts_at_the_same_time() { let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time") diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 87ec46c0b5..7370ec1386 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -489,10 +489,7 @@ impl Timeline { // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are: // - GC, which at worst witnesses us "undelete" a layer that they just deleted. // - ingestion, which only inserts layers, therefore cannot collide with us. - let resident = layer - .download_and_keep_resident() - .await - .map_err(CompactionError::input_layer_download_failed)?; + let resident = layer.download_and_keep_resident().await?; let keys_written = resident .filter(&self.shard_identity, &mut image_layer_writer, ctx) @@ -693,23 +690,14 @@ impl Timeline { let mut fully_compacted = true; - deltas_to_compact.push( - first_level0_delta - .download_and_keep_resident() - .await - .map_err(CompactionError::input_layer_download_failed)?, - ); + deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); for l in level0_deltas_iter { let lsn_range = &l.layer_desc().lsn_range; if lsn_range.start != prev_lsn_end { break; } - deltas_to_compact.push( - l.download_and_keep_resident() - .await - .map_err(CompactionError::input_layer_download_failed)?, - ); + deltas_to_compact.push(l.download_and_keep_resident().await?); deltas_to_compact_bytes += l.metadata().file_size; prev_lsn_end = lsn_range.end; @@ -760,6 +748,9 @@ impl Timeline { let all_keys = { let mut all_keys = Vec::new(); for l in deltas_to_compact.iter() { + if self.cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?); } // The current stdlib sorting implementation is designed in a way where it is @@ -842,6 +833,11 @@ impl Timeline { }; stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); drop_rlock(guard); + + if self.cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } + stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); // This iterator walks through all key-value pairs from all the layers @@ -1052,11 +1048,22 @@ impl Timeline { let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key let mut next_hole = 0; // index of next hole in holes vector + let mut keys = 0; + while let Some((key, lsn, value)) = all_values_iter .next(ctx) .await .map_err(CompactionError::Other)? { + keys += 1; + + if keys % 32_768 == 0 && self.cancel.is_cancelled() { + // avoid hitting the cancellation token on every key. in benches, we end up + // shuffling an order of million keys per layer, this means we'll check it + // around tens of times per layer. + return Err(CompactionError::ShuttingDown); + } + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); // We need to check key boundaries once we reach next key or end of layer with the same key if !same_key || lsn == dup_end_lsn { @@ -1137,6 +1144,10 @@ impl Timeline { if !self.shard_identity.is_key_disposable(&key) { if writer.is_none() { + if self.cancel.is_cancelled() { + // to be somewhat responsive to cancellation, check for each new layer + return Err(CompactionError::ShuttingDown); + } // Create writer if not initiaized yet writer = Some( DeltaLayerWriter::new( @@ -1157,6 +1168,8 @@ impl Timeline { .await .map_err(CompactionError::Other)?, ); + + keys = 0; } writer @@ -2325,7 +2338,7 @@ impl CompactionJobExecutor for TimelineAdaptor { key_range, )) } else { - // The current compaction implementatin only ever requests the key space + // The current compaction implementation only ever requests the key space // at the compaction end LSN. anyhow::bail!("keyspace not available for requested lsn"); } diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 3b52adc77b..641faada25 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -5,12 +5,15 @@ use crate::{ context::{DownloadBehavior, RequestContext}, task_mgr::TaskKind, tenant::{ + remote_timeline_client::index::GcBlockingReason::DetachAncestor, storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}, Tenant, }, virtual_file::{MaybeFatalIo, VirtualFile}, }; +use anyhow::Context; use pageserver_api::models::detach_ancestor::AncestorDetached; +use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; @@ -19,50 +22,74 @@ use utils::{completion, generation::Generation, http::error::ApiError, id::Timel pub(crate) enum Error { #[error("no ancestors")] NoAncestor, + #[error("too many ancestors")] TooManyAncestors, + #[error("shutting down, please retry later")] ShuttingDown, - #[error("flushing failed")] - FlushAncestor(#[source] FlushLayerError), - #[error("layer download failed")] - RewrittenDeltaDownloadFailed(#[source] crate::tenant::storage_layer::layer::DownloadError), - #[error("copying LSN prefix locally failed")] - CopyDeltaPrefix(#[source] anyhow::Error), - #[error("upload rewritten layer")] - UploadRewritten(#[source] anyhow::Error), + + #[error(transparent)] + NotFound(crate::tenant::GetTimelineError), + + #[error("failed to reparent all candidate timelines, please retry")] + FailedToReparentAll, #[error("ancestor is already being detached by: {}", .0)] OtherTimelineDetachOngoing(TimelineId), - #[error("remote copying layer failed")] - CopyFailed(#[source] anyhow::Error), + #[error("preparing to timeline ancestor detach failed")] + Prepare(#[source] anyhow::Error), - #[error("unexpected error")] - Unexpected(#[source] anyhow::Error), + #[error("detaching and reparenting failed")] + DetachReparent(#[source] anyhow::Error), + + #[error("completing ancestor detach failed")] + Complete(#[source] anyhow::Error), #[error("failpoint: {}", .0)] Failpoint(&'static str), } +impl Error { + /// Try to catch cancellation from within the `anyhow::Error`, or wrap the anyhow as the given + /// variant or fancier `or_else`. + fn launder(e: anyhow::Error, or_else: F) -> Error + where + F: Fn(anyhow::Error) -> Error, + { + use crate::tenant::remote_timeline_client::WaitCompletionError; + use crate::tenant::upload_queue::NotInitialized; + use remote_storage::TimeoutOrCancel; + + if e.is::() + || TimeoutOrCancel::caused_by_cancel(&e) + || e.downcast_ref::() + .is_some_and(|e| e.is_cancelled()) + || e.is::() + { + Error::ShuttingDown + } else { + or_else(e) + } + } +} + impl From for ApiError { fn from(value: Error) -> Self { match value { - e @ Error::NoAncestor => ApiError::Conflict(e.to_string()), - // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError? - e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)), + Error::NoAncestor => ApiError::Conflict(value.to_string()), + Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", value)), Error::ShuttingDown => ApiError::ShuttingDown, - Error::OtherTimelineDetachOngoing(_) => { - ApiError::ResourceUnavailable("other timeline detach is already ongoing".into()) + Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => { + ApiError::ResourceUnavailable(value.to_string().into()) } - // All of these contain shutdown errors, in fact, it's the most common - e @ Error::FlushAncestor(_) - | e @ Error::RewrittenDeltaDownloadFailed(_) - | e @ Error::CopyDeltaPrefix(_) - | e @ Error::UploadRewritten(_) - | e @ Error::CopyFailed(_) - | e @ Error::Unexpected(_) - | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()), + Error::NotFound(e) => ApiError::from(e), + // these variants should have no cancellation errors because of Error::launder + Error::Prepare(_) + | Error::DetachReparent(_) + | Error::Complete(_) + | Error::Failpoint(_) => ApiError::InternalServerError(value.into()), } } } @@ -80,24 +107,8 @@ impl From for Error { } } -impl From for Error { - fn from(value: FlushLayerError) -> Self { - match value { - FlushLayerError::Cancelled => Error::ShuttingDown, - FlushLayerError::NotRunning(_) => { - // FIXME(#6424): technically statically unreachable right now, given how we never - // drop the sender - Error::ShuttingDown - } - FlushLayerError::CreateImageLayersError(_) | FlushLayerError::Other(_) => { - Error::FlushAncestor(value) - } - } - } -} - pub(crate) enum Progress { - Prepared(completion::Completion, PreparedTimelineDetach), + Prepared(Attempt, PreparedTimelineDetach), Done(AncestorDetached), } @@ -121,6 +132,26 @@ impl Default for Options { } } +/// Represents an across tenant reset exclusive single attempt to detach ancestor. +#[derive(Debug)] +pub(crate) struct Attempt { + pub(crate) timeline_id: TimelineId, + + _guard: completion::Completion, + gate_entered: Option, +} + +impl Attempt { + pub(crate) fn before_reset_tenant(&mut self) { + let taken = self.gate_entered.take(); + assert!(taken.is_some()); + } + + pub(crate) fn new_barrier(&self) -> completion::Barrier { + self._guard.barrier() + } +} + /// See [`Timeline::prepare_to_detach_from_ancestor`] pub(super) async fn prepare( detached: &Arc, @@ -135,15 +166,33 @@ pub(super) async fn prepare( .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { - { + let still_in_progress = { let accessor = detached.remote_client.initialized_upload_queue()?; // we are safe to inspect the latest uploaded, because we can only witness this after // restart is complete and ancestor is no more. let latest = accessor.latest_uploaded_index_part(); - if !latest.lineage.is_detached_from_original_ancestor() { + if latest.lineage.detached_previous_ancestor().is_none() { return Err(NoAncestor); - } + }; + + latest + .gc_blocking + .as_ref() + .is_some_and(|b| b.blocked_by(DetachAncestor)) + }; + + if still_in_progress { + // gc is still blocked, we can still reparent and complete. + // we are safe to reparent remaining, because they were locked in in the beginning. + let attempt = continue_with_blocked_gc(detached, tenant).await?; + + // because the ancestor of detached is already set to none, we have published all + // of the layers, so we are still "prepared." + return Ok(Progress::Prepared( + attempt, + PreparedTimelineDetach { layers: Vec::new() }, + )); } let reparented_timelines = reparented_direct_children(detached, tenant)?; @@ -164,24 +213,9 @@ pub(super) async fn prepare( return Err(TooManyAncestors); } - // before we acquire the gate, we must mark the ancestor as having a detach operation - // ongoing which will block other concurrent detach operations so we don't get to ackward - // situations where there would be two branches trying to reparent earlier branches. - let (guard, barrier) = completion::channel(); + let attempt = start_new_attempt(detached, tenant).await?; - { - let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); - if let Some((tl, other)) = guard.as_ref() { - if !other.is_ready() { - return Err(OtherTimelineDetachOngoing(*tl)); - } - } - *guard = Some((detached.timeline_id, barrier)); - } - - let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; - - utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable"); + utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking-pausable"); fail::fail_point!( "timeline-detach-ancestor::before_starting_after_locking", @@ -210,7 +244,17 @@ pub(super) async fn prepare( } }; - res?; + res.map_err(|e| { + use FlushLayerError::*; + match e { + Cancelled | NotRunning(_) => { + // FIXME(#6424): technically statically unreachable right now, given how we never + // drop the sender + Error::ShuttingDown + } + CreateImageLayersError(_) | Other(_) => Error::Prepare(e.into()), + } + })?; // we do not need to wait for uploads to complete but we do need `struct Layer`, // copying delta prefix is unsupported currently for `InMemoryLayer`. @@ -245,7 +289,8 @@ pub(super) async fn prepare( }; // TODO: layers are already sorted by something: use that to determine how much of remote - // copies are already done. + // copies are already done -- gc is blocked, but a compaction could had happened on ancestor, + // which is something to keep in mind if copy skipping is implemented. tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers"); // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after @@ -259,34 +304,38 @@ pub(super) async fn prepare( let mut wrote_any = false; - let limiter = Arc::new(tokio::sync::Semaphore::new( - options.rewrite_concurrency.get(), - )); + let limiter = Arc::new(Semaphore::new(options.rewrite_concurrency.get())); for layer in straddling_branchpoint { let limiter = limiter.clone(); let timeline = detached.clone(); let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download); - tasks.spawn(async move { - let _permit = limiter.acquire().await; - let copied = - upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) - .await?; - Ok(copied) - }); + let span = tracing::info_span!("upload_rewritten_layer", %layer); + tasks.spawn( + async move { + let _permit = limiter.acquire().await; + let copied = + upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) + .await?; + if let Some(copied) = copied.as_ref() { + tracing::info!(%copied, "rewrote and uploaded"); + } + Ok(copied) + } + .instrument(span), + ); } while let Some(res) = tasks.join_next().await { match res { Ok(Ok(Some(copied))) => { wrote_any = true; - tracing::info!(layer=%copied, "rewrote and uploaded"); new_layers.push(copied); } Ok(Ok(None)) => {} Ok(Err(e)) => return Err(e), - Err(je) => return Err(Unexpected(je.into())), + Err(je) => return Err(Error::Prepare(je.into())), } } @@ -308,7 +357,7 @@ pub(super) async fn prepare( } let mut tasks = tokio::task::JoinSet::new(); - let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get())); + let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get())); for adopted in rest_of_historic { let limiter = limiter.clone(); @@ -334,7 +383,7 @@ pub(super) async fn prepare( Ok(Err(failed)) => { return Err(failed); } - Err(je) => return Err(Unexpected(je.into())), + Err(je) => return Err(Error::Prepare(je.into())), } } @@ -342,7 +391,55 @@ pub(super) async fn prepare( let prepared = PreparedTimelineDetach { layers: new_layers }; - Ok(Progress::Prepared(guard, prepared)) + Ok(Progress::Prepared(attempt, prepared)) +} + +async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result { + let attempt = obtain_exclusive_attempt(detached, tenant)?; + + // insert the block in the index_part.json, if not already there. + let _dont_care = tenant + .gc_block + .insert( + detached, + crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor, + ) + .await + .map_err(|e| Error::launder(e, Error::Prepare))?; + + Ok(attempt) +} + +async fn continue_with_blocked_gc(detached: &Timeline, tenant: &Tenant) -> Result { + // FIXME: it would be nice to confirm that there is an in-memory version, since we've just + // verified there is a persistent one? + obtain_exclusive_attempt(detached, tenant) +} + +fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result { + use Error::{OtherTimelineDetachOngoing, ShuttingDown}; + + // ensure we are the only active attempt for this tenant + let (guard, barrier) = completion::channel(); + { + let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); + if let Some((tl, other)) = guard.as_ref() { + if !other.is_ready() { + return Err(OtherTimelineDetachOngoing(*tl)); + } + // FIXME: no test enters here + } + *guard = Some((detached.timeline_id, barrier)); + } + + // ensure the gate is still open + let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + + Ok(Attempt { + timeline_id: detached.timeline_id, + _guard: guard, + gate_entered: Some(_gate_entered), + }) } fn reparented_direct_children( @@ -437,19 +534,17 @@ async fn upload_rewritten_layer( cancel: &CancellationToken, ctx: &RequestContext, ) -> Result, Error> { - use Error::UploadRewritten; let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?; let Some(copied) = copied else { return Ok(None); }; - // FIXME: better shuttingdown error target .remote_client .upload_layer_file(&copied, cancel) .await - .map_err(UploadRewritten)?; + .map_err(|e| Error::launder(e, Error::Prepare))?; Ok(Some(copied.into())) } @@ -460,10 +555,8 @@ async fn copy_lsn_prefix( target_timeline: &Arc, ctx: &RequestContext, ) -> Result, Error> { - use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed, ShuttingDown}; - if target_timeline.cancel.is_cancelled() { - return Err(ShuttingDown); + return Err(Error::ShuttingDown); } tracing::debug!(%layer, %end_lsn, "copying lsn prefix"); @@ -477,18 +570,22 @@ async fn copy_lsn_prefix( ctx, ) .await - .map_err(CopyDeltaPrefix)?; + .with_context(|| format!("prepare to copy lsn prefix of ancestors {layer}")) + .map_err(Error::Prepare)?; - let resident = layer - .download_and_keep_resident() - .await - // likely shutdown - .map_err(RewrittenDeltaDownloadFailed)?; + let resident = layer.download_and_keep_resident().await.map_err(|e| { + if e.is_cancelled() { + Error::ShuttingDown + } else { + Error::Prepare(e.into()) + } + })?; let records = resident .copy_delta_prefix(&mut writer, end_lsn, ctx) .await - .map_err(CopyDeltaPrefix)?; + .with_context(|| format!("copy lsn prefix of ancestors {layer}")) + .map_err(Error::Prepare)?; drop(resident); @@ -506,9 +603,9 @@ async fn copy_lsn_prefix( let (desc, path) = writer .finish(reused_highest_key, ctx) .await - .map_err(CopyDeltaPrefix)?; + .map_err(Error::Prepare)?; let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path) - .map_err(CopyDeltaPrefix)?; + .map_err(Error::Prepare)?; tracing::debug!(%layer, %copied, "new layer produced"); @@ -524,8 +621,6 @@ async fn remote_copy( generation: Generation, cancel: &CancellationToken, ) -> Result { - use Error::CopyFailed; - // depending if Layer::keep_resident we could hardlink let mut metadata = adopted.metadata(); @@ -539,105 +634,216 @@ async fn remote_copy( metadata, ); - // FIXME: better shuttingdown error adoptee .remote_client .copy_timeline_layer(adopted, &owned, cancel) .await .map(move |()| owned) - .map_err(CopyFailed) + .map_err(|e| Error::launder(e, Error::Prepare)) } -/// See [`Timeline::complete_detaching_timeline_ancestor`]. -pub(super) async fn complete( +pub(crate) enum DetachingAndReparenting { + /// All of the following timeline ids were reparented and the timeline ancestor detach must be + /// marked as completed. + Reparented(HashSet), + + /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as + /// completed. + /// + /// Nested `must_reset_tenant` is set to true when any restart requiring changes were made. + SomeReparentingFailed { must_reset_tenant: bool }, + + /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach + /// must be marked as completed. + AlreadyDone(HashSet), +} + +impl DetachingAndReparenting { + pub(crate) fn reset_tenant_required(&self) -> bool { + use DetachingAndReparenting::*; + match self { + Reparented(_) => true, + SomeReparentingFailed { must_reset_tenant } => *must_reset_tenant, + AlreadyDone(_) => false, + } + } + + pub(crate) fn completed(self) -> Option> { + use DetachingAndReparenting::*; + match self { + Reparented(x) | AlreadyDone(x) => Some(x), + SomeReparentingFailed { .. } => None, + } + } +} + +/// See [`Timeline::detach_from_ancestor_and_reparent`]. +pub(super) async fn detach_and_reparent( detached: &Arc, tenant: &Tenant, prepared: PreparedTimelineDetach, _ctx: &RequestContext, -) -> Result, anyhow::Error> { +) -> Result { let PreparedTimelineDetach { layers } = prepared; - let ancestor = detached - .ancestor_timeline - .as_ref() - .expect("must still have a ancestor"); - let ancestor_lsn = detached.get_ancestor_lsn(); + #[derive(Debug)] + enum Ancestor { + NotDetached(Arc, Lsn), + Detached(Arc, Lsn), + } + + let (recorded_branchpoint, still_ongoing) = { + let access = detached.remote_client.initialized_upload_queue()?; + let latest = access.latest_uploaded_index_part(); + + ( + latest.lineage.detached_previous_ancestor(), + latest + .gc_blocking + .as_ref() + .is_some_and(|b| b.blocked_by(DetachAncestor)), + ) + }; + assert!( + still_ongoing, + "cannot (detach? reparent)? complete if the operation is not still ongoing" + ); + + let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) { + (Some(ancestor), None) => { + assert!( + !layers.is_empty(), + "there should always be at least one layer to inherit" + ); + Ancestor::NotDetached(ancestor.clone(), detached.ancestor_lsn) + } + (Some(_), Some(_)) => { + panic!( + "it should be impossible to get to here without having gone through the tenant reset; if the tenant was reset, then the ancestor_timeline would be None" + ); + } + (None, Some((ancestor_id, ancestor_lsn))) => { + // it has been either: + // - detached but still exists => we can try reparenting + // - detached and deleted + // + // either way, we must complete + assert!( + layers.is_empty(), + "no layers should had been copied as detach is done" + ); + + let existing = tenant.timelines.lock().unwrap().get(&ancestor_id).cloned(); + + if let Some(ancestor) = existing { + Ancestor::Detached(ancestor, ancestor_lsn) + } else { + let direct_children = reparented_direct_children(detached, tenant)?; + return Ok(DetachingAndReparenting::AlreadyDone(direct_children)); + } + } + (None, None) => { + // TODO: make sure there are no `?` before tenant_reset from after a questionmark from + // here. + panic!( + "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" + ); + } + }; // publish the prepared layers before we reparent any of the timelines, so that on restart // reparented timelines find layers. also do the actual detaching. // - // if we crash after this operation, we will at least come up having detached a timeline, but - // we cannot go back and reparent the timelines which would had been reparented in normal - // execution. - // - // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart - // which could give us a completely wrong layer combination. - detached - .remote_client - .schedule_adding_existing_layers_to_index_detach_and_wait( - &layers, - (ancestor.timeline_id, ancestor_lsn), - ) - .await?; + // if we crash after this operation, a retry will allow reparenting the remaining timelines as + // gc is blocked. + + let (ancestor, ancestor_lsn, was_detached) = match ancestor { + Ancestor::NotDetached(ancestor, ancestor_lsn) => { + // this has to complete before any reparentings because otherwise they would not have + // layers on the new parent. + detached + .remote_client + .schedule_adding_existing_layers_to_index_detach_and_wait( + &layers, + (ancestor.timeline_id, ancestor_lsn), + ) + .await + .context("publish layers and detach ancestor") + .map_err(|e| Error::launder(e, Error::DetachReparent))?; + + tracing::info!( + ancestor=%ancestor.timeline_id, + %ancestor_lsn, + inherited_layers=%layers.len(), + "detached from ancestor" + ); + (ancestor, ancestor_lsn, true) + } + Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false), + }; let mut tasks = tokio::task::JoinSet::new(); + // Returns a single permit semaphore which will be used to make one reparenting succeed, + // others will fail as if those timelines had been stopped for whatever reason. + #[cfg(feature = "testing")] + let failpoint_sem = || -> Option> { + fail::fail_point!("timeline-detach-ancestor::allow_one_reparented", |_| Some( + Arc::new(Semaphore::new(1)) + )); + None + }(); + // because we are now keeping the slot in progress, it is unlikely that there will be any // timeline deletions during this time. if we raced one, then we'll just ignore it. - tenant - .timelines - .lock() - .unwrap() - .values() - .filter_map(|tl| { - if Arc::ptr_eq(tl, detached) { - return None; - } + { + let g = tenant.timelines.lock().unwrap(); + reparentable_timelines(g.values(), detached, &ancestor, ancestor_lsn) + .cloned() + .for_each(|timeline| { + // important in this scope: we are holding the Tenant::timelines lock + let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); + let new_parent = detached.timeline_id; + #[cfg(feature = "testing")] + let failpoint_sem = failpoint_sem.clone(); - if !tl.is_active() { - return None; - } + tasks.spawn( + async move { + let res = async { + #[cfg(feature = "testing")] + if let Some(failpoint_sem) = failpoint_sem { + let _permit = failpoint_sem.acquire().await.map_err(|_| { + anyhow::anyhow!( + "failpoint: timeline-detach-ancestor::allow_one_reparented", + ) + })?; + failpoint_sem.close(); + } - let tl_ancestor = tl.ancestor_timeline.as_ref()?; - let is_same = Arc::ptr_eq(ancestor, tl_ancestor); - let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; - - let is_deleting = tl - .delete_progress - .try_lock() - .map(|flow| !flow.is_not_started()) - .unwrap_or(true); - - if is_same && is_earlier && !is_deleting { - Some(tl.clone()) - } else { - None - } - }) - .for_each(|timeline| { - // important in this scope: we are holding the Tenant::timelines lock - let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); - let new_parent = detached.timeline_id; - - tasks.spawn( - async move { - let res = timeline - .remote_client - .schedule_reparenting_and_wait(&new_parent) + timeline + .remote_client + .schedule_reparenting_and_wait(&new_parent) + .await + } .await; - match res { - Ok(()) => Some(timeline), - Err(e) => { - // with the use of tenant slot, we no longer expect these. - tracing::warn!("reparenting failed: {e:#}"); - None + match res { + Ok(()) => { + tracing::info!("reparented"); + Some(timeline) + } + Err(e) => { + // with the use of tenant slot, raced timeline deletion is the most + // likely reason. + tracing::warn!("reparenting failed: {e:#}"); + None + } } } - } - .instrument(span), - ); - }); + .instrument(span), + ); + }); + } let reparenting_candidates = tasks.len(); let mut reparented = HashSet::with_capacity(tasks.len()); @@ -645,33 +851,102 @@ pub(super) async fn complete( while let Some(res) = tasks.join_next().await { match res { Ok(Some(timeline)) => { - tracing::info!(reparented=%timeline.timeline_id, "reparenting done"); - assert!( reparented.insert(timeline.timeline_id), "duplicate reparenting? timeline_id={}", timeline.timeline_id ); } - Ok(None) => { - // lets just ignore this for now. one or all reparented timelines could had - // started deletion, and that is fine. - } Err(je) if je.is_cancelled() => unreachable!("not used"), - Err(je) if je.is_panic() => { - // ignore; it's better to continue with a single reparenting failing (or even - // all of them) in order to get to the goal state. - // - // these timelines will never be reparentable, but they can be always detached as - // separate tree roots. - } + // just ignore failures now, we can retry + Ok(None) => {} + Err(je) if je.is_panic() => {} Err(je) => tracing::error!("unexpected join error: {je:?}"), } } - if reparenting_candidates != reparented.len() { - tracing::info!("failed to reparent some candidates"); + let reparented_all = reparenting_candidates == reparented.len(); + + if reparented_all { + Ok(DetachingAndReparenting::Reparented(reparented)) + } else { + tracing::info!( + reparented = reparented.len(), + candidates = reparenting_candidates, + "failed to reparent all candidates; they can be retried after the tenant_reset", + ); + + let must_reset_tenant = !reparented.is_empty() || was_detached; + Ok(DetachingAndReparenting::SomeReparentingFailed { must_reset_tenant }) + } +} + +pub(super) async fn complete( + detached: &Arc, + tenant: &Tenant, + mut attempt: Attempt, + _ctx: &RequestContext, +) -> Result<(), Error> { + assert_eq!(detached.timeline_id, attempt.timeline_id); + + if attempt.gate_entered.is_none() { + let entered = detached.gate.enter().map_err(|_| Error::ShuttingDown)?; + attempt.gate_entered = Some(entered); + } else { + // Some(gate_entered) means the tenant was not restarted, as is not required } - Ok(reparented) + assert!(detached.ancestor_timeline.is_none()); + + // this should be an 503 at least...? + fail::fail_point!( + "timeline-detach-ancestor::complete_before_uploading", + |_| Err(Error::Failpoint( + "timeline-detach-ancestor::complete_before_uploading" + )) + ); + + tenant + .gc_block + .remove( + detached, + crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor, + ) + .await + .map_err(|e| Error::launder(e, Error::Complete))?; + + Ok(()) +} + +/// Query against a locked `Tenant::timelines`. +fn reparentable_timelines<'a, I>( + timelines: I, + detached: &'a Arc, + ancestor: &'a Arc, + ancestor_lsn: Lsn, +) -> impl Iterator> + 'a +where + I: Iterator> + 'a, +{ + timelines.filter_map(move |tl| { + if Arc::ptr_eq(tl, detached) { + return None; + } + + let tl_ancestor = tl.ancestor_timeline.as_ref()?; + let is_same = Arc::ptr_eq(ancestor, tl_ancestor); + let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; + + let is_deleting = tl + .delete_progress + .try_lock() + .map(|flow| !flow.is_not_started()) + .unwrap_or(true); + + if is_same && is_earlier && !is_deleting { + Some(tl) + } else { + None + } + }) } diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 07d860eb80..eaa9c0ff62 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -30,7 +30,8 @@ use crate::{ pgdatadir_mapping::CollectKeySpaceError, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ - tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, + storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError, + LogicalSizeCalculationCause, Tenant, }, }; @@ -241,7 +242,22 @@ impl Timeline { } }; - no_activity_for > p.threshold + match layer.visibility() { + LayerVisibilityHint::Visible => { + // Usual case: a visible layer might be read any time, and we will keep it + // resident until it hits our configured TTL threshold. + no_activity_for > p.threshold + } + LayerVisibilityHint::Covered => { + // Covered layers: this is probably a layer that was recently covered by + // an image layer during compaction. We don't evict it immediately, but + // it doesn't stay resident for the full `threshold`: we just keep it + // for a shorter time in case + // - it is used for Timestamp->LSN lookups + // - a new branch is created in recent history which will read this layer + no_activity_for > p.period + } + } }) .cloned() .for_each(|layer| { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index a66900522a..b5c577af72 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -335,6 +335,9 @@ pub(super) async fn handle_walreceiver_connection( filtered_records += 1; } + // FIXME: this cannot be made pausable_failpoint without fixing the + // failpoint library; in tests, the added amount of debugging will cause us + // to timeout the tests. fail_point!("walreceiver-after-ingest"); last_rec_lsn = lsn; diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index e6c835aa75..3c48c84598 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -5,12 +5,17 @@ use anyhow::Context; use std::path::Path; +use utils::serde_percent::Percent; use pageserver_api::models::PageserverUtilization; -pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result { - // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough +use crate::{config::PageServerConf, tenant::mgr::TenantManager}; +pub(crate) fn regenerate( + conf: &PageServerConf, + tenants_path: &Path, + tenant_manager: &TenantManager, +) -> anyhow::Result { let statvfs = nix::sys::statvfs::statvfs(tenants_path) .map_err(std::io::Error::from) .context("statvfs tenants directory")?; @@ -34,16 +39,31 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result e.max_usage_pct, + None => Percent::new(100).unwrap(), + }; + + // Express a static value for how many shards we may schedule on one node + const MAX_SHARDS: u32 = 20000; + + let mut doc = PageserverUtilization { disk_usage_bytes: used, free_space_bytes: free, - // lower is better; start with a constant - // - // note that u64::MAX will be output as i64::MAX as u64, but that should not matter - utilization_score: u64::MAX, + disk_wanted_bytes, + disk_usable_pct, + shard_count, + max_shard_count: MAX_SHARDS, + utilization_score: 0, captured_at: utils::serde_system_time::SystemTime(captured_at), }; + doc.refresh_score(); + // TODO: make utilization_score into a metric Ok(doc) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 27f6fe90a4..b4695e5f40 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -17,6 +17,7 @@ use crate::page_cache::{PageWriteGuard, PAGE_SZ}; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; +use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::shard::TenantShardId; use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; @@ -50,6 +51,7 @@ pub(crate) mod owned_buffers_io { //! but for the time being we're proving out the primitives in the neon.git repo //! for faster iteration. + pub(crate) mod io_buf_ext; pub(crate) mod slice; pub(crate) mod write; pub(crate) mod util { @@ -637,24 +639,24 @@ impl VirtualFile { } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 - pub async fn write_all_at, Buf: IoBuf + Send>( + pub async fn write_all_at( &self, - buf: B, + buf: FullSlice, mut offset: u64, ctx: &RequestContext, - ) -> (B::Buf, Result<(), Error>) { - let buf_len = buf.bytes_init(); - if buf_len == 0 { - return (Slice::into_inner(buf.slice_full()), Ok(())); - } - let mut buf = buf.slice(0..buf_len); + ) -> (FullSlice, Result<(), Error>) { + let buf = buf.into_raw_slice(); + let bounds = buf.bounds(); + let restore = + |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds)); + let mut buf = buf; while !buf.is_empty() { - let res; - (buf, res) = self.write_at(buf, offset, ctx).await; + let (tmp, res) = self.write_at(FullSlice::must_new(buf), offset, ctx).await; + buf = tmp.into_raw_slice(); match res { Ok(0) => { return ( - Slice::into_inner(buf), + restore(buf), Err(Error::new( std::io::ErrorKind::WriteZero, "failed to write whole buffer", @@ -666,33 +668,33 @@ impl VirtualFile { offset += n as u64; } Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return (Slice::into_inner(buf), Err(e)), + Err(e) => return (restore(buf), Err(e)), } } - (Slice::into_inner(buf), Ok(())) + (restore(buf), Ok(())) } - /// Writes `buf.slice(0..buf.bytes_init())`. - /// Returns the IoBuf that is underlying the BoundedBuf `buf`. - /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in. - /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant. - pub async fn write_all, Buf: IoBuf + Send>( + /// Writes `buf` to the file at the current offset. + /// + /// Panics if there is an uninitialized range in `buf`, as that is most likely a bug in the caller. + pub async fn write_all( &mut self, - buf: B, + buf: FullSlice, ctx: &RequestContext, - ) -> (B::Buf, Result) { - let nbytes = buf.bytes_init(); - if nbytes == 0 { - return (Slice::into_inner(buf.slice_full()), Ok(0)); - } - let mut buf = buf.slice(0..nbytes); + ) -> (FullSlice, Result) { + let buf = buf.into_raw_slice(); + let bounds = buf.bounds(); + let restore = + |buf: Slice<_>| FullSlice::must_new(Slice::from_buf_bounds(buf.into_inner(), bounds)); + let nbytes = buf.len(); + let mut buf = buf; while !buf.is_empty() { - let res; - (buf, res) = self.write(buf, ctx).await; + let (tmp, res) = self.write(FullSlice::must_new(buf), ctx).await; + buf = tmp.into_raw_slice(); match res { Ok(0) => { return ( - Slice::into_inner(buf), + restore(buf), Err(Error::new( std::io::ErrorKind::WriteZero, "failed to write whole buffer", @@ -703,17 +705,17 @@ impl VirtualFile { buf = buf.slice(n..); } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return (Slice::into_inner(buf), Err(e)), + Err(e) => return (restore(buf), Err(e)), } } - (Slice::into_inner(buf), Ok(nbytes)) + (restore(buf), Ok(nbytes)) } async fn write( &mut self, - buf: Slice, + buf: FullSlice, ctx: &RequestContext, - ) -> (Slice, Result) { + ) -> (FullSlice, Result) { let pos = self.pos; let (buf, res) = self.write_at(buf, pos, ctx).await; let n = match res { @@ -756,10 +758,10 @@ impl VirtualFile { async fn write_at( &self, - buf: Slice, + buf: FullSlice, offset: u64, _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ - ) -> (Slice, Result) { + ) -> (FullSlice, Result) { let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, Err(e) => return (buf, Err(e)), @@ -1093,11 +1095,11 @@ impl Drop for VirtualFile { impl OwnedAsyncWriter for VirtualFile { #[inline(always)] - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, ctx: &RequestContext, - ) -> std::io::Result<(usize, B::Buf)> { + ) -> std::io::Result<(usize, FullSlice)> { let (buf, res) = VirtualFile::write_all(self, buf, ctx).await; res.map(move |v| (v, buf)) } @@ -1159,7 +1161,8 @@ mod tests { use crate::task_mgr::TaskKind; use super::*; - use owned_buffers_io::slice::SliceExt; + use owned_buffers_io::io_buf_ext::IoBufExt; + use owned_buffers_io::slice::SliceMutExt; use rand::seq::SliceRandom; use rand::thread_rng; use rand::Rng; @@ -1193,9 +1196,9 @@ mod tests { } } } - async fn write_all_at, Buf: IoBuf + Send>( + async fn write_all_at( &self, - buf: B, + buf: FullSlice, offset: u64, ctx: &RequestContext, ) -> Result<(), Error> { @@ -1204,13 +1207,7 @@ mod tests { let (_buf, res) = file.write_all_at(buf, offset, ctx).await; res } - MaybeVirtualFile::File(file) => { - let buf_len = buf.bytes_init(); - if buf_len == 0 { - return Ok(()); - } - file.write_all_at(&buf.slice(0..buf_len), offset) - } + MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset), } } async fn seek(&mut self, pos: SeekFrom) -> Result { @@ -1219,9 +1216,9 @@ mod tests { MaybeVirtualFile::File(file) => file.seek(pos), } } - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, ctx: &RequestContext, ) -> Result<(), Error> { match self { @@ -1229,13 +1226,7 @@ mod tests { let (_buf, res) = file.write_all(buf, ctx).await; res.map(|_| ()) } - MaybeVirtualFile::File(file) => { - let buf_len = buf.bytes_init(); - if buf_len == 0 { - return Ok(()); - } - file.write_all(&buf.slice(0..buf_len)) - } + MaybeVirtualFile::File(file) => file.write_all(&buf[..]), } } @@ -1347,7 +1338,9 @@ mod tests { &ctx, ) .await?; - file_a.write_all(b"foobar".to_vec(), &ctx).await?; + file_a + .write_all(b"foobar".to_vec().slice_len(), &ctx) + .await?; // cannot read from a file opened in write-only mode let _ = file_a.read_string(&ctx).await.unwrap_err(); @@ -1356,7 +1349,10 @@ mod tests { let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?; // cannot write to a file opened in read-only mode - let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err(); + let _ = file_a + .write_all(b"bar".to_vec().slice_len(), &ctx) + .await + .unwrap_err(); // Try simple read assert_eq!("foobar", file_a.read_string(&ctx).await?); @@ -1399,8 +1395,12 @@ mod tests { &ctx, ) .await?; - file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?; - file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?; + file_b + .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx) + .await?; + file_b + .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx) + .await?; assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index 0ffcd9fa05..faef1ba9ff 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -12,7 +12,7 @@ #[cfg(target_os = "linux")] pub(super) mod tokio_epoll_uring_ext; -use tokio_epoll_uring::{IoBuf, Slice}; +use tokio_epoll_uring::IoBuf; use tracing::Instrument; pub(crate) use super::api::IoEngineKind; @@ -107,7 +107,10 @@ use std::{ sync::atomic::{AtomicU8, Ordering}, }; -use super::{owned_buffers_io::slice::SliceExt, FileGuard, Metadata}; +use super::{ + owned_buffers_io::{io_buf_ext::FullSlice, slice::SliceMutExt}, + FileGuard, Metadata, +}; #[cfg(target_os = "linux")] fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { @@ -206,8 +209,8 @@ impl IoEngine { &self, file_guard: FileGuard, offset: u64, - buf: Slice, - ) -> ((FileGuard, Slice), std::io::Result) { + buf: FullSlice, + ) -> ((FileGuard, FullSlice), std::io::Result) { match self { IoEngine::NotSet => panic!("not initialized"), IoEngine::StdFs => { @@ -217,8 +220,12 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.write(file_guard, offset, buf).await; - (resources, res.map_err(epoll_uring_error_to_std)) + let ((file_guard, slice), res) = + system.write(file_guard, offset, buf.into_raw_slice()).await; + ( + (file_guard, FullSlice::must_new(slice)), + res.map_err(epoll_uring_error_to_std), + ) } } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs new file mode 100644 index 0000000000..7c773b6b21 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -0,0 +1,78 @@ +//! See [`FullSlice`]. + +use bytes::{Bytes, BytesMut}; +use std::ops::{Deref, Range}; +use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; + +/// The true owned equivalent for Rust [`slice`]. Use this for the write path. +/// +/// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`, +/// [`FullSlice`] is guaranteed to have all its bytes initialized. This means that +/// [`>::len`] is equal to [`Slice::bytes_init`] and [`Slice::bytes_total`]. +/// +pub struct FullSlice { + slice: Slice, +} + +impl FullSlice +where + B: IoBuf, +{ + pub(crate) fn must_new(slice: Slice) -> Self { + assert_eq!(slice.bytes_init(), slice.bytes_total()); + FullSlice { slice } + } + pub(crate) fn into_raw_slice(self) -> Slice { + let FullSlice { slice: s } = self; + s + } +} + +impl Deref for FullSlice +where + B: IoBuf, +{ + type Target = [u8]; + + fn deref(&self) -> &[u8] { + let rust_slice = &self.slice[..]; + assert_eq!(rust_slice.len(), self.slice.bytes_init()); + assert_eq!(rust_slice.len(), self.slice.bytes_total()); + rust_slice + } +} + +pub(crate) trait IoBufExt { + /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`. + fn slice_len(self) -> FullSlice + where + Self: Sized; +} + +macro_rules! impl_io_buf_ext { + ($T:ty) => { + impl IoBufExt for $T { + #[inline(always)] + fn slice_len(self) -> FullSlice { + let len = self.len(); + let s = if len == 0 { + // `BoundedBuf::slice(0..len)` or `BoundedBuf::slice(..)` has an incorrect assertion, + // causing a panic if len == 0. + // The Slice::from_buf_bounds has the correct assertion (<= instead of <). + // => https://github.com/neondatabase/tokio-epoll-uring/issues/46 + let slice = self.slice_full(); + let mut bounds: Range<_> = slice.bounds(); + bounds.end = bounds.start; + Slice::from_buf_bounds(slice.into_inner(), bounds) + } else { + self.slice(0..len) + }; + FullSlice::must_new(s) + } + } + }; +} + +impl_io_buf_ext!(Bytes); +impl_io_buf_ext!(BytesMut); +impl_io_buf_ext!(Vec); diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs index d19e5ddffe..6100593663 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/slice.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs @@ -3,14 +3,14 @@ use tokio_epoll_uring::BoundedBufMut; use tokio_epoll_uring::IoBufMut; use tokio_epoll_uring::Slice; -pub(crate) trait SliceExt { +pub(crate) trait SliceMutExt { /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO. /// /// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]` fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8]; } -impl SliceExt for Slice +impl SliceMutExt for Slice where B: IoBufMut, { diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs index 55b1d0b46b..efcb61ba65 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs @@ -1,5 +1,8 @@ -use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter}; -use tokio_epoll_uring::{BoundedBuf, IoBuf}; +use crate::{ + context::RequestContext, + virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter}, +}; +use tokio_epoll_uring::IoBuf; pub struct Writer { dst: W, @@ -35,11 +38,11 @@ where W: OwnedAsyncWriter, { #[inline(always)] - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, ctx: &RequestContext, - ) -> std::io::Result<(usize, B::Buf)> { + ) -> std::io::Result<(usize, FullSlice)> { let (nwritten, buf) = self.dst.write_all(buf, ctx).await?; self.bytes_amount += u64::try_from(nwritten).unwrap(); Ok((nwritten, buf)) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 8599d95cdf..f8f37b17e3 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,16 +1,18 @@ use bytes::BytesMut; -use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use tokio_epoll_uring::IoBuf; use crate::context::RequestContext; +use super::io_buf_ext::{FullSlice, IoBufExt}; + /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. pub trait OwnedAsyncWriter { - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, ctx: &RequestContext, - ) -> std::io::Result<(usize, B::Buf)>; + ) -> std::io::Result<(usize, FullSlice)>; } /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch @@ -79,9 +81,11 @@ where #[cfg_attr(target_os = "macos", allow(dead_code))] pub async fn write_buffered( &mut self, - chunk: Slice, + chunk: FullSlice, ctx: &RequestContext, - ) -> std::io::Result<(usize, S)> { + ) -> std::io::Result<(usize, FullSlice)> { + let chunk = chunk.into_raw_slice(); + let chunk_len = chunk.len(); // avoid memcpy for the middle of the chunk if chunk.len() >= self.buf().cap() { @@ -94,7 +98,10 @@ where .pending(), 0 ); - let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?; + let (nwritten, chunk) = self + .writer + .write_all(FullSlice::must_new(chunk), ctx) + .await?; assert_eq!(nwritten, chunk_len); return Ok((nwritten, chunk)); } @@ -114,7 +121,7 @@ where } } assert!(slice.is_empty(), "by now we should have drained the chunk"); - Ok((chunk_len, chunk.into_inner())) + Ok((chunk_len, FullSlice::must_new(chunk))) } /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data. @@ -150,9 +157,12 @@ where self.buf = Some(buf); return Ok(()); } - let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?; + let slice = buf.flush(); + let (nwritten, slice) = self.writer.write_all(slice, ctx).await?; assert_eq!(nwritten, buf_len); - self.buf = Some(Buffer::reuse_after_flush(io_buf)); + self.buf = Some(Buffer::reuse_after_flush( + slice.into_raw_slice().into_inner(), + )); Ok(()) } } @@ -172,9 +182,9 @@ pub trait Buffer { /// Number of bytes in the buffer. fn pending(&self) -> usize; - /// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data + /// Turns `self` into a [`FullSlice`] of the pending data /// so we can use [`tokio_epoll_uring`] to write it to disk. - fn flush(self) -> Slice; + fn flush(self) -> FullSlice; /// After the write to disk is done and we have gotten back the slice, /// [`BufferedWriter`] uses this method to re-use the io buffer. @@ -198,12 +208,8 @@ impl Buffer for BytesMut { self.len() } - fn flush(self) -> Slice { - if self.is_empty() { - return self.slice_full(); - } - let len = self.len(); - self.slice(0..len) + fn flush(self) -> FullSlice { + self.slice_len() } fn reuse_after_flush(mut iobuf: BytesMut) -> Self { @@ -213,18 +219,13 @@ impl Buffer for BytesMut { } impl OwnedAsyncWriter for Vec { - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, _: &RequestContext, - ) -> std::io::Result<(usize, B::Buf)> { - let nbytes = buf.bytes_init(); - if nbytes == 0 { - return Ok((0, Slice::into_inner(buf.slice_full()))); - } - let buf = buf.slice(0..nbytes); + ) -> std::io::Result<(usize, FullSlice)> { self.extend_from_slice(&buf[..]); - Ok((buf.len(), Slice::into_inner(buf))) + Ok((buf.len(), buf)) } } @@ -241,19 +242,13 @@ mod tests { writes: Vec>, } impl OwnedAsyncWriter for RecorderWriter { - async fn write_all, Buf: IoBuf + Send>( + async fn write_all( &mut self, - buf: B, + buf: FullSlice, _: &RequestContext, - ) -> std::io::Result<(usize, B::Buf)> { - let nbytes = buf.bytes_init(); - if nbytes == 0 { - self.writes.push(vec![]); - return Ok((0, Slice::into_inner(buf.slice_full()))); - } - let buf = buf.slice(0..nbytes); + ) -> std::io::Result<(usize, FullSlice)> { self.writes.push(Vec::from(&buf[..])); - Ok((buf.len(), Slice::into_inner(buf))) + Ok((buf.len(), buf)) } } @@ -264,7 +259,7 @@ mod tests { macro_rules! write { ($writer:ident, $data:literal) => {{ $writer - .write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx()) + .write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx()) .await?; }}; } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 804c7fca97..8425528740 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -515,7 +515,7 @@ impl WalIngest { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)? + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version) // do not materialize null pages because them most likely be soon replaced with real data && blk.bimg_len != 0 { @@ -1702,7 +1702,7 @@ async fn get_relsize( modification: &DatadirModification<'_>, rel: RelTag, ctx: &RequestContext, -) -> anyhow::Result { +) -> Result { let nblocks = if !modification .tline .get_rel_exists(rel, Version::Modified(modification), ctx) diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 62a3a91b0b..edddcefbe1 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -1018,7 +1018,7 @@ pub fn decode_wal_record( ); let blk_img_is_compressed = - postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?; + postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version); if blk_img_is_compressed { debug!("compressed block image , pg_version = {}", pg_version); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 770081b3b4..82585f9ed8 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -107,8 +107,10 @@ enum ProcessOnceCell { } struct Process { - _launched_processes_guard: utils::sync::gate::GateGuard, process: process::WalRedoProcess, + /// This field is last in this struct so the guard gets dropped _after_ [`Self::process`]. + /// (Reminder: dropping [`Self::process`] synchronously sends SIGKILL and then `wait()`s for it to exit). + _launched_processes_guard: utils::sync::gate::GateGuard, } impl std::ops::Deref for Process { @@ -327,20 +329,23 @@ impl PostgresRedoManager { }, Err(permit) => { let start = Instant::now(); - let proc = Arc::new(Process { - _launched_processes_guard: match self.launched_processes.enter() { + // acquire guard before spawning process, so that we don't spawn new processes + // if the gate is already closed. + let _launched_processes_guard = match self.launched_processes.enter() { Ok(guard) => guard, Err(GateError::GateClosed) => unreachable!( "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" ), - }, - process: process::WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - }); + }; + let proc = Arc::new(Process { + process: process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, + _launched_processes_guard, + }); let duration = start.elapsed(); WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); info!( diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 1894e8c72a..479209a537 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -41,6 +41,8 @@ #include "hll.h" +#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) + /* * Local file cache is used to temporary store relations pages in local file system. * All blocks of all relations are stored inside one file and addressed using shared hash map. @@ -51,19 +53,43 @@ * * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about * its consistency. + + * + * ## Holes + * + * The LFC can be resized on the fly, up to a maximum size that's determined + * at server startup (neon.max_file_cache_size). After server startup, we + * expand the underlying file when needed, until it reaches the soft limit + * (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink + * the LFC by punching holes in the underlying file with a + * fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't + * shrink, but the disk space it uses does. + * + * Each hole is tracked by a dummy FileCacheEntry, which are kept in the + * 'holes' linked list. They are entered into the chunk hash table, with a + * special key where the blockNumber is used to store the 'offset' of the + * hole, and all other fields are zero. Holes are never looked up in the hash + * table, we only enter them there to have a FileCacheEntry that we can keep + * in the linked list. If the soft limit is raised again, we reuse the holes + * before extending the nominal size of the file. */ /* Local file storage allocation chunk. - * Should be power of two and not less than 32. Using larger than page chunks can + * Should be power of two. Using larger than page chunks can * 1. Reduce hash-map memory footprint: 8TB database contains billion pages * and size of hash entry is 40 bytes, so we need 40Gb just for hash map. * 1Mb chunks can reduce hash map size to 320Mb. * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed */ #define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */ +/* + * Smaller chunk seems to be better for OLTP workload + */ +// #define BLOCKS_PER_CHUNK 8 /* 64kb chunk */ #define MB ((uint64)1024*1024) #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) +#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32) typedef struct FileCacheEntry { @@ -71,8 +97,8 @@ typedef struct FileCacheEntry uint32 hash; uint32 offset; uint32 access_count; - uint32 bitmap[BLOCKS_PER_CHUNK / 32]; - dlist_node lru_node; /* LRU list node */ + uint32 bitmap[CHUNK_BITMAP_SIZE]; + dlist_node list_node; /* LRU/holes list node */ } FileCacheEntry; typedef struct FileCacheControl @@ -87,6 +113,7 @@ typedef struct FileCacheControl uint64 writes; dlist_head lru; /* double linked list for LRU replacement * algorithm */ + dlist_head holes; /* double linked list of punched holes */ HyperLogLogState wss_estimation; /* estimation of working set size */ } FileCacheControl; @@ -135,6 +162,7 @@ lfc_disable(char const *op) lfc_ctl->used = 0; lfc_ctl->limit = 0; dlist_init(&lfc_ctl->lru); + dlist_init(&lfc_ctl->holes); if (lfc_desc > 0) { @@ -214,18 +242,18 @@ lfc_shmem_startup(void) if (!found) { int fd; - uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size); + uint32 n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size); lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock"); info.keysize = sizeof(BufferTag); info.entrysize = sizeof(FileCacheEntry); /* - * lfc_size+1 because we add new element to hash table before eviction + * n_chunks+1 because we add new element to hash table before eviction * of victim */ lfc_hash = ShmemInitHash("lfc_hash", - lfc_size + 1, lfc_size + 1, + n_chunks + 1, n_chunks + 1, &info, HASH_ELEM | HASH_BLOBS); lfc_ctl->generation = 0; @@ -235,6 +263,7 @@ lfc_shmem_startup(void) lfc_ctl->misses = 0; lfc_ctl->writes = 0; dlist_init(&lfc_ctl->lru); + dlist_init(&lfc_ctl->holes); /* Initialize hyper-log-log structure for estimating working set size */ initSHLL(&lfc_ctl->wss_estimation); @@ -310,14 +339,31 @@ lfc_change_limit_hook(int newval, void *extra) * Shrink cache by throwing away least recently accessed chunks and * returning their space to file system */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); + FileCacheEntry *hole; + uint32 offset = victim->offset; + uint32 hash; + bool found; + BufferTag holetag; - Assert(victim->access_count == 0); + CriticalAssert(victim->access_count == 0); #ifdef FALLOC_FL_PUNCH_HOLE if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0) neon_log(LOG, "Failed to punch hole in file: %m"); #endif + /* We remove the old entry, and re-enter a hole to the hash table */ hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); + + memset(&holetag, 0, sizeof(holetag)); + holetag.blockNum = offset; + hash = get_hash_value(lfc_hash, &holetag); + hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found); + hole->hash = hash; + hole->offset = offset; + hole->access_count = 0; + CriticalAssert(!found); + dlist_push_tail(&lfc_ctl->holes, &hole->list_node); + lfc_ctl->used -= 1; } lfc_ctl->limit = new_size; @@ -409,6 +455,8 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_SHARED); @@ -440,6 +488,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) tag.forkNum = forkNum; tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1)); + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -470,7 +519,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) { bool has_remaining_pages; - for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) + for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) { if (entry->bitmap[i] != 0) { @@ -485,8 +534,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) */ if (!has_remaining_pages) { - dlist_delete(&entry->lru_node); - dlist_push_head(&lfc_ctl->lru, &entry->lru_node); + dlist_delete(&entry->list_node); + dlist_push_head(&lfc_ctl->lru, &entry->list_node); } } @@ -525,6 +574,8 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -551,7 +602,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } /* Unlink entry from LRU list to pin it for the duration of IO operation */ if (entry->access_count++ == 0) - dlist_delete(&entry->lru_node); + dlist_delete(&entry->list_node); generation = lfc_ctl->generation; entry_offset = entry->offset; @@ -569,12 +620,12 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (lfc_ctl->generation == generation) { - Assert(LFC_ENABLED()); + CriticalAssert(LFC_ENABLED()); lfc_ctl->hits += 1; pgBufferUsage.file_cache.hits += 1; - Assert(entry->access_count > 0); + CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); } else result = false; @@ -613,6 +664,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CopyNRelFileInfoToBufTag(tag, rinfo); + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -632,7 +685,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void * operation */ if (entry->access_count++ == 0) - dlist_delete(&entry->lru_node); + dlist_delete(&entry->list_node); } else { @@ -655,13 +708,26 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) { /* Cache overflow: evict least recently used chunk */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); - Assert(victim->access_count == 0); + CriticalAssert(victim->access_count == 0); entry->offset = victim->offset; /* grab victim's chunk */ hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); neon_log(DEBUG2, "Swap file cache page"); } + else if (!dlist_is_empty(&lfc_ctl->holes)) + { + /* We can reuse a hole that was left behind when the LFC was shrunk previously */ + FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); + uint32 offset = hole->offset; + bool found; + + hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); + CriticalAssert(found); + + lfc_ctl->used += 1; + entry->offset = offset; /* reuse the hole */ + } else { lfc_ctl->used += 1; @@ -689,11 +755,11 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void if (lfc_ctl->generation == generation) { - Assert(LFC_ENABLED()); + CriticalAssert(LFC_ENABLED()); /* Place entry to the head of LRU list */ - Assert(entry->access_count > 0); + CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31)); } @@ -708,7 +774,6 @@ typedef struct } NeonGetStatsCtx; #define NUM_NEON_GET_STATS_COLS 2 -#define NUM_NEON_GET_STATS_ROWS 3 PG_FUNCTION_INFO_V1(neon_get_lfc_stats); Datum @@ -744,7 +809,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) INT8OID, -1, 0); fctx->tupdesc = BlessTupleDesc(tupledesc); - funcctx->max_calls = NUM_NEON_GET_STATS_ROWS; funcctx->user_fctx = fctx; /* Return to original context when allocating transient memory */ @@ -778,6 +842,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS) if (lfc_ctl) value = lfc_ctl->writes; break; + case 4: + key = "file_cache_size"; + if (lfc_ctl) + value = lfc_ctl->size; + break; default: SRF_RETURN_DONE(funcctx); } @@ -901,7 +970,7 @@ local_cache_pages(PG_FUNCTION_ARGS) hash_seq_init(&status, lfc_hash); while ((entry = hash_seq_search(&status)) != NULL) { - for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++) + for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) n_pages += pg_popcount32(entry->bitmap[i]); } } diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index d107cdc1c2..fe8e276d1c 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -32,6 +32,7 @@ #include "utils/builtins.h" #include "utils/pg_lsn.h" #include "utils/guc.h" +#include "utils/guc_tables.h" #include "utils/wait_event.h" #include "extension_server.h" @@ -68,10 +69,10 @@ InitLogicalReplicationMonitor(void) DefineCustomIntVariable( "neon.logical_replication_max_snap_files", - "Maximum allowed logical replication .snap files", + "Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.", NULL, &logical_replication_max_snap_files, - 300, 0, INT_MAX, + 300, -1, INT_MAX, PGC_SIGHUP, 0, NULL, NULL, NULL); @@ -191,6 +192,13 @@ LogicalSlotsMonitorMain(Datum main_arg) { XLogRecPtr cutoff_lsn; + /* In case of a SIGHUP, just reload the configuration. */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + /* * If there are too many .snap files, just drop all logical slots to * prevent aux files bloat. @@ -584,6 +592,40 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n return false; } + +/* + * pgbouncer is able to track GUCs reported by Postgres. + * But most parameters cannot be tracked this way. The only parameters that can be tracked are ones + * that Postgres reports to the client. Unfortunately `search_path` is not reported by Postgres: + * https://www.postgresql.org/message-id/flat/CAGECzQQ6xFcgrg%2Be0p9mCumtK362TiA6vTiiZKoYbS8OXggwuQ%40mail.gmail.com#be4bfd7a9cf1f0633bdb2d1790a0a1be + * This code sets GUC_REPORT flag for `search_path`making it possible to include it in + * pgbouncer's `track_extra_parameters` list. + * + * This code is inspired by how the Citus extension does this, see + * https://github.com/citusdata/citus/blob/2a263fe69a707d16ef24378f7650742386b0968f/src/backend/distributed/shared_library_init.c#L2694 + */ +static void +ReportSearchPath(void) +{ +#if PG_VERSION_NUM >= 160000 + int nGucs = 0; + struct config_generic **gucs = get_guc_variables(&nGucs); +#else + struct config_generic **gucs = get_guc_variables(); + int nGucs = GetNumConfigOptions(); +#endif + + for (int i = 0; i < nGucs; i++) + { + struct config_generic *guc = (struct config_generic *) gucs[i]; + + if (strcmp(guc->name, "search_path") == 0) + { + guc->flags |= GUC_REPORT; + } + } +} + void _PG_init(void) { @@ -599,6 +641,7 @@ _PG_init(void) pg_init_walproposer(); WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitLogicalReplicationMonitor(); @@ -626,6 +669,8 @@ _PG_init(void) * extension was loaded will be removed. */ EmitWarningsOnPlaceholders("neon"); + + ReportSearchPath(); } PG_FUNCTION_INFO_V1(pg_cluster_size); diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index f19732cbbb..addb6ccce6 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -54,6 +54,10 @@ #define BufTagGetNRelFileInfo(tag) tag.rnode +#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode) + +#define InvalidRelFileNumber InvalidOid + #define SMgrRelGetRelInfo(reln) \ (reln->smgr_rnode.node) diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 944b316344..f3ddc64061 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -512,7 +512,7 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe } /* - * Start walsender streaming replication + * Start walproposer streaming replication */ static void walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c index 8f8d1dfc01..bd3856e9d9 100644 --- a/pgxn/neon/walsender_hooks.c +++ b/pgxn/neon/walsender_hooks.c @@ -20,6 +20,7 @@ #include "utils/guc.h" #include "postmaster/interrupt.h" +#include "neon.h" #include "neon_walreader.h" #include "walproposer.h" @@ -181,6 +182,13 @@ NeonWALReadSegmentClose(XLogReaderState *xlogreader) void NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr) { + /* + * If safekeepers are not configured, assume we don't need neon_walreader, + * i.e. running neon fork locally. + */ + if (wal_acceptors_list[0] == '\0') + return; + if (!wal_reader) { XLogRecPtr epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn); diff --git a/pgxn/neon_rmgr/neon_rmgr.c b/pgxn/neon_rmgr/neon_rmgr.c index 496ca08c08..c3f726db84 100644 --- a/pgxn/neon_rmgr/neon_rmgr.c +++ b/pgxn/neon_rmgr/neon_rmgr.c @@ -186,7 +186,7 @@ static void fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) { *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | - HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK); + HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK | HEAP_COMBOCID); *infomask2 &= ~HEAP_KEYS_UPDATED; if (infobits & XLHL_XMAX_IS_MULTI) @@ -195,6 +195,8 @@ fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) *infomask |= HEAP_XMAX_LOCK_ONLY; if (infobits & XLHL_XMAX_EXCL_LOCK) *infomask |= HEAP_XMAX_EXCL_LOCK; + if (infobits & XLHL_COMBOCID) + *infomask |= HEAP_COMBOCID; /* note HEAP_XMAX_SHR_LOCK isn't considered here */ if (infobits & XLHL_XMAX_KEYSHR_LOCK) *infomask |= HEAP_XMAX_KEYSHR_LOCK; @@ -284,7 +286,7 @@ redo_neon_heap_insert(XLogReaderState *record) htup->t_infomask = xlhdr.t_infomask; htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); + htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid; htup->t_ctid = target_tid; if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, @@ -373,7 +375,7 @@ redo_neon_heap_delete(XLogReaderState *record) HeapTupleHeaderSetXmax(htup, xlrec->xmax); else HeapTupleHeaderSetXmin(htup, InvalidTransactionId); - HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; /* Mark the page as a candidate for pruning */ PageSetPrunable(page, XLogRecGetXid(record)); @@ -490,7 +492,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update) fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, &htup->t_infomask2); HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); - HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; /* Set forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -623,7 +625,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update) htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); + htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid; HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -728,7 +730,7 @@ redo_neon_heap_lock(XLogReaderState *record) offnum); } HeapTupleHeaderSetXmax(htup, xlrec->xmax); - HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } @@ -840,7 +842,7 @@ redo_neon_heap_multi_insert(XLogReaderState *record) htup->t_infomask = xlhdr->t_infomask; htup->t_hoff = xlhdr->t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlrec->t_cid); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index b316c53034..21d92abb20 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -11,6 +11,7 @@ testing = [] [dependencies] ahash.workspace = true anyhow.workspace = true +arc-swap.workspace = true async-compression.workspace = true async-trait.workspace = true atomic-take.workspace = true @@ -73,7 +74,7 @@ rustls.workspace = true scopeguard.workspace = true serde.workspace = true serde_json.workspace = true -sha2 = { workspace = true, features = ["asm"] } +sha2 = { workspace = true, features = ["asm", "oid"] } smol_str.workspace = true smallvec.workspace = true socket2.workspace = true @@ -103,6 +104,14 @@ x509-parser.workspace = true postgres-protocol.workspace = true redis.workspace = true +# jwt stuff +jose-jwa = "0.1.2" +jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] } +signature = "2" +ecdsa = "0.16" +p256 = "0.13" +rsa = "0.9" + workspace_hack.workspace = true [dev-dependencies] diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 90dea01bf3..c6a0b2af5a 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -1,5 +1,6 @@ mod classic; mod hacks; +pub mod jwt; mod link; use std::net::IpAddr; diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs new file mode 100644 index 0000000000..0c2ca8fb97 --- /dev/null +++ b/proxy/src/auth/backend/jwt.rs @@ -0,0 +1,556 @@ +use std::{future::Future, sync::Arc, time::Duration}; + +use anyhow::{bail, ensure, Context}; +use arc_swap::ArcSwapOption; +use dashmap::DashMap; +use jose_jwk::crypto::KeyInfo; +use signature::Verifier; +use tokio::time::Instant; + +use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt}; + +// TODO(conrad): make these configurable. +const MIN_RENEW: Duration = Duration::from_secs(30); +const AUTO_RENEW: Duration = Duration::from_secs(300); +const MAX_RENEW: Duration = Duration::from_secs(3600); +const MAX_JWK_BODY_SIZE: usize = 64 * 1024; + +/// How to get the JWT auth rules +pub trait FetchAuthRules: Clone + Send + Sync + 'static { + fn fetch_auth_rules(&self) -> impl Future> + Send; +} + +#[derive(Clone)] +struct FetchAuthRulesFromCplane { + #[allow(dead_code)] + endpoint: EndpointIdInt, +} + +impl FetchAuthRules for FetchAuthRulesFromCplane { + async fn fetch_auth_rules(&self) -> anyhow::Result { + Err(anyhow::anyhow!("not yet implemented")) + } +} + +pub struct AuthRules { + jwks_urls: Vec, +} + +#[derive(Default)] +pub struct JwkCache { + client: reqwest::Client, + + map: DashMap>, +} + +pub struct JwkCacheEntryLock { + cached: ArcSwapOption, + lookup: tokio::sync::Semaphore, +} + +impl Default for JwkCacheEntryLock { + fn default() -> Self { + JwkCacheEntryLock { + cached: ArcSwapOption::empty(), + lookup: tokio::sync::Semaphore::new(1), + } + } +} + +pub struct JwkCacheEntry { + /// Should refetch at least every hour to verify when old keys have been removed. + /// Should refetch when new key IDs are seen only every 5 minutes or so + last_retrieved: Instant, + + /// cplane will return multiple JWKs urls that we need to scrape. + key_sets: ahash::HashMap, +} + +impl JwkCacheEntryLock { + async fn acquire_permit<'a>(self: &'a Arc) -> JwkRenewalPermit<'a> { + JwkRenewalPermit::acquire_permit(self).await + } + + fn try_acquire_permit<'a>(self: &'a Arc) -> Option> { + JwkRenewalPermit::try_acquire_permit(self) + } + + async fn renew_jwks( + &self, + _permit: JwkRenewalPermit<'_>, + client: &reqwest::Client, + auth_rules: &F, + ) -> anyhow::Result> { + // double check that no one beat us to updating the cache. + let now = Instant::now(); + let guard = self.cached.load_full(); + if let Some(cached) = guard { + let last_update = now.duration_since(cached.last_retrieved); + if last_update < Duration::from_secs(300) { + return Ok(cached); + } + } + + let rules = auth_rules.fetch_auth_rules().await?; + let mut key_sets = ahash::HashMap::with_capacity_and_hasher( + rules.jwks_urls.len(), + ahash::RandomState::new(), + ); + // TODO(conrad): run concurrently + // TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284) + for url in rules.jwks_urls { + let req = client.get(url.clone()); + // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`. + match req.send().await.and_then(|r| r.error_for_status()) { + // todo: should we re-insert JWKs if we want to keep this JWKs URL? + // I expect these failures would be quite sparse. + Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"), + Ok(r) => { + let resp: http::Response = r.into(); + match parse_json_body_with_limit::( + resp.into_body(), + MAX_JWK_BODY_SIZE, + ) + .await + { + Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"), + Ok(jwks) => { + key_sets.insert(url, jwks); + } + } + } + } + } + + let entry = Arc::new(JwkCacheEntry { + last_retrieved: now, + key_sets, + }); + self.cached.swap(Some(Arc::clone(&entry))); + + Ok(entry) + } + + async fn get_or_update_jwk_cache( + self: &Arc, + client: &reqwest::Client, + fetch: &F, + ) -> Result, anyhow::Error> { + let now = Instant::now(); + let guard = self.cached.load_full(); + + // if we have no cached JWKs, try and get some + let Some(cached) = guard else { + let permit = self.acquire_permit().await; + return self.renew_jwks(permit, client, fetch).await; + }; + + let last_update = now.duration_since(cached.last_retrieved); + + // check if the cached JWKs need updating. + if last_update > MAX_RENEW { + let permit = self.acquire_permit().await; + + // it's been too long since we checked the keys. wait for them to update. + return self.renew_jwks(permit, client, fetch).await; + } + + // every 5 minutes we should spawn a job to eagerly update the token. + if last_update > AUTO_RENEW { + if let Some(permit) = self.try_acquire_permit() { + tracing::debug!("JWKs should be renewed. Renewal permit acquired"); + let permit = permit.into_owned(); + let entry = self.clone(); + let client = client.clone(); + let fetch = fetch.clone(); + tokio::spawn(async move { + if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await { + tracing::warn!(error=?e, "could not fetch JWKs in background job"); + } + }); + } else { + tracing::debug!("JWKs should be renewed. Renewal permit already taken, skipping"); + } + } + + Ok(cached) + } + + async fn check_jwt( + self: &Arc, + jwt: String, + client: &reqwest::Client, + fetch: &F, + ) -> Result<(), anyhow::Error> { + // JWT compact form is defined to be + // || . || || . || + // where Signature = alg( || . || ); + + let (header_payload, signature) = jwt + .rsplit_once(".") + .context("Provided authentication token is not a valid JWT encoding")?; + let (header, _payload) = header_payload + .split_once(".") + .context("Provided authentication token is not a valid JWT encoding")?; + + let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD) + .context("Provided authentication token is not a valid JWT encoding")?; + let header = serde_json::from_slice::(&header) + .context("Provided authentication token is not a valid JWT encoding")?; + + let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD) + .context("Provided authentication token is not a valid JWT encoding")?; + + ensure!(header.typ == "JWT"); + let kid = header.kid.context("missing key id")?; + + let mut guard = self.get_or_update_jwk_cache(client, fetch).await?; + + // get the key from the JWKs if possible. If not, wait for the keys to update. + let jwk = loop { + let jwk = guard + .key_sets + .values() + .flat_map(|jwks| &jwks.keys) + .find(|jwk| jwk.prm.kid.as_deref() == Some(kid)); + + match jwk { + Some(jwk) => break jwk, + None if guard.last_retrieved.elapsed() > MIN_RENEW => { + let permit = self.acquire_permit().await; + guard = self.renew_jwks(permit, client, fetch).await?; + } + _ => { + bail!("jwk not found"); + } + } + }; + + ensure!( + jwk.is_supported(&header.alg), + "signature algorithm not supported" + ); + + match &jwk.key { + jose_jwk::Key::Ec(key) => { + verify_ec_signature(header_payload.as_bytes(), &sig, key)?; + } + jose_jwk::Key::Rsa(key) => { + verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?; + } + key => bail!("unsupported key type {key:?}"), + }; + + // TODO(conrad): verify iss, exp, nbf, etc... + + Ok(()) + } +} + +impl JwkCache { + pub async fn check_jwt( + &self, + endpoint: EndpointIdInt, + jwt: String, + ) -> Result<(), anyhow::Error> { + // try with just a read lock first + let entry = self.map.get(&endpoint).as_deref().map(Arc::clone); + let entry = match entry { + Some(entry) => entry, + None => { + // acquire a write lock after to insert. + let entry = self.map.entry(endpoint).or_default(); + Arc::clone(&*entry) + } + }; + + let fetch = FetchAuthRulesFromCplane { endpoint }; + entry.check_jwt(jwt, &self.client, &fetch).await + } +} + +fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> { + use ecdsa::Signature; + use signature::Verifier; + + match key.crv { + jose_jwk::EcCurves::P256 => { + let pk = + p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?; + let key = p256::ecdsa::VerifyingKey::from(&pk); + let sig = Signature::from_slice(sig)?; + key.verify(data, &sig)?; + } + key => bail!("unsupported ec key type {key:?}"), + } + + Ok(()) +} + +fn verify_rsa_signature( + data: &[u8], + sig: &[u8], + key: &jose_jwk::Rsa, + alg: &Option, +) -> anyhow::Result<()> { + use jose_jwa::{Algorithm, Signing}; + use rsa::{ + pkcs1v15::{Signature, VerifyingKey}, + RsaPublicKey, + }; + + let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?; + + match alg { + Some(Algorithm::Signing(Signing::Rs256)) => { + let key = VerifyingKey::::new(key); + let sig = Signature::try_from(sig)?; + key.verify(data, &sig)?; + } + _ => bail!("invalid RSA signing algorithm"), + }; + + Ok(()) +} + +/// +#[derive(serde::Deserialize, serde::Serialize)] +struct JWTHeader<'a> { + /// must be "JWT" + typ: &'a str, + /// must be a supported alg + alg: jose_jwa::Algorithm, + /// key id, must be provided for our usecase + kid: Option<&'a str>, +} + +struct JwkRenewalPermit<'a> { + inner: Option>, +} + +enum JwkRenewalPermitInner<'a> { + Owned(Arc), + Borrowed(&'a Arc), +} + +impl JwkRenewalPermit<'_> { + fn into_owned(mut self) -> JwkRenewalPermit<'static> { + JwkRenewalPermit { + inner: self.inner.take().map(JwkRenewalPermitInner::into_owned), + } + } + + async fn acquire_permit(from: &Arc) -> JwkRenewalPermit { + match from.lookup.acquire().await { + Ok(permit) => { + permit.forget(); + JwkRenewalPermit { + inner: Some(JwkRenewalPermitInner::Borrowed(from)), + } + } + Err(_) => panic!("semaphore should not be closed"), + } + } + + fn try_acquire_permit(from: &Arc) -> Option { + match from.lookup.try_acquire() { + Ok(permit) => { + permit.forget(); + Some(JwkRenewalPermit { + inner: Some(JwkRenewalPermitInner::Borrowed(from)), + }) + } + Err(tokio::sync::TryAcquireError::NoPermits) => None, + Err(tokio::sync::TryAcquireError::Closed) => panic!("semaphore should not be closed"), + } + } +} + +impl JwkRenewalPermitInner<'_> { + fn into_owned(self) -> JwkRenewalPermitInner<'static> { + match self { + JwkRenewalPermitInner::Owned(p) => JwkRenewalPermitInner::Owned(p), + JwkRenewalPermitInner::Borrowed(p) => JwkRenewalPermitInner::Owned(Arc::clone(p)), + } + } +} + +impl Drop for JwkRenewalPermit<'_> { + fn drop(&mut self) { + let entry = match &self.inner { + None => return, + Some(JwkRenewalPermitInner::Owned(p)) => p, + Some(JwkRenewalPermitInner::Borrowed(p)) => *p, + }; + entry.lookup.add_permits(1); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::{future::IntoFuture, net::SocketAddr, time::SystemTime}; + + use base64::URL_SAFE_NO_PAD; + use bytes::Bytes; + use http::Response; + use http_body_util::Full; + use hyper1::service::service_fn; + use hyper_util::rt::TokioIo; + use rand::rngs::OsRng; + use signature::Signer; + use tokio::net::TcpListener; + + fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) { + let sk = p256::SecretKey::random(&mut OsRng); + let pk = sk.public_key().into(); + let jwk = jose_jwk::Jwk { + key: jose_jwk::Key::Ec(pk), + prm: jose_jwk::Parameters { + kid: Some(kid), + alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)), + ..Default::default() + }, + }; + (sk, jwk) + } + + fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) { + let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap(); + let pk = sk.to_public_key().into(); + let jwk = jose_jwk::Jwk { + key: jose_jwk::Key::Rsa(pk), + prm: jose_jwk::Parameters { + kid: Some(kid), + alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)), + ..Default::default() + }, + }; + (sk, jwk) + } + + fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String { + let header = JWTHeader { + typ: "JWT", + alg: jose_jwa::Algorithm::Signing(sig), + kid: Some(&kid), + }; + let body = typed_json::json! {{ + "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600, + }}; + + let header = + base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD); + let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD); + + format!("{header}.{body}") + } + + fn new_ec_jwt(kid: String, key: p256::SecretKey) -> String { + use p256::ecdsa::{Signature, SigningKey}; + + let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256); + let sig: Signature = SigningKey::from(key).sign(payload.as_bytes()); + let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD); + + format!("{payload}.{sig}") + } + + fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String { + use rsa::pkcs1v15::SigningKey; + use rsa::signature::SignatureEncoding; + + let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256); + let sig = SigningKey::::new(key).sign(payload.as_bytes()); + let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD); + + format!("{payload}.{sig}") + } + + #[tokio::test] + async fn renew() { + let (rs1, jwk1) = new_rsa_jwk("1".into()); + let (rs2, jwk2) = new_rsa_jwk("2".into()); + let (ec1, jwk3) = new_ec_jwk("3".into()); + let (ec2, jwk4) = new_ec_jwk("4".into()); + + let jwt1 = new_rsa_jwt("1".into(), rs1); + let jwt2 = new_rsa_jwt("2".into(), rs2); + let jwt3 = new_ec_jwt("3".into(), ec1); + let jwt4 = new_ec_jwt("4".into(), ec2); + + let foo_jwks = jose_jwk::JwkSet { + keys: vec![jwk1, jwk3], + }; + let bar_jwks = jose_jwk::JwkSet { + keys: vec![jwk2, jwk4], + }; + + let service = service_fn(move |req| { + let foo_jwks = foo_jwks.clone(); + let bar_jwks = bar_jwks.clone(); + async move { + let jwks = match req.uri().path() { + "/foo" => &foo_jwks, + "/bar" => &bar_jwks, + _ => { + return Response::builder() + .status(404) + .body(Full::new(Bytes::new())); + } + }; + let body = serde_json::to_vec(jwks).unwrap(); + Response::builder() + .status(200) + .body(Full::new(Bytes::from(body))) + } + }); + + let listener = TcpListener::bind("0.0.0.0:0").await.unwrap(); + let server = hyper1::server::conn::http1::Builder::new(); + let addr = listener.local_addr().unwrap(); + tokio::spawn(async move { + loop { + let (s, _) = listener.accept().await.unwrap(); + let serve = server.serve_connection(TokioIo::new(s), service.clone()); + tokio::spawn(serve.into_future()); + } + }); + + let client = reqwest::Client::new(); + + #[derive(Clone)] + struct Fetch(SocketAddr); + + impl FetchAuthRules for Fetch { + async fn fetch_auth_rules(&self) -> anyhow::Result { + Ok(AuthRules { + jwks_urls: vec![ + format!("http://{}/foo", self.0).parse().unwrap(), + format!("http://{}/bar", self.0).parse().unwrap(), + ], + }) + } + } + + let jwk_cache = Arc::new(JwkCacheEntryLock::default()); + + jwk_cache + .check_jwt(jwt1, &client, &Fetch(addr)) + .await + .unwrap(); + jwk_cache + .check_jwt(jwt2, &client, &Fetch(addr)) + .await + .unwrap(); + jwk_cache + .check_jwt(jwt3, &client, &Fetch(addr)) + .await + .unwrap(); + jwk_cache + .check_jwt(jwt4, &client, &Fetch(addr)) + .await + .unwrap(); + } +} diff --git a/proxy/src/http.rs b/proxy/src/http.rs index dd7164181d..1f1dd8c415 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -6,6 +6,12 @@ pub mod health_server; use std::time::Duration; +use anyhow::bail; +use bytes::Bytes; +use http_body_util::BodyExt; +use hyper1::body::Body; +use serde::de::DeserializeOwned; + pub use reqwest::{Request, Response, StatusCode}; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; @@ -96,6 +102,33 @@ impl Endpoint { } } +pub async fn parse_json_body_with_limit( + mut b: impl Body + Unpin, + limit: usize, +) -> anyhow::Result { + // We could use `b.limited().collect().await.to_bytes()` here + // but this ends up being slightly more efficient as far as I can tell. + + // check the lower bound of the size hint. + // in reqwest, this value is influenced by the Content-Length header. + let lower_bound = match usize::try_from(b.size_hint().lower()) { + Ok(bound) if bound <= limit => bound, + _ => bail!("Content length exceeds limit of {limit} bytes"), + }; + let mut bytes = Vec::with_capacity(lower_bound); + + while let Some(frame) = b.frame().await.transpose()? { + if let Ok(data) = frame.into_data() { + if bytes.len() + data.len() > limit { + bail!("Content length exceeds limit of {limit} bytes") + } + bytes.extend_from_slice(&data); + } + } + + Ok(serde_json::from_slice::(&bytes)?) +} + #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index e5b6536328..c41df07a4d 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -34,6 +34,7 @@ use tracing::error; use tracing::info; use typed_json::json; use url::Url; +use urlencoding; use utils::http::error::ApiError; use crate::auth::backend::ComputeUserInfo; @@ -168,7 +169,8 @@ fn get_conn_info( .path_segments() .ok_or(ConnInfoError::MissingDbName)?; - let dbname: DbName = url_path.next().ok_or(ConnInfoError::InvalidDbName)?.into(); + let dbname: DbName = + urlencoding::decode(url_path.next().ok_or(ConnInfoError::InvalidDbName)?)?.into(); ctx.set_dbname(dbname.clone()); let username = RoleName::from(urlencoding::decode(connection_url.username())?); diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index d574bb438f..c551cd3122 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -164,6 +164,30 @@ impl Deref for FileStorage { } } +impl TimelinePersistentState { + pub(crate) fn write_to_buf(&self) -> Result> { + let mut buf: Vec = Vec::new(); + WriteBytesExt::write_u32::(&mut buf, SK_MAGIC)?; + + if self.eviction_state == EvictionState::Present { + // temp hack for forward compatibility + const PREV_FORMAT_VERSION: u32 = 8; + let prev = downgrade_v9_to_v8(self); + WriteBytesExt::write_u32::(&mut buf, PREV_FORMAT_VERSION)?; + prev.ser_into(&mut buf)?; + } else { + // otherwise, we write the current format version + WriteBytesExt::write_u32::(&mut buf, SK_FORMAT_VERSION)?; + self.ser_into(&mut buf)?; + } + + // calculate checksum before resize + let checksum = crc32c::crc32c(&buf); + buf.extend_from_slice(&checksum.to_le_bytes()); + Ok(buf) + } +} + #[async_trait::async_trait] impl Storage for FileStorage { /// Persists state durably to the underlying storage. @@ -180,24 +204,8 @@ impl Storage for FileStorage { &control_partial_path ) })?; - let mut buf: Vec = Vec::new(); - WriteBytesExt::write_u32::(&mut buf, SK_MAGIC)?; - if s.eviction_state == EvictionState::Present { - // temp hack for forward compatibility - const PREV_FORMAT_VERSION: u32 = 8; - let prev = downgrade_v9_to_v8(s); - WriteBytesExt::write_u32::(&mut buf, PREV_FORMAT_VERSION)?; - prev.ser_into(&mut buf)?; - } else { - // otherwise, we write the current format version - WriteBytesExt::write_u32::(&mut buf, SK_FORMAT_VERSION)?; - s.ser_into(&mut buf)?; - } - - // calculate checksum before resize - let checksum = crc32c::crc32c(&buf); - buf.extend_from_slice(&checksum.to_le_bytes()); + let buf: Vec = s.write_to_buf()?; control_partial.write_all(&buf).await.with_context(|| { format!( diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs index 0bb31c200d..c56f7880d4 100644 --- a/safekeeper/src/http/client.rs +++ b/safekeeper/src/http/client.rs @@ -10,7 +10,7 @@ use reqwest::{IntoUrl, Method, StatusCode}; use utils::{ http::error::HttpErrorBody, - id::{TenantId, TimelineId}, + id::{NodeId, TenantId, TimelineId}, logging::SecretString, }; @@ -97,10 +97,11 @@ impl Client { &self, tenant_id: TenantId, timeline_id: TimelineId, + stream_to: NodeId, ) -> Result { let uri = format!( - "{}/v1/tenant/{}/timeline/{}/snapshot", - self.mgmt_api_endpoint, tenant_id, timeline_id + "{}/v1/tenant/{}/timeline/{}/snapshot/{}", + self.mgmt_api_endpoint, tenant_id, timeline_id, stream_to.0 ); self.get(&uri).await } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index fe6d325cee..c9defb0bcf 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -205,6 +205,7 @@ async fn timeline_pull_handler(mut request: Request) -> Result) -> Result, ApiError> { + let destination = parse_request_param(&request, "destination_id")?; let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, @@ -225,7 +226,13 @@ async fn timeline_snapshot_handler(request: Request) -> Result RouterBuilder request_span(r, tenant_delete_handler) }) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot", + "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id", |r| request_span(r, timeline_snapshot_handler), ) .post("/v1/pull_timeline", |r| { diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 618c6b278f..1eacec9981 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -11,13 +11,8 @@ use std::{ io::{self, ErrorKind}, sync::Arc, }; -use tokio::{ - fs::{File, OpenOptions}, - io::AsyncWrite, - sync::mpsc, - task, -}; -use tokio_tar::{Archive, Builder}; +use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task}; +use tokio_tar::{Archive, Builder, Header}; use tokio_util::{ io::{CopyToBytes, SinkWriter}, sync::PollSender, @@ -32,13 +27,15 @@ use crate::{ routes::TimelineStatus, }, safekeeper::Term, + state::TimelinePersistentState, timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError, WalResidentTimeline}, + wal_backup, wal_storage::{self, open_wal_file, Storage}, GlobalTimelines, SafeKeeperConf, }; use utils::{ crashsafe::{durable_rename, fsync_async_opt}, - id::{TenantId, TenantTimelineId, TimelineId}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, logging::SecretString, lsn::Lsn, pausable_failpoint, @@ -46,8 +43,13 @@ use utils::{ /// Stream tar archive of timeline to tx. #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))] -pub async fn stream_snapshot(tli: WalResidentTimeline, tx: mpsc::Sender>) { - if let Err(e) = stream_snapshot_guts(tli, tx.clone()).await { +pub async fn stream_snapshot( + tli: WalResidentTimeline, + source: NodeId, + destination: NodeId, + tx: mpsc::Sender>, +) { + if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await { // Error type/contents don't matter as they won't can't reach the client // (hyper likely doesn't do anything with it), but http stream will be // prematurely terminated. It would be nice to try to send the error in @@ -81,6 +83,8 @@ impl Drop for SnapshotContext { pub async fn stream_snapshot_guts( tli: WalResidentTimeline, + source: NodeId, + destination: NodeId, tx: mpsc::Sender>, ) -> Result<()> { // tokio-tar wants Write implementor, but we have mpsc tx >; @@ -104,7 +108,7 @@ pub async fn stream_snapshot_guts( // which is also likely suboptimal. let mut ar = Builder::new_non_terminated(pinned_writer); - let bctx = tli.start_snapshot(&mut ar).await?; + let bctx = tli.start_snapshot(&mut ar, source, destination).await?; pausable_failpoint!("sk-snapshot-after-list-pausable"); let tli_dir = tli.get_timeline_dir(); @@ -158,13 +162,43 @@ impl WalResidentTimeline { async fn start_snapshot( &self, ar: &mut tokio_tar::Builder, + source: NodeId, + destination: NodeId, ) -> Result { let mut shared_state = self.write_shared_state().await; let wal_seg_size = shared_state.get_wal_seg_size(); - let cf_path = self.get_timeline_dir().join(CONTROL_FILE_NAME); - let mut cf = File::open(cf_path).await?; - ar.append_file(CONTROL_FILE_NAME, &mut cf).await?; + let mut control_store = TimelinePersistentState::clone(shared_state.sk.state()); + // Modify the partial segment of the in-memory copy for the control file to + // point to the destination safekeeper. + let replace = control_store + .partial_backup + .replace_uploaded_segment(source, destination)?; + + if let Some(replace) = replace { + // The deserialized control file has an uploaded partial. We upload a copy + // of it to object storage for the destination safekeeper and send an updated + // control file in the snapshot. + tracing::info!( + "Replacing uploaded partial segment in in-mem control file: {replace:?}" + ); + + let remote_timeline_path = wal_backup::remote_timeline_path(&self.tli.ttid)?; + wal_backup::copy_partial_segment( + &replace.previous.remote_path(&remote_timeline_path), + &replace.current.remote_path(&remote_timeline_path), + ) + .await?; + } + + let buf = control_store + .write_to_buf() + .with_context(|| "failed to serialize control store")?; + let mut header = Header::new_gnu(); + header.set_size(buf.len().try_into().expect("never breaches u64")); + ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice()) + .await + .with_context(|| "failed to append to archive")?; // We need to stream since the oldest segment someone (s3 or pageserver) // still needs. This duplicates calc_horizon_lsn logic. @@ -342,7 +376,7 @@ async fn pull_timeline( let client = Client::new(host.clone(), sk_auth_token.clone()); // Request stream with basebackup archive. let bb_resp = client - .snapshot(status.tenant_id, status.timeline_id) + .snapshot(status.tenant_id, status.timeline_id, conf.my_id) .await?; // Make Stream of Bytes from it... diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 33ec39b852..0814d9ba67 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -92,7 +92,7 @@ impl TermHistory { } /// Find point of divergence between leader (walproposer) term history and - /// safekeeper. Arguments are not symmetrics as proposer history ends at + /// safekeeper. Arguments are not symmetric as proposer history ends at /// +infinity while safekeeper at flush_lsn. /// C version is at walproposer SendProposerElected. pub fn find_highest_common_point( @@ -701,7 +701,13 @@ where .with_label_values(&["handle_elected"]) .start_timer(); - info!("received ProposerElected {:?}", msg); + info!( + "received ProposerElected {:?}, term={}, last_log_term={}, flush_lsn={}", + msg, + self.state.acceptor_state.term, + self.get_last_log_term(), + self.flush_lsn() + ); if self.state.acceptor_state.term < msg.term { let mut state = self.state.start_change(); state.acceptor_state.term = msg.term; @@ -713,22 +719,43 @@ where return Ok(None); } - // This might happen in a rare race when another (old) connection from - // the same walproposer writes + flushes WAL after this connection - // already sent flush_lsn in VoteRequest. It is generally safe to - // proceed, but to prevent commit_lsn surprisingly going down we should - // either refuse the session (simpler) or skip the part we already have - // from the stream (can be implemented). - if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at { - bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help", - msg.term, self.flush_lsn(), msg.start_streaming_at) + // Before truncating WAL check-cross the check divergence point received + // from the walproposer. + let sk_th = self.get_term_history(); + let last_common_point = match TermHistory::find_highest_common_point( + &msg.term_history, + &sk_th, + self.flush_lsn(), + ) { + // No common point. Expect streaming from the beginning of the + // history like walproposer while we don't have proper init. + None => *msg.term_history.0.first().ok_or(anyhow::anyhow!( + "empty walproposer term history {:?}", + msg.term_history + ))?, + Some(lcp) => lcp, + }; + // This is expected to happen in a rare race when another connection + // from the same walproposer writes + flushes WAL after this connection + // sent flush_lsn in VoteRequest; for instance, very late + // ProposerElected message delivery after another connection was + // established and wrote WAL. In such cases error is transient; + // reconnection makes safekeeper send newest term history and flush_lsn + // and walproposer recalculates the streaming point. OTOH repeating + // error indicates a serious bug. + if last_common_point.lsn != msg.start_streaming_at { + bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", + last_common_point, msg.start_streaming_at, + self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, + ); } - // Otherwise we must never attempt to truncate committed data. + + // We are also expected to never attempt to truncate committed data. assert!( msg.start_streaming_at >= self.state.inmem.commit_lsn, - "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}", - msg.start_streaming_at, - self.state.inmem.commit_lsn + "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}", + msg.start_streaming_at, self.state.inmem.commit_lsn, + self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history, ); // Before first WAL write initialize its segment. It makes first segment @@ -743,9 +770,6 @@ where .await?; } - // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to - // intersection of our history and history from msg - // truncate wal, update the LSNs self.wal_store.truncate_wal(msg.start_streaming_at).await?; @@ -1069,7 +1093,7 @@ mod tests { let pem = ProposerElected { term: 1, - start_streaming_at: Lsn(1), + start_streaming_at: Lsn(3), term_history: TermHistory(vec![TermLsn { term: 1, lsn: Lsn(3), diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 234273e133..aa1a6696a1 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -483,6 +483,16 @@ pub(crate) async fn backup_partial_segment( .await } +pub(crate) async fn copy_partial_segment( + source: &RemotePath, + destination: &RemotePath, +) -> Result<()> { + let storage = get_configured_remote_storage(); + let cancel = CancellationToken::new(); + + storage.copy_object(source, destination, &cancel).await +} + pub async fn read_object( file_path: &RemotePath, offset: u64, diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 52765b0e98..675a051887 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -17,14 +17,13 @@ //! file. Code updates state in the control file before doing any S3 operations. //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. - use camino::Utf8PathBuf; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; use tracing::{debug, error, info, instrument, warn}; -use utils::lsn::Lsn; +use utils::{id::NodeId, lsn::Lsn}; use crate::{ metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, @@ -82,6 +81,12 @@ pub struct State { pub segments: Vec, } +#[derive(Debug)] +pub(crate) struct ReplaceUploadedSegment { + pub(crate) previous: PartialRemoteSegment, + pub(crate) current: PartialRemoteSegment, +} + impl State { /// Find an Uploaded segment. There should be only one Uploaded segment at a time. pub(crate) fn uploaded_segment(&self) -> Option { @@ -90,6 +95,54 @@ impl State { .find(|seg| seg.status == UploadStatus::Uploaded) .cloned() } + + /// Replace the name of the Uploaded segment (if one exists) in order to match + /// it with `destination` safekeeper. Returns a description of the change or None + /// wrapped in anyhow::Result. + pub(crate) fn replace_uploaded_segment( + &mut self, + source: NodeId, + destination: NodeId, + ) -> anyhow::Result> { + let current = self + .segments + .iter_mut() + .find(|seg| seg.status == UploadStatus::Uploaded); + + let current = match current { + Some(some) => some, + None => { + return anyhow::Ok(None); + } + }; + + // Sanity check that the partial segment we are replacing is belongs + // to the `source` SK. + if !current + .name + .ends_with(format!("sk{}.partial", source.0).as_str()) + { + anyhow::bail!( + "Partial segment name ({}) doesn't match self node id ({})", + current.name, + source + ); + } + + let previous = current.clone(); + + let new_name = current.name.replace( + format!("_sk{}", source.0).as_str(), + format!("_sk{}", destination.0).as_str(), + ); + + current.name = new_name; + + anyhow::Ok(Some(ReplaceUploadedSegment { + previous, + current: current.clone(), + })) + } } struct PartialBackup { diff --git a/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql new file mode 100644 index 0000000000..53222c614e --- /dev/null +++ b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql @@ -0,0 +1 @@ +DROP TABLE controllers; diff --git a/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql new file mode 100644 index 0000000000..90546948cb --- /dev/null +++ b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql @@ -0,0 +1,5 @@ +CREATE TABLE controllers ( + address VARCHAR NOT NULL, + started_at TIMESTAMPTZ NOT NULL, + PRIMARY KEY(address, started_at) +); diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index e8513b31eb..7bbd1541cf 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -500,7 +500,7 @@ async fn handle_node_configure(mut req: Request) -> Result, StatusCode::OK, state .service - .node_configure( + .external_node_configure( config_req.node_id, config_req.availability.map(NodeAvailability::from), config_req.scheduling, @@ -520,6 +520,19 @@ async fn handle_node_status(req: Request) -> Result, ApiErr json_response(StatusCode::OK, node_status) } +async fn handle_get_leader(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let leader = state.service.get_leader().await.map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "Failed to read leader from database: {err}" + )) + })?; + + json_response(StatusCode::OK, leader) +} + async fn handle_node_drain(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -1016,6 +1029,9 @@ pub fn make_router( .get("/control/v1/node/:node_id", |r| { named_request_span(r, handle_node_status, RequestName("control_v1_node_status")) }) + .get("/control/v1/leader", |r| { + named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader")) + }) .put("/control/v1/node/:node_id/drain", |r| { named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain")) }) diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs index 26c258c466..2034addbe1 100644 --- a/storage_controller/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -11,6 +11,7 @@ mod id_lock_map; pub mod metrics; mod node; mod pageserver_client; +mod peer_client; pub mod persistence; mod reconciler; mod scheduler; diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index a66e9128bc..7387d36690 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,6 +1,7 @@ use anyhow::{anyhow, Context}; use clap::Parser; use diesel::Connection; +use hyper::Uri; use metrics::launch_timestamp::LaunchTimestamp; use metrics::BuildInfo; use std::path::PathBuf; @@ -83,6 +84,13 @@ struct Cli { #[arg(long, default_value = "5s")] db_connect_timeout: humantime::Duration, + #[arg(long, default_value = "false")] + start_as_candidate: bool, + + // TODO: make this mandatory once the helm chart gets updated + #[arg(long)] + address_for_peers: Option, + /// `neon_local` sets this to the path of the neon_local repo dir. /// Only relevant for testing. // TODO: make `cfg(feature = "testing")` @@ -188,14 +196,26 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> { } fn main() -> anyhow::Result<()> { - let default_panic = std::panic::take_hook(); - std::panic::set_hook(Box::new(move |info| { - default_panic(info); - std::process::exit(1); - })); + logging::init( + LogFormat::Plain, + logging::TracingErrorLayerEnablement::Disabled, + logging::Output::Stdout, + )?; + + // log using tracing so we don't get confused output by default hook writing to stderr + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + let hook = std::panic::take_hook(); + std::panic::set_hook(Box::new(move |info| { + // let sentry send a message (and flush) + // and trace the error + hook(info); + + std::process::exit(1); + })); + tokio::runtime::Builder::new_current_thread() // We use spawn_blocking for database operations, so require approximately // as many blocking threads as we will open database connections. @@ -209,12 +229,6 @@ fn main() -> anyhow::Result<()> { async fn async_main() -> anyhow::Result<()> { let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate())); - logging::init( - LogFormat::Plain, - logging::TracingErrorLayerEnablement::Disabled, - logging::Output::Stdout, - )?; - preinitialize_metrics(); let args = Cli::parse(); @@ -285,6 +299,9 @@ async fn async_main() -> anyhow::Result<()> { split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, max_secondary_lag_bytes: args.max_secondary_lag_bytes, + address_for_peers: args.address_for_peers, + start_as_candidate: args.start_as_candidate, + http_service_port: args.listen.port() as i32, }; // After loading secrets & config, but before starting anything else, apply database migrations diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index a1a4b8543d..c2303e7a7f 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -12,6 +12,7 @@ use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, Metr use metrics::NeonMetrics; use once_cell::sync::Lazy; use std::sync::Mutex; +use strum::IntoEnumIterator; use crate::{ persistence::{DatabaseError, DatabaseOperation}, @@ -241,3 +242,18 @@ impl DatabaseError { } } } + +/// Update the leadership status metric gauges to reflect the requested status +pub(crate) fn update_leadership_status(status: LeadershipStatus) { + let status_metric = &METRICS_REGISTRY + .metrics_group + .storage_controller_leadership_status; + + for s in LeadershipStatus::iter() { + if s == status { + status_metric.set(LeadershipStatusGroup { status: s }, 1); + } else { + status_metric.set(LeadershipStatusGroup { status: s }, 0); + } + } +} diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs new file mode 100644 index 0000000000..3f8520fe55 --- /dev/null +++ b/storage_controller/src/peer_client.rs @@ -0,0 +1,108 @@ +use crate::tenant_shard::ObservedState; +use pageserver_api::shard::TenantShardId; +use serde::{Deserialize, Serialize}; +use std::{collections::HashMap, time::Duration}; +use tokio_util::sync::CancellationToken; + +use hyper::Uri; +use reqwest::{StatusCode, Url}; +use utils::{backoff, http::error::HttpErrorBody}; + +#[derive(Debug, Clone)] +pub(crate) struct PeerClient { + uri: Uri, + jwt: Option, + client: reqwest::Client, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum StorageControllerPeerError { + #[error("failed to deserialize error response with status code {0} at {1}: {2}")] + DeserializationError(StatusCode, Url, reqwest::Error), + #[error("storage controller peer API error ({0}): {1}")] + ApiError(StatusCode, String), + #[error("failed to send HTTP request: {0}")] + SendError(reqwest::Error), + #[error("Cancelled")] + Cancelled, +} + +pub(crate) type Result = std::result::Result; + +pub(crate) trait ResponseErrorMessageExt: Sized { + fn error_from_body(self) -> impl std::future::Future> + Send; +} + +impl ResponseErrorMessageExt for reqwest::Response { + async fn error_from_body(self) -> Result { + let status = self.status(); + if !(status.is_client_error() || status.is_server_error()) { + return Ok(self); + } + + let url = self.url().to_owned(); + Err(match self.json::().await { + Ok(HttpErrorBody { msg }) => StorageControllerPeerError::ApiError(status, msg), + Err(err) => StorageControllerPeerError::DeserializationError(status, url, err), + }) + } +} + +#[derive(Serialize, Deserialize, Debug, Default)] +pub(crate) struct GlobalObservedState(pub(crate) HashMap); + +impl PeerClient { + pub(crate) fn new(uri: Uri, jwt: Option) -> Self { + Self { + uri, + jwt, + client: reqwest::Client::new(), + } + } + + async fn request_step_down(&self) -> Result { + let step_down_path = format!("{}control/v1/step_down", self.uri); + let req = self.client.put(step_down_path); + let req = if let Some(jwt) = &self.jwt { + req.header(reqwest::header::AUTHORIZATION, format!("Bearer {jwt}")) + } else { + req + }; + + let req = req.timeout(Duration::from_secs(2)); + + let res = req + .send() + .await + .map_err(StorageControllerPeerError::SendError)?; + let response = res.error_from_body().await?; + + let status = response.status(); + let url = response.url().to_owned(); + + response + .json() + .await + .map_err(|err| StorageControllerPeerError::DeserializationError(status, url, err)) + } + + /// Request the peer to step down and return its current observed state + /// All errors are retried with exponential backoff for a maximum of 4 attempts. + /// Assuming all retries are performed, the function times out after roughly 4 seconds. + pub(crate) async fn step_down( + &self, + cancel: &CancellationToken, + ) -> Result { + backoff::retry( + || self.request_step_down(), + |_e| false, + 2, + 4, + "Send step down request", + cancel, + ) + .await + .ok_or_else(|| StorageControllerPeerError::Cancelled) + .and_then(|x| x) + } +} diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 64a3e597ce..aebbdec0d1 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -95,6 +95,8 @@ pub(crate) enum DatabaseOperation { ListMetadataHealth, ListMetadataHealthUnhealthy, ListMetadataHealthOutdated, + GetLeader, + UpdateLeader, } #[must_use] @@ -785,6 +787,69 @@ impl Persistence { ) .await } + + /// Get the current entry from the `leader` table if one exists. + /// It is an error for the table to contain more than one entry. + pub(crate) async fn get_leader(&self) -> DatabaseResult> { + let mut leader: Vec = self + .with_measured_conn( + DatabaseOperation::GetLeader, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::controllers::table.load::(conn)?) + }, + ) + .await?; + + if leader.len() > 1 { + return Err(DatabaseError::Logical(format!( + "More than one entry present in the leader table: {leader:?}" + ))); + } + + Ok(leader.pop()) + } + + /// Update the new leader with compare-exchange semantics. If `prev` does not + /// match the current leader entry, then the update is treated as a failure. + /// When `prev` is not specified, the update is forced. + pub(crate) async fn update_leader( + &self, + prev: Option, + new: ControllerPersistence, + ) -> DatabaseResult<()> { + use crate::schema::controllers::dsl::*; + + let updated = self + .with_measured_conn( + DatabaseOperation::UpdateLeader, + move |conn| -> DatabaseResult { + let updated = match &prev { + Some(prev) => diesel::update(controllers) + .filter(address.eq(prev.address.clone())) + .filter(started_at.eq(prev.started_at)) + .set(( + address.eq(new.address.clone()), + started_at.eq(new.started_at), + )) + .execute(conn)?, + None => diesel::insert_into(controllers) + .values(new.clone()) + .execute(conn)?, + }; + + Ok(updated) + }, + ) + .await?; + + if updated == 0 { + return Err(DatabaseError::Logical( + "Leader table update failed".to_string(), + )); + } + + Ok(()) + } } /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably @@ -910,3 +975,12 @@ impl From for MetadataHealthRecord { } } } + +#[derive( + Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq, Debug, Clone, +)] +#[diesel(table_name = crate::schema::controllers)] +pub(crate) struct ControllerPersistence { + pub(crate) address: String, + pub(crate) started_at: chrono::DateTime, +} diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index cb5ba3f38b..77ba47e114 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -1,5 +1,12 @@ // @generated automatically by Diesel CLI. +diesel::table! { + controllers (address, started_at) { + address -> Varchar, + started_at -> Timestamptz, + } +} + diesel::table! { metadata_health (tenant_id, shard_number, shard_count) { tenant_id -> Varchar, @@ -36,4 +43,4 @@ diesel::table! { } } -diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,); +diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,); diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 31b2d0c3f5..3459b44774 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,3 +1,4 @@ +use hyper::Uri; use std::{ borrow::Cow, cmp::Ordering, @@ -16,8 +17,12 @@ use crate::{ compute_hook::NotifyError, drain_utils::{self, TenantShardDrain, TenantShardIterator}, id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, - metrics::LeadershipStatusGroup, - persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter}, + metrics, + peer_client::{GlobalObservedState, PeerClient}, + persistence::{ + AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence, + TenantFilter, + }, reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, scheduler::{MaySchedule, ScheduleContext, ScheduleMode}, tenant_shard::{ @@ -83,7 +88,6 @@ use crate::{ ReconcilerWaiter, TenantShard, }, }; -use serde::{Deserialize, Serialize}; pub mod chaos_injector; @@ -140,7 +144,15 @@ enum NodeOperations { /// Allowed transitions are: /// 1. Leader -> SteppedDown /// 2. Candidate -> Leader -#[derive(Copy, Clone, strum_macros::Display, measured::FixedCardinalityLabel)] +#[derive( + Eq, + PartialEq, + Copy, + Clone, + strum_macros::Display, + strum_macros::EnumIter, + measured::FixedCardinalityLabel, +)] #[strum(serialize_all = "snake_case")] pub(crate) enum LeadershipStatus { /// This is the steady state where the storage controller can produce @@ -226,22 +238,12 @@ impl ServiceState { tenants: BTreeMap, scheduler: Scheduler, delayed_reconcile_rx: tokio::sync::mpsc::Receiver, + initial_leadership_status: LeadershipStatus, ) -> Self { - let status = &crate::metrics::METRICS_REGISTRY - .metrics_group - .storage_controller_leadership_status; - - status.set( - LeadershipStatusGroup { - status: LeadershipStatus::Leader, - }, - 1, - ); + metrics::update_leadership_status(initial_leadership_status); Self { - // TODO: Starting up as Leader is a transient state. Once we enable rolling - // upgrades on the k8s side, we should start up as Candidate. - leadership_status: LeadershipStatus::Leader, + leadership_status: initial_leadership_status, tenants, nodes: Arc::new(nodes), scheduler, @@ -266,29 +268,12 @@ impl ServiceState { fn step_down(&mut self) { self.leadership_status = LeadershipStatus::SteppedDown; + metrics::update_leadership_status(self.leadership_status); + } - let status = &crate::metrics::METRICS_REGISTRY - .metrics_group - .storage_controller_leadership_status; - - status.set( - LeadershipStatusGroup { - status: LeadershipStatus::SteppedDown, - }, - 1, - ); - status.set( - LeadershipStatusGroup { - status: LeadershipStatus::Leader, - }, - 0, - ); - status.set( - LeadershipStatusGroup { - status: LeadershipStatus::Candidate, - }, - 0, - ); + fn become_leader(&mut self) { + self.leadership_status = LeadershipStatus::Leader; + metrics::update_leadership_status(self.leadership_status); } } @@ -332,6 +317,12 @@ pub struct Config { // by more than the configured amount, then the secondary is not // upgraded to primary. pub max_secondary_lag_bytes: Option, + + pub address_for_peers: Option, + + pub start_as_candidate: bool, + + pub http_service_port: i32, } impl From for ApiError { @@ -499,10 +490,6 @@ pub(crate) enum ReconcileResultRequest { Stop, } -// TODO: move this into the storcon peer client when that gets added -#[derive(Serialize, Deserialize, Debug, Default)] -pub(crate) struct GlobalObservedState(HashMap); - impl Service { pub fn get_config(&self) -> &Config { &self.config @@ -513,15 +500,12 @@ impl Service { #[instrument(skip_all)] async fn startup_reconcile( self: &Arc, + current_leader: Option, + leader_step_down_state: Option, bg_compute_notify_result_tx: tokio::sync::mpsc::Sender< Result<(), (TenantShardId, NotifyError)>, >, ) { - // For all tenant shards, a vector of observed states on nodes (where None means - // indeterminate, same as in [`ObservedStateLocation`]) - let mut observed: HashMap)>> = - HashMap::new(); - // Startup reconciliation does I/O to other services: whether they // are responsive or not, we should aim to finish within our deadline, because: // - If we don't, a k8s readiness hook watching /ready will kill us. @@ -535,26 +519,26 @@ impl Service { .checked_add(STARTUP_RECONCILE_TIMEOUT / 2) .expect("Reconcile timeout is a modest constant"); + let observed = if let Some(state) = leader_step_down_state { + tracing::info!( + "Using observed state received from leader at {}", + current_leader.as_ref().unwrap().address + ); + + state + } else { + self.build_global_observed_state(node_scan_deadline).await + }; + // Accumulate a list of any tenant locations that ought to be detached let mut cleanup = Vec::new(); - let node_listings = self.scan_node_locations(node_scan_deadline).await; - // Send initial heartbeat requests to nodes that replied to the location listing above. - let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await; - - for (node_id, list_response) in node_listings { - let tenant_shards = list_response.tenant_shards; - tracing::info!( - "Received {} shard statuses from pageserver {}, setting it to Active", - tenant_shards.len(), - node_id - ); - - for (tenant_shard_id, conf_opt) in tenant_shards { - let shard_observations = observed.entry(tenant_shard_id).or_default(); - shard_observations.push((node_id, conf_opt)); - } - } + // Send initial heartbeat requests to all nodes loaded from the database + let all_nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await; // List of tenants for which we will attempt to notify compute of their location at startup let mut compute_notifications = Vec::new(); @@ -577,17 +561,16 @@ impl Service { } *nodes = Arc::new(new_nodes); - for (tenant_shard_id, shard_observations) in observed { - for (node_id, observed_loc) in shard_observations { - let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { - cleanup.push((tenant_shard_id, node_id)); - continue; - }; - tenant_shard - .observed - .locations - .insert(node_id, ObservedStateLocation { conf: observed_loc }); - } + for (tenant_shard_id, observed_state) in observed.0 { + let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { + for node_id in observed_state.locations.keys() { + cleanup.push((tenant_shard_id, *node_id)); + } + + continue; + }; + + tenant_shard.observed = observed_state; } // Populate each tenant's intent state @@ -621,6 +604,28 @@ impl Service { tenants.len() }; + // Before making any obeservable changes to the cluster, persist self + // as leader in database and memory. + if let Some(address_for_peers) = &self.config.address_for_peers { + // TODO: `address-for-peers` can become a mandatory cli arg + // after we update the k8s setup + let proposed_leader = ControllerPersistence { + address: address_for_peers.to_string(), + started_at: chrono::Utc::now(), + }; + + if let Err(err) = self + .persistence + .update_leader(current_leader, proposed_leader) + .await + { + tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ..."); + std::process::exit(1); + } + } + + self.inner.write().unwrap().become_leader(); + // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that // generation_pageserver in the database. @@ -786,6 +791,31 @@ impl Service { node_results } + async fn build_global_observed_state(&self, deadline: Instant) -> GlobalObservedState { + let node_listings = self.scan_node_locations(deadline).await; + let mut observed = GlobalObservedState::default(); + + for (node_id, location_confs) in node_listings { + tracing::info!( + "Received {} shard statuses from pageserver {}", + location_confs.tenant_shards.len(), + node_id + ); + + for (tid, location_conf) in location_confs.tenant_shards { + let entry = observed.0.entry(tid).or_default(); + entry.locations.insert( + node_id, + ObservedStateLocation { + conf: location_conf, + }, + ); + } + } + + observed + } + /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers. /// /// This is safe to run in the background, because if we don't have this TenantShardId in our map of @@ -1264,12 +1294,20 @@ impl Service { config.max_warming_up_interval, cancel.clone(), ); + + let initial_leadership_status = if config.start_as_candidate { + LeadershipStatus::Candidate + } else { + LeadershipStatus::Leader + }; + let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( nodes, tenants, scheduler, delayed_reconcile_rx, + initial_leadership_status, ))), config: config.clone(), persistence, @@ -1338,7 +1376,35 @@ impl Service { return; }; - this.startup_reconcile(bg_compute_notify_result_tx).await; + let leadership_status = this.inner.read().unwrap().get_leadership_status(); + let leader = match this.get_leader().await { + Ok(ok) => ok, + Err(err) => { + tracing::error!( + "Failed to query database for current leader: {err}. Aborting start-up ..." + ); + std::process::exit(1); + } + }; + + let leader_step_down_state = match leadership_status { + LeadershipStatus::Candidate => { + if let Some(ref leader) = leader { + this.request_step_down(leader).await + } else { + tracing::info!( + "No leader found to request step down from. Will build observed state." + ); + None + } + } + LeadershipStatus::Leader => None, + LeadershipStatus::SteppedDown => unreachable!(), + }; + + this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx) + .await; + drop(startup_completion); } }); @@ -2937,6 +3003,7 @@ impl Service { ); let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + client .timeline_detach_ancestor(tenant_shard_id, timeline_id) .await @@ -2953,7 +3020,13 @@ impl Service { Error::ApiError(StatusCode::BAD_REQUEST, msg) => { ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}")) } - // rest can be mapped + Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, msg) => { + // avoid turning these into conflicts to remain compatible with + // pageservers, 500 errors are sadly retryable with timeline ancestor + // detach + ApiError::InternalServerError(anyhow::anyhow!("{node}: {msg}")) + } + // rest can be mapped as usual other => passthrough_api_error(&node, other), } }) @@ -2987,6 +3060,8 @@ impl Service { ?mismatching, "shards returned different results" ); + + return Err(ApiError::InternalServerError(anyhow::anyhow!("pageservers returned mixed results for ancestor detach; manual intervention is required."))); } Ok(any.1) @@ -4589,6 +4664,10 @@ impl Service { )) } + pub(crate) async fn get_leader(&self) -> DatabaseResult> { + self.persistence.get_leader().await + } + pub(crate) async fn node_register( &self, register_req: NodeRegisterRequest, @@ -4851,6 +4930,26 @@ impl Service { Ok(()) } + /// Wrapper around [`Self::node_configure`] which only allows changes while there is no ongoing + /// operation for HTTP api. + pub(crate) async fn external_node_configure( + &self, + node_id: NodeId, + availability: Option, + scheduling: Option, + ) -> Result<(), ApiError> { + { + let locked = self.inner.read().unwrap(); + if let Some(op) = locked.ongoing_operation.as_ref().map(|op| op.operation) { + return Err(ApiError::PreconditionFailed( + format!("Ongoing background operation forbids configuring: {op}").into(), + )); + } + } + + self.node_configure(node_id, availability, scheduling).await + } + pub(crate) async fn start_node_drain( self: &Arc, node_id: NodeId, @@ -4908,6 +5007,8 @@ impl Service { cancel: cancel.clone(), }); + let span = tracing::info_span!(parent: None, "drain_node", %node_id); + tokio::task::spawn({ let service = self.clone(); let cancel = cancel.clone(); @@ -4924,21 +5025,21 @@ impl Service { } } - tracing::info!(%node_id, "Drain background operation starting"); + tracing::info!("Drain background operation starting"); let res = service.drain_node(node_id, cancel).await; match res { Ok(()) => { - tracing::info!(%node_id, "Drain background operation completed successfully"); + tracing::info!("Drain background operation completed successfully"); } Err(OperationError::Cancelled) => { - tracing::info!(%node_id, "Drain background operation was cancelled"); + tracing::info!("Drain background operation was cancelled"); } Err(err) => { - tracing::error!(%node_id, "Drain background operation encountered: {err}") + tracing::error!("Drain background operation encountered: {err}") } } } - }); + }.instrument(span)); } NodeSchedulingPolicy::Draining => { return Err(ApiError::Conflict(format!( @@ -4956,14 +5057,14 @@ impl Service { } pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> { - let (node_available, node_policy) = { + let node_available = { let locked = self.inner.read().unwrap(); let nodes = &locked.nodes; let node = nodes.get(&node_id).ok_or(ApiError::NotFound( anyhow::anyhow!("Node {} not registered", node_id).into(), ))?; - (node.is_available(), node.get_scheduling()) + node.is_available() }; if !node_available { @@ -4972,12 +5073,6 @@ impl Service { )); } - if !matches!(node_policy, NodeSchedulingPolicy::Draining) { - return Err(ApiError::PreconditionFailed( - format!("Node {node_id} has no drain in progress").into(), - )); - } - if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() { if let Operation::Drain(drain) = op_handler.operation { if drain.node_id == node_id { @@ -5043,6 +5138,8 @@ impl Service { cancel: cancel.clone(), }); + let span = tracing::info_span!(parent: None, "fill_node", %node_id); + tokio::task::spawn({ let service = self.clone(); let cancel = cancel.clone(); @@ -5059,21 +5156,21 @@ impl Service { } } - tracing::info!(%node_id, "Fill background operation starting"); + tracing::info!("Fill background operation starting"); let res = service.fill_node(node_id, cancel).await; match res { Ok(()) => { - tracing::info!(%node_id, "Fill background operation completed successfully"); + tracing::info!("Fill background operation completed successfully"); } Err(OperationError::Cancelled) => { - tracing::info!(%node_id, "Fill background operation was cancelled"); + tracing::info!("Fill background operation was cancelled"); } Err(err) => { - tracing::error!(%node_id, "Fill background operation encountered: {err}") + tracing::error!("Fill background operation encountered: {err}") } } } - }); + }.instrument(span)); } NodeSchedulingPolicy::Filling => { return Err(ApiError::Conflict(format!( @@ -5091,14 +5188,14 @@ impl Service { } pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> { - let (node_available, node_policy) = { + let node_available = { let locked = self.inner.read().unwrap(); let nodes = &locked.nodes; let node = nodes.get(&node_id).ok_or(ApiError::NotFound( anyhow::anyhow!("Node {} not registered", node_id).into(), ))?; - (node.is_available(), node.get_scheduling()) + node.is_available() }; if !node_available { @@ -5107,12 +5204,6 @@ impl Service { )); } - if !matches!(node_policy, NodeSchedulingPolicy::Filling) { - return Err(ApiError::PreconditionFailed( - format!("Node {node_id} has no fill in progress").into(), - )); - } - if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() { if let Operation::Fill(fill) = op_handler.operation { if fill.node_id == node_id { @@ -5921,7 +6012,7 @@ impl Service { .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) .await; - failpoint_support::sleep_millis_async!("sleepy-drain-loop"); + failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel); } while !waiters.is_empty() { @@ -6269,6 +6360,7 @@ impl Service { pub(crate) async fn step_down(&self) -> GlobalObservedState { tracing::info!("Received step down request from peer"); + failpoint_support::sleep_millis_async!("sleep-on-step-down-handling"); self.inner.write().unwrap().step_down(); // TODO: would it make sense to have a time-out for this? @@ -6285,4 +6377,42 @@ impl Service { global_observed } + + /// Request step down from the currently registered leader in the database + /// + /// If such an entry is persisted, the success path returns the observed + /// state and details of the leader. Otherwise, None is returned indicating + /// there is no leader currently. + /// + /// On failures to query the database or step down error responses the process is killed + /// and we rely on k8s to retry. + async fn request_step_down( + &self, + leader: &ControllerPersistence, + ) -> Option { + tracing::info!("Sending step down request to {leader:?}"); + + // TODO: jwt token + let client = PeerClient::new( + Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"), + self.config.jwt_token.clone(), + ); + let state = client.step_down(&self.cancel).await; + match state { + Ok(state) => Some(state), + Err(err) => { + // TODO: Make leaders periodically update a timestamp field in the + // database and, if the leader is not reachable from the current instance, + // but inferred as alive from the timestamp, abort start-up. This avoids + // a potential scenario in which we have two controllers acting as leaders. + tracing::error!( + "Leader ({}) did not respond to step-down request: {}", + leader.address, + err + ); + + None + } + } + } } diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index cbc836755a..3935e513e3 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -3,9 +3,10 @@ use camino::Utf8PathBuf; use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse}; use pageserver_api::shard::TenantShardId; use reqwest::{Method, Url}; +use storage_controller_client::control_api; use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; use storage_scrubber::pageserver_physical_gc::GcMode; -use storage_scrubber::scan_pageserver_metadata::scan_metadata; +use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata; use storage_scrubber::tenant_snapshot::SnapshotDownloader; use storage_scrubber::{find_large_objects, ControllerClientConfig}; use storage_scrubber::{ @@ -68,7 +69,7 @@ enum Command { #[arg(long = "tenant-id", num_args = 0..)] tenant_ids: Vec, #[arg(long = "post", default_value_t = false)] - post_to_storage_controller: bool, + post_to_storcon: bool, #[arg(long, default_value = None)] /// For safekeeper node_kind only, points to db with debug dump dump_db_connstr: Option, @@ -100,6 +101,16 @@ enum Command { #[arg(long = "concurrency", short = 'j', default_value_t = 64)] concurrency: usize, }, + CronJob { + // PageserverPhysicalGc + #[arg(long = "min-age")] + gc_min_age: humantime::Duration, + #[arg(short, long, default_value_t = GcMode::IndicesOnly)] + gc_mode: GcMode, + // ScanMetadata + #[arg(long = "post", default_value_t = false)] + post_to_storcon: bool, + }, } #[tokio::main] @@ -117,6 +128,7 @@ async fn main() -> anyhow::Result<()> { Command::TenantSnapshot { .. } => "tenant-snapshot", Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc", Command::FindLargeObjects { .. } => "find-large-objects", + Command::CronJob { .. } => "cron-job", }; let _guard = init_logging(&format!( "{}_{}_{}_{}.log", @@ -126,12 +138,13 @@ async fn main() -> anyhow::Result<()> { chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") )); - let controller_client_conf = cli.controller_api.map(|controller_api| { + let controller_client = cli.controller_api.map(|controller_api| { ControllerClientConfig { controller_api, // Default to no key: this is a convenience when working in a development environment controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()), } + .build_client() }); match cli.command { @@ -139,7 +152,7 @@ async fn main() -> anyhow::Result<()> { json, tenant_ids, node_kind, - post_to_storage_controller, + post_to_storcon, dump_db_connstr, dump_db_table, } => { @@ -178,53 +191,14 @@ async fn main() -> anyhow::Result<()> { } Ok(()) } else { - if controller_client_conf.is_none() && post_to_storage_controller { - return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run")); - } - match scan_metadata(bucket_config.clone(), tenant_ids).await { - Err(e) => { - tracing::error!("Failed: {e}"); - Err(e) - } - Ok(summary) => { - if json { - println!("{}", serde_json::to_string(&summary).unwrap()) - } else { - println!("{}", summary.summary_string()); - } - - if post_to_storage_controller { - if let Some(conf) = controller_client_conf { - let controller_client = conf.build_client(); - let body = summary.build_health_update_request(); - controller_client - .dispatch::( - Method::POST, - "control/v1/metadata_health/update".to_string(), - Some(body), - ) - .await?; - } - } - - if summary.is_fatal() { - tracing::error!("Fatal scrub errors detected"); - } else if summary.is_empty() { - // Strictly speaking an empty bucket is a valid bucket, but if someone ran the - // scrubber they were likely expecting to scan something, and if we see no timelines - // at all then it's likely due to some configuration issues like a bad prefix - tracing::error!( - "No timelines found in bucket {} prefix {}", - bucket_config.bucket, - bucket_config - .prefix_in_bucket - .unwrap_or("".to_string()) - ); - } - - Ok(()) - } - } + scan_pageserver_metadata_cmd( + bucket_config, + controller_client.as_ref(), + tenant_ids, + json, + post_to_storcon, + ) + .await } } Command::FindGarbage { @@ -254,31 +228,14 @@ async fn main() -> anyhow::Result<()> { min_age, mode, } => { - match (&controller_client_conf, mode) { - (Some(_), _) => { - // Any mode may run when controller API is set - } - (None, GcMode::Full) => { - // The part of physical GC where we erase ancestor layers cannot be done safely without - // confirming the most recent complete shard split with the controller. Refuse to run, rather - // than doing it unsafely. - return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run")); - } - (None, GcMode::DryRun | GcMode::IndicesOnly) => { - // These GcModes do not require the controller to run. - } - } - - let summary = pageserver_physical_gc( - bucket_config, - controller_client_conf, + pageserver_physical_gc_cmd( + &bucket_config, + controller_client.as_ref(), tenant_ids, - min_age.into(), + min_age, mode, ) - .await?; - println!("{}", serde_json::to_string(&summary).unwrap()); - Ok(()) + .await } Command::FindLargeObjects { min_size, @@ -295,5 +252,142 @@ async fn main() -> anyhow::Result<()> { println!("{}", serde_json::to_string(&summary).unwrap()); Ok(()) } + Command::CronJob { + gc_min_age, + gc_mode, + post_to_storcon, + } => { + run_cron_job( + bucket_config, + controller_client.as_ref(), + gc_min_age, + gc_mode, + post_to_storcon, + ) + .await + } + } +} + +/// Runs the scrubber cron job. +/// 1. Do pageserver physical gc +/// 2. Scan pageserver metadata +pub async fn run_cron_job( + bucket_config: BucketConfig, + controller_client: Option<&control_api::Client>, + gc_min_age: humantime::Duration, + gc_mode: GcMode, + post_to_storcon: bool, +) -> anyhow::Result<()> { + tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc"); + pageserver_physical_gc_cmd( + &bucket_config, + controller_client, + Vec::new(), + gc_min_age, + gc_mode, + ) + .await?; + tracing::info!(%post_to_storcon, node_kind = %NodeKind::Pageserver, "Running scan-metadata"); + scan_pageserver_metadata_cmd( + bucket_config, + controller_client, + Vec::new(), + true, + post_to_storcon, + ) + .await?; + + Ok(()) +} + +pub async fn pageserver_physical_gc_cmd( + bucket_config: &BucketConfig, + controller_client: Option<&control_api::Client>, + tenant_shard_ids: Vec, + min_age: humantime::Duration, + mode: GcMode, +) -> anyhow::Result<()> { + match (controller_client, mode) { + (Some(_), _) => { + // Any mode may run when controller API is set + } + (None, GcMode::Full) => { + // The part of physical GC where we erase ancestor layers cannot be done safely without + // confirming the most recent complete shard split with the controller. Refuse to run, rather + // than doing it unsafely. + return Err(anyhow!( + "Full physical GC requires `--controller-api` and `--controller-jwt` to run" + )); + } + (None, GcMode::DryRun | GcMode::IndicesOnly) => { + // These GcModes do not require the controller to run. + } + } + + let summary = pageserver_physical_gc( + bucket_config, + controller_client, + tenant_shard_ids, + min_age.into(), + mode, + ) + .await?; + println!("{}", serde_json::to_string(&summary).unwrap()); + Ok(()) +} + +pub async fn scan_pageserver_metadata_cmd( + bucket_config: BucketConfig, + controller_client: Option<&control_api::Client>, + tenant_shard_ids: Vec, + json: bool, + post_to_storcon: bool, +) -> anyhow::Result<()> { + if controller_client.is_none() && post_to_storcon { + return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run")); + } + match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await { + Err(e) => { + tracing::error!("Failed: {e}"); + Err(e) + } + Ok(summary) => { + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); + } + + if post_to_storcon { + if let Some(client) = controller_client { + let body = summary.build_health_update_request(); + client + .dispatch::( + Method::POST, + "control/v1/metadata_health/update".to_string(), + Some(body), + ) + .await?; + } + } + + if summary.is_fatal() { + tracing::error!("Fatal scrub errors detected"); + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + tracing::error!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + ); + } + + Ok(()) + } } } diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index ff230feae3..20d9bd6dd4 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -1,12 +1,10 @@ -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use std::time::{Duration, SystemTime}; use crate::checks::{list_timeline_blobs, BlobDataParseResult}; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{ - init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId, -}; +use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use aws_sdk_s3::Client; use futures_util::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; @@ -117,7 +115,7 @@ use refs::AncestorRefs; // - Are there any refs to ancestor shards' layers? #[derive(Default)] struct TenantRefAccumulator { - shards_seen: HashMap>, + shards_seen: HashMap>, // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to ancestor_ref_shards: AncestorRefs, @@ -130,7 +128,7 @@ impl TenantRefAccumulator { .shards_seen .entry(ttid.tenant_shard_id.tenant_id) .or_default()) - .push(this_shard_idx); + .insert(this_shard_idx); let mut ancestor_refs = Vec::new(); for (layer_name, layer_metadata) in &index_part.layer_metadata { @@ -154,7 +152,7 @@ impl TenantRefAccumulator { summary: &mut GcSummary, ) -> (Vec, AncestorRefs) { let mut ancestors_to_gc = Vec::new(); - for (tenant_id, mut shard_indices) in self.shards_seen { + for (tenant_id, shard_indices) in self.shards_seen { // Find the highest shard count let latest_count = shard_indices .iter() @@ -162,6 +160,7 @@ impl TenantRefAccumulator { .max() .expect("Always at least one shard"); + let mut shard_indices = shard_indices.iter().collect::>(); let (mut latest_shards, ancestor_shards) = { let at = itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count); @@ -174,7 +173,7 @@ impl TenantRefAccumulator { // to scan the S3 bucket halfway through a shard split. if latest_shards.len() != latest_count.count() as usize { // This should be extremely rare, so we warn on it. - tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count); + tracing::warn!(%tenant_id, "Missed some shards at count {:?}: {latest_shards:?}", latest_count); continue; } @@ -212,7 +211,7 @@ impl TenantRefAccumulator { .iter() .map(|s| s.tenant_shard_id.to_index()) .collect(); - if controller_indices != latest_shards { + if !controller_indices.iter().eq(latest_shards.iter().copied()) { tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})"); continue; } @@ -472,8 +471,8 @@ async fn gc_ancestor( /// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and /// make sure that object listings don't get slowed down by large numbers of garbage objects. pub async fn pageserver_physical_gc( - bucket_config: BucketConfig, - controller_client_conf: Option, + bucket_config: &BucketConfig, + controller_client: Option<&control_api::Client>, tenant_shard_ids: Vec, min_age: Duration, mode: GcMode, @@ -557,7 +556,7 @@ pub async fn pageserver_physical_gc( let timelines = timelines.map_ok(|ttid| { gc_timeline( &s3_client, - &bucket_config, + bucket_config, &min_age, &target, mode, @@ -573,7 +572,7 @@ pub async fn pageserver_physical_gc( } // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC - let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else { + let Some(client) = controller_client else { tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified"); return Ok(summary); }; @@ -582,13 +581,13 @@ pub async fn pageserver_physical_gc( .unwrap() .into_inner() .unwrap() - .into_gc_ancestors(&controller_client, &mut summary) + .into_gc_ancestors(client, &mut summary) .await; for ancestor_shard in ancestor_shards { gc_ancestor( &s3_client, - &bucket_config, + bucket_config, &target, &min_age, ancestor_shard, diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index b9630056e1..2409b7b132 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -116,7 +116,7 @@ Index versions: {version_summary} } /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics. -pub async fn scan_metadata( +pub async fn scan_pageserver_metadata( bucket_config: BucketConfig, tenant_ids: Vec, ) -> anyhow::Result { diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 4b0c9ac71d..996ca4d652 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -3,6 +3,7 @@ pytest_plugins = ( "fixtures.parametrize", "fixtures.httpserver", "fixtures.compute_reconfigure", + "fixtures.storage_controller_proxy", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 08215438e1..5fe544b3bd 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -42,7 +42,11 @@ class PgCompare(ABC): pass @abstractmethod - def flush(self): + def flush(self, compact: bool = False, gc: bool = False): + pass + + @abstractmethod + def compact(self): pass @abstractmethod @@ -129,13 +133,16 @@ class NeonCompare(PgCompare): def pg_bin(self) -> PgBin: return self._pg_bin - def flush(self): + def flush(self, compact: bool = True, gc: bool = True): wait_for_last_flush_lsn(self.env, self._pg, self.tenant, self.timeline) - self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline) - self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0) + self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline, compact=compact) + if gc: + self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0) def compact(self): - self.pageserver_http_client.timeline_compact(self.tenant, self.timeline) + self.pageserver_http_client.timeline_compact( + self.tenant, self.timeline, wait_until_uploaded=True + ) def report_peak_memory_use(self): self.zenbenchmark.record( @@ -215,9 +222,12 @@ class VanillaCompare(PgCompare): def pg_bin(self) -> PgBin: return self._pg.pg_bin - def flush(self): + def flush(self, compact: bool = False, gc: bool = False): self.cur.execute("checkpoint") + def compact(self): + pass + def report_peak_memory_use(self): pass # TODO find something @@ -266,6 +276,9 @@ class RemoteCompare(PgCompare): # TODO: flush the remote pageserver pass + def compact(self): + pass + def report_peak_memory_use(self): # TODO: get memory usage from remote pageserver pass diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 844a23d327..ba6fbc003a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -24,7 +24,7 @@ from functools import cached_property, partial from itertools import chain, product from pathlib import Path from types import TracebackType -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast +from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast from urllib.parse import quote, urlparse import asyncpg @@ -67,6 +67,7 @@ from fixtures.pageserver.utils import ( from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( + LocalFsStorage, MockS3Server, RemoteStorage, RemoteStorageKind, @@ -388,7 +389,7 @@ class PgProtocol: return self.safe_psql_many([query], **kwargs)[0] def safe_psql_many( - self, queries: List[str], log_query=True, **kwargs: Any + self, queries: Iterable[str], log_query=True, **kwargs: Any ) -> List[List[Tuple[Any, ...]]]: """ Execute queries against the node and return all rows. @@ -496,6 +497,7 @@ class NeonEnvBuilder: pageserver_aux_file_policy: Optional[AuxFileStore] = None, pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None, safekeeper_extra_opts: Optional[list[str]] = None, + storage_controller_port_override: Optional[int] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -548,6 +550,8 @@ class NeonEnvBuilder: self.safekeeper_extra_opts = safekeeper_extra_opts + self.storage_controller_port_override = storage_controller_port_override + assert test_name.startswith( "test_" ), "Unexpectedly instantiated from outside a test function" @@ -963,7 +967,7 @@ class NeonEnvBuilder: if self.env: log.info("Cleaning up all storage and compute nodes") self.env.stop( - immediate=True, + immediate=False, # if the test threw an exception, don't check for errors # as a failing assertion would cause the cleanup below to fail ps_assert_metric_no_errors=(exc_type is None), @@ -1053,6 +1057,7 @@ class NeonEnv: """ BASE_PAGESERVER_ID = 1 + storage_controller: NeonStorageController | NeonProxiedStorageController def __init__(self, config: NeonEnvBuilder): self.repo_dir = config.repo_dir @@ -1083,27 +1088,41 @@ class NeonEnv: self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline - # Find two adjacent ports for storage controller and its postgres DB. This - # loop would eventually throw from get_port() if we run out of ports (extremely - # unlikely): usually we find two adjacent free ports on the first iteration. - while True: - self.storage_controller_port = self.port_distributor.get_port() - storage_controller_pg_port = self.port_distributor.get_port() - if storage_controller_pg_port == self.storage_controller_port + 1: - break - # The URL for the pageserver to use as its control_plane_api config - self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1" - # The base URL of the storage controller - self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}" + if config.storage_controller_port_override is not None: + log.info( + f"Using storage controller api override {config.storage_controller_port_override}" + ) + + self.storage_controller_port = config.storage_controller_port_override + self.storage_controller = NeonProxiedStorageController( + self, config.storage_controller_port_override, config.auth_enabled + ) + else: + # Find two adjacent ports for storage controller and its postgres DB. This + # loop would eventually throw from get_port() if we run out of ports (extremely + # unlikely): usually we find two adjacent free ports on the first iteration. + while True: + storage_controller_port = self.port_distributor.get_port() + storage_controller_pg_port = self.port_distributor.get_port() + if storage_controller_pg_port == storage_controller_port + 1: + break + + self.storage_controller_port = storage_controller_port + self.storage_controller = NeonStorageController( + self, storage_controller_port, config.auth_enabled + ) + + log.info( + f"Using generated control_plane_api: {self.storage_controller.upcall_api_endpoint()}" + ) + + self.storage_controller_api: str = self.storage_controller.api_root() + self.control_plane_api: str = self.storage_controller.upcall_api_endpoint() # For testing this with a fake HTTP server, enable passing through a URL from config self.control_plane_compute_hook_api = config.control_plane_compute_hook_api - self.storage_controller: NeonStorageController = NeonStorageController( - self, config.auth_enabled - ) - self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_aux_file_policy = config.pageserver_aux_file_policy @@ -1143,7 +1162,6 @@ class NeonEnv: "listen_http_addr": f"localhost:{pageserver_port.http}", "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, - "image_compression": "zstd", } if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine @@ -1251,21 +1269,57 @@ class NeonEnv: def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): """ After this method returns, there should be no child processes running. + + Unless of course, some stopping failed, in that case, all remaining child processes are leaked. """ - self.endpoints.stop_all(fail_on_endpoint_errors) + + # the commonly failing components have special try-except behavior, + # trying to get us to actually shutdown all processes over easier error + # reporting. + + raise_later = None + try: + self.endpoints.stop_all(fail_on_endpoint_errors) + except Exception as e: + raise_later = e # Stop storage controller before pageservers: we don't want it to spuriously # detect a pageserver "failure" during test teardown self.storage_controller.stop(immediate=immediate) + stop_later = [] + metric_errors = [] + for sk in self.safekeepers: sk.stop(immediate=immediate) for pageserver in self.pageservers: if ps_assert_metric_no_errors: - pageserver.assert_no_metric_errors() - pageserver.stop(immediate=immediate) + try: + pageserver.assert_no_metric_errors() + except Exception as e: + metric_errors.append(e) + log.error(f"metric validation failed on {pageserver.id}: {e}") + try: + pageserver.stop(immediate=immediate) + except RuntimeError: + stop_later.append(pageserver) self.broker.stop(immediate=immediate) + # TODO: for nice logging we need python 3.11 ExceptionGroup + for ps in stop_later: + ps.stop(immediate=True) + + if raise_later is not None: + raise raise_later + + for error in metric_errors: + raise error + + if len(stop_later) > 0: + raise RuntimeError( + f"{len(stop_later)} out of {len(self.pageservers)} pageservers failed to stop gracefully" + ) + @property def pageserver(self) -> NeonPageserver: """ @@ -1832,16 +1886,24 @@ class NeonCli(AbstractNeonCli): def storage_controller_start( self, timeout_in_seconds: Optional[int] = None, + instance_id: Optional[int] = None, + base_port: Optional[int] = None, ): cmd = ["storage_controller", "start"] if timeout_in_seconds is not None: cmd.append(f"--start-timeout={timeout_in_seconds}s") + if instance_id is not None: + cmd.append(f"--instance-id={instance_id}") + if base_port is not None: + cmd.append(f"--base-port={base_port}") return self.raw_cli(cmd) - def storage_controller_stop(self, immediate: bool): + def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None): cmd = ["storage_controller", "stop"] if immediate: cmd.extend(["-m", "immediate"]) + if instance_id is not None: + cmd.append(f"--instance-id={instance_id}") return self.raw_cli(cmd) def pageserver_start( @@ -2152,17 +2214,30 @@ class PageserverSchedulingPolicy(str, Enum): PAUSE_FOR_RESTART = "PauseForRestart" +class StorageControllerLeadershipStatus(str, Enum): + LEADER = "leader" + STEPPED_DOWN = "stepped_down" + CANDIDATE = "candidate" + + class NeonStorageController(MetricsGetter, LogUtils): - def __init__(self, env: NeonEnv, auth_enabled: bool): + def __init__(self, env: NeonEnv, port: int, auth_enabled: bool): self.env = env + self.port: int = port + self.api: str = f"http://127.0.0.1:{port}" self.running = False self.auth_enabled = auth_enabled self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS - self.logfile = self.workdir / "storage_controller.log" + self.logfile = self.env.repo_dir / "storage_controller_1" / "storage_controller.log" - def start(self, timeout_in_seconds: Optional[int] = None): + def start( + self, + timeout_in_seconds: Optional[int] = None, + instance_id: Optional[int] = None, + base_port: Optional[int] = None, + ): assert not self.running - self.env.neon_cli.storage_controller_start(timeout_in_seconds) + self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) self.running = True return self @@ -2172,6 +2247,12 @@ class NeonStorageController(MetricsGetter, LogUtils): self.running = False return self + def upcall_api_endpoint(self) -> str: + return f"{self.api}/upcall/v1" + + def api_root(self) -> str: + return self.api + @staticmethod def retryable_node_operation(op, ps_id, max_attempts, backoff): while max_attempts > 0: @@ -2200,7 +2281,9 @@ class NeonStorageController(MetricsGetter, LogUtils): def assert_no_errors(self): assert_no_errors( - self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors + self.logfile, + "storage_controller", + self.allowed_errors, ) def pageserver_api(self) -> PageserverHttpClient: @@ -2212,7 +2295,7 @@ class NeonStorageController(MetricsGetter, LogUtils): auth_token = None if self.auth_enabled: auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API) - return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token) + return PageserverHttpClient(self.port, lambda: True, auth_token) def request(self, method, *args, **kwargs) -> requests.Response: resp = requests.request(method, *args, **kwargs) @@ -2229,13 +2312,13 @@ class NeonStorageController(MetricsGetter, LogUtils): return headers def get_metrics(self) -> Metrics: - res = self.request("GET", f"{self.env.storage_controller_api}/metrics") + res = self.request("GET", f"{self.api}/metrics") return parse_metrics(res.text) def ready(self) -> bool: status = None try: - resp = self.request("GET", f"{self.env.storage_controller_api}/ready") + resp = self.request("GET", f"{self.api}/ready") status = resp.status_code except StorageControllerApiException as e: status = e.status_code @@ -2268,7 +2351,7 @@ class NeonStorageController(MetricsGetter, LogUtils): response = self.request( "POST", - f"{self.env.storage_controller_api}/debug/v1/attach-hook", + f"{self.api}/debug/v1/attach-hook", json=body, headers=self.headers(TokenScope.ADMIN), ) @@ -2279,7 +2362,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]): self.request( "POST", - f"{self.env.storage_controller_api}/debug/v1/attach-hook", + f"{self.api}/debug/v1/attach-hook", json={"tenant_shard_id": str(tenant_shard_id), "node_id": None}, headers=self.headers(TokenScope.ADMIN), ) @@ -2290,7 +2373,7 @@ class NeonStorageController(MetricsGetter, LogUtils): """ response = self.request( "POST", - f"{self.env.storage_controller_api}/debug/v1/inspect", + f"{self.api}/debug/v1/inspect", json={"tenant_shard_id": str(tenant_shard_id)}, headers=self.headers(TokenScope.ADMIN), ) @@ -2313,7 +2396,7 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"node_register({body})") self.request( "POST", - f"{self.env.storage_controller_api}/control/v1/node", + f"{self.api}/control/v1/node", json=body, headers=self.headers(TokenScope.ADMIN), ) @@ -2322,7 +2405,7 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"node_delete({node_id})") self.request( "DELETE", - f"{self.env.storage_controller_api}/control/v1/node/{node_id}", + f"{self.api}/control/v1/node/{node_id}", headers=self.headers(TokenScope.ADMIN), ) @@ -2330,7 +2413,7 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"node_drain({node_id})") self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain", + f"{self.api}/control/v1/node/{node_id}/drain", headers=self.headers(TokenScope.ADMIN), ) @@ -2338,7 +2421,7 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"cancel_node_drain({node_id})") self.request( "DELETE", - f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain", + f"{self.api}/control/v1/node/{node_id}/drain", headers=self.headers(TokenScope.ADMIN), ) @@ -2346,7 +2429,7 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"node_fill({node_id})") self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill", + f"{self.api}/control/v1/node/{node_id}/fill", headers=self.headers(TokenScope.ADMIN), ) @@ -2354,14 +2437,22 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"cancel_node_fill({node_id})") self.request( "DELETE", - f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill", + f"{self.api}/control/v1/node/{node_id}/fill", headers=self.headers(TokenScope.ADMIN), ) def node_status(self, node_id): response = self.request( "GET", - f"{self.env.storage_controller_api}/control/v1/node/{node_id}", + f"{self.api}/control/v1/node/{node_id}", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def get_leader(self): + response = self.request( + "GET", + f"{self.api}/control/v1/leader", headers=self.headers(TokenScope.ADMIN), ) return response.json() @@ -2369,7 +2460,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def node_list(self): response = self.request( "GET", - f"{self.env.storage_controller_api}/control/v1/node", + f"{self.api}/control/v1/node", headers=self.headers(TokenScope.ADMIN), ) return response.json() @@ -2377,7 +2468,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def tenant_list(self): response = self.request( "GET", - f"{self.env.storage_controller_api}/debug/v1/tenant", + f"{self.api}/debug/v1/tenant", headers=self.headers(TokenScope.ADMIN), ) return response.json() @@ -2387,7 +2478,7 @@ class NeonStorageController(MetricsGetter, LogUtils): body["node_id"] = node_id self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config", + f"{self.api}/control/v1/node/{node_id}/config", json=body, headers=self.headers(TokenScope.ADMIN), ) @@ -2422,7 +2513,7 @@ class NeonStorageController(MetricsGetter, LogUtils): response = self.request( "POST", - f"{self.env.storage_controller_api}/v1/tenant", + f"{self.api}/v1/tenant", json=body, headers=self.headers(TokenScope.PAGE_SERVER_API), ) @@ -2435,7 +2526,7 @@ class NeonStorageController(MetricsGetter, LogUtils): """ response = self.request( "GET", - f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate", + f"{self.api}/debug/v1/tenant/{tenant_id}/locate", headers=self.headers(TokenScope.ADMIN), ) body = response.json() @@ -2448,7 +2539,7 @@ class NeonStorageController(MetricsGetter, LogUtils): """ response = self.request( "GET", - f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}", + f"{self.api}/control/v1/tenant/{tenant_id}", headers=self.headers(TokenScope.ADMIN), ) response.raise_for_status() @@ -2459,7 +2550,7 @@ class NeonStorageController(MetricsGetter, LogUtils): ) -> list[TenantShardId]: response = self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split", + f"{self.api}/control/v1/tenant/{tenant_id}/shard_split", json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size}, headers=self.headers(TokenScope.ADMIN), ) @@ -2471,7 +2562,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int): self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate", + f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate", json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id}, headers=self.headers(TokenScope.ADMIN), ) @@ -2482,7 +2573,7 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"tenant_policy_update({tenant_id}, {body})") self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy", + f"{self.api}/control/v1/tenant/{tenant_id}/policy", json=body, headers=self.headers(TokenScope.ADMIN), ) @@ -2490,14 +2581,14 @@ class NeonStorageController(MetricsGetter, LogUtils): def tenant_import(self, tenant_id: TenantId): self.request( "POST", - f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import", + f"{self.api}/debug/v1/tenant/{tenant_id}/import", headers=self.headers(TokenScope.ADMIN), ) def reconcile_all(self): r = self.request( "POST", - f"{self.env.storage_controller_api}/debug/v1/reconcile_all", + f"{self.api}/debug/v1/reconcile_all", headers=self.headers(TokenScope.ADMIN), ) r.raise_for_status() @@ -2530,7 +2621,7 @@ class NeonStorageController(MetricsGetter, LogUtils): """ self.request( "POST", - f"{self.env.storage_controller_api}/debug/v1/consistency_check", + f"{self.api}/debug/v1/consistency_check", headers=self.headers(TokenScope.ADMIN), ) log.info("storage controller passed consistency check") @@ -2603,7 +2694,7 @@ class NeonStorageController(MetricsGetter, LogUtils): self.request( "POST", - f"{self.env.storage_controller_api}/control/v1/metadata_health/update", + f"{self.api}/control/v1/metadata_health/update", json=body, headers=self.headers(TokenScope.SCRUBBER), ) @@ -2611,7 +2702,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def metadata_health_list_unhealthy(self): response = self.request( "GET", - f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy", + f"{self.api}/control/v1/metadata_health/unhealthy", headers=self.headers(TokenScope.ADMIN), ) return response.json() @@ -2621,7 +2712,7 @@ class NeonStorageController(MetricsGetter, LogUtils): response = self.request( "POST", - f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated", + f"{self.api}/control/v1/metadata_health/outdated", json=body, headers=self.headers(TokenScope.ADMIN), ) @@ -2644,7 +2735,7 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info("Asking storage controller to step down") response = self.request( "PUT", - f"{self.env.storage_controller_api}/control/v1/step_down", + f"{self.api}/control/v1/step_down", headers=self.headers(TokenScope.ADMIN), ) @@ -2661,7 +2752,7 @@ class NeonStorageController(MetricsGetter, LogUtils): res = self.request( "PUT", - f"{self.env.storage_controller_api}/debug/v1/failpoints", + f"{self.api}/debug/v1/failpoints", json=[{"name": name, "actions": actions} for name, actions in pairs], headers=self.headers(TokenScope.ADMIN), ) @@ -2731,9 +2822,21 @@ class NeonStorageController(MetricsGetter, LogUtils): parsed_tid, wait_ms=250 ) - @property - def workdir(self) -> Path: - return self.env.repo_dir + def get_leadership_status(self) -> StorageControllerLeadershipStatus: + metric_values = {} + for status in StorageControllerLeadershipStatus: + metric_value = self.get_metric_value( + "storage_controller_leadership_status", filter={"status": status} + ) + metric_values[status] = metric_value + + assert list(metric_values.values()).count(1) == 1 + + for status, metric_value in metric_values.items(): + if metric_value == 1: + return status + + raise AssertionError("unreachable") def __enter__(self) -> "NeonStorageController": return self @@ -2747,6 +2850,59 @@ class NeonStorageController(MetricsGetter, LogUtils): self.stop(immediate=True) +class NeonProxiedStorageController(NeonStorageController): + def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool): + super(NeonProxiedStorageController, self).__init__(env, proxy_port, auth_enabled) + self.instances: dict[int, dict[str, Any]] = {} + + def start( + self, + timeout_in_seconds: Optional[int] = None, + instance_id: Optional[int] = None, + base_port: Optional[int] = None, + ): + assert instance_id is not None and base_port is not None + + self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) + self.instances[instance_id] = {"running": True} + + self.running = True + return self + + def stop_instance( + self, immediate: bool = False, instance_id: Optional[int] = None + ) -> "NeonStorageController": + assert instance_id in self.instances + if self.instances[instance_id]["running"]: + self.env.neon_cli.storage_controller_stop(immediate, instance_id) + self.instances[instance_id]["running"] = False + + self.running = any(meta["running"] for meta in self.instances.values()) + return self + + def stop(self, immediate: bool = False) -> "NeonStorageController": + for iid, details in self.instances.items(): + if details["running"]: + self.env.neon_cli.storage_controller_stop(immediate, iid) + self.instances[iid]["running"] = False + + self.running = False + return self + + def assert_no_errors(self): + for instance_id in self.instances.keys(): + assert_no_errors( + self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log", + "storage_controller", + self.allowed_errors, + ) + + def log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Optional[Tuple[str, LogCursor]]: + raise NotImplementedError() + + @dataclass class LogCursor: _line_no: int @@ -4098,6 +4254,17 @@ class Endpoint(PgProtocol, LogUtils): assert self.pgdata_dir is not None # please mypy return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024 + def clear_shared_buffers(self, cursor: Optional[Any] = None): + """ + Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.' + + Might also clear LFC. + """ + if cursor is not None: + cursor.execute("select clear_buffer_cache()") + else: + self.safe_psql("select clear_buffer_cache()") + class EndpointFactory: """An object representing multiple compute endpoints.""" @@ -4378,14 +4545,32 @@ class Safekeeper(LogUtils): def timeline_dir(self, tenant_id, timeline_id) -> Path: return self.data_dir / str(tenant_id) / str(timeline_id) + def list_uploaded_segments(self, tenant_id: TenantId, timeline_id: TimelineId): + tline_path = ( + self.env.repo_dir + / "local_fs_remote_storage" + / "safekeeper" + / str(tenant_id) + / str(timeline_id) + ) + assert isinstance(self.env.safekeepers_remote_storage, LocalFsStorage) + return self._list_segments_in_dir( + tline_path, lambda name: ".metadata" not in name and ".___temp" not in name + ) + def list_segments(self, tenant_id, timeline_id) -> List[str]: """ Get list of segment names of the given timeline. """ tli_dir = self.timeline_dir(tenant_id, timeline_id) + return self._list_segments_in_dir( + tli_dir, lambda name: not name.startswith("safekeeper.control") + ) + + def _list_segments_in_dir(self, path: Path, keep_filter: Callable[[str], bool]) -> list[str]: segments = [] - for _, _, filenames in os.walk(tli_dir): - segments.extend([f for f in filenames if not f.startswith("safekeeper.control")]) + for _, _, filenames in os.walk(path): + segments.extend([f for f in filenames if keep_filter(f)]) segments.sort() return segments @@ -4454,7 +4639,7 @@ class StorageScrubber: base_args = [ str(self.env.neon_binpath / "storage_scrubber"), - f"--controller-api={self.env.storage_controller_api}", + f"--controller-api={self.env.storage_controller.api_root()}", ] args = base_args + args @@ -4893,7 +5078,7 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) -def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -> Lsn: +def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> Lsn: """Wait logical replication subscriber to sync with publisher.""" publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) while True: diff --git a/test_runner/fixtures/storage_controller_proxy.py b/test_runner/fixtures/storage_controller_proxy.py new file mode 100644 index 0000000000..3477f8b1f2 --- /dev/null +++ b/test_runner/fixtures/storage_controller_proxy.py @@ -0,0 +1,73 @@ +import re +from typing import Any, Optional + +import pytest +import requests +from pytest_httpserver import HTTPServer +from werkzeug.datastructures import Headers +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + +from fixtures.log_helper import log + + +class StorageControllerProxy: + def __init__(self, server: HTTPServer): + self.server: HTTPServer = server + self.listen: str = f"http://{server.host}:{server.port}" + self.routing_to: Optional[str] = None + + def route_to(self, storage_controller_api: str): + self.routing_to = storage_controller_api + + def port(self) -> int: + return self.server.port + + def upcall_api_endpoint(self) -> str: + return f"{self.listen}/upcall/v1" + + +def proxy_request(method: str, url: str, **kwargs) -> requests.Response: + return requests.request(method, url, **kwargs) + + +@pytest.fixture(scope="function") +def storage_controller_proxy(make_httpserver): + """ + Proxies requests into the storage controller to the currently + selected storage controller instance via `StorageControllerProxy.route_to`. + + This fixture is intended for tests that need to run multiple instances + of the storage controller at the same time. + """ + server = make_httpserver + + self = StorageControllerProxy(server) + + log.info(f"Storage controller proxy listening on {self.listen}") + + def handler(request: Request): + if self.route_to is None: + log.info(f"Storage controller proxy has no routing configured for {request.url}") + return Response("Routing not configured", status=503) + + route_to_url = f"{self.routing_to}{request.path}" + + log.info(f"Routing {request.url} to {route_to_url}") + + args: dict[str, Any] = {"headers": request.headers} + if request.is_json: + args["json"] = request.json + + response = proxy_request(request.method, route_to_url, **args) + + headers = Headers() + for key, value in response.headers.items(): + headers.add(key, value) + + return Response(response.content, headers=headers, status=response.status_code) + + self.server.expect_request(re.compile(".*")).respond_with_handler(handler) + + yield self + server.clear() diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 4dc9f7caae..80f1c9e4e3 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -403,7 +403,7 @@ def wait_until( try: res = func() except Exception as e: - log.info("waiting for %s iteration %s failed", func, i + 1) + log.info("waiting for %s iteration %s failed: %s", func, i + 1, e) last_exception = e if show_intermediate_error: log.info(e) diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index dfd9caba3e..cc93762175 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -182,14 +182,8 @@ class Workload: def validate(self, pageserver_id: Optional[int] = None): endpoint = self.endpoint(pageserver_id) - result = endpoint.safe_psql_many( - [ - "select clear_buffer_cache()", - f""" - SELECT COUNT(*) FROM {self.table} - """, - ] - ) + endpoint.clear_shared_buffers() + result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}") log.info(f"validate({self.expect_rows}): {result}") - assert result == [[("",)], [(self.expect_rows,)]] + assert result == [(self.expect_rows,)] diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index 3258d4dcfa..8b934057e4 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -44,8 +44,7 @@ def test_basebackup_with_high_slru_count( page_cache_size = 16384 max_file_descriptors = 500000 neon_env_builder.pageserver_config_override = ( - f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; " - f"get_vectored_impl='vectored'; validate_vectored_get=false" + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" ) params.update( { diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py index 644c1f559b..0348b08f04 100644 --- a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py +++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py @@ -62,6 +62,9 @@ def test_download_churn( run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration) + # see https://github.com/neondatabase/neon/issues/8712 + env.stop(immediate=True) + def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): remote_storage_kind = s3_storage() diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 3dad348976..69df7974b9 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,9 +1,9 @@ from contextlib import closing -import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare +from fixtures.log_helper import log from fixtures.pg_version import PgVersion @@ -17,7 +17,6 @@ from fixtures.pg_version import PgVersion # 3. Disk space used # 4. Peak memory usage # -@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/7124") def test_bulk_insert(neon_with_baseline: PgCompare): env = neon_with_baseline @@ -30,8 +29,8 @@ def test_bulk_insert(neon_with_baseline: PgCompare): # Run INSERT, recording the time and I/O it takes with env.record_pageserver_writes("pageserver_writes"): with env.record_duration("insert"): - cur.execute("insert into huge values (generate_series(1, 5000000), 0);") - env.flush() + cur.execute("insert into huge values (generate_series(1, 20000000), 0);") + env.flush(compact=False, gc=False) env.report_peak_memory_use() env.report_size() @@ -49,6 +48,9 @@ def test_bulk_insert(neon_with_baseline: PgCompare): if isinstance(env, NeonCompare): measure_recovery_time(env) + with env.record_duration("compaction"): + env.compact() + def measure_recovery_time(env: NeonCompare): client = env.env.pageserver.http_client() @@ -71,7 +73,9 @@ def measure_recovery_time(env: NeonCompare): # Measure recovery time with env.record_duration("wal_recovery"): + log.info("Entering recovery...") client.timeline_create(pg_version, env.tenant, env.timeline) # Flush, which will also wait for lsn to catch up - env.flush() + env.flush(compact=False, gc=False) + log.info("Finished recovery.") diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index 9b20954d45..890b70b9fc 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -36,3 +36,6 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): with zenbenchmark.record_duration("test_query"): cur.execute("SELECT count(*) from t") assert cur.fetchone() == (n_iters * n_records,) + + # see https://github.com/neondatabase/neon/issues/8712 + env.stop(immediate=True) diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 4b4ffc1fee..077f73ac06 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -262,3 +262,86 @@ def test_publisher_restart( sub_workload.terminate() finally: pub_workload.terminate() + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_snap_files( + pg_bin: PgBin, + benchmark_project_pub: NeonApiEndpoint, + zenbenchmark: NeonBenchmarker, +): + """ + Creates a node with a replication slot. Generates pgbench into the replication slot, + then runs pgbench inserts while generating large numbers of snapfiles. Then restarts + the node and tries to peek the replication changes. + """ + test_duration_min = 60 + test_interval_min = 5 + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + env = benchmark_project_pub.pgbench_env + connstr = benchmark_project_pub.connstr + + with psycopg2.connect(connstr) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'") + is_super = cur.fetchall()[0][0] + assert is_super, "This benchmark won't work if we don't have superuser" + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env) + + conn = psycopg2.connect(connstr) + conn.autocommit = True + cur = conn.cursor() + cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1") + + with psycopg2.connect(connstr) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute("SELECT pg_reload_conf()") + + with psycopg2.connect(connstr) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute( + """ + DO $$ + BEGIN + IF EXISTS ( + SELECT 1 + FROM pg_replication_slots + WHERE slot_name = 'slotter' + ) THEN + PERFORM pg_drop_replication_slot('slotter'); + END IF; + END $$; + """ + ) + cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')") + + workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env) + try: + start = time.time() + prev_measurement = time.time() + while time.time() - start < test_duration_min * 60: + with psycopg2.connect(connstr) as conn: + with conn.cursor() as cur: + cur.execute( + "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s" + ) + check_pgbench_still_running(workload) + cur.execute( + "SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())" + ) + + # Measure storage + if time.time() - prev_measurement > test_interval_min * 60: + storage = benchmark_project_pub.get_synthetic_storage_size() + zenbenchmark.record("storage", storage, "B", MetricReport.LOWER_IS_BETTER) + prev_measurement = time.time() + time.sleep(test_interval_min * 60 / 3) + + finally: + workload.terminate() diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index 7e40081aa2..f83b44a7ad 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -20,7 +20,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): } ) - pageserver_http.configure_failpoints(("flush-frozen-pausable", "sleep(10000)")) + failpoint = "flush-frozen-pausable" + + pageserver_http.configure_failpoints((failpoint, "sleep(10000)")) endpoint_branch0 = env.endpoints.create_start("main", tenant_id=tenant) branch0_cur = endpoint_branch0.connect().cursor() @@ -96,3 +98,5 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): assert query_scalar(branch1_cur, "SELECT count(*) FROM foo") == 200000 assert query_scalar(branch2_cur, "SELECT count(*) FROM foo") == 300000 + + pageserver_http.configure_failpoints((failpoint, "off")) diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py new file mode 100644 index 0000000000..6d2567b7ee --- /dev/null +++ b/test_runner/regress/test_combocid.py @@ -0,0 +1,139 @@ +from fixtures.neon_fixtures import NeonEnvBuilder + + +def do_combocid_op(neon_env_builder: NeonEnvBuilder, op): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers='1MB'", + ], + ) + + conn = endpoint.connect() + cur = conn.cursor() + n_records = 1000 + + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("create table t(id integer, val integer)") + + cur.execute("begin") + cur.execute("insert into t values (1, 0)") + cur.execute("insert into t values (2, 0)") + cur.execute(f"insert into t select g, 0 from generate_series(3,{n_records}) g") + + # Open a cursor that scroll it halfway through + cur.execute("DECLARE c1 NO SCROLL CURSOR WITHOUT HOLD FOR SELECT * FROM t") + cur.execute("fetch 500 from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + # Perform specified operation + cur.execute(op) + + # Clear the cache, so that we exercise reconstructing the pages + # from WAL + cur.execute("SELECT clear_buffer_cache()") + + # Check that the cursor opened earlier still works. If the + # combocids are not restored correctly, it won't. + cur.execute("fetch all from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + cur.execute("rollback") + + +def test_combocid_delete(neon_env_builder: NeonEnvBuilder): + do_combocid_op(neon_env_builder, "delete from t") + + +def test_combocid_update(neon_env_builder: NeonEnvBuilder): + do_combocid_op(neon_env_builder, "update t set val=val+1") + + +def test_combocid_lock(neon_env_builder: NeonEnvBuilder): + do_combocid_op(neon_env_builder, "select * from t for update") + + +def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers='1MB'", + ], + ) + + conn = endpoint.connect() + cur = conn.cursor() + n_records = 1000 + + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("create table t(id integer, val integer)") + file_path = f"{endpoint.pg_data_dir_path()}/t.csv" + cur.execute(f"insert into t select g, 0 from generate_series(1,{n_records}) g") + cur.execute(f"copy t to '{file_path}'") + cur.execute("truncate table t") + + cur.execute("begin") + cur.execute(f"copy t from '{file_path}'") + + # Open a cursor that scroll it halfway through + cur.execute("DECLARE c1 NO SCROLL CURSOR WITHOUT HOLD FOR SELECT * FROM t") + cur.execute("fetch 500 from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + # Delete all the rows. Because all of the rows were inserted earlier in the + # same transaction, all the rows will get a combocid. + cur.execute("delete from t") + # Clear the cache, so that we exercise reconstructing the pages + # from WAL + cur.execute("SELECT clear_buffer_cache()") + + # Check that the cursor opened earlier still works. If the + # combocids are not restored correctly, it won't. + cur.execute("fetch all from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + cur.execute("rollback") + + +def test_combocid(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + + conn = endpoint.connect() + cur = conn.cursor() + n_records = 100000 + + cur.execute("create table t(id integer, val integer)") + cur.execute(f"insert into t values (generate_series(1,{n_records}), 0)") + + cur.execute("begin") + + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + + cur.execute("delete from t") + assert cur.rowcount == n_records + cur.execute("delete from t") + assert cur.rowcount == 0 + + cur.execute(f"insert into t values (generate_series(1,{n_records}), 0)") + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + + cur.execute("rollback") diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 8edc8c554c..ae63136abb 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -168,7 +168,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): # re-execute the query, it will make GetPage # requests. This does not clear the last-written LSN cache # so we still remember the LSNs of the pages. - s_cur.execute("SELECT clear_buffer_cache()") + secondary.clear_shared_buffers(cursor=s_cur) if pause_apply: s_cur.execute("SELECT pg_wal_replay_pause()") @@ -332,6 +332,7 @@ def test_replica_query_race(neon_simple_env: NeonEnv): log.info(f"read {reads}: counter {readcounter}, last update {writecounter}") reads += 1 + # FIXME: what about LFC clearing? await conn.execute("SELECT clear_buffer_cache()") async def both(): diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 2a3442448a..1b2c7f808f 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -1,3 +1,7 @@ +import os +import random +import re +import subprocess import threading import time @@ -17,17 +21,17 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): "test_lfc_resize", config_lines=[ "neon.file_cache_path='file.cache'", - "neon.max_file_cache_size=1GB", - "neon.file_cache_size_limit=1GB", + "neon.max_file_cache_size=512MB", + "neon.file_cache_size_limit=512MB", ], ) n_resize = 10 - scale = 10 + scale = 100 def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) - pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr]) + pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr]) thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) thread.start() @@ -35,9 +39,21 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): conn = endpoint.connect() cur = conn.cursor() - for i in range(n_resize): - cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'") + for _ in range(n_resize): + size = random.randint(1, 512) + cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'") cur.execute("select pg_reload_conf()") time.sleep(1) + cur.execute("alter system set neon.file_cache_size_limit='100MB'") + cur.execute("select pg_reload_conf()") + thread.join() + + lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache" + lfc_file_size = os.path.getsize(lfc_file_path) + res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True) + lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0] + log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}") + assert lfc_file_size <= 512 * 1024 * 1024 + assert int(lfc_file_blocks) <= 128 * 1024 diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 66afe9ddfd..0d18aa43b7 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -4,11 +4,13 @@ from random import choice from string import ascii_lowercase import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( AuxFileStore, NeonEnv, NeonEnvBuilder, + PgProtocol, logical_replication_sync, wait_for_last_flush_lsn, ) @@ -253,6 +255,21 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of cur.execute( "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')" ) + cur.execute( + """ +INSERT INTO wal_generator (data) +SELECT repeat('A', 1024) -- Generates a kilobyte of data per row +FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data +""" + ) + + endpoint.stop_and_destroy() + endpoint = env.endpoints.create_start("init") + with endpoint.connect().cursor() as cur: + log.info("advance slot") + cur.execute( + "SELECT * from pg_replication_slot_advance('slotty_mcslotface', pg_current_wal_lsn())" + ) # Tests that walsender correctly blocks until WAL is downloaded from safekeepers @@ -524,3 +541,90 @@ def test_replication_shutdown(neon_simple_env: NeonEnv): assert [r[0] for r in res] == [10, 20, 30, 40] wait_until(10, 0.5, check_that_changes_propagated) + + +def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn: + """ + Wait for logical replication subscriber reported flush_lsn to reach + pg_current_wal_flush_lsn on publisher. Note that this is somewhat unreliable + because for some WAL records like vacuum subscriber won't get any data at + all. + """ + publisher_flush_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + + def check_caughtup(): + res = publisher.safe_psql( + """ +select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication sr, pg_replication_slots s + where s.active_pid = sr.pid and s.slot_type = 'logical'; + """ + )[0] + sent_lsn, flush_lsn, curr_publisher_flush_lsn = Lsn(res[0]), Lsn(res[1]), Lsn(res[2]) + log.info( + f"sent_lsn={sent_lsn}, flush_lsn={flush_lsn}, publisher_flush_lsn={curr_publisher_flush_lsn}, waiting flush_lsn to reach {publisher_flush_lsn}" + ) + assert flush_lsn >= publisher_flush_lsn + + wait_until(30, 0.5, check_caughtup) + return publisher_flush_lsn + + +# Test that subscriber takes into account quorum committed flush_lsn in +# flush_lsn reporting to publisher. Without this, it may ack too far, losing +# data on restart because publisher advances START_REPLICATION position to the +# confirmed_flush_lsn of the slot. +def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg): + env = neon_simple_env + # use vanilla as publisher to allow writes on it when safekeeper is down + vanilla_pg.configure( + [ + "wal_level = 'logical'", + # neon fork uses custom WAL records which won't work without extension installed with obscure + # ERROR: resource manager with ID 134 not registered + # error. + "shared_preload_libraries = 'neon'", + ] + ) + vanilla_pg.start() + vanilla_pg.safe_psql("create extension neon;") + + env.neon_cli.create_branch("subscriber") + sub = env.endpoints.create("subscriber") + sub.start() + + with vanilla_pg.cursor() as pcur: + with sub.cursor() as scur: + pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + pcur.execute("CREATE PUBLICATION pub FOR TABLE t") + scur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + + pub_connstr = vanilla_pg.connstr().replace("'", "''") + log.info(f"pub connstr is {pub_connstr}, subscriber connstr {sub.connstr()}") + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_connstr}' PUBLICATION pub with (synchronous_commit=off)" + scur.execute(query) + time.sleep(2) # let initial table sync complete + + # stop safekeeper so it won't get any data + for sk in env.safekeepers: + sk.stop() + # and insert to publisher + with vanilla_pg.cursor() as pcur: + for i in range(0, 1000): + pcur.execute("INSERT into t values (%s, random()*100000)", (i,)) + # wait until sub receives all data + logical_replication_sync(sub, vanilla_pg) + # Update confirmed_flush_lsn of the slot. If subscriber ack'ed recevied data + # as flushed we'll now lose it if subscriber restars. That's why + # logical_replication_wait_flush_lsn_sync is expected to hang while + # safekeeper is down. + vanilla_pg.safe_psql("checkpoint;") + assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000 + + # restart subscriber and ensure it can catch up lost tail again + sub.stop(mode="immediate") + for sk in env.safekeepers: + sk.start() + sub.start() + log.info("waiting for sync after restart") + logical_replication_wait_flush_lsn_sync(vanilla_pg) + assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000 diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py index a94ae99ed9..e8eefc2414 100644 --- a/test_runner/regress/test_oid_overflow.py +++ b/test_runner/regress/test_oid_overflow.py @@ -37,7 +37,7 @@ def test_oid_overflow(neon_env_builder: NeonEnvBuilder): oid = cur.fetchall()[0][0] log.info(f"t2.relfilenode={oid}") - cur.execute("SELECT clear_buffer_cache()") + endpoint.clear_shared_buffers(cursor=cur) cur.execute("SELECT x from t1") assert cur.fetchone() == (1,) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 68a45f957c..bbf82fea4c 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -159,6 +159,8 @@ def test_pageserver_chaos( if build_type == "debug": pytest.skip("times out in debug builds") + # same rationale as with the immediate stop; we might leave orphan layers behind. + neon_env_builder.disable_scrub_on_exit() neon_env_builder.enable_pageserver_remote_storage(s3_storage()) if shard_count is not None: neon_env_builder.num_pageservers = shard_count @@ -220,3 +222,11 @@ def test_pageserver_chaos( # Check that all the updates are visible num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0] assert num_updates == i * 100000 + + # currently pageserver cannot tolerate the fact that "s3" goes away, and if + # we succeeded in a compaction before shutdown, there might be a lot of + # uploads pending, certainly more than what we can ingest with MOCK_S3 + # + # so instead, do a fast shutdown for this one test. + # See https://github.com/neondatabase/neon/issues/8709 + env.stop(immediate=True) diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index f446f4f200..d2b8c2ed8b 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -2,6 +2,7 @@ import asyncio import json import subprocess import time +import urllib.parse from typing import Any, List, Optional, Tuple import psycopg2 @@ -275,6 +276,31 @@ def test_sql_over_http(static_proxy: NeonProxy): assert res["rowCount"] is None +def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy): + db = "db with spaces" + static_proxy.safe_psql_many( + ( + f'create database "{db}"', + "create role http with login password 'http' superuser", + ) + ) + + def q(sql: str, params: Optional[List[Any]] = None) -> Any: + params = params or [] + connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/{urllib.parse.quote(db)}" + response = requests.post( + f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + data=json.dumps({"query": sql, "params": params}), + headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr}, + verify=str(static_proxy.test_output_dir / "proxy.crt"), + ) + assert response.status_code == 200, response.text + return response.json() + + rows = q("select 42 as answer")["rows"] + assert rows == [{"answer": 42}] + + def test_sql_over_http_output_options(static_proxy: NeonProxy): static_proxy.safe_psql("create role http2 with login password 'http2' superuser") diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index 2437c8f806..d128c60a99 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -61,7 +61,7 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Clear buffer cache to ensure no stale pages are brought into the cache") - c.execute("select clear_buffer_cache()") + endpoint.clear_shared_buffers(cursor=c) cache_entries = query_scalar( c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 9b2557a165..95c35e9641 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -1,3 +1,4 @@ +import concurrent.futures import json import threading import time @@ -16,6 +17,7 @@ from fixtures.neon_fixtures import ( PageserverSchedulingPolicy, PgBin, StorageControllerApiException, + StorageControllerLeadershipStatus, TokenScope, last_flush_lsn_upload, ) @@ -30,7 +32,9 @@ from fixtures.pageserver.utils import ( timeline_delete_wait_completed, ) from fixtures.pg_version import PgVersion +from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind, s3_storage +from fixtures.storage_controller_proxy import StorageControllerProxy from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until from fixtures.workload import Workload from mypy_boto3_s3.type_defs import ( @@ -2091,3 +2095,172 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): ) == 0 ) + + +# This is a copy of NeonEnv.start which injects the instance id and port +# into the call to NeonStorageController.start +def start_env(env: NeonEnv, storage_controller_port: int): + timeout_in_seconds = 30 + + # Storage controller starts first, so that pageserver /re-attach calls don't + # bounce through retries on startup + env.storage_controller.start(timeout_in_seconds, 1, storage_controller_port) + + # Wait for storage controller readiness to prevent unnecessary post start-up + # reconcile. + env.storage_controller.wait_until_ready() + + # Start up broker, pageserver and all safekeepers + futs = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=2 + len(env.pageservers) + len(env.safekeepers) + ) as executor: + futs.append( + executor.submit(lambda: env.broker.try_start() or None) + ) # The `or None` is for the linter + + for pageserver in env.pageservers: + futs.append( + executor.submit( + lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) + ) + ) + + for safekeeper in env.safekeepers: + futs.append( + executor.submit( + lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds) + ) + ) + + for f in futs: + f.result() + + +@pytest.mark.parametrize("step_down_times_out", [False, True]) +def test_storage_controller_leadership_transfer( + neon_env_builder: NeonEnvBuilder, + storage_controller_proxy: StorageControllerProxy, + port_distributor: PortDistributor, + step_down_times_out: bool, +): + neon_env_builder.num_pageservers = 3 + + neon_env_builder.storage_controller_config = { + "database_url": f"127.0.0.1:{port_distributor.get_port()}", + "start_as_candidate": True, + } + + neon_env_builder.storage_controller_port_override = storage_controller_proxy.port() + + storage_controller_1_port = port_distributor.get_port() + storage_controller_2_port = port_distributor.get_port() + + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + + env = neon_env_builder.init_configs() + start_env(env, storage_controller_1_port) + + assert ( + env.storage_controller.get_leadership_status() == StorageControllerLeadershipStatus.LEADER + ) + leader = env.storage_controller.get_leader() + assert leader["address"] == f"http://127.0.0.1:{storage_controller_1_port}/" + + if step_down_times_out: + env.storage_controller.configure_failpoints( + ("sleep-on-step-down-handling", "return(10000)") + ) + env.storage_controller.allowed_errors.append(".*request was dropped before completing.*") + + tenant_count = 2 + shard_count = 4 + tenants = set(TenantId.generate() for _ in range(0, tenant_count)) + + for tid in tenants: + env.storage_controller.tenant_create( + tid, shard_count=shard_count, placement_policy={"Attached": 1} + ) + env.storage_controller.reconcile_until_idle() + + env.storage_controller.start( + timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port + ) + + if not step_down_times_out: + + def previous_stepped_down(): + assert ( + env.storage_controller.get_leadership_status() + == StorageControllerLeadershipStatus.STEPPED_DOWN + ) + + wait_until(5, 1, previous_stepped_down) + + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}") + + def new_becomes_leader(): + assert ( + env.storage_controller.get_leadership_status() + == StorageControllerLeadershipStatus.LEADER + ) + + wait_until(15, 1, new_becomes_leader) + leader = env.storage_controller.get_leader() + assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/" + + env.storage_controller.wait_until_ready() + env.storage_controller.consistency_check() + + if step_down_times_out: + env.storage_controller.allowed_errors.extend( + [ + ".*Leader.*did not respond to step-down request.*", + ".*Send step down request failed.*", + ".*Send step down request still failed.*", + ] + ) + + +def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder): + # single unsharded tenant, two locations + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_start() + + env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}}) + env.storage_controller.reconcile_until_idle() + + attached_id = int(env.storage_controller.locate(env.initial_tenant)[0]["node_id"]) + attached = next((ps for ps in env.pageservers if ps.id == attached_id)) + + def attached_is_draining(): + details = env.storage_controller.node_status(attached.id) + assert details["scheduling"] == "Draining" + + env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)")) + env.storage_controller.node_drain(attached.id) + + wait_until(10, 0.5, attached_is_draining) + + attached.restart() + + # we are unable to reconfigure node while the operation is still ongoing + with pytest.raises( + StorageControllerApiException, + match="Precondition failed: Ongoing background operation forbids configuring: drain.*", + ): + env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"}) + with pytest.raises( + StorageControllerApiException, + match="Precondition failed: Ongoing background operation forbids configuring: drain.*", + ): + env.storage_controller.node_configure(attached.id, {"availability": "Offline"}) + + env.storage_controller.cancel_node_drain(attached.id) + + def reconfigure_node_again(): + env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"}) + + # allow for small delay between actually having cancelled and being able reconfigure again + wait_until(4, 0.5, reconfigure_node_again) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 388f6a9e92..2844d1b1d2 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -204,6 +204,11 @@ def test_scrubber_physical_gc_ancestors( }, ) + # Create an extra timeline, to ensure the scrubber isn't confused by multiple timelines + env.storage_controller.pageserver_api().timeline_create( + env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate() + ) + # Make sure the original shard has some layers workload = Workload(env, tenant_id, timeline_id) workload.init() @@ -214,6 +219,11 @@ def test_scrubber_physical_gc_ancestors( shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately + # Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors + env.storage_controller.pageserver_api().timeline_create( + env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate() + ) + # Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which # compacts, and we only want to do tha explicitly later in the test. workload.write_rows(100, upload=False) @@ -305,10 +315,19 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder # Make sure the original shard has some layers workload = Workload(env, tenant_id, timeline_id) workload.init() - workload.write_rows(100) + workload.write_rows(100, upload=False) + workload.stop() new_shard_count = 4 shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + log.info(f"Waiting for shard {shard} on pageserver {ps.id}") + ps.http_client().timeline_checkpoint( + shard, timeline_id, compact=False, wait_until_uploaded=True + ) + + ps.http_client().deletion_queue_flush(execute=True) # Create a second timeline so that when we delete the first one, child shards still have some content in S3. # @@ -319,15 +338,6 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder PgVersion.NOT_SET, tenant_id, other_timeline_id ) - # Write after split so that child shards have some indices in S3 - workload.write_rows(100, upload=False) - for shard in shards: - ps = env.get_tenant_pageserver(shard) - log.info(f"Waiting for shard {shard} on pageserver {ps.id}") - ps.http_client().timeline_checkpoint( - shard, timeline_id, compact=False, wait_until_uploaded=True - ) - # The timeline still exists in child shards and they reference its layers, so scrubbing # now shouldn't delete anything. gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index b3767a2766..82fc26126d 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -5,7 +5,7 @@ import time from concurrent.futures import ThreadPoolExecutor from queue import Empty, Queue from threading import Barrier -from typing import List, Tuple +from typing import List, Set, Tuple import pytest from fixtures.common_types import Lsn, TimelineId @@ -97,7 +97,7 @@ def test_ancestor_detach_branched_from( client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") - wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline) deltas = client.layer_map_info(env.initial_tenant, env.initial_timeline).delta_layers() # there is also the in-mem layer, but ignore it for now @@ -411,7 +411,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None - assert ep.safe_psql("SELECT clear_buffer_cache();") + ep.clear_shared_buffers() assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0 ep.stop() @@ -452,6 +452,9 @@ def test_compaction_induced_by_detaches_in_history( } ) env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + env.pageserver.allowed_errors.append( + ".*await_initial_logical_size: can't get semaphore cancel token, skipping" + ) client = env.pageserver.http_client() def delta_layers(timeline_id: TimelineId): @@ -524,6 +527,7 @@ def test_compaction_induced_by_detaches_in_history( assert len([filter(lambda x: x.l0, delta_layers(branch_timeline_id))]) == 1 skip_main = branches[1:] + branch_lsn = client.timeline_detail(env.initial_tenant, branch_timeline_id)["ancestor_lsn"] # take the fullbackup before and after inheriting the new L0s @@ -532,6 +536,13 @@ def test_compaction_induced_by_detaches_in_history( env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before ) + # force initial logical sizes, so we can evict all layers from all + # timelines and exercise on-demand download for copy lsn prefix + client.timeline_detail( + env.initial_tenant, env.initial_timeline, force_await_initial_logical_size=True + ) + client.evict_all_layers(env.initial_tenant, env.initial_timeline) + for _, timeline_id in skip_main: reparented = client.detach_ancestor(env.initial_tenant, timeline_id) assert reparented == set(), "we have no earlier branches at any level" @@ -705,7 +716,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): log.info(f"stuck pageserver is id={stuck.id}") stuck_http = stuck.http_client() stuck_http.configure_failpoints( - ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause") + ("timeline-detach-ancestor::before_starting_after_locking-pausable", "pause") ) restarted = pageservers[int(shards[1]["node_id"])] @@ -716,7 +727,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): restarted_http = restarted.http_client() restarted_http.configure_failpoints( [ - ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause"), + ("timeline-detach-ancestor::before_starting_after_locking-pausable", "pause"), ] ) @@ -734,7 +745,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): target.detach_ancestor(env.initial_tenant, branch_timeline_id, timeout=1) stuck_http.configure_failpoints( - ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off") + ("timeline-detach-ancestor::before_starting_after_locking-pausable", "off") ) barrier = threading.Barrier(2) @@ -753,7 +764,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): # we have 10s, lets use 1/2 of that to help the shutdown start time.sleep(5) restarted_http.configure_failpoints( - ("timeline-detach-ancestor::before_starting_after_locking_pausable", "off") + ("timeline-detach-ancestor::before_starting_after_locking-pausable", "off") ) fut.result() @@ -806,23 +817,25 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( after starting the detach. What remains not tested by this: - - shutdown winning over complete - - Shutdown winning over complete needs gc blocking and reparenting any left-overs on retry. + - shutdown winning over complete, see test_timeline_is_deleted_before_timeline_detach_ancestor_completes """ if sharded and mode == "delete_tenant": # the shared/exclusive lock for tenant is blocking this: # timeline detach ancestor takes shared, delete tenant takes exclusive - pytest.skip( - "tenant deletion while timeline ancestor detach is underway is not supported yet" - ) + pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen") shard_count = 2 if sharded else 1 neon_env_builder.num_pageservers = shard_count - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count if sharded else None) + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count if sharded else None, + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + }, + ) for ps in env.pageservers: ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) @@ -831,7 +844,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( detached_timeline = env.neon_cli.create_branch("detached soon", "main") - failpoint = "timeline-detach-ancestor::before_starting_after_locking_pausable" + pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable" env.storage_controller.reconcile_until_idle() shards = env.storage_controller.locate(env.initial_tenant) @@ -843,13 +856,20 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( victim = pageservers[int(shards[-1]["node_id"])] victim_http = victim.http_client() - victim_http.configure_failpoints((failpoint, "pause")) + victim_http.configure_failpoints((pausepoint, "pause")) def detach_ancestor(): target.detach_ancestor(env.initial_tenant, detached_timeline) - def at_failpoint() -> Tuple[str, LogCursor]: - return victim.assert_log_contains(f"at failpoint {failpoint}") + def at_failpoint() -> LogCursor: + msg, offset = victim.assert_log_contains(f"at failpoint {pausepoint}") + log.info(f"found {msg}") + msg, offset = victim.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + log.info(f"found {msg}") + return offset def start_delete(): if mode == "delete_timeline": @@ -882,26 +902,47 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( with ThreadPoolExecutor(max_workers=2) as pool: try: fut = pool.submit(detach_ancestor) - _, offset = wait_until(10, 1.0, at_failpoint) + offset = wait_until(10, 1.0, at_failpoint) delete = pool.submit(start_delete) - wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) + offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) - victim_http.configure_failpoints((failpoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) delete.result() assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}" + # TODO: match the error with pytest.raises(PageserverApiException) as exc: fut.result() + log.info(f"TODO: match this error: {exc.value}") assert exc.value.status_code == 503 finally: - victim_http.configure_failpoints((failpoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) + + if mode != "delete_timeline": + return + + # make sure the gc is unblocked + time.sleep(2) + victim.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) + + if not sharded: + # we have the other node only while sharded + return + + other = pageservers[int(shards[0]["node_id"])] + log.info(f"other is {other.id}") + _, offset = other.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/\\S+/timeline/\\S+/detach_ancestor .*\\}: Request handled, status: 200 OK", + ) + # this might be a lot earlier than the victims line, but that is okay. + _, offset = other.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) -@pytest.mark.parametrize("mode", ["delete_reparentable_timeline"]) +@pytest.mark.parametrize("mode", ["delete_reparentable_timeline", "create_reparentable_timeline"]) def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnvBuilder, mode: str): """ Technically possible storage controller concurrent interleaving timeline @@ -913,10 +954,6 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv must be detached. """ - assert ( - mode == "delete_reparentable_timeline" - ), "only one now, but we could have the create just as well, need gc blocking" - shard_count = 2 neon_env_builder.num_pageservers = shard_count env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) @@ -948,14 +985,21 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv for ps, shard_id in [(pageservers[int(x["node_id"])], x["shard_id"]) for x in shards]: ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline) - first_branch = env.neon_cli.create_branch( - "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn - ) + def create_reparentable_timeline() -> TimelineId: + return env.neon_cli.create_branch( + "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn + ) + + if mode == "delete_reparentable_timeline": + first_branch = create_reparentable_timeline() + else: + first_branch = None + detached_branch = env.neon_cli.create_branch( "detached_branch", ancestor_branch_name="main", ancestor_start_lsn=detached_branch_lsn ) - pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable" + pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable" stuck = pageservers[int(shards[0]["node_id"])] stuck_http = stuck.http_client().without_status_retrying() @@ -967,12 +1011,6 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv (pausepoint, "pause"), ) - # noticed a surprising 409 if the other one would fail instead - # victim_http.configure_failpoints([ - # (pausepoint, "pause"), - # ("timeline-detach-ancestor::before_starting_after_locking", "return"), - # ]) - # interleaving a create_timeline which could be reparented will produce two # permanently different reparentings: one node has reparented, other has # not @@ -991,6 +1029,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv assert detail.get("ancestor_lsn") is None def first_branch_gone(): + assert first_branch is not None try: env.storage_controller.pageserver_api().timeline_detail( env.initial_tenant, first_branch @@ -1011,47 +1050,437 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv stuck_http.configure_failpoints((pausepoint, "off")) wait_until(10, 1.0, first_completed) - # if we would let victim fail, for some reason there'd be a 409 response instead of 500 - # victim_http.configure_failpoints((pausepoint, "off")) - # with pytest.raises(PageserverApiException, match=".* 500 Internal Server Error failpoint: timeline-detach-ancestor::before_starting_after_locking") as exc: - # fut.result() - # assert exc.value.status_code == 409 - - env.storage_controller.pageserver_api().timeline_delete( - env.initial_tenant, first_branch - ) - victim_http.configure_failpoints((pausepoint, "off")) - wait_until(10, 1.0, first_branch_gone) + if mode == "delete_reparentable_timeline": + assert first_branch is not None + env.storage_controller.pageserver_api().timeline_delete( + env.initial_tenant, first_branch + ) + victim_http.configure_failpoints((pausepoint, "off")) + wait_until(10, 1.0, first_branch_gone) + elif mode == "create_reparentable_timeline": + first_branch = create_reparentable_timeline() + victim_http.configure_failpoints((pausepoint, "off")) + else: + raise RuntimeError("{mode}") # it now passes, and we should get an error messages about mixed reparenting as the stuck still had something to reparent - fut.result() + mixed_results = "pageservers returned mixed results for ancestor detach; manual intervention is required." + with pytest.raises(PageserverApiException, match=mixed_results): + fut.result() msg, offset = env.storage_controller.assert_log_contains( ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*" ) - log.info(f"expected error message: {msg}") - env.storage_controller.allowed_errors.append( - ".*: shards returned different results matching=0 .*" + log.info(f"expected error message: {msg.rstrip()}") + env.storage_controller.allowed_errors.extend( + [ + ".*: shards returned different results matching=0 .*", + f".*: InternalServerError\\({mixed_results}", + ] ) - detach_timeline() + if mode == "create_reparentable_timeline": + with pytest.raises(PageserverApiException, match=mixed_results): + detach_timeline() + else: + # it is a bit shame to flag it and then it suceeds, but most + # likely there would be a retry loop which would take care of + # this in cplane + detach_timeline() - # FIXME: perhaps the above should be automatically retried, if we get mixed results? - not_found = env.storage_controller.log_contains( + retried = env.storage_controller.log_contains( ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*", - offset=offset, + offset, ) - - assert not_found is None + if mode == "delete_reparentable_timeline": + assert ( + retried is None + ), "detaching should had converged after both nodes saw the deletion" + elif mode == "create_reparentable_timeline": + assert retried is not None, "detaching should not have converged" + _, offset = retried finally: stuck_http.configure_failpoints((pausepoint, "off")) victim_http.configure_failpoints((pausepoint, "off")) + if mode == "create_reparentable_timeline": + assert first_branch is not None + # now we have mixed ancestry + assert ( + TimelineId( + stuck_http.timeline_detail(shards[0]["shard_id"], first_branch)[ + "ancestor_timeline_id" + ] + ) + == env.initial_timeline + ) + assert ( + TimelineId( + victim_http.timeline_detail(shards[-1]["shard_id"], first_branch)[ + "ancestor_timeline_id" + ] + ) + == detached_branch + ) + + # make sure we are still able to repair this by detaching the ancestor on the storage controller in case it ever happens + # if the ancestor would be deleted, we would partially fail, making deletion stuck. + env.storage_controller.pageserver_api().detach_ancestor(env.initial_tenant, first_branch) + + # and we should now have good results + not_found = env.storage_controller.log_contains( + ".*/timeline/\\S+/detach_ancestor.*: shards returned different results matching=0 .*", + offset, + ) + + assert not_found is None + assert ( + stuck_http.timeline_detail(shards[0]["shard_id"], first_branch)["ancestor_timeline_id"] + is None + ) + assert ( + victim_http.timeline_detail(shards[-1]["shard_id"], first_branch)[ + "ancestor_timeline_id" + ] + is None + ) + + +def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor( + neon_env_builder: NeonEnvBuilder, +): + shard_count = 2 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + + for ps in env.pageservers: + ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + pageservers = dict((int(p.id), p) for p in env.pageservers) + + env.storage_controller.reconcile_until_idle() + shards = env.storage_controller.locate(env.initial_tenant) + assert len(set(x["node_id"] for x in shards)) == shard_count + + detached_branch = env.neon_cli.create_branch("detached_branch", ancestor_branch_name="main") + + pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable" + failpoint = "timeline-detach-ancestor::before_starting_after_locking" + + stuck = pageservers[int(shards[0]["node_id"])] + stuck_http = stuck.http_client().without_status_retrying() + stuck_http.configure_failpoints( + (pausepoint, "pause"), + ) + + env.storage_controller.allowed_errors.append( + f".*Error processing HTTP request: .* failpoint: {failpoint}" + ) + http = env.storage_controller.pageserver_api() + + victim = pageservers[int(shards[-1]["node_id"])] + victim.allowed_errors.append( + f".*Error processing HTTP request: InternalServerError\\(failpoint: {failpoint}" + ) + victim_http = victim.http_client().without_status_retrying() + victim_http.configure_failpoints([(pausepoint, "pause"), (failpoint, "return")]) + + def detach_timeline(): + http.detach_ancestor(env.initial_tenant, detached_branch) + + def paused_at_failpoint(): + stuck.assert_log_contains(f"at failpoint {pausepoint}") + victim.assert_log_contains(f"at failpoint {pausepoint}") + + def first_completed(): + detail = stuck_http.timeline_detail(shards[0]["shard_id"], detached_branch) + log.info(detail) + assert detail.get("ancestor_lsn") is None + + with ThreadPoolExecutor(max_workers=1) as pool: + try: + fut = pool.submit(detach_timeline) + wait_until(10, 1.0, paused_at_failpoint) + + # let stuck complete + stuck_http.configure_failpoints((pausepoint, "off")) + wait_until(10, 1.0, first_completed) + + victim_http.configure_failpoints((pausepoint, "off")) + + with pytest.raises( + PageserverApiException, + match=f".*failpoint: {failpoint}", + ) as exc: + fut.result() + assert exc.value.status_code == 500 + + finally: + stuck_http.configure_failpoints((pausepoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) + + victim_http.configure_failpoints((failpoint, "off")) + detach_timeline() + + +def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: NeonEnvBuilder): + """ + Using a failpoint, force the completion step of timeline ancestor detach to + fail after reparenting a single timeline. + + Retrying should try reparenting until all reparentings are done, all the + time blocking gc even across restarts (first round). + + A completion failpoint is used to inhibit completion on second to last + round. + + On last round, the completion uses a path where no reparentings can happen + because original ancestor is deleted, and there is a completion to unblock + gc without restart. + """ + + # to get the remote storage metrics + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + env.pageserver.allowed_errors.extend( + [ + ".* reparenting failed: failpoint: timeline-detach-ancestor::allow_one_reparented", + ".* Error processing HTTP request: InternalServerError\\(failed to reparent all candidate timelines, please retry", + ".* Error processing HTTP request: InternalServerError\\(failpoint: timeline-detach-ancestor::complete_before_uploading", + ] + ) + + http = env.pageserver.http_client() + + def remote_storage_copy_requests(): + return http.get_metric_value( + "remote_storage_s3_request_seconds_count", + {"request_type": "copy_object", "result": "ok"}, + ) + + def reparenting_progress(timelines: List[TimelineId]) -> Tuple[int, Set[TimelineId]]: + reparented = 0 + not_reparented = set() + for timeline in timelines: + detail = http.timeline_detail(env.initial_tenant, timeline) + ancestor = TimelineId(detail["ancestor_timeline_id"]) + if ancestor == detached: + reparented += 1 + else: + not_reparented.add(timeline) + return (reparented, not_reparented) + + # main ------A-----B-----C-----D-----E> lsn + timelines = [] + with env.endpoints.create_start("main") as ep: + for counter in range(5): + ep.safe_psql( + f"create table foo_{counter} as select i::bigint from generate_series(1, 10000) t(i)" + ) + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + http.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + branch = env.neon_cli.create_branch( + f"branch_{counter}", "main", ancestor_start_lsn=branch_lsn + ) + timelines.append(branch) + + flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline) + + # detach "E" which has most reparentable timelines under it + detached = timelines.pop() + assert len(timelines) == 4 + + http = http.without_status_retrying() + + http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return")) + + not_reparented: Set[TimelineId] = set() + # tracked offset in the pageserver log which is at least at the most recent activation + offset = None + + def try_detach(): + with pytest.raises( + PageserverApiException, + match=".*failed to reparent all candidate timelines, please retry", + ) as exc: + http.detach_ancestor(env.initial_tenant, detached) + assert exc.value.status_code == 503 + + # first round -- do more checking to make sure the gc gets paused + try_detach() + + assert ( + http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None + ), "first round should had detached 'detached'" + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == 1 + + time.sleep(2) + _, offset = env.pageserver.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request", + offset, + ) + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + _, offset = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + metric = remote_storage_copy_requests() + assert metric != 0 + # make sure the gc blocking is persistent over a restart + env.pageserver.restart() + env.pageserver.quiesce_tenants() + time.sleep(2) + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + assert env.pageserver.log_contains(".* gc_loop.*: [0-9] timelines need GC", offset) is None + _, offset = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + # restore failpoint for the next reparented + http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return")) + + reparented_before = reparented + + # do two more rounds + for _ in range(2): + try_detach() + + assert ( + http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None + ), "first round should had detached 'detached'" + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == reparented_before + 1 + reparented_before = reparented + + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + metric = remote_storage_copy_requests() + assert metric == 0, "copies happen in the first round" + + assert offset is not None + assert len(not_reparented) == 1 + + http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "return")) + + # almost final round, the failpoint is hit no longer as there is only one reparented and one always gets to succeed. + # the tenant is restarted once more, but we fail during completing. + with pytest.raises( + PageserverApiException, match=".* timeline-detach-ancestor::complete_before_uploading" + ) as exc: + http.detach_ancestor(env.initial_tenant, detached) + assert exc.value.status_code == 500 + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + + # delete the previous ancestor to take a different path to completion. all + # other tests take the "detach? reparent complete", but this only hits + # "complete". + http.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20) + + http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off")) + + reparented_resp = http.detach_ancestor(env.initial_tenant, detached) + assert reparented_resp == set(timelines) + # no need to quiesce_tenants anymore, because completion does that + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == len(timelines) + + time.sleep(2) + assert ( + env.pageserver.log_contains(".*: attach finished, activating", offset) is None + ), "there should be no restart with the final detach_ancestor as it only completed" + + # gc is unblocked + env.pageserver.assert_log_contains(".* gc_loop.*: 5 timelines need GC", offset) + + metric = remote_storage_copy_requests() + assert metric == 0 + + +def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( + neon_env_builder: NeonEnvBuilder, +): + """ + Make sure that a timeline deleted after restart will unpause gc blocking. + """ + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + http = env.pageserver.http_client() + + detached = env.neon_cli.create_branch("detached") + + failpoint = "timeline-detach-ancestor::after_activating_before_finding-pausable" + + http.configure_failpoints((failpoint, "pause")) + + def detach_and_get_stuck(): + return http.detach_ancestor(env.initial_tenant, detached) + + def request_processing_noted_in_log(): + _, offset = env.pageserver.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request", + ) + return offset + + def delete_detached(): + return http.timeline_delete(env.initial_tenant, detached) + + try: + with ThreadPoolExecutor(max_workers=1) as pool: + detach = pool.submit(detach_and_get_stuck) + + offset = wait_until(10, 1.0, request_processing_noted_in_log) + + # make this named fn tor more clear failure test output logging + def pausepoint_hit_with_gc_paused() -> LogCursor: + env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + _, at = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + return at + + offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused) + + delete_detached() + + wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0) + + http.configure_failpoints((failpoint, "off")) + + with pytest.raises( + PageserverApiException, match="NotFound: Timeline .* was not found" + ) as exc: + detach.result() + assert exc.value.status_code == 404 + finally: + http.configure_failpoints((failpoint, "off")) + + # make sure gc has been unblocked + time.sleep(2) + + env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) + # TODO: -# - after starting the operation, pageserver is shutdown, restarted -# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited -# - deletion of reparented while reparenting should fail once, then succeed (?) # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. # diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 1f220eec9e..642b9e449b 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1137,3 +1137,10 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) else: raise RuntimeError(activation_method) + + client.configure_failpoints( + [ + ("timeline-calculate-logical-size-pause", "off"), + ("walreceiver-after-ingest", "off"), + ] + ) diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 225b952e73..7272979c4a 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -62,7 +62,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # Clear the buffer cache, to force the VM page to be re-fetched from # the page server - cur.execute("SELECT clear_buffer_cache()") + endpoint.clear_shared_buffers(cursor=cur) # Check that an index-only scan doesn't see the deleted row. If the # clearing of the VM bit was not replayed correctly, this would incorrectly diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index f02f19c588..5d3b263936 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -49,7 +49,13 @@ from fixtures.remote_storage import ( ) from fixtures.safekeeper.http import SafekeeperHttpClient from fixtures.safekeeper.utils import are_walreceivers_absent -from fixtures.utils import PropagatingThread, get_dir_size, query_scalar, start_in_background +from fixtures.utils import ( + PropagatingThread, + get_dir_size, + query_scalar, + start_in_background, + wait_until, +) def wait_lsn_force_checkpoint( @@ -63,6 +69,18 @@ def wait_lsn_force_checkpoint( lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") + wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + +def wait_lsn_force_checkpoint_at( + lsn: Lsn, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + pageserver_conn_options = pageserver_conn_options or {} + auth_token = None if "password" in pageserver_conn_options: auth_token = pageserver_conn_options["password"] @@ -2159,7 +2177,7 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder): # generate some data to commit WAL on safekeepers endpoint.safe_psql("insert into t select generate_series(1,100), 'action'") # clear the buffers - endpoint.safe_psql("select clear_buffer_cache()") + endpoint.clear_shared_buffers() # read data to fetch pages from pageserver endpoint.safe_psql("select sum(i) from t") @@ -2304,3 +2322,138 @@ def test_s3_eviction( ) assert event_metrics_seen + + +def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilder): + """ + Verify that pulling timeline from a SK with an uploaded partial segment + does not lead to consistency issues: + 1. Start 3 SKs - only use two + 2. Ingest a bit of WAL + 3. Wait for partial to be uploaded + 4. Pull timeline to the third SK + 6. Replace source with destination SK and start compute + 5. Wait for source SK to evict timeline + 6. Go back to initial compute SK config and validate that + source SK can unevict the timeline (S3 state is consistent) + """ + neon_env_builder.auth_enabled = True + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--delete-offloaded-wal", + "--partial-backup-timeout", + "500ms", + "--control-file-save-interval", + "500ms", + "--eviction-min-resident=500ms", + ] + + env = neon_env_builder.init_start(initial_tenant_conf={"checkpoint_timeout": "100ms"}) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2]) + + log.info("use only first 2 safekeepers, 3rd will be seeded") + endpoint = env.endpoints.create("main") + endpoint.active_safekeepers = [1, 2] + endpoint.start() + endpoint.safe_psql("create table t(key int, value text)") + endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'") + + endpoint.stop() + + def source_partial_segment_uploaded(): + first_segment_name = "000000010000000000000001" + segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + + candidate_seg = None + for seg in segs: + if "partial" in seg and "sk1" in seg and not seg.startswith(first_segment_name): + candidate_seg = seg + + if candidate_seg is not None: + # The term might change, causing the segment to be gc-ed shortly after, + # so give it a bit of time to make sure it's stable. + time.sleep(2) + + segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + assert candidate_seg in segs + return candidate_seg + + raise Exception("Partial segment not uploaded yet") + + source_partial_segment = wait_until(15, 1, source_partial_segment_uploaded) + log.info( + f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" + ) + log.info(f"Tracking source partial segment: {source_partial_segment}") + + src_flush_lsn = src_sk.get_flush_lsn(tenant_id, timeline_id) + log.info(f"flush_lsn on src before pull_timeline: {src_flush_lsn}") + + pageserver_conn_options = {"password": env.auth_keys.generate_tenant_token(tenant_id)} + wait_lsn_force_checkpoint_at( + src_flush_lsn, tenant_id, timeline_id, env.pageserver, pageserver_conn_options + ) + + dst_sk.pull_timeline([src_sk], tenant_id, timeline_id) + + def evicted(): + evictions = src_sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "evict"} + ) + + if evictions is None or evictions == 0: + raise Exception("Eviction did not happen on source safekeeper yet") + + wait_until(30, 1, evicted) + + endpoint.start(safekeepers=[2, 3]) + + def new_partial_segment_uploaded(): + segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + for seg in segs: + if "partial" in seg and "sk3" in seg: + return seg + + raise Exception("Partial segment not uploaded yet") + + log.info( + f"Uploaded segments before post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" + ) + + endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'") + wait_until(15, 1, new_partial_segment_uploaded) + + log.info( + f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" + ) + + # Allow for some gc iterations to happen and assert that the original + # uploaded partial segment remains in place. + time.sleep(5) + segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + assert source_partial_segment in segs + + log.info( + f"Uploaded segments at the end are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" + ) + + # Restart the endpoint in order to check that the source safekeeper + # can unevict the timeline + endpoint.stop() + endpoint.start(safekeepers=[1, 2]) + + def unevicted(): + unevictions = src_sk.http_client().get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "restore"} + ) + + if unevictions is None or unevictions == 0: + raise Exception("Uneviction did not happen on source safekeeper yet") + + wait_until(10, 1, unevicted) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 7bbe834c8c..3fd7a45f8a 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 7bbe834c8c2dc37802eca8484311599bc47341f6 +Subproject commit 3fd7a45f8aae85c080df6329e3c85887b7f3a737 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 9eba7dd382..46b4b235f3 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 9eba7dd382606ffca43aca865f337ec21bcdac73 +Subproject commit 46b4b235f38413ab5974bb22c022f9b829257674 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 5377f5ed72..47a9122a5a 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3 +Subproject commit 47a9122a5a150a3217fafd3f3d4fe8e020ea718a diff --git a/vendor/revisions.json b/vendor/revisions.json index 570dfc1550..6e3e489b5d 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,14 @@ { - "v16": ["16.3", "5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3"], - "v15": ["15.7", "9eba7dd382606ffca43aca865f337ec21bcdac73"], - "v14": ["14.12", "7bbe834c8c2dc37802eca8484311599bc47341f6"] + "v16": [ + "16.3", + "47a9122a5a150a3217fafd3f3d4fe8e020ea718a" + ], + "v15": [ + "15.7", + "46b4b235f38413ab5974bb22c022f9b829257674" + ], + "v14": [ + "14.12", + "3fd7a45f8aae85c080df6329e3c85887b7f3a737" + ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 832fe06bf6..2d9b372654 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -30,13 +30,17 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd clap = { version = "4", features = ["derive", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } +crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] } +der = { version = "0.7", default-features = false, features = ["oid", "pem", "std"] } deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] } +digest = { version = "0.10", features = ["mac", "oid", "std"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } futures-channel = { version = "0.3", features = ["sink"] } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } +generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } @@ -44,6 +48,7 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } +lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } @@ -64,8 +69,10 @@ rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } -sha2 = { version = "0.10", features = ["asm"] } +sha2 = { version = "0.10", features = ["asm", "oid"] } +signature = { version = "2", default-features = false, features = ["digest", "rand_core", "std"] } smallvec = { version = "1", default-features = false, features = ["const_new", "write"] } +spki = { version = "0.7", default-features = false, features = ["pem", "std"] } subtle = { version = "2" } sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] } tikv-jemalloc-sys = { version = "0.5" } @@ -81,7 +88,7 @@ tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } -zeroize = { version = "1", features = ["derive"] } +zeroize = { version = "1", features = ["derive", "serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } @@ -97,6 +104,7 @@ getrandom = { version = "0.2", default-features = false, features = ["std"] } hashbrown = { version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } +lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" }