diff --git a/.github/actionlint.yml b/.github/actionlint.yml index d27fa01efa..a5282876d0 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -13,3 +13,4 @@ config-variables: - REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_REGION - SLACK_UPCOMING_RELEASE_CHANNEL_ID + - DEV_AWS_OIDC_ROLE_ARN diff --git a/.github/actions/set-docker-config-dir/action.yml b/.github/actions/set-docker-config-dir/action.yml new file mode 100644 index 0000000000..3ee8bec8c6 --- /dev/null +++ b/.github/actions/set-docker-config-dir/action.yml @@ -0,0 +1,36 @@ +name: "Set custom docker config directory" +description: "Create a directory for docker config and set DOCKER_CONFIG" + +# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings +runs: + using: "composite" + steps: + - name: Show warning on GitHub-hosted runners + if: runner.environment == 'github-hosted' + shell: bash -euo pipefail {0} + run: | + # Using the following environment variables to find a path to the workflow file + # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch + # ${GITHUB_REPOSITORY} - octocat/hello-world + # ${GITHUB_REF} - refs/heads/my_branch + # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables + + filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"} + filename=${filename_with_ref%"@$GITHUB_REF"} + + # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message + title='Unnecessary usage of `.github/actions/set-docker-config-dir`' + message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners' + echo "::warning file=${filename},title=${title}::${message}" + + - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7 + env: + DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }} + with: + main: | + mkdir -p "${DOCKER_CONFIG}" + echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV + post: | + if [ -d "${DOCKER_CONFIG}" ]; then + rm -r "${DOCKER_CONFIG}" + fi diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml new file mode 100644 index 0000000000..7229776cd6 --- /dev/null +++ b/.github/workflows/_benchmarking_preparation.yml @@ -0,0 +1,152 @@ +name: Prepare benchmarking databases by restoring dumps + +on: + workflow_call: + # no inputs needed + +defaults: + run: + shell: bash -euxo pipefail {0} + +jobs: + setup-databases: + strategy: + fail-fast: false + matrix: + platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ] + database: [ clickbench, tpch, userexample ] + + env: + LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib + PLATFORM: ${{ matrix.platform }} + PG_BINARIES: /tmp/neon/pg_install/v16/bin + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + options: --init + + steps: + - name: Set up Connection String + id: set-up-prep-connstr + run: | + case "${PLATFORM}" in + neon) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} + ;; + aws-rds-postgres) + CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} + ;; + aws-aurora-serverless-v2-postgres) + CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }} + ;; + *) + echo >&2 "Unknown PLATFORM=${PLATFORM}" + exit 1 + ;; + esac + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + # we create a table that has one row for each database that we want to restore with the status whether the restore is done + - name: Create benchmark_restore_status table if it does not exist + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + # to avoid a race condition of multiple jobs trying to create the table at the same time, + # we use an advisory lock + run: | + ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c " + SELECT pg_advisory_lock(4711); + CREATE TABLE IF NOT EXISTS benchmark_restore_status ( + databasename text primary key, + restore_done boolean + ); + SELECT pg_advisory_unlock(4711); + " + + - name: Check if restore is already done + id: check-restore-done + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + run: | + skip=false + if ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM benchmark_restore_status WHERE databasename='${{ env.DATABASE_NAME }}' AND restore_done=true;" | grep -q 1; then + echo "Restore already done for database ${{ env.DATABASE_NAME }} on platform ${{ env.PLATFORM }}. Skipping this database." + skip=true + fi + echo "skip=${skip}" | tee -a $GITHUB_OUTPUT + + - name: Check and create database if it does not exist + if: steps.check-restore-done.outputs.skip != 'true' + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + run: | + DB_EXISTS=$(${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM pg_database WHERE datname='${{ env.DATABASE_NAME }}'") + if [ "$DB_EXISTS" != "1" ]; then + echo "Database ${{ env.DATABASE_NAME }} does not exist. Creating it..." + ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "CREATE DATABASE \"${{ env.DATABASE_NAME }}\";" + else + echo "Database ${{ env.DATABASE_NAME }} already exists." + fi + + - name: Download dump from S3 to /tmp/dumps + if: steps.check-restore-done.outputs.skip != 'true' + env: + DATABASE_NAME: ${{ matrix.database }} + run: | + mkdir -p /tmp/dumps + aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/ + + - name: Replace database name in connection string + if: steps.check-restore-done.outputs.skip != 'true' + id: replace-dbname + env: + DATABASE_NAME: ${{ matrix.database }} + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + run: | + # Extract the part before the database name + base_connstr="${BENCHMARK_CONNSTR%/*}" + # Extract the query parameters (if any) after the database name + query_params="${BENCHMARK_CONNSTR#*\?}" + # Reconstruct the new connection string + if [ "$query_params" != "$BENCHMARK_CONNSTR" ]; then + new_connstr="${base_connstr}/${DATABASE_NAME}?${query_params}" + else + new_connstr="${base_connstr}/${DATABASE_NAME}" + fi + echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT + + - name: Restore dump + if: steps.check-restore-done.outputs.skip != 'true' + env: + DATABASE_NAME: ${{ matrix.database }} + DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }} + # the following works only with larger computes: + # PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7" + # we add the || true because: + # the dumps were created with Neon and contain neon extensions that are not + # available in RDS, so we will always report an error, but we can ignore it + run: | + ${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \ + -d "${DATABASE_CONNSTR}" /tmp/dumps/${DATABASE_NAME}.pg_dump || true + + - name: Update benchmark_restore_status table + if: steps.check-restore-done.outputs.skip != 'true' + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }} + DATABASE_NAME: ${{ matrix.database }} + run: | + ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c " + INSERT INTO benchmark_restore_status (databasename, restore_done) VALUES ('${{ env.DATABASE_NAME }}', true) + ON CONFLICT (databasename) DO UPDATE SET restore_done = true; + " diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 0f4dac841e..106c3e3138 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -56,6 +56,10 @@ concurrency: jobs: bench: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} + permissions: + contents: write + statuses: write + id-token: write # Required for OIDC authentication in azure runners strategy: fail-fast: false matrix: @@ -63,9 +67,13 @@ jobs: - DEFAULT_PG_VERSION: 16 PLATFORM: "neon-staging" region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} + RUNNER: [ self-hosted, us-east-2, x64 ] + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned - DEFAULT_PG_VERSION: 16 PLATFORM: "azure-staging" region_id: 'azure-eastus2' + RUNNER: [ self-hosted, eastus2, x64 ] + IMAGE: neondatabase/build-tools:pinned env: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" @@ -76,14 +84,21 @@ jobs: SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.PLATFORM }} - runs-on: [ self-hosted, us-east-2, x64 ] + runs-on: ${{ matrix.RUNNER }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: ${{ matrix.IMAGE }} options: --init steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials # necessary on Azure runners + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -161,6 +176,7 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -237,6 +253,9 @@ jobs: id: pgbench-compare-matrix run: | region_id_default=${{ env.DEFAULT_REGION_ID }} + runner_default='["self-hosted", "us-east-2", "x64"]' + runner_azure='["self-hosted", "eastus2", "x64"]' + image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned" matrix='{ "pg_version" : [ 16 @@ -250,16 +269,20 @@ jobs: "neonvm-captest-new" ], "db_size": [ "10gb" ], - "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }] + "runner": ['"$runner_default"'], + "image": [ "'"$image_default"'" ], + "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] }' - if [ "$(date +%A)" = "Saturday" ]; then - matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"}]') + if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then + matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -299,9 +322,17 @@ jobs: echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT + prepare_AWS_RDS_databases: + uses: ./.github/workflows/_benchmarking_preparation.yml + secrets: inherit + pgbench-compare: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} - needs: [ generate-matrices ] + needs: [ generate-matrices, prepare_AWS_RDS_databases ] + permissions: + contents: write + statuses: write + id-token: write # Required for OIDC authentication in azure runners strategy: fail-fast: false @@ -317,9 +348,9 @@ jobs: SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} - runs-on: [ self-hosted, us-east-2, x64 ] + runs-on: ${{ matrix.runner }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: ${{ matrix.image }} options: --init # Increase timeout to 8h, default timeout is 6h @@ -328,6 +359,13 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials # necessary on Azure runners + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Download Neon artifact uses: ./.github/actions/download with: @@ -435,12 +473,20 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} pgbench-pgvector: + permissions: + contents: write + statuses: write + id-token: write # Required for OIDC authentication in azure runners strategy: fail-fast: false matrix: include: - PLATFORM: "neonvm-captest-pgvector" + RUNNER: [ self-hosted, us-east-2, x64 ] + IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned - PLATFORM: "azure-captest-pgvector" + RUNNER: [ self-hosted, eastus2, x64 ] + IMAGE: neondatabase/build-tools:pinned env: TEST_PG_BENCH_DURATIONS_MATRIX: "15m" @@ -453,9 +499,9 @@ jobs: SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.PLATFORM }} - runs-on: [ self-hosted, us-east-2, x64 ] + runs-on: ${{ matrix.RUNNER }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + image: ${{ matrix.IMAGE }} options: --init steps: @@ -466,12 +512,12 @@ jobs: - name: Install postgresql-16 where pytest expects it run: | cd /home/nonroot - wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb - wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb - wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb - dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg - dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg - dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb + wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb + dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg + dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg + dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg mkdir -p /tmp/neon/pg_install/v16/bin ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql @@ -496,6 +542,13 @@ jobs: esac echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + - name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3 + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours - name: Benchmark pgvector hnsw indexing uses: ./.github/actions/run-python-test-set @@ -524,7 +577,7 @@ jobs: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - + - name: Create Allure report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -547,7 +600,7 @@ jobs: # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} - needs: [ generate-matrices, pgbench-compare ] + needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false @@ -555,7 +608,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }} TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }} @@ -607,6 +660,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_clickbench + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -636,7 +690,7 @@ jobs: # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} - needs: [ generate-matrices, clickbench-compare ] + needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false @@ -644,7 +698,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} @@ -676,7 +730,7 @@ jobs: ENV_PLATFORM=RDS_AURORA_TPCH ;; rds-postgres) - ENV_PLATFORM=RDS_AURORA_TPCH + ENV_PLATFORM=RDS_POSTGRES_TPCH ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'" @@ -702,6 +756,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_tpch + pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -723,7 +778,7 @@ jobs: user-examples-compare: if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} - needs: [ generate-matrices, tpch-compare ] + needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ] strategy: fail-fast: false @@ -731,7 +786,7 @@ jobs: env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - DEFAULT_PG_VERSION: 14 + DEFAULT_PG_VERSION: 16 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 76fc58151a..f4f6e6971f 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -56,13 +56,7 @@ jobs: - uses: actions/checkout@v4 - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p /tmp/.docker-custom - echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV - + - uses: ./.github/actions/set-docker-config-dir - uses: docker/setup-buildx-action@v3 with: cache-binary: false @@ -89,11 +83,6 @@ jobs: cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }} tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} - - name: Remove custom docker config directory - if: always() - run: | - rm -rf /tmp/.docker-custom - merge-images: needs: [ build-image ] runs-on: ubuntu-22.04 diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index c7ae2aedd4..78f9f11a65 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -484,12 +484,7 @@ jobs: submodules: true fetch-depth: 0 - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p .docker-custom - echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV + - uses: ./.github/actions/set-docker-config-dir - uses: docker/setup-buildx-action@v3 with: cache-binary: false @@ -521,11 +516,6 @@ jobs: tags: | neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - - name: Remove custom docker config directory - if: always() - run: | - rm -rf .docker-custom - neon-image: needs: [ neon-image-arch, tag ] runs-on: ubuntu-22.04 @@ -570,12 +560,7 @@ jobs: submodules: true fetch-depth: 0 - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p .docker-custom - echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV + - uses: ./.github/actions/set-docker-config-dir - uses: docker/setup-buildx-action@v3 with: cache-binary: false @@ -658,11 +643,6 @@ jobs: tags: | neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - - name: Remove custom docker config directory - if: always() - run: | - rm -rf .docker-custom - compute-node-image: needs: [ compute-node-image-arch, tag ] runs-on: ubuntu-22.04 @@ -735,13 +715,7 @@ jobs: curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder chmod +x vm-builder - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p .docker-custom - echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - + - uses: ./.github/actions/set-docker-config-dir - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} @@ -764,11 +738,6 @@ jobs: run: | docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - - name: Remove custom docker config directory - if: always() - run: | - rm -rf .docker-custom - test-images: needs: [ check-permissions, tag, neon-image, compute-node-image ] strategy: @@ -784,13 +753,7 @@ jobs: with: fetch-depth: 0 - # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings - # The default value is ~/.docker - - name: Set custom docker config directory - run: | - mkdir -p .docker-custom - echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - + - uses: ./.github/actions/set-docker-config-dir - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} @@ -830,11 +793,6 @@ jobs: docker compose -f ./docker-compose/docker-compose.yml logs || 0 docker compose -f ./docker-compose/docker-compose.yml down - - name: Remove custom docker config directory - if: always() - run: | - rm -rf .docker-custom - promote-images: permissions: contents: read # This is required for actions/checkout diff --git a/.github/workflows/label-for-external-users.yml b/.github/workflows/label-for-external-users.yml new file mode 100644 index 0000000000..2f19a746e0 --- /dev/null +++ b/.github/workflows/label-for-external-users.yml @@ -0,0 +1,35 @@ +name: Add `external` label to issues and PRs created by external users + +on: + issues: + types: + - opened + pull_request: + types: + - opened + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +env: + LABEL: external + +jobs: + add-label: + # This workflow uses `author_association` for PRs and issues to determine if the user is an external user. + # Possible values for `author_association`: https://docs.github.com/en/graphql/reference/enums#commentauthorassociation + if: ${{ !contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].author_association) }} + + runs-on: ubuntu-22.04 + permissions: + pull-requests: write + issues: write + + steps: + - name: Label new ${{ github.event_name }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request' && 'pull_request' || 'issue'].number }} + GH_CLI_COMMAND: ${{ github.event_name == 'pull_request' && 'pr' || 'issue' }} + run: | + gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index d4870e16ad..2ee66cfdc1 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -149,8 +149,6 @@ jobs: env: BUILD_TYPE: release - # remove the cachepot wrapper and build without crate caches - RUSTC_WRAPPER: "" # build with incremental compilation produce partial results # so do not attempt to cache this build, also disable the incremental compilation CARGO_INCREMENTAL: 0 diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index cf10910b0b..2e79498fc4 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -7,12 +7,20 @@ on: description: 'Source tag' required: true type: string + force: + description: 'Force the image to be pinned' + default: false + type: boolean workflow_call: inputs: from-tag: description: 'Source tag' required: true type: string + force: + description: 'Force the image to be pinned' + default: false + type: boolean defaults: run: @@ -22,15 +30,18 @@ concurrency: group: pin-build-tools-image-${{ inputs.from-tag }} cancel-in-progress: false +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} -jobs: - tag-image: - runs-on: ubuntu-22.04 +env: + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: pinned - env: - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: pinned +jobs: + check-manifests: + runs-on: ubuntu-22.04 + outputs: + skip: ${{ steps.check-manifests.outputs.skip }} steps: - name: Check if we really need to pin the image @@ -47,27 +58,31 @@ jobs: echo "skip=${skip}" | tee -a $GITHUB_OUTPUT + tag-image: + needs: check-manifests + + # use format(..) to catch both inputs.force = true AND inputs.force = 'true' + if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true' + + runs-on: ubuntu-22.04 + + permissions: + id-token: write # for `azure/login` + + steps: - uses: docker/login-action@v3 - if: steps.check-manifests.outputs.skip == 'false' + with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub - if: steps.check-manifests.outputs.skip == 'false' - run: | - docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \ - neondatabase/build-tools:${FROM_TAG} - - uses: docker/login-action@v3 - if: steps.check-manifests.outputs.skip == 'false' with: registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com username: ${{ secrets.AWS_ACCESS_KEY_DEV }} password: ${{ secrets.AWS_SECRET_KEY_DEV }} - name: Azure login - if: steps.check-manifests.outputs.skip == 'false' uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1 with: client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }} @@ -75,13 +90,12 @@ jobs: subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }} - name: Login to ACR - if: steps.check-manifests.outputs.skip == 'false' run: | az acr login --name=neoneastus2 - - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR and ACR - if: steps.check-manifests.outputs.skip == 'false' + - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \ -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \ + -t neondatabase/build-tools:${TO_TAG} \ neondatabase/build-tools:${FROM_TAG} diff --git a/Dockerfile b/Dockerfile index ace112cccf..ceb1c7cb55 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh -ENV BUILD_TYPE release +ENV BUILD_TYPE=release RUN set -e \ && mold -run make -j $(nproc) -s neon-pg-ext \ && rm -rf pg_install/build \ @@ -29,24 +29,12 @@ WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG -# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. -# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. -# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build -ARG RUSTC_WRAPPER=cachepot -ENV AWS_REGION=eu-central-1 -ENV CACHEPOT_S3_KEY_PREFIX=cachepot -ARG CACHEPOT_BUCKET=neon-github-dev -#ARG AWS_ACCESS_KEY_ID -#ARG AWS_SECRET_ACCESS_KEY - COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib COPY --chown=nonroot . . -# Show build caching stats to check if it was used in the end. -# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \ --bin pg_sni_router \ @@ -58,8 +46,7 @@ RUN set -e \ --bin proxy \ --bin neon_local \ --bin storage_scrubber \ - --locked --release \ - && cachepot -s + --locked --release # Build final image # @@ -104,7 +91,7 @@ RUN mkdir -p /data/.neon/ && \ # When running a binary that links with libpq, default to using our most recent postgres version. Binaries # that want a particular postgres version will select it explicitly: this is just a default. -ENV LD_LIBRARY_PATH /usr/local/v16/lib +ENV LD_LIBRARY_PATH=/usr/local/v16/lib VOLUME ["/data"] @@ -112,5 +99,5 @@ USER neon EXPOSE 6400 EXPOSE 9898 -CMD /usr/local/bin/pageserver -D /data/.neon +CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"] diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index dfaab1cb2e..d6beb61369 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -58,7 +58,7 @@ RUN set -e \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* # protobuf-compiler (protoc) -ENV PROTOC_VERSION 25.1 +ENV PROTOC_VERSION=25.1 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \ && unzip -q protoc.zip -d protoc \ && mv protoc/bin/protoc /usr/local/bin/protoc \ @@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws && rm awscliv2.zip # Mold: A Modern Linker -ENV MOLD_VERSION v2.31.0 +ENV MOLD_VERSION=v2.33.0 RUN set -e \ && git clone https://github.com/rui314/mold.git \ && mkdir mold/build \ @@ -168,7 +168,7 @@ USER nonroot:nonroot WORKDIR /home/nonroot # Python -ENV PYTHON_VERSION=3.9.18 \ +ENV PYTHON_VERSION=3.9.19 \ PYENV_ROOT=/home/nonroot/.pyenv \ PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH RUN set -e \ @@ -192,9 +192,14 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.80.0 +ENV RUSTC_VERSION=1.80.1 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" +ARG RUSTFILT_VERSION=0.2.1 +ARG CARGO_HAKARI_VERSION=0.9.30 +ARG CARGO_DENY_VERSION=0.16.1 +ARG CARGO_HACK_VERSION=0.6.31 +ARG CARGO_NEXTEST_VERSION=0.9.72 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ chmod +x rustup-init && \ ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ @@ -203,15 +208,13 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux . "$HOME/.cargo/env" && \ cargo --version && rustup --version && \ rustup component add llvm-tools-preview rustfmt clippy && \ - cargo install --git https://github.com/paritytech/cachepot && \ - cargo install rustfilt && \ - cargo install cargo-hakari && \ - cargo install cargo-deny --locked && \ - cargo install cargo-hack && \ - cargo install cargo-nextest && \ + cargo install rustfilt --version ${RUSTFILT_VERSION} && \ + cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \ + cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \ + cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \ + cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \ rm -rf /home/nonroot/.cargo/registry && \ rm -rf /home/nonroot/.cargo/git -ENV RUSTC_WRAPPER=cachepot # Show versions RUN whoami \ diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 054d44e0ec..7acaf2f2fd 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -94,7 +94,7 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ make clean && cp -R /sfcgal/* / -ENV PATH "/usr/local/pgsql/bin:$PATH" +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ @@ -411,7 +411,7 @@ FROM build-deps AS timescaledb-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION -ENV PATH "/usr/local/pgsql/bin:$PATH" +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ @@ -444,7 +444,7 @@ FROM build-deps AS pg-hint-plan-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION -ENV PATH "/usr/local/pgsql/bin:$PATH" +ENV PATH="/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in \ "v14") \ @@ -480,7 +480,7 @@ RUN case "${PG_VERSION}" in \ FROM build-deps AS pg-cron-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ @@ -506,7 +506,7 @@ RUN apt-get update && \ libboost-system1.74-dev \ libeigen3-dev -ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ @@ -546,7 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. FROM build-deps AS pg-uuidv7-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ @@ -563,7 +563,7 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz FROM build-deps AS pg-roaringbitmap-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ @@ -580,7 +580,7 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 FROM build-deps AS pg-semver-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ @@ -598,7 +598,7 @@ FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ @@ -622,7 +622,7 @@ RUN case "${PG_VERSION}" in \ FROM build-deps AS pg-anon-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ @@ -750,7 +750,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz - FROM build-deps AS wal2json-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ @@ -766,7 +766,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar. FROM build-deps AS pg-ivm-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ @@ -783,7 +783,7 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv FROM build-deps AS pg-partman-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -ENV PATH "/usr/local/pgsql/bin/:$PATH" +ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ @@ -1034,6 +1034,6 @@ RUN apt update && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 -ENV LANG en_US.utf8 +ENV LANG=en_US.utf8 USER postgres ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index f9bb2da7e7..9f879c4b08 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -824,11 +824,12 @@ impl Endpoint { // cleanup work to do after postgres stops, like syncing safekeepers, // etc. // - // If destroying, send it SIGTERM before waiting. Sometimes we do *not* - // want this cleanup: tests intentionally do stop when majority of - // safekeepers is down, so sync-safekeepers would hang otherwise. This - // could be a separate flag though. - self.wait_for_compute_ctl_to_exit(destroy)?; + // If destroying or stop mode is immediate, send it SIGTERM before + // waiting. Sometimes we do *not* want this cleanup: tests intentionally + // do stop when majority of safekeepers is down, so sync-safekeepers + // would hang otherwise. This could be a separate flag though. + let send_sigterm = destroy || mode == "immediate"; + self.wait_for_compute_ctl_to_exit(send_sigterm)?; if destroy { println!( "Destroying postgres data directory '{}'", diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 505d157efd..15bbac702f 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -158,6 +158,8 @@ pub struct NeonStorageControllerConf { /// Threshold for auto-splitting a tenant into shards pub split_threshold: Option, + + pub max_secondary_lag_bytes: Option, } impl NeonStorageControllerConf { @@ -173,6 +175,7 @@ impl Default for NeonStorageControllerConf { max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL, max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL, split_threshold: None, + max_secondary_lag_bytes: None, } } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index e054e9ee57..f180e922e8 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -383,6 +383,10 @@ impl StorageController { args.push(format!("--split-threshold={split_threshold}")) } + if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() { + args.push(format!("--max-secondary-lag-bytes={lag}")) + } + args.push(format!( "--neon-local-repo-dir={}", self.env.base_data_dir.display() diff --git a/deny.toml b/deny.toml index 469609c496..dc985138e6 100644 --- a/deny.toml +++ b/deny.toml @@ -4,6 +4,7 @@ # to your expectations and requirements. # Root options +[graph] targets = [ { triple = "x86_64-unknown-linux-gnu" }, { triple = "aarch64-unknown-linux-gnu" }, @@ -12,6 +13,7 @@ targets = [ ] all-features = false no-default-features = false +[output] feature-depth = 1 # This section is considered when running `cargo deny check advisories` @@ -19,17 +21,13 @@ feature-depth = 1 # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html [advisories] db-urls = ["https://github.com/rustsec/advisory-db"] -vulnerability = "deny" -unmaintained = "warn" yanked = "warn" -notice = "warn" ignore = [] # This section is considered when running `cargo deny check licenses` # More documentation for the licenses section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html [licenses] -unlicensed = "deny" allow = [ "Apache-2.0", "Artistic-2.0", @@ -42,10 +40,6 @@ allow = [ "OpenSSL", "Unicode-DFS-2016", ] -deny = [] -copyleft = "warn" -allow-osi-fsf-free = "neither" -default = "deny" confidence-threshold = 0.8 exceptions = [ # Zlib license has some restrictions if we decide to change sth diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index b275349168..5fd4080c28 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -1,13 +1,18 @@ # Summary +# Looking for `neon.tech` docs? + +This page linkes to a selection of technical content about the open source code in this repository. + +Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code +in this repository. + +# Architecture + [Introduction]() - [Separation of Compute and Storage](./separation-compute-storage.md) -# Architecture - - [Compute]() - - [WAL proposer]() - - [WAL Backpressure]() - [Postgres changes](./core_changes.md) - [Pageserver](./pageserver.md) @@ -16,33 +21,15 @@ - [WAL Redo](./pageserver-walredo.md) - [Page cache](./pageserver-pagecache.md) - [Storage](./pageserver-storage.md) - - [Datadir mapping]() - - [Layer files]() - - [Branching]() - - [Garbage collection]() - - [Cloud Storage]() - [Processing a GetPage request](./pageserver-processing-getpage.md) - [Processing WAL](./pageserver-processing-wal.md) - - [Management API]() - - [Tenant Rebalancing]() - [WAL Service](walservice.md) - [Consensus protocol](safekeeper-protocol.md) - - [Management API]() - - [Rebalancing]() - -- [Control Plane]() - -- [Proxy]() - [Source view](./sourcetree.md) - [docker.md](./docker.md) — Docker images and building pipeline. - [Error handling and logging](./error-handling.md) - - [Testing]() - - [Unit testing]() - - [Integration testing]() - - [Benchmarks]() - - [Glossary](./glossary.md) @@ -58,28 +45,6 @@ # RFCs -- [RFCs](./rfcs/README.md) - -- [002-storage](rfcs/002-storage.md) -- [003-laptop-cli](rfcs/003-laptop-cli.md) -- [004-durability](rfcs/004-durability.md) -- [005-zenith_local](rfcs/005-zenith_local.md) -- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md) -- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md) -- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md) -- [008-push-pull](rfcs/008-push-pull.md) -- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md) -- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md) -- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md) -- [010-storage_details](rfcs/010-storage_details.md) -- [011-retention-policy](rfcs/011-retention-policy.md) -- [012-background-tasks](rfcs/012-background-tasks.md) -- [013-term-history](rfcs/013-term-history.md) -- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md) -- [014-storage-lsm](rfcs/014-storage-lsm.md) -- [015-storage-messaging](rfcs/015-storage-messaging.md) -- [016-connection-routing](rfcs/016-connection-routing.md) -- [017-timeline-data-management](rfcs/017-timeline-data-management.md) -- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md) -- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md) -- [cluster-size-limits](rfcs/cluster-size-limits.md) +Major changes are documented in RFCS: +- See [RFCs](./rfcs/README.md) for more information +- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md new file mode 100644 index 0000000000..239ec58186 --- /dev/null +++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md @@ -0,0 +1,495 @@ +# Safekeeper dynamic membership change + +To quickly recover from safekeeper node failures and do rebalancing we need to +be able to change set of safekeepers the timeline resides on. The procedure must +be safe (not lose committed log) regardless of safekeepers and compute state. It +should be able to progress if any majority of old safekeeper set, any majority +of new safekeeper set and compute are up and connected. This is known as a +consensus membership change. It always involves two phases: 1) switch old +majority to old + new configuration, preventing commits without acknowledge from +the new set 2) bootstrap the new set by ensuring majority of the new set has all +data which ever could have been committed before the first phase completed; +after that switch is safe to finish. Without two phases switch to the new set +which quorum might not intersect with quorum of the old set (and typical case of +ABC -> ABD switch is an example of that, because quorums AC and BD don't +intersect). Furthermore, procedure is typically carried out by the consensus +leader, and so enumeration of configurations which establishes order between +them is done through consensus log. + +In our case consensus leader is compute (walproposer), and we don't want to wake +up all computes for the change. Neither we want to fully reimplement the leader +logic second time outside compute. Because of that the proposed algorithm relies +for issuing configurations on the external fault tolerant (distributed) strongly +consisent storage with simple API: CAS (compare-and-swap) on the single key. +Properly configured postgres suits this. + +In the system consensus is implemented at the timeline level, so algorithm below +applies to the single timeline. + +## Algorithm + +### Definitions + +A configuration is + +``` +struct Configuration { + generation: Generation, // a number uniquely identifying configuration + sk_set: Vec, // current safekeeper set + new_sk_set: Optional>, +} +``` + +Configuration with `new_set` present is used for the intermediate step during +the change and called joint configuration. Generations establish order of +generations: we say `c1` is higher than `c2` if `c1.generation` > +`c2.generation`. + +### Persistently stored data changes + +Safekeeper starts storing its current configuration in the control file. Update +of is atomic, so in-memory value always matches the persistent one. + +External CAS providing storage (let's call it configuration storage here) also +stores configuration for each timeline. It is initialized with generation 1 and +initial set of safekeepers during timeline creation. Executed CAS on it must +never be lost. + +### Compute <-> safekeeper protocol changes + +`ProposerGreeting` message carries walproposer's configuration if it is already +established (see below), else null. `AcceptorGreeting` message carries +safekeeper's current `Configuration`. All further messages (`VoteRequest`, +`VoteResponse`, `ProposerElected`, `AppendRequest`, `AppendResponse`) carry +generation number, of walproposer in case of wp->sk message or of safekeeper in +case of sk->wp message. + +### Safekeeper changes + +Basic rule: once safekeeper observes configuration higher than his own it +immediately switches to it. It must refuse all messages with lower generation +that his. It also refuses messages if it is not member of the current generation +(that is, of either `sk_set` of `sk_new_set`), though it is likely not unsafe to +process them (walproposer should ignore result anyway). + +If there is non null configuration in `ProposerGreeting` and it is higher than +current safekeeper one, safekeeper switches to it. + +Safekeeper sends its current configuration in its first message to walproposer +`AcceptorGreeting`. It refuses all other walproposer messages if the +configuration generation in them is less than its current one. Namely, it +refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In +response it sends its current configuration generation to let walproposer know. + +Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` +accepting `Configuration`. Safekeeper switches to the given conf it is higher than its +current one and ignores it otherwise. In any case it replies with +``` +struct ConfigurationSwitchResponse { + conf: Configuration, + term: Term, + last_log_term: Term, + flush_lsn: Lsn, +} +``` + +### Compute (walproposer) changes + +Basic rule is that joint configuration requires votes from majorities in the +both `set` and `new_sk_set`. + +Compute receives list of safekeepers to connect to from the control plane as +currently and tries to communicate with all of them. However, the list does not +define consensus members. Instead, on start walproposer tracks highest +configuration it receives from `AcceptorGreeting`s. Once it assembles greetings +from majority of `sk_set` and majority of `new_sk_set` (if it is present), it +establishes this configuration as its own and moves to voting. + +It should stop talking to safekeepers not listed in the configuration at this +point, though it is not unsafe to continue doing so. + +To be elected it must receive votes from both majorites if `new_sk_set` is present. +Similarly, to commit WAL it must receive flush acknowledge from both majorities. + +If walproposer hears from safekeeper configuration higher than his own (i.e. +refusal to accept due to configuration change) it simply restarts. + +### Change algorithm + +The following algorithm can be executed anywhere having access to configuration +storage and safekeepers. It is safe to interrupt / restart it and run multiple +instances of it concurrently, though likely one of them won't make +progress then. It accepts `desired_set: Vec` as input. + +Algorithm will refuse to make the change if it encounters previous interrupted +change attempt, but in this case it will try to finish it. + +It will eventually converge if old majority, new majority and configuration +storage are reachable. + +1) Fetch current timeline configuration from the configuration storage. +2) If it is already joint one and `new_set` is different from `desired_set` + refuse to change. However, assign join conf to (in memory) var + `join_conf` and proceed to step 4 to finish the ongoing change. +3) Else, create joint `joint_conf: Configuration`: increment current conf number + `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration + storage by doing CAS on the current generation: change happens only if + current configuration number is still `n`. Apart from guaranteeing uniqueness + of configurations, CAS linearizes them, ensuring that new configuration is + created only following the previous one when we know that the transition is + safe. Failed CAS aborts the procedure. +4) Call `PUT` `configuration` on safekeepers from the current set, + delivering them `joint_conf`. Collecting responses from majority is required + to proceed. If any response returned generation higher than + `joint_conf.generation`, abort (another switch raced us). Otherwise, choose + max `` among responses and establish it as + (in memory) `sync_position`. Also choose max `term` and establish it as (in + memory) `sync_term`. We can't finish the switch until majority of the new set + catches up to this `sync_position` because data before it could be committed + without ack from the new set. Similarly, we'll bump term on new majority + to `sync_term` so that two computes with the same term are never elected. +4) Initialize timeline on safekeeper(s) from `new_sk_set` where it + doesn't exist yet by doing `pull_timeline` from the majority of the + current set. Doing that on majority of `new_sk_set` is enough to + proceed, but it is reasonable to ensure that all `new_sk_set` members + are initialized -- if some of them are down why are we migrating there? +5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. + Success on majority is enough. +6) Repeatedly call `PUT` `configuration` on safekeepers from the new set, + delivering them `joint_conf` and collecting their positions. This will + switch them to the `joint_conf` which generally won't be needed + because `pull_timeline` already includes it and plus additionally would be + broadcast by compute. More importantly, we may proceed to the next step + only when `` on the majority of the new set reached + `sync_position`. Similarly, on the happy path no waiting is not needed because + `pull_timeline` already includes it. However, we should double + check to be safe. For example, timeline could have been created earlier e.g. + manually or after try-to-migrate, abort, try-to-migrate-again sequence. +7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new + safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration + storage under one more CAS. +8) Call `PUT` `configuration` on safekeepers from the new set, + delivering them `new_conf`. It is enough to deliver it to the majority + of the new set; the rest can be updated by compute. + +I haven't put huge effort to make the description above very precise, because it +is natural language prone to interpretations anyway. Instead I'd like to make TLA+ +spec of it. + +Description above focuses on safety. To make the flow practical and live, here a few more +considerations. +1) It makes sense to ping new set to ensure it we are migrating to live node(s) before + step 3. +2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed + it is safe to rollback to the old conf with one more CAS. +3) On step 4 timeline might be already created on members of the new set for various reasons; + the simplest is the procedure restart. There are more complicated scenarious like mentioned + in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving + generations, so seems simpler to treat existing timeline as success. However, this also + has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in + the step 5 is never reached until compute is (re)awaken up to synchronize new member(s). + I don't think we'll observe this in practice, but can add waking up compute if needed. +4) In the end timeline should be locally deleted on the safekeeper(s) which are + in the old set but not in the new one, unless they are unreachable. To be + safe this also should be done under generation number (deletion proceeds only if + current configuration is <= than one in request and safekeeper is not memeber of it). +5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`, + jump to step 7, using it as `new_conf`. + +## Implementation + +The procedure ought to be driven from somewhere. Obvious candidates are control +plane and storage_controller; and as each of them already has db we don't want +yet another storage. I propose to manage safekeepers in storage_controller +because 1) since it is in rust it simplifies simulation testing (more on this +below) 2) it already manages pageservers. + +This assumes that migration will be fully usable only after we migrate all +tenants/timelines to storage_controller. It is discussible whether we want also +to manage pageserver attachments for all of these, but likely we do. + +This requires us to define storcon <-> cplane interface. + +### storage_controller <-> control plane interface + +First of all, control plane should +[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829) +storing safekeepers per timeline instead of per tenant because we can't migrate +tenants atomically. + +The important question is how updated configuration is delivered from +storage_controller to control plane to provide it to computes. As always, there +are two options, pull and push. Let's do it the same push as with pageserver +`/notify-attach` because 1) it keeps storage_controller out of critical compute +start path 2) provides easier upgrade: there won't be such a thing as 'timeline +managed by control plane / storcon', cplane just takes the value out of its db +when needed 3) uniformity. It makes storage_controller responsible for retrying notifying +control plane until it succeeds. + +So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and +updates it in the db if the provided conf generation is higher (the cplane db +should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it +should update db which makes the call successful, and then try to schedule +`apply_config` if possible, it is ok if not. storage_controller +should rate limit calling the endpoint, but likely this won't be needed, as migration +throughput is limited by `pull_timeline`. + +Timeline (branch) creation in cplane should call storage_controller POST +`tenant/:tenant_id/timeline` like it currently does for sharded tenants. +Response should be augmented with `safekeeper_conf: Configuration`. The call +should be retried until succeeds. + +Timeline deletion and tenant deletion in cplane should call appropriate +storage_controller endpoints like it currently does for sharded tenants. The +calls should be retried until they succeed. + +### storage_controller implementation + +Current 'load everything on startup and keep in memory' easy design is fine. +Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16 +byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so +10^6 of timelines shouldn't take more than 100MB. + +Similar to pageserver attachment Intents storage_controller would have in-memory +`MigrationRequest` (or its absense) for each timeline and pool of tasks trying +to make these request reality; this ensures one instance of storage_controller +won't do several migrations on the same timeline concurrently. In the first +version it is simpler to have more manual control and no retries, i.e. migration +failure removes the request. Later we can build retries and automatic +scheduling/migration. `MigrationRequest` is +``` +enum MigrationRequest { + To(Vec), + FinishPending, +} +``` + +`FinishPending` requests to run the procedure to ensure state is clean: current +configuration is not joint and majority of safekeepers are aware of it, but do +not attempt to migrate anywhere. If current configuration fetched on step 1 is +not joint it jumps to step 7. It should be run at startup for all timelines (but +similarly, in the first version it is ok to trigger it manually). + +#### Schema + +`safekeepers` table mirroring current `nodes` should be added, except that for +`scheduling_policy` field (seems like `status` is a better name for it): it is enough +to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3) +`decomissioned`. + +`timelines` table: +``` +table! { + // timeline_id is primary key + timelines (tenant_id, timeline_id) { + timeline_id -> Varchar, + tenant_id -> Varchar, + generation -> Int4, + sk_set -> Array, // list of safekeeper ids + new_sk_set -> Nullable>, // list of safekeeper ids, null if not joint conf + cplane_notified_generation -> Int4, + } +} +``` + +#### API + +Node management is similar to pageserver: +1) POST `/control/v1/safekeepers` upserts safekeeper. +2) GET `/control/v1/safekeepers` lists safekeepers. +3) GET `/control/v1/safekeepers/:node_id` gets safekeeper. +4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g. + `offline` or `decomissioned`. Initially it is simpler not to schedule any + migrations here. + +Safekeeper deploy scripts should register safekeeper at storage_contorller as +they currently do with cplane, under the same id. + +Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline` +would 1) choose initial set of safekeepers; 2) write to the db initial +`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in +case of conflict; 3) create timeline on the majority of safekeepers (already +created is ok). + +We don't want to block timeline creation when one safekeeper is down. Currently +this is solved by compute implicitly creating timeline on any safekeeper it is +connected to. This creates ugly timeline state on safekeeper when timeline is +created, but start LSN is not defined yet. It would be nice to remove this; to +do that, controller can in the background retry to create timeline on +safekeeper(s) which missed that during initial creation call. It can do that +through `pull_timeline` from majority so it doesn't need to remember +`parent_lsn` in its db. + +Timeline deletion removes the row from the db and forwards deletion to the +current configuration members. Without additional actions deletions might leak, +see below on this; initially let's ignore these, reporting to cplane success if +at least one safekeeper deleted the timeline (this will remove s3 data). + +Tenant deletion repeats timeline deletion for all timelines. + +Migration API: the first version is the simplest and the most imperative: +1) PUT `/control/v1/safekeepers/migrate` schedules `MigrationRequest`s to move +all timelines from one safekeeper to another. It accepts json +``` +{ + "src_sk": u32, + "dst_sk": u32, + "limit": Optional, +} +``` + +Returns list of scheduled requests. + +2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest` + to move single timeline to given set of safekeepers: +``` +{ + "desired_set": Vec, +} +``` + +Returns scheduled request. + +Similar call should be added for the tenant. + +It would be great to have some way of subscribing to the results (apart from +looking at logs/metrics). + +Migration is executed as described above. One subtlety is that (local) deletion on +source safekeeper might fail, which is not a problem if we are going to +decomission the node but leaves garbage otherwise. I'd propose in the first version +1) Don't attempt deletion at all if node status is `offline`. +2) If it failed, just issue warning. +And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and +remove garbage timelines for manual use. It will 1) list all timelines on the +safekeeper 2) compare each one against configuration storage: if timeline +doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can +be deleted under generation number if node is not member of current generation. + +Automating this is untrivial; we'd need to register all potential missing +deletions in the same transaction +which switches configurations. Similarly when timeline is fully deleted to +prevent cplane operation from blocking when some safekeeper is not available +deletion should be also registered. + +One more task pool should infinitely retry notifying control plane about changed +safekeeper sets. + +3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return + current in memory state of the timeline and pending `MigrationRequest`, + if any. + +4) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort` tries to abort the + migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS + (incrementing generation as always). + +#### Dealing with multiple instances of storage_controller + +Operations described above executed concurrently might create some errors but do +not prevent progress, so while we normally don't want to run multiple instances +of storage_controller it is fine to have it temporarily, e.g. during redeploy. + +Any interactions with db update in-memory controller state, e.g. if migration +request failed because different one is in progress, controller remembers that +and tries to finish it. + +## Testing + +`neon_local` should be switched to use storage_controller, playing role of +control plane. + +There should be following layers of tests: +1) Model checked TLA+ spec specifies the algorithm and verifies its basic safety. + +2) To cover real code and at the same time test many schedules we should have + simulation tests. For that, configuration storage, storage_controller <-> + safekeeper communication and pull_timeline need to be mocked and main switch + procedure wrapped to as a node (thread) in simulation tests, using these + mocks. Test would inject migrations like it currently injects + safekeeper/walproposer restars. Main assert is the same -- committed WAL must + not be lost. + +3) Since simulation testing injects at relatively high level points (not + syscalls), it omits some code, in particular `pull_timeline`. Thus it is + better to have basic tests covering whole system as well. Extended version of + `test_restarts_under_load` would do: start background load and do migration + under it, then restart endpoint and check that no reported commits + had been lost. I'd also add one more creating classic network split scenario, with + one compute talking to AC and another to BD while migration from nodes ABC to ABD + happens. + +4) Simple e2e test should ensure that full flow including cplane notification works. + +## Order of implementation and rollout + +Note that +- Control plane parts and integration with it is fully independent from everything else + (tests would use simulation and neon_local). +- There is a lot of infra work making storage_controller aware of timelines and safekeepers + and its impl/rollout should be separate from migration itself. +- Initially walproposer can just stop working while it observers joint configuration. + Such window would be typically very short anyway. + +To rollout smoothly, both walproposer and safekeeper should have flag +`configurations_enabled`; when set to false, they would work as currently, i.e. +walproposer is able to commit on whatever safekeeper set it is provided. Until +all timelines are managed by storcon we'd need to use current script to migrate +and update/drop entries in the storage_controller database if it has any. + +Safekeepers would need to be able to talk both current and new protocol version +with compute to reduce number of computes restarted in prod once v2 protocol is +deployed (though before completely switching we'd need to force this). + +Let's have the following rollout order: +- storage_controller becomes aware of safekeepers; +- storage_controller gets timeline creation for new timelines and deletion requests, but + doesn't manage all timelines yet. Migration can be tested on these new timelines. + To keep control plane and storage_controller databases in sync while control + plane still chooses the safekeepers initially (until all timelines are imported + it can choose better), `TimelineCreateRequest` can get optional safekeepers + field with safekeepers chosen by cplane. +- Then we can import all existing timelines from control plane to + storage_controller and gradually enable configurations region by region. + + +Very rough implementation order: +- Add concept of configurations to safekeepers (including control file), + implement v3 protocol. +- Implement walproposer changes, including protocol. +- Implement storconn part. Use it in neon_local (and pytest). +- Make cplane store safekeepers per timeline instead of per tenant. +- Implement cplane/storcon integration. Route branch creation/deletion + through storcon. Then we can test migration of new branches. +- Finally import existing branches. Then we can drop cplane + safekeeper selection code. Gradually enable configurations at + computes and safekeepers. Before that, all computes must talk only + v3 protocol version. + +## Integration with evicted timelines + +Currently, `pull_timeline` doesn't work correctly with evicted timelines because +copy would point to original partial file. To fix let's just do s3 copy of the +file. It is a bit stupid as generally unnecessary work, but it makes sense to +implement proper migration before doing smarter timeline archival. [Issue](https://github.com/neondatabase/neon/issues/8542) + +## Possible optimizations + +Steps above suggest walproposer restart (with re-election) and thus reconnection +to safekeepers. Since by bumping term on new majority we ensure that leader +terms are unique even across generation switches it is possible to preserve +connections. However, it is more complicated, reconnection is very fast and it +is much more important to avoid compute restart than millisecond order of write +stall. + +Multiple joint consensus: algorithm above rejects attempt to change membership +while another attempt is in progress. It is possible to overlay them and AFAIK +Aurora does this but similarly I don't think this is needed. + +## Misc + +We should use Compute <-> safekeeper protocol change to include other (long +yearned) modifications: +- send data in network order to make arm work. +- remove term_start_lsn from AppendRequest +- add horizon to TermHistory +- add to ProposerGreeting number of connection from this wp to sk diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 3af3f74e9c..2fdd7de38f 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -22,6 +22,11 @@ pub struct Key { pub field6: u32, } +/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as +/// a struct of fields. +#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)] +pub struct CompactKey(i128); + /// The storage key size. pub const KEY_SIZE: usize = 18; @@ -130,6 +135,14 @@ impl Key { } } + pub fn to_compact(&self) -> CompactKey { + CompactKey(self.to_i128()) + } + + pub fn from_compact(k: CompactKey) -> Self { + Self::from_i128(k.0) + } + pub const fn next(&self) -> Key { self.add(1) } @@ -199,6 +212,13 @@ impl fmt::Display for Key { } } +impl fmt::Display for CompactKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let k = Key::from_compact(*self); + k.fmt(f) + } +} + impl Key { pub const MIN: Key = Key { field1: u8::MIN, diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index e88cab5d6a..0fec221276 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -1,4 +1,5 @@ -use utils::serde_system_time::SystemTime; +use std::time::SystemTime; +use utils::{serde_percent::Percent, serde_system_time}; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. @@ -9,19 +10,88 @@ use utils::serde_system_time::SystemTime; /// not handle full u64 values properly. #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)] pub struct PageserverUtilization { - /// Used disk space + /// Used disk space (physical, ground truth from statfs()) #[serde(serialize_with = "ser_saturating_u63")] pub disk_usage_bytes: u64, /// Free disk space #[serde(serialize_with = "ser_saturating_u63")] pub free_space_bytes: u64, - /// Lower is better score for how good candidate for a next tenant would this pageserver be. - #[serde(serialize_with = "ser_saturating_u63")] + + /// Wanted disk space, based on the tenant shards currently present on this pageserver: this + /// is like disk_usage_bytes, but it is stable and does not change with the cache state of + /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay + /// there, or may be unrealistically low if the pageserver has attached tenants which haven't + /// downloaded layers yet. + #[serde(serialize_with = "ser_saturating_u63", default)] + pub disk_wanted_bytes: u64, + + // What proportion of total disk space will this pageserver use before it starts evicting data? + #[serde(default = "unity_percent")] + pub disk_usable_pct: Percent, + + // How many shards are currently on this node? + #[serde(default)] + pub shard_count: u32, + + // How many shards should this node be able to handle at most? + #[serde(default)] + pub max_shard_count: u32, + + /// Cached result of [`Self::score`] pub utilization_score: u64, + /// When was this snapshot captured, pageserver local time. /// /// Use millis to give confidence that the value is regenerated often enough. - pub captured_at: SystemTime, + pub captured_at: serde_system_time::SystemTime, +} + +fn unity_percent() -> Percent { + Percent::new(0).unwrap() +} + +impl PageserverUtilization { + const UTILIZATION_FULL: u64 = 1000000; + + /// Calculate a utilization score. The result is to be inrepreted as a fraction of + /// Self::UTILIZATION_FULL. + /// + /// Lower values are more affine to scheduling more work on this node. + /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work. + /// - 0.0 represents an empty node. + /// - Negative values are forbidden + /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to + /// layer eviction. + pub fn score(&self) -> u64 { + let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes) + * self.disk_usable_pct.get() as u64) + / 100; + let disk_utilization_score = + self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity; + + let shard_utilization_score = + self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64; + std::cmp::max(disk_utilization_score, shard_utilization_score) + } + + pub fn refresh_score(&mut self) { + self.utilization_score = self.score(); + } + + /// A utilization structure that has a full utilization score: use this as a placeholder when + /// you need a utilization but don't have real values yet. + pub fn full() -> Self { + Self { + disk_usage_bytes: 1, + free_space_bytes: 0, + disk_wanted_bytes: 1, + disk_usable_pct: Percent::new(100).unwrap(), + shard_count: 1, + max_shard_count: 1, + utilization_score: Self::UTILIZATION_FULL, + captured_at: serde_system_time::SystemTime(SystemTime::now()), + } + } } /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. @@ -49,15 +119,19 @@ mod tests { let doc = PageserverUtilization { disk_usage_bytes: u64::MAX, free_space_bytes: 0, - utilization_score: u64::MAX, - captured_at: SystemTime( + disk_wanted_bytes: u64::MAX, + utilization_score: 13, + disk_usable_pct: Percent::new(90).unwrap(), + shard_count: 100, + max_shard_count: 200, + captured_at: serde_system_time::SystemTime( std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), ), }; let s = serde_json::to_string(&doc).unwrap(); - let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#; + let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}"; assert_eq!(s, expected); } diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs index 2fef8d35df..f65c080ad4 100644 --- a/libs/utils/src/completion.rs +++ b/libs/utils/src/completion.rs @@ -5,13 +5,40 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker}; /// Can be cloned, moved and kept around in futures as "guard objects". #[derive(Clone)] pub struct Completion { - _token: TaskTrackerToken, + token: TaskTrackerToken, +} + +impl std::fmt::Debug for Completion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Completion") + .field("siblings", &self.token.task_tracker().len()) + .finish() + } +} + +impl Completion { + /// Returns true if this completion is associated with the given barrier. + pub fn blocks(&self, barrier: &Barrier) -> bool { + TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0) + } + + pub fn barrier(&self) -> Barrier { + Barrier(self.token.task_tracker().clone()) + } } /// Barrier will wait until all clones of [`Completion`] have been dropped. #[derive(Clone)] pub struct Barrier(TaskTracker); +impl std::fmt::Debug for Barrier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Barrier") + .field("remaining", &self.0.len()) + .finish() + } +} + impl Default for Barrier { fn default() -> Self { let (_, rx) = channel(); @@ -51,5 +78,5 @@ pub fn channel() -> (Completion, Barrier) { tracker.close(); let token = tracker.token(); - (Completion { _token: token }, Barrier(tracker)) + (Completion { token }, Barrier(tracker)) } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index a46d68ef33..f4fc0ba57b 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -128,7 +128,7 @@ pub mod circuit_breaker; /// /// ############################################################################################# /// TODO this macro is not the way the library is intended to be used, see for details. -/// We use `cachepot` to reduce our current CI build times: +/// We used `cachepot` to reduce our current CI build times: /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation. /// The problem needs further investigation and regular `const` declaration instead of a macro. diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 9bab02e46c..0336302de0 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -95,7 +95,7 @@ async fn ingest( } } - layer.put_value(key, lsn, &data, &ctx).await?; + layer.put_value(key.to_compact(), lsn, &data, &ctx).await?; } layer.freeze(lsn + 1).await; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 932918410c..da0c11d9bf 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -124,8 +124,6 @@ fn main() -> anyhow::Result<()> { // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings"); - info!(?conf.get_impl, "starting with get page implementation"); - info!(?conf.get_vectored_impl, "starting with vectored get page implementation"); info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access"); let tenants_path = conf.tenants_path(); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index f4c367bd4d..3ac5ac539f 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -29,12 +29,12 @@ use utils::{ logging::LogFormat, }; +use crate::l0_flush::L0FlushConfig; +use crate::tenant::config::TenantConfOpt; use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess; use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; -use crate::tenant::{config::TenantConfOpt, timeline::GetImpl}; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine}; -use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl}; use crate::{tenant::config::TenantConf, virtual_file}; use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX}; @@ -133,14 +133,8 @@ pub mod defaults { #virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}' -#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}' - -#get_impl = '{DEFAULT_GET_IMPL}' - #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}' -#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' - [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -278,14 +272,8 @@ pub struct PageServerConf { pub virtual_file_io_engine: virtual_file::IoEngineKind, - pub get_vectored_impl: GetVectoredImpl, - - pub get_impl: GetImpl, - pub max_vectored_read_bytes: MaxVectoredReadBytes, - pub validate_vectored_get: bool, - pub image_compression: ImageCompressionAlgorithm, /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this @@ -396,14 +384,8 @@ struct PageServerConfigBuilder { virtual_file_io_engine: BuilderValue, - get_vectored_impl: BuilderValue, - - get_impl: BuilderValue, - max_vectored_read_bytes: BuilderValue, - validate_vectored_get: BuilderValue, - image_compression: BuilderValue, ephemeral_bytes_per_memory_kb: BuilderValue, @@ -493,13 +475,10 @@ impl PageServerConfigBuilder { virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), - get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()), - get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()), max_vectored_read_bytes: Set(MaxVectoredReadBytes( NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), image_compression: Set(DEFAULT_IMAGE_COMPRESSION), - validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: Set(L0FlushConfig::default()), compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()), @@ -659,22 +638,10 @@ impl PageServerConfigBuilder { self.virtual_file_io_engine = BuilderValue::Set(value); } - pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) { - self.get_vectored_impl = BuilderValue::Set(value); - } - - pub fn get_impl(&mut self, value: GetImpl) { - self.get_impl = BuilderValue::Set(value); - } - pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) { self.max_vectored_read_bytes = BuilderValue::Set(value); } - pub fn get_validate_vectored_get(&mut self, value: bool) { - self.validate_vectored_get = BuilderValue::Set(value); - } - pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) { self.image_compression = BuilderValue::Set(value); } @@ -745,10 +712,7 @@ impl PageServerConfigBuilder { heatmap_upload_concurrency, secondary_download_concurrency, ingest_batch_size, - get_vectored_impl, - get_impl, max_vectored_read_bytes, - validate_vectored_get, image_compression, ephemeral_bytes_per_memory_kb, l0_flush, @@ -1002,21 +966,12 @@ impl PageServerConf { "virtual_file_io_engine" => { builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?) } - "get_vectored_impl" => { - builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?) - } - "get_impl" => { - builder.get_impl(parse_toml_from_str("get_impl", item)?) - } "max_vectored_read_bytes" => { let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize; builder.get_max_vectored_read_bytes( MaxVectoredReadBytes( NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0"))) } - "validate_vectored_get" => { - builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?) - } "image_compression" => { builder.get_image_compression(parse_toml_from_str("image_compression", item)?) } @@ -1106,14 +1061,11 @@ impl PageServerConf { secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), - get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant"), ), image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, - validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, l0_flush: L0FlushConfig::default(), compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), @@ -1349,13 +1301,10 @@ background_task_maximum_delay = '334 s' secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), - get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant") ), - validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, l0_flush: L0FlushConfig::default(), @@ -1425,13 +1374,10 @@ background_task_maximum_delay = '334 s' secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: 100, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), - get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), max_vectored_read_bytes: MaxVectoredReadBytes( NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant") ), - validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, l0_flush: L0FlushConfig::default(), diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a983d8c4c2..fd4ead9d47 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1787,9 +1787,11 @@ async fn timeline_checkpoint_handler( } if wait_until_uploaded { + tracing::info!("Waiting for uploads to complete..."); timeline.remote_client.wait_completion().await // XXX map to correct ApiError for the cases where it's due to shutdown .context("wait completion").map_err(ApiError::InternalServerError)?; + tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0))); } json_response(StatusCode::OK, ()) @@ -1887,7 +1889,7 @@ async fn timeline_detach_ancestor_handler( // drop(tenant); let resp = match progress { - detach_ancestor::Progress::Prepared(_guard, prepared) => { + detach_ancestor::Progress::Prepared(attempt, prepared) => { // it would be great to tag the guard on to the tenant activation future let reparented_timelines = state .tenant_manager @@ -1895,10 +1897,10 @@ async fn timeline_detach_ancestor_handler( tenant_shard_id, timeline_id, prepared, + attempt, ctx, ) .await - .context("timeline detach ancestor completion") .map_err(ApiError::InternalServerError)?; AncestorDetached { @@ -2357,8 +2359,9 @@ async fn get_utilization( // regenerate at most 1Hz to allow polling at any rate. if !still_valid { let path = state.conf.tenants_path(); - let doc = crate::utilization::regenerate(path.as_std_path()) - .map_err(ApiError::InternalServerError)?; + let doc = + crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager) + .map_err(ApiError::InternalServerError)?; let mut buf = Vec::new(); serde_json::to_writer(&mut buf, &doc) diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 45a516566f..ede1791afa 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -56,7 +56,6 @@ impl Statvfs { } pub mod mock { - use anyhow::Context; use camino::Utf8Path; use regex::Regex; use tracing::log::info; @@ -135,14 +134,30 @@ pub mod mock { { continue; } - total += entry - .metadata() - .with_context(|| format!("get metadata of {:?}", entry.path()))? - .len(); + let m = match entry.metadata() { + Ok(m) => m, + Err(e) if is_not_found(&e) => { + // some temp file which got removed right as we are walking + continue; + } + Err(e) => { + return Err(anyhow::Error::new(e) + .context(format!("get metadata of {:?}", entry.path()))) + } + }; + total += m.len(); } Ok(total) } + fn is_not_found(e: &walkdir::Error) -> bool { + let Some(io_error) = e.io_error() else { + return false; + }; + let kind = io_error.kind(); + matches!(kind, std::io::ErrorKind::NotFound) + } + pub struct Statvfs { pub blocks: u64, pub blocks_available: u64, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2422ab4cf2..b065f58382 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -41,6 +41,7 @@ use tokio::sync::watch; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; +use upload_queue::NotInitialized; use utils::backoff; use utils::circuit_breaker::CircuitBreaker; use utils::completion; @@ -301,7 +302,11 @@ pub struct Tenant { pub(crate) timeline_get_throttle: Arc>, - /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. + /// An ongoing timeline detach concurrency limiter. + /// + /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense + /// to have two running at the same time. A different one can be started if an earlier one + /// has failed for whatever reason. ongoing_timeline_detach: std::sync::Mutex>, /// `index_part.json` based gc blocking reason tracking. @@ -601,6 +606,15 @@ impl From for GcError { } } +impl From for GcError { + fn from(value: NotInitialized) -> Self { + match value { + NotInitialized::Uninitialized => GcError::Remote(value.into()), + NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled, + } + } +} + impl From for GcError { fn from(_: timeline::layer_manager::Shutdown) -> Self { GcError::TimelineCancelled @@ -823,9 +837,9 @@ impl Tenant { // The Stopping case is for when we have passed control on to DeleteTenantFlow: // if it errors, we will call make_broken when tenant is already in Stopping. assert!( - matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }), - "the attach task owns the tenant state until activation is complete" - ); + matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }), + "the attach task owns the tenant state until activation is complete" + ); *state = TenantState::broken_from_reason(err.to_string()); }); @@ -3012,54 +3026,6 @@ impl Tenant { // because that will stall branch creation. let gc_cs = self.gc_cs.lock().await; - // Paranoia check: it is critical that GcInfo's list of child timelines is correct, to avoid incorrectly GC'ing data they - // depend on. So although GcInfo is updated continuously by Timeline::new and Timeline::drop, we also calculate it here - // and fail out if it's inaccurate. - // (this can be removed later, it's a risk mitigation for https://github.com/neondatabase/neon/pull/8427) - { - let mut all_branchpoints: BTreeMap> = - BTreeMap::new(); - timelines.iter().for_each(|timeline| { - if let Some(ancestor_timeline_id) = &timeline.get_ancestor_timeline_id() { - let ancestor_children = - all_branchpoints.entry(*ancestor_timeline_id).or_default(); - ancestor_children.push((timeline.get_ancestor_lsn(), timeline.timeline_id)); - } - }); - - for timeline in &timelines { - let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints - .remove(&timeline.timeline_id) - .unwrap_or_default(); - - branchpoints.sort_by_key(|b| b.0); - - let target = timeline.gc_info.read().unwrap(); - - // We require that retain_lsns contains everything in `branchpoints`, but not that - // they are exactly equal: timeline deletions can race with us, so retain_lsns - // may contain some extra stuff. It is safe to have extra timelines in there, because it - // just means that we retain slightly more data than we otherwise might. - let have_branchpoints = target.retain_lsns.iter().copied().collect::>(); - for b in &branchpoints { - if !have_branchpoints.contains(b) { - tracing::error!( - "Bug: `retain_lsns` is set incorrectly. Expected be {:?}, but found {:?}", - branchpoints, - target.retain_lsns - ); - debug_assert!(false); - // Do not GC based on bad information! - // (ab-use an existing GcError type rather than adding a new one, since this is a - // "should never happen" check that will be removed soon). - return Err(GcError::Remote(anyhow::anyhow!( - "retain_lsns failed validation!" - ))); - } - } - } - } - // Ok, we now know all the branch points. // Update the GC information for each timeline. let mut gc_timelines = Vec::with_capacity(timelines.len()); @@ -3770,6 +3736,19 @@ impl Tenant { pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt { self.tenant_conf.load().tenant_conf.clone() } + + /// How much local storage would this tenant like to have? It can cope with + /// less than this (via eviction and on-demand downloads), but this function enables + /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O + /// by keeping important things on local disk. + pub(crate) fn local_storage_wanted(&self) -> u64 { + let mut wanted = 0; + let timelines = self.timelines.lock().unwrap(); + for timeline in timelines.values() { + wanted += timeline.metrics.visible_physical_size_gauge.get(); + } + wanted + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index bbc070a81b..6073abc8c3 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -285,12 +285,15 @@ impl TimelineMetadata { } /// When reparenting, the `ancestor_lsn` does not change. + /// + /// Returns true if anything was changed. pub fn reparent(&mut self, timeline: &TimelineId) { assert!(self.body.ancestor_timeline.is_some()); // no assertion for redoing this: it's fine, we may have to repeat this multiple times over self.body.ancestor_timeline = Some(*timeline); } + /// Returns true if anything was changed pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { if let Some(ancestor) = self.body.ancestor_timeline { assert_eq!(ancestor, branchpoint.0); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 3f592f167e..5f2539d426 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -54,7 +54,7 @@ use utils::id::{TenantId, TimelineId}; use super::remote_timeline_client::remote_tenant_path; use super::secondary::SecondaryTenant; -use super::timeline::detach_ancestor::PreparedTimelineDetach; +use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; use super::{GlobalShutDown, TenantSharedResources}; /// For a tenant that appears in TenantsMap, it may either be @@ -224,21 +224,8 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result>); -enum BackgroundPurgesInner { - Open(tokio::task::JoinSet<()>), - // we use the async mutex for coalescing - ShuttingDown(Arc>>), -} - -impl Default for BackgroundPurges { - fn default() -> Self { - Self(Arc::new(std::sync::Mutex::new( - BackgroundPurgesInner::Open(JoinSet::new()), - ))) - } -} +#[derive(Clone, Default)] +pub struct BackgroundPurges(tokio_util::task::TaskTracker); impl BackgroundPurges { /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in @@ -247,24 +234,32 @@ impl BackgroundPurges { /// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. /// Thus the [`BackgroundPurges`] type to keep track of these tasks. pub fn spawn(&self, tmp_path: Utf8PathBuf) { - let mut guard = self.0.lock().unwrap(); - let jset = match &mut *guard { - BackgroundPurgesInner::Open(ref mut jset) => jset, - BackgroundPurgesInner::ShuttingDown(_) => { - warn!("trying to spawn background purge during shutdown, ignoring"); - return; + // because on shutdown we close and wait, we are misusing TaskTracker a bit. + // + // so first acquire a token, then check if the tracker has been closed. the tracker might get closed + // right after, but at least the shutdown will wait for what we are spawning next. + let token = self.0.token(); + + if self.0.is_closed() { + warn!( + %tmp_path, + "trying to spawn background purge during shutdown, ignoring" + ); + return; + } + + let span = info_span!(parent: None, "background_purge", %tmp_path); + + let task = move || { + let _token = token; + let _entered = span.entered(); + if let Err(error) = std::fs::remove_dir_all(tmp_path.as_path()) { + // should we fatal_io_error here? + warn!(%error, "failed to purge tenant directory"); } }; - jset.spawn_on( - async move { - if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await { - // should we fatal_io_error here? - warn!(%error, path=%tmp_path, "failed to purge tenant directory"); - } - } - .instrument(info_span!(parent: None, "background_purge")), - BACKGROUND_RUNTIME.handle(), - ); + + BACKGROUND_RUNTIME.spawn_blocking(task); } /// When this future completes, all background purges have completed. @@ -278,42 +273,9 @@ impl BackgroundPurges { /// instances of this future will continue to be correct. #[instrument(skip_all)] pub async fn shutdown(&self) { - let jset = { - let mut guard = self.0.lock().unwrap(); - match &mut *guard { - BackgroundPurgesInner::Open(jset) => { - *guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new( - std::mem::take(jset), - ))) - } - BackgroundPurgesInner::ShuttingDown(_) => { - // calling shutdown multiple times is most likely a bug in pageserver shutdown code - warn!("already shutting down"); - } - }; - match &mut *guard { - BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(), - BackgroundPurgesInner::Open(_) => { - unreachable!("above code transitions into shut down state"); - } - } - }; - let mut jset = jset.lock().await; // concurrent callers coalesce here - while let Some(res) = jset.join_next().await { - match res { - Ok(()) => {} - Err(e) if e.is_panic() => { - // If it panicked, the error is already logged by the panic hook. - } - Err(e) if e.is_cancelled() => { - unreachable!("we don't cancel the joinset or runtime") - } - Err(e) => { - // No idea when this can happen, but let's log it. - warn!(%e, "background purge task failed or panicked"); - } - } - } + // forbid new tasks (can be called many times) + self.0.close(); + self.0.wait().await; } } @@ -1965,8 +1927,10 @@ impl TenantManager { tenant_shard_id: TenantShardId, timeline_id: TimelineId, prepared: PreparedTimelineDetach, + mut attempt: detach_ancestor::Attempt, ctx: &RequestContext, ) -> Result, anyhow::Error> { + use crate::tenant::timeline::detach_ancestor::Error; // FIXME: this is unnecessary, slotguard already has these semantics struct RevertOnDropSlot(Option); @@ -2015,43 +1979,98 @@ impl TenantManager { let timeline = tenant.get_timeline(timeline_id, true)?; - let reparented = timeline - .complete_detaching_timeline_ancestor(&tenant, prepared, ctx) + let resp = timeline + .detach_from_ancestor_and_reparent(&tenant, prepared, ctx) .await?; let mut slot_guard = slot_guard.into_inner(); - let (_guard, progress) = utils::completion::channel(); - match tenant.shutdown(progress, ShutdownMode::Hard).await { - Ok(()) => { - slot_guard.drop_old_value()?; + let tenant = if resp.reset_tenant_required() { + attempt.before_reset_tenant(); + + let (_guard, progress) = utils::completion::channel(); + match tenant.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => { + slot_guard.drop_old_value()?; + } + Err(_barrier) => { + slot_guard.revert(); + // this really should not happen, at all, unless shutdown was already going? + anyhow::bail!("Cannot restart Tenant, already shutting down"); + } } - Err(_barrier) => { - slot_guard.revert(); - // this really should not happen, at all, unless shutdown was already going? - anyhow::bail!("Cannot restart Tenant, already shutting down"); + + let tenant_path = self.conf.tenant_path(&tenant_shard_id); + let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?; + + let shard_identity = config.shard; + let tenant = tenant_spawn( + self.conf, + tenant_shard_id, + &tenant_path, + self.resources.clone(), + AttachedTenantConf::try_from(config)?, + shard_identity, + None, + SpawnMode::Eager, + ctx, + )?; + + { + let mut g = tenant.ongoing_timeline_detach.lock().unwrap(); + assert!( + g.is_none(), + "there cannot be any new timeline detach ancestor on newly created tenant" + ); + *g = Some((attempt.timeline_id, attempt.new_barrier())); } + + slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?; + tenant + } else { + tracing::info!("skipping tenant_reset as no changes made required it"); + tenant + }; + + if let Some(reparented) = resp.completed() { + // finally ask the restarted tenant to complete the detach + // + // rationale for 9999s: we don't really have a timetable here; if retried, the caller + // will get an 503. + tenant + .wait_to_become_active(std::time::Duration::from_secs(9999)) + .await + .map_err(|e| { + use pageserver_api::models::TenantState; + use GetActiveTenantError::{Cancelled, WillNotBecomeActive}; + match e { + Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => { + Error::ShuttingDown + } + other => Error::Unexpected(other.into()), + } + })?; + + utils::pausable_failpoint!( + "timeline-detach-ancestor::after_activating_before_finding-pausable" + ); + + let timeline = tenant + .get_timeline(attempt.timeline_id, true) + .map_err(|_| Error::DetachedNotFoundAfterRestart)?; + + timeline + .complete_detaching_timeline_ancestor(&tenant, attempt, ctx) + .await + .map(|()| reparented) + .map_err(|e| e.into()) + } else { + // at least the latest versions have now been downloaded and refreshed; be ready to + // retry another time. + Err(anyhow::anyhow!( + "failed to reparent all candidate timelines, please retry" + )) } - - let tenant_path = self.conf.tenant_path(&tenant_shard_id); - let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?; - - let shard_identity = config.shard; - let tenant = tenant_spawn( - self.conf, - tenant_shard_id, - &tenant_path, - self.resources.clone(), - AttachedTenantConf::try_from(config)?, - shard_identity, - None, - SpawnMode::Eager, - ctx, - )?; - - slot_guard.upsert(TenantSlot::Attached(tenant))?; - - Ok(reparented) } /// A page service client sends a TenantId, and to look up the correct Tenant we must @@ -2123,6 +2142,57 @@ impl TenantManager { } } } + + /// Calculate the tenant shards' contributions to this pageserver's utilization metrics. The + /// returned values are: + /// - the number of bytes of local disk space this pageserver's shards are requesting, i.e. + /// how much space they would use if not impacted by disk usage eviction. + /// - the number of tenant shards currently on this pageserver, including attached + /// and secondary. + /// + /// This function is quite expensive: callers are expected to cache the result and + /// limit how often they call it. + pub(crate) fn calculate_utilization(&self) -> Result<(u64, u32), TenantMapListError> { + let tenants = self.tenants.read().unwrap(); + let m = match &*tenants { + TenantsMap::Initializing => return Err(TenantMapListError::Initializing), + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, + }; + let shard_count = m.len(); + let mut wanted_bytes = 0; + + for tenant_slot in m.values() { + match tenant_slot { + TenantSlot::InProgress(_barrier) => { + // While a slot is being changed, we can't know how much storage it wants. This + // means this function's output can fluctuate if a lot of changes are going on + // (such as transitions from secondary to attached). + // + // We could wait for the barrier and retry, but it's important that the utilization + // API is responsive, and the data quality impact is not very significant. + continue; + } + TenantSlot::Attached(tenant) => { + wanted_bytes += tenant.local_storage_wanted(); + } + TenantSlot::Secondary(secondary) => { + let progress = secondary.progress.lock().unwrap(); + wanted_bytes += if progress.heatmap_mtime.is_some() { + // If we have heatmap info, then we will 'want' the sum + // of the size of layers in the heatmap: this is how much space + // we would use if not doing any eviction. + progress.bytes_total + } else { + // In the absence of heatmap info, assume that the secondary location simply + // needs as much space as it is currently using. + secondary.resident_size_metric.get() + } + } + } + } + + Ok((wanted_bytes, shard_count as u32)) + } } #[derive(Debug, thiserror::Error)] diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 1344fe4192..b4d7ad1e97 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -736,12 +736,13 @@ impl RemoteTimelineClient { Ok(()) } + /// Reparent this timeline to a new parent. + /// + /// A retryable step of timeline ancestor detach. pub(crate) async fn schedule_reparenting_and_wait( self: &Arc, new_parent: &TimelineId, ) -> anyhow::Result<()> { - // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing - // and reads the in-memory part we cannot do the detaching like this let receiver = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -752,17 +753,25 @@ impl RemoteTimelineClient { )); }; - upload_queue.dirty.metadata.reparent(new_parent); - upload_queue.dirty.lineage.record_previous_ancestor(&prev); + let uploaded = &upload_queue.clean.0.metadata; - self.schedule_index_upload(upload_queue)?; + if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() { + // nothing to do + None + } else { + upload_queue.dirty.metadata.reparent(new_parent); + upload_queue.dirty.lineage.record_previous_ancestor(&prev); - self.schedule_barrier0(upload_queue) + self.schedule_index_upload(upload_queue)?; + + Some(self.schedule_barrier0(upload_queue)) + } }; - Self::wait_completion0(receiver) - .await - .context("wait completion") + if let Some(receiver) = receiver { + Self::wait_completion0(receiver).await?; + } + Ok(()) } /// Schedules uploading a new version of `index_part.json` with the given layers added, @@ -778,26 +787,30 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - upload_queue.dirty.metadata.detach_from_ancestor(&adopted); - upload_queue.dirty.lineage.record_detaching(&adopted); + if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) { + None + } else { + upload_queue.dirty.metadata.detach_from_ancestor(&adopted); + upload_queue.dirty.lineage.record_detaching(&adopted); - for layer in layers { - upload_queue - .dirty - .layer_metadata - .insert(layer.layer_desc().layer_name(), layer.metadata()); + for layer in layers { + let prev = upload_queue + .dirty + .layer_metadata + .insert(layer.layer_desc().layer_name(), layer.metadata()); + assert!(prev.is_none(), "copied layer existed already {layer}"); + } + + self.schedule_index_upload(upload_queue)?; + + Some(self.schedule_barrier0(upload_queue)) } - - self.schedule_index_upload(upload_queue)?; - - let barrier = self.schedule_barrier0(upload_queue); - self.launch_queued_tasks(upload_queue); - barrier }; - Self::wait_completion0(barrier) - .await - .context("wait completion") + if let Some(barrier) = barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) } /// Adds a gc blocking reason for this timeline if one does not exist already. @@ -873,12 +886,7 @@ impl RemoteTimelineClient { let upload_queue = guard.initialized_mut()?; if let index::GcBlockingReason::DetachAncestor = reason { - if !upload_queue - .clean - .0 - .lineage - .is_detached_from_original_ancestor() - { + if !upload_queue.clean.0.lineage.is_detached_from_ancestor() { drop(guard); panic!("cannot complete timeline_ancestor_detach while not detached"); } @@ -985,7 +993,10 @@ impl RemoteTimelineClient { /// /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`] /// is invoked on them. - pub(crate) fn schedule_gc_update(self: &Arc, gc_layers: &[Layer]) -> anyhow::Result<()> { + pub(crate) fn schedule_gc_update( + self: &Arc, + gc_layers: &[Layer], + ) -> Result<(), NotInitialized> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 90453b1922..757fb9d032 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -216,26 +216,47 @@ fn is_false(b: &bool) -> bool { impl Lineage { const REMEMBER_AT_MOST: usize = 100; - pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) { + pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool { if self.reparenting_history.last() == Some(old_ancestor) { // do not re-record it - return; - } + false + } else { + #[cfg(feature = "testing")] + { + let existing = self + .reparenting_history + .iter() + .position(|x| x == old_ancestor); + assert_eq!( + existing, None, + "we cannot reparent onto and off and onto the same timeline twice" + ); + } + let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; - let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; - - self.reparenting_history_truncated |= drop_oldest; - if drop_oldest { - self.reparenting_history.remove(0); + self.reparenting_history_truncated |= drop_oldest; + if drop_oldest { + self.reparenting_history.remove(0); + } + self.reparenting_history.push(*old_ancestor); + true } - self.reparenting_history.push(*old_ancestor); } - pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) { - assert!(self.original_ancestor.is_none()); - - self.original_ancestor = - Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + /// Returns true if anything changed. + pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool { + if let Some((id, lsn, _)) = self.original_ancestor { + assert_eq!( + &(id, lsn), + branchpoint, + "detaching attempt has to be for the same ancestor we are already detached from" + ); + false + } else { + self.original_ancestor = + Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + true + } } /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed @@ -247,10 +268,16 @@ impl Lineage { .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn) } - pub(crate) fn is_detached_from_original_ancestor(&self) -> bool { + /// Returns true if the timeline originally had an ancestor, and no longer has one. + pub(crate) fn is_detached_from_ancestor(&self) -> bool { self.original_ancestor.is_some() } + /// Returns original ancestor timeline id and lsn that this timeline has been detached from. + pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> { + self.original_ancestor.map(|(id, lsn, _)| (id, lsn)) + } + pub(crate) fn is_reparented(&self) -> bool { !self.reparenting_history.is_empty() } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 27439d4f03..135e73b57f 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -55,7 +55,7 @@ use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; use utils::{ backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, - id::TimelineId, serde_system_time, + id::TimelineId, pausable_failpoint, serde_system_time, }; use super::{ @@ -1146,12 +1146,14 @@ impl<'a> TenantDownloader<'a> { layer: HeatMapLayer, ctx: &RequestContext, ) -> Result, UpdateError> { - // Failpoint for simulating slow remote storage + // Failpoints for simulating slow remote storage failpoint_support::sleep_millis_async!( "secondary-layer-download-sleep", &self.secondary_state.cancel ); + pausable_failpoint!("secondary-layer-download-pausable"); + let local_path = local_layer_path( self.conf, tenant_shard_id, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 16ba0fda94..f9d3fdf186 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -369,9 +369,6 @@ impl ImageLayerInner { self.lsn } - /// Returns nested result following Result, Critical>: - /// - inner has the success or transient failure - /// - outer has the permanent failure pub(super) async fn load( path: &Utf8Path, lsn: Lsn, diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 57d93feaaf..fb15ddfba9 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -15,6 +15,7 @@ use crate::tenant::PageReconstructError; use crate::{l0_flush, page_cache, walrecord}; use anyhow::{anyhow, Result}; use camino::Utf8PathBuf; +use pageserver_api::key::CompactKey; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; @@ -78,7 +79,7 @@ pub struct InMemoryLayerInner { /// All versions of all pages in the layer are kept here. Indexed /// by block number and LSN. The value is an offset into the /// ephemeral file where the page version is stored. - index: BTreeMap>, + index: BTreeMap>, /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. @@ -312,8 +313,12 @@ impl InMemoryLayer { let reader = inner.file.block_cursor(); for range in keyspace.ranges.iter() { - for (key, vec_map) in inner.index.range(range.start..range.end) { - let lsn_range = match reconstruct_state.get_cached_lsn(key) { + for (key, vec_map) in inner + .index + .range(range.start.to_compact()..range.end.to_compact()) + { + let key = Key::from_compact(*key); + let lsn_range = match reconstruct_state.get_cached_lsn(&key) { Some(cached_lsn) => (cached_lsn + 1)..end_lsn, None => self.start_lsn..end_lsn, }; @@ -324,20 +329,18 @@ impl InMemoryLayer { // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183 let buf = reader.read_blob(*pos, &ctx).await; if let Err(e) = buf { - reconstruct_state - .on_key_error(*key, PageReconstructError::from(anyhow!(e))); + reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); break; } let value = Value::des(&buf.unwrap()); if let Err(e) = value { - reconstruct_state - .on_key_error(*key, PageReconstructError::from(anyhow!(e))); + reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); break; } let key_situation = - reconstruct_state.update_key(key, *entry_lsn, value.unwrap()); + reconstruct_state.update_key(&key, *entry_lsn, value.unwrap()); if key_situation == ValueReconstructSituation::Complete { break; } @@ -417,7 +420,7 @@ impl InMemoryLayer { /// Adds the page version to the in-memory tree pub async fn put_value( &self, - key: Key, + key: CompactKey, lsn: Lsn, buf: &[u8], ctx: &RequestContext, @@ -430,7 +433,7 @@ impl InMemoryLayer { async fn put_value_locked( &self, locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>, - key: Key, + key: CompactKey, lsn: Lsn, buf: &[u8], ctx: &RequestContext, @@ -539,6 +542,8 @@ impl InMemoryLayer { let end_lsn = *self.end_lsn.get().unwrap(); let key_count = if let Some(key_range) = key_range { + let key_range = key_range.start.to_compact()..key_range.end.to_compact(); + inner .index .iter() @@ -578,7 +583,7 @@ impl InMemoryLayer { let will_init = Value::des(&buf)?.will_init(); let res; (buf, res) = delta_layer_writer - .put_value_bytes(*key, *lsn, buf, will_init, &ctx) + .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, &ctx) .await; res?; } @@ -617,7 +622,7 @@ impl InMemoryLayer { let will_init = Value::des(&buf)?.will_init(); let res; (buf, res) = delta_layer_writer - .put_value_bytes(*key, *lsn, buf, will_init, ctx) + .put_value_bytes(Key::from_compact(*key), *lsn, buf, will_init, ctx) .await; res?; } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 83450d24bb..0175f32268 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1848,8 +1848,8 @@ impl ResidentLayer { /// Read all they keys in this layer which match the ShardIdentity, and write them all to /// the provided writer. Return the number of keys written. #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] - pub(crate) async fn filter<'a>( - &'a self, + pub(crate) async fn filter( + &self, shard_identity: &ShardIdentity, writer: &mut ImageLayerWriter, ctx: &RequestContext, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index b4706ea59d..dbcd704b4e 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -211,6 +211,11 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } else { // Run compaction match tenant.compaction_iteration(&cancel, &ctx).await { + Ok(has_pending_task) => { + error_run_count = 0; + // schedule the next compaction immediately in case there is a pending compaction task + if has_pending_task { Duration::ZERO } else { period } + } Err(e) => { let wait_duration = backoff::exponential_backoff_duration_seconds( error_run_count + 1, @@ -227,11 +232,6 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { ); wait_duration } - Ok(has_pending_task) => { - error_run_count = 0; - // schedule the next compaction immediately in case there is a pending compaction task - if has_pending_task { Duration::from_secs(0) } else { period } - } } }; @@ -265,7 +265,8 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { count_throttled, sum_throttled_usecs, allowed_rps=%format_args!("{allowed_rps:.0}"), - "shard was throttled in the last n_seconds") + "shard was throttled in the last n_seconds" + ); }); // Sleep @@ -365,14 +366,13 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { if first { first = false; - if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel) - .await - .is_err() - { - break; - } + let delays = async { + delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?; + random_init_delay(period, &cancel).await?; + Ok::<_, Cancelled>(()) + }; - if random_init_delay(period, &cancel).await.is_err() { + if delays.await.is_err() { break; } } @@ -424,7 +424,6 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc); - // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) .await .is_ok() diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 76dcb5645f..f1587951c6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -802,40 +802,6 @@ impl From for PageReconstructError { } } -#[derive( - Eq, - PartialEq, - Debug, - Copy, - Clone, - strum_macros::EnumString, - strum_macros::Display, - serde_with::DeserializeFromStr, - serde_with::SerializeDisplay, -)] -#[strum(serialize_all = "kebab-case")] -pub enum GetVectoredImpl { - Sequential, - Vectored, -} - -#[derive( - Eq, - PartialEq, - Debug, - Copy, - Clone, - strum_macros::EnumString, - strum_macros::Display, - serde_with::DeserializeFromStr, - serde_with::SerializeDisplay, -)] -#[strum(serialize_all = "kebab-case")] -pub enum GetImpl { - Legacy, - Vectored, -} - pub(crate) enum WaitLsnWaiter<'a> { Timeline(&'a Timeline), Tenant, @@ -995,11 +961,10 @@ impl Timeline { } trace!( - "get vectored request for {:?}@{} from task kind {:?} will use {} implementation", + "get vectored request for {:?}@{} from task kind {:?}", keyspace, lsn, ctx.task_kind(), - self.conf.get_vectored_impl ); let start = crate::metrics::GET_VECTORED_LATENCY @@ -3952,6 +3917,10 @@ impl Timeline { .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) .await?; + if self.cancel.is_cancelled() { + return Err(CreateImageLayersError::Cancelled); + } + for (img_key, img) in results { let img = match img { Ok(img) => img, @@ -4059,6 +4028,9 @@ impl Timeline { next_start_key: img_range.end, }); } + if self.cancel.is_cancelled() { + return Err(CreateImageLayersError::Cancelled); + } let mut wrote_any_image = false; for (k, v) in data { if v.is_empty() { @@ -4173,6 +4145,10 @@ impl Timeline { let check_for_image_layers = self.should_check_if_image_layers_required(lsn); for partition in partitioning.parts.iter() { + if self.cancel.is_cancelled() { + return Err(CreateImageLayersError::Cancelled); + } + let img_range = start..partition.ranges.last().unwrap().end; let compact_metadata = partition.overlaps(&Key::metadata_key_range()); if compact_metadata { @@ -4352,18 +4328,34 @@ impl Timeline { detach_ancestor::prepare(self, tenant, options, ctx).await } - /// Completes the ancestor detach. This method is to be called while holding the - /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any - /// timeline be deleted. After this method returns successfully, tenant must be reloaded. + /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and + /// reparents any reparentable children of previous ancestor. /// - /// Pageserver receiving a SIGKILL during this operation is not supported (yet). - pub(crate) async fn complete_detaching_timeline_ancestor( + /// This method is to be called while holding the TenantManager's tenant slot, so during this + /// method we cannot be deleted nor can any timeline be deleted. After this method returns + /// successfully, tenant must be reloaded. + /// + /// Final step will be to [`Self::complete_detaching_timeline_ancestor`] after optionally + /// resetting the tenant. + pub(crate) async fn detach_from_ancestor_and_reparent( self: &Arc, tenant: &crate::tenant::Tenant, prepared: detach_ancestor::PreparedTimelineDetach, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - detach_ancestor::complete(self, tenant, prepared, ctx).await + ) -> Result { + detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await + } + + /// Final step which unblocks the GC. + /// + /// The tenant must've been reset if ancestry was modified previously (in tenant manager). + pub(crate) async fn complete_detaching_timeline_ancestor( + self: &Arc, + tenant: &crate::tenant::Tenant, + attempt: detach_ancestor::Attempt, + ctx: &RequestContext, + ) -> Result<(), detach_ancestor::Error> { + detach_ancestor::complete(self, tenant, attempt, ctx).await } /// Switch aux file policy and schedule upload to the index part. @@ -4412,31 +4404,33 @@ impl From for CompactionError { impl From for CompactionError { fn from(value: super::upload_queue::NotInitialized) -> Self { match value { - super::upload_queue::NotInitialized::Uninitialized - | super::upload_queue::NotInitialized::Stopped => { + super::upload_queue::NotInitialized::Uninitialized => { CompactionError::Other(anyhow::anyhow!(value)) } - super::upload_queue::NotInitialized::ShuttingDown => CompactionError::ShuttingDown, + super::upload_queue::NotInitialized::ShuttingDown + | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown, } } } -impl CompactionError { - /// We cannot do compaction because we could not download a layer that is input to the compaction. - pub(crate) fn input_layer_download_failed( - e: super::storage_layer::layer::DownloadError, - ) -> Self { +impl From for CompactionError { + fn from(e: super::storage_layer::layer::DownloadError) -> Self { match e { - super::storage_layer::layer::DownloadError::TimelineShutdown | - /* TODO DownloadCancelled correct here? */ - super::storage_layer::layer::DownloadError::DownloadCancelled => CompactionError::ShuttingDown, - super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads | - super::storage_layer::layer::DownloadError::DownloadRequired | - super::storage_layer::layer::DownloadError::NotFile(_) | - super::storage_layer::layer::DownloadError::DownloadFailed | - super::storage_layer::layer::DownloadError::PreStatFailed(_)=>CompactionError::Other(anyhow::anyhow!(e)), + super::storage_layer::layer::DownloadError::TimelineShutdown + | super::storage_layer::layer::DownloadError::DownloadCancelled => { + CompactionError::ShuttingDown + } + super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads + | super::storage_layer::layer::DownloadError::DownloadRequired + | super::storage_layer::layer::DownloadError::NotFile(_) + | super::storage_layer::layer::DownloadError::DownloadFailed + | super::storage_layer::layer::DownloadError::PreStatFailed(_) => { + CompactionError::Other(anyhow::anyhow!(e)) + } #[cfg(test)] - super::storage_layer::layer::DownloadError::Failpoint(_) => CompactionError::Other(anyhow::anyhow!(e)), + super::storage_layer::layer::DownloadError::Failpoint(_) => { + CompactionError::Other(anyhow::anyhow!(e)) + } } } } @@ -4990,15 +4984,7 @@ impl Timeline { result.layers_removed = gc_layers.len() as u64; - self.remote_client - .schedule_gc_update(&gc_layers) - .map_err(|e| { - if self.cancel.is_cancelled() { - GcError::TimelineCancelled - } else { - GcError::Remote(e) - } - })?; + self.remote_client.schedule_gc_update(&gc_layers)?; guard.open_mut()?.finish_gc_timeline(&gc_layers); @@ -5559,7 +5545,7 @@ impl<'a> TimelineWriter<'a> { let action = self.get_open_layer_action(lsn, buf_size); let layer = self.handle_open_layer_action(lsn, action, ctx).await?; - let res = layer.put_value(key, lsn, &buf, ctx).await; + let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await; if res.is_ok() { // Update the current size only when the entire write was ok. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 87ec46c0b5..9ac0086cde 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -489,10 +489,7 @@ impl Timeline { // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are: // - GC, which at worst witnesses us "undelete" a layer that they just deleted. // - ingestion, which only inserts layers, therefore cannot collide with us. - let resident = layer - .download_and_keep_resident() - .await - .map_err(CompactionError::input_layer_download_failed)?; + let resident = layer.download_and_keep_resident().await?; let keys_written = resident .filter(&self.shard_identity, &mut image_layer_writer, ctx) @@ -693,23 +690,14 @@ impl Timeline { let mut fully_compacted = true; - deltas_to_compact.push( - first_level0_delta - .download_and_keep_resident() - .await - .map_err(CompactionError::input_layer_download_failed)?, - ); + deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); for l in level0_deltas_iter { let lsn_range = &l.layer_desc().lsn_range; if lsn_range.start != prev_lsn_end { break; } - deltas_to_compact.push( - l.download_and_keep_resident() - .await - .map_err(CompactionError::input_layer_download_failed)?, - ); + deltas_to_compact.push(l.download_and_keep_resident().await?); deltas_to_compact_bytes += l.metadata().file_size; prev_lsn_end = lsn_range.end; @@ -760,6 +748,9 @@ impl Timeline { let all_keys = { let mut all_keys = Vec::new(); for l in deltas_to_compact.iter() { + if self.cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?); } // The current stdlib sorting implementation is designed in a way where it is @@ -842,6 +833,11 @@ impl Timeline { }; stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); drop_rlock(guard); + + if self.cancel.is_cancelled() { + return Err(CompactionError::ShuttingDown); + } + stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); // This iterator walks through all key-value pairs from all the layers @@ -1137,6 +1133,10 @@ impl Timeline { if !self.shard_identity.is_key_disposable(&key) { if writer.is_none() { + if self.cancel.is_cancelled() { + // to be somewhat responsive to cancellation, check for each new layer + return Err(CompactionError::ShuttingDown); + } // Create writer if not initiaized yet writer = Some( DeltaLayerWriter::new( diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 3b52adc77b..969da2662b 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -5,12 +5,16 @@ use crate::{ context::{DownloadBehavior, RequestContext}, task_mgr::TaskKind, tenant::{ + mgr::GetActiveTenantError, + remote_timeline_client::index::GcBlockingReason::DetachAncestor, storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}, Tenant, }, virtual_file::{MaybeFatalIo, VirtualFile}, }; +use anyhow::Context; use pageserver_api::models::detach_ancestor::AncestorDetached; +use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; @@ -38,6 +42,12 @@ pub(crate) enum Error { #[error("remote copying layer failed")] CopyFailed(#[source] anyhow::Error), + #[error("wait for tenant to activate after restarting")] + WaitToActivate(#[source] GetActiveTenantError), + + #[error("detached timeline was not found after restart")] + DetachedNotFoundAfterRestart, + #[error("unexpected error")] Unexpected(#[source] anyhow::Error), @@ -55,6 +65,10 @@ impl From for ApiError { Error::OtherTimelineDetachOngoing(_) => { ApiError::ResourceUnavailable("other timeline detach is already ongoing".into()) } + e @ Error::WaitToActivate(_) => { + let s = utils::error::report_compact_sources(&e).to_string(); + ApiError::ResourceUnavailable(s.into()) + } // All of these contain shutdown errors, in fact, it's the most common e @ Error::FlushAncestor(_) | e @ Error::RewrittenDeltaDownloadFailed(_) @@ -63,6 +77,7 @@ impl From for ApiError { | e @ Error::CopyFailed(_) | e @ Error::Unexpected(_) | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()), + Error::DetachedNotFoundAfterRestart => ApiError::NotFound(value.into()), } } } @@ -96,8 +111,25 @@ impl From for Error { } } +impl From for Error { + fn from(value: GetActiveTenantError) -> Self { + use pageserver_api::models::TenantState; + use GetActiveTenantError::*; + + match value { + Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) | SwitchedTenant => { + Error::ShuttingDown + } + WaitForActiveTimeout { .. } | NotFound(_) | Broken(_) | WillNotBecomeActive(_) => { + // NotFound seems out-of-place + Error::WaitToActivate(value) + } + } + } +} + pub(crate) enum Progress { - Prepared(completion::Completion, PreparedTimelineDetach), + Prepared(Attempt, PreparedTimelineDetach), Done(AncestorDetached), } @@ -121,6 +153,26 @@ impl Default for Options { } } +/// Represents an across tenant reset exclusive single attempt to detach ancestor. +#[derive(Debug)] +pub(crate) struct Attempt { + pub(crate) timeline_id: TimelineId, + + _guard: completion::Completion, + gate_entered: Option, +} + +impl Attempt { + pub(crate) fn before_reset_tenant(&mut self) { + let taken = self.gate_entered.take(); + assert!(taken.is_some()); + } + + pub(crate) fn new_barrier(&self) -> completion::Barrier { + self._guard.barrier() + } +} + /// See [`Timeline::prepare_to_detach_from_ancestor`] pub(super) async fn prepare( detached: &Arc, @@ -135,15 +187,33 @@ pub(super) async fn prepare( .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { - { + let still_in_progress = { let accessor = detached.remote_client.initialized_upload_queue()?; // we are safe to inspect the latest uploaded, because we can only witness this after // restart is complete and ancestor is no more. let latest = accessor.latest_uploaded_index_part(); - if !latest.lineage.is_detached_from_original_ancestor() { + if latest.lineage.detached_previous_ancestor().is_none() { return Err(NoAncestor); - } + }; + + latest + .gc_blocking + .as_ref() + .is_some_and(|b| b.blocked_by(DetachAncestor)) + }; + + if still_in_progress { + // gc is still blocked, we can still reparent and complete. + // we are safe to reparent remaining, because they were locked in in the beginning. + let attempt = continue_with_blocked_gc(detached, tenant).await?; + + // because the ancestor of detached is already set to none, we have published all + // of the layers, so we are still "prepared." + return Ok(Progress::Prepared( + attempt, + PreparedTimelineDetach { layers: Vec::new() }, + )); } let reparented_timelines = reparented_direct_children(detached, tenant)?; @@ -164,22 +234,7 @@ pub(super) async fn prepare( return Err(TooManyAncestors); } - // before we acquire the gate, we must mark the ancestor as having a detach operation - // ongoing which will block other concurrent detach operations so we don't get to ackward - // situations where there would be two branches trying to reparent earlier branches. - let (guard, barrier) = completion::channel(); - - { - let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); - if let Some((tl, other)) = guard.as_ref() { - if !other.is_ready() { - return Err(OtherTimelineDetachOngoing(*tl)); - } - } - *guard = Some((detached.timeline_id, barrier)); - } - - let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + let attempt = start_new_attempt(detached, tenant).await?; utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable"); @@ -245,7 +300,8 @@ pub(super) async fn prepare( }; // TODO: layers are already sorted by something: use that to determine how much of remote - // copies are already done. + // copies are already done -- gc is blocked, but a compaction could had happened on ancestor, + // which is something to keep in mind if copy skipping is implemented. tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers"); // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after @@ -259,29 +315,33 @@ pub(super) async fn prepare( let mut wrote_any = false; - let limiter = Arc::new(tokio::sync::Semaphore::new( - options.rewrite_concurrency.get(), - )); + let limiter = Arc::new(Semaphore::new(options.rewrite_concurrency.get())); for layer in straddling_branchpoint { let limiter = limiter.clone(); let timeline = detached.clone(); let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download); - tasks.spawn(async move { - let _permit = limiter.acquire().await; - let copied = - upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) - .await?; - Ok(copied) - }); + let span = tracing::info_span!("upload_rewritten_layer", %layer); + tasks.spawn( + async move { + let _permit = limiter.acquire().await; + let copied = + upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) + .await?; + if let Some(copied) = copied.as_ref() { + tracing::info!(%copied, "rewrote and uploaded"); + } + Ok(copied) + } + .instrument(span), + ); } while let Some(res) = tasks.join_next().await { match res { Ok(Ok(Some(copied))) => { wrote_any = true; - tracing::info!(layer=%copied, "rewrote and uploaded"); new_layers.push(copied); } Ok(Ok(None)) => {} @@ -308,7 +368,7 @@ pub(super) async fn prepare( } let mut tasks = tokio::task::JoinSet::new(); - let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get())); + let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get())); for adopted in rest_of_historic { let limiter = limiter.clone(); @@ -342,7 +402,56 @@ pub(super) async fn prepare( let prepared = PreparedTimelineDetach { layers: new_layers }; - Ok(Progress::Prepared(guard, prepared)) + Ok(Progress::Prepared(attempt, prepared)) +} + +async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result { + let attempt = obtain_exclusive_attempt(detached, tenant)?; + + // insert the block in the index_part.json, if not already there. + let _dont_care = tenant + .gc_block + .insert( + detached, + crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor, + ) + .await + // FIXME: better error + .map_err(Error::Unexpected)?; + + Ok(attempt) +} + +async fn continue_with_blocked_gc(detached: &Timeline, tenant: &Tenant) -> Result { + // FIXME: it would be nice to confirm that there is an in-memory version, since we've just + // verified there is a persistent one? + obtain_exclusive_attempt(detached, tenant) +} + +fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result { + use Error::{OtherTimelineDetachOngoing, ShuttingDown}; + + // ensure we are the only active attempt for this tenant + let (guard, barrier) = completion::channel(); + { + let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); + if let Some((tl, other)) = guard.as_ref() { + if !other.is_ready() { + return Err(OtherTimelineDetachOngoing(*tl)); + } + // FIXME: no test enters here + } + *guard = Some((detached.timeline_id, barrier)); + } + + // ensure the gate is still open + let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + + Ok(Attempt { + timeline_id: detached.timeline_id, + _guard: guard, + gate_entered: Some(_gate_entered), + }) } fn reparented_direct_children( @@ -548,96 +657,207 @@ async fn remote_copy( .map_err(CopyFailed) } -/// See [`Timeline::complete_detaching_timeline_ancestor`]. -pub(super) async fn complete( +pub(crate) enum DetachingAndReparenting { + /// All of the following timeline ids were reparented and the timeline ancestor detach must be + /// marked as completed. + Reparented(HashSet), + + /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as + /// completed. + /// + /// Nested `must_reset_tenant` is set to true when any restart requiring changes were made. + SomeReparentingFailed { must_reset_tenant: bool }, + + /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach + /// must be marked as completed. + AlreadyDone(HashSet), +} + +impl DetachingAndReparenting { + pub(crate) fn reset_tenant_required(&self) -> bool { + use DetachingAndReparenting::*; + match self { + Reparented(_) => true, + SomeReparentingFailed { must_reset_tenant } => *must_reset_tenant, + AlreadyDone(_) => false, + } + } + + pub(crate) fn completed(self) -> Option> { + use DetachingAndReparenting::*; + match self { + Reparented(x) | AlreadyDone(x) => Some(x), + SomeReparentingFailed { .. } => None, + } + } +} + +/// See [`Timeline::detach_from_ancestor_and_reparent`]. +pub(super) async fn detach_and_reparent( detached: &Arc, tenant: &Tenant, prepared: PreparedTimelineDetach, _ctx: &RequestContext, -) -> Result, anyhow::Error> { +) -> Result { let PreparedTimelineDetach { layers } = prepared; - let ancestor = detached - .ancestor_timeline - .as_ref() - .expect("must still have a ancestor"); - let ancestor_lsn = detached.get_ancestor_lsn(); + #[derive(Debug)] + enum Ancestor { + NotDetached(Arc, Lsn), + Detached(Arc, Lsn), + } + + let (recorded_branchpoint, still_ongoing) = { + let access = detached.remote_client.initialized_upload_queue()?; + let latest = access.latest_uploaded_index_part(); + + ( + latest.lineage.detached_previous_ancestor(), + latest + .gc_blocking + .as_ref() + .is_some_and(|b| b.blocked_by(DetachAncestor)), + ) + }; + assert!( + still_ongoing, + "cannot (detach? reparent)? complete if the operation is not still ongoing" + ); + + let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) { + (Some(ancestor), None) => { + assert!( + !layers.is_empty(), + "there should always be at least one layer to inherit" + ); + Ancestor::NotDetached(ancestor.clone(), detached.ancestor_lsn) + } + (Some(_), Some(_)) => { + panic!( + "it should be impossible to get to here without having gone through the tenant reset; if the tenant was reset, then the ancestor_timeline would be None" + ); + } + (None, Some((ancestor_id, ancestor_lsn))) => { + // it has been either: + // - detached but still exists => we can try reparenting + // - detached and deleted + // + // either way, we must complete + assert!( + layers.is_empty(), + "no layers should had been copied as detach is done" + ); + + let existing = tenant.timelines.lock().unwrap().get(&ancestor_id).cloned(); + + if let Some(ancestor) = existing { + Ancestor::Detached(ancestor, ancestor_lsn) + } else { + let direct_children = reparented_direct_children(detached, tenant)?; + return Ok(DetachingAndReparenting::AlreadyDone(direct_children)); + } + } + (None, None) => { + // TODO: make sure there are no `?` before tenant_reset from after a questionmark from + // here. + panic!( + "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" + ); + } + }; // publish the prepared layers before we reparent any of the timelines, so that on restart // reparented timelines find layers. also do the actual detaching. // - // if we crash after this operation, we will at least come up having detached a timeline, but - // we cannot go back and reparent the timelines which would had been reparented in normal - // execution. - // - // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart - // which could give us a completely wrong layer combination. - detached - .remote_client - .schedule_adding_existing_layers_to_index_detach_and_wait( - &layers, - (ancestor.timeline_id, ancestor_lsn), - ) - .await?; + // if we crash after this operation, a retry will allow reparenting the remaining timelines as + // gc is blocked. + + let (ancestor, ancestor_lsn, was_detached) = match ancestor { + Ancestor::NotDetached(ancestor, ancestor_lsn) => { + // this has to complete before any reparentings because otherwise they would not have + // layers on the new parent. + detached + .remote_client + .schedule_adding_existing_layers_to_index_detach_and_wait( + &layers, + (ancestor.timeline_id, ancestor_lsn), + ) + .await + .context("publish layers and detach ancestor")?; + + tracing::info!( + ancestor=%ancestor.timeline_id, + %ancestor_lsn, + inherited_layers=%layers.len(), + "detached from ancestor" + ); + (ancestor, ancestor_lsn, true) + } + Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false), + }; let mut tasks = tokio::task::JoinSet::new(); + // Returns a single permit semaphore which will be used to make one reparenting succeed, + // others will fail as if those timelines had been stopped for whatever reason. + #[cfg(feature = "testing")] + let failpoint_sem = || -> Option> { + fail::fail_point!("timeline-detach-ancestor::allow_one_reparented", |_| Some( + Arc::new(Semaphore::new(1)) + )); + None + }(); + // because we are now keeping the slot in progress, it is unlikely that there will be any // timeline deletions during this time. if we raced one, then we'll just ignore it. - tenant - .timelines - .lock() - .unwrap() - .values() - .filter_map(|tl| { - if Arc::ptr_eq(tl, detached) { - return None; - } + { + let g = tenant.timelines.lock().unwrap(); + reparentable_timelines(g.values(), detached, &ancestor, ancestor_lsn) + .cloned() + .for_each(|timeline| { + // important in this scope: we are holding the Tenant::timelines lock + let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); + let new_parent = detached.timeline_id; + #[cfg(feature = "testing")] + let failpoint_sem = failpoint_sem.clone(); - if !tl.is_active() { - return None; - } + tasks.spawn( + async move { + let res = async { + #[cfg(feature = "testing")] + if let Some(failpoint_sem) = failpoint_sem { + let _permit = failpoint_sem.acquire().await.map_err(|_| { + anyhow::anyhow!( + "failpoint: timeline-detach-ancestor::allow_one_reparented", + ) + })?; + failpoint_sem.close(); + } - let tl_ancestor = tl.ancestor_timeline.as_ref()?; - let is_same = Arc::ptr_eq(ancestor, tl_ancestor); - let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; - - let is_deleting = tl - .delete_progress - .try_lock() - .map(|flow| !flow.is_not_started()) - .unwrap_or(true); - - if is_same && is_earlier && !is_deleting { - Some(tl.clone()) - } else { - None - } - }) - .for_each(|timeline| { - // important in this scope: we are holding the Tenant::timelines lock - let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); - let new_parent = detached.timeline_id; - - tasks.spawn( - async move { - let res = timeline - .remote_client - .schedule_reparenting_and_wait(&new_parent) + timeline + .remote_client + .schedule_reparenting_and_wait(&new_parent) + .await + } .await; - match res { - Ok(()) => Some(timeline), - Err(e) => { - // with the use of tenant slot, we no longer expect these. - tracing::warn!("reparenting failed: {e:#}"); - None + match res { + Ok(()) => { + tracing::info!("reparented"); + Some(timeline) + } + Err(e) => { + // with the use of tenant slot, raced timeline deletion is the most + // likely reason. + tracing::warn!("reparenting failed: {e:#}"); + None + } } } - } - .instrument(span), - ); - }); + .instrument(span), + ); + }); + } let reparenting_candidates = tasks.len(); let mut reparented = HashSet::with_capacity(tasks.len()); @@ -645,33 +865,103 @@ pub(super) async fn complete( while let Some(res) = tasks.join_next().await { match res { Ok(Some(timeline)) => { - tracing::info!(reparented=%timeline.timeline_id, "reparenting done"); - assert!( reparented.insert(timeline.timeline_id), "duplicate reparenting? timeline_id={}", timeline.timeline_id ); } - Ok(None) => { - // lets just ignore this for now. one or all reparented timelines could had - // started deletion, and that is fine. - } Err(je) if je.is_cancelled() => unreachable!("not used"), - Err(je) if je.is_panic() => { - // ignore; it's better to continue with a single reparenting failing (or even - // all of them) in order to get to the goal state. - // - // these timelines will never be reparentable, but they can be always detached as - // separate tree roots. - } + // just ignore failures now, we can retry + Ok(None) => {} + Err(je) if je.is_panic() => {} Err(je) => tracing::error!("unexpected join error: {je:?}"), } } - if reparenting_candidates != reparented.len() { - tracing::info!("failed to reparent some candidates"); + let reparented_all = reparenting_candidates == reparented.len(); + + if reparented_all { + Ok(DetachingAndReparenting::Reparented(reparented)) + } else { + tracing::info!( + reparented = reparented.len(), + candidates = reparenting_candidates, + "failed to reparent all candidates; they can be retried after the tenant_reset", + ); + + let must_reset_tenant = !reparented.is_empty() || was_detached; + Ok(DetachingAndReparenting::SomeReparentingFailed { must_reset_tenant }) + } +} + +pub(super) async fn complete( + detached: &Arc, + tenant: &Tenant, + mut attempt: Attempt, + _ctx: &RequestContext, +) -> Result<(), Error> { + assert_eq!(detached.timeline_id, attempt.timeline_id); + + if attempt.gate_entered.is_none() { + let entered = detached.gate.enter().map_err(|_| Error::ShuttingDown)?; + attempt.gate_entered = Some(entered); + } else { + // Some(gate_entered) means the tenant was not restarted, as is not required } - Ok(reparented) + assert!(detached.ancestor_timeline.is_none()); + + // this should be an 503 at least...? + fail::fail_point!( + "timeline-detach-ancestor::complete_before_uploading", + |_| Err(Error::Failpoint( + "timeline-detach-ancestor::complete_before_uploading" + )) + ); + + tenant + .gc_block + .remove( + detached, + crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor, + ) + .await + // FIXME: better error + .map_err(Error::Unexpected)?; + + Ok(()) +} + +/// Query against a locked `Tenant::timelines`. +fn reparentable_timelines<'a, I>( + timelines: I, + detached: &'a Arc, + ancestor: &'a Arc, + ancestor_lsn: Lsn, +) -> impl Iterator> + 'a +where + I: Iterator> + 'a, +{ + timelines.filter_map(move |tl| { + if Arc::ptr_eq(tl, detached) { + return None; + } + + let tl_ancestor = tl.ancestor_timeline.as_ref()?; + let is_same = Arc::ptr_eq(ancestor, tl_ancestor); + let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; + + let is_deleting = tl + .delete_progress + .try_lock() + .map(|flow| !flow.is_not_started()) + .unwrap_or(true); + + if is_same && is_earlier && !is_deleting { + Some(tl) + } else { + None + } + }) } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index a66900522a..b5c577af72 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -335,6 +335,9 @@ pub(super) async fn handle_walreceiver_connection( filtered_records += 1; } + // FIXME: this cannot be made pausable_failpoint without fixing the + // failpoint library; in tests, the added amount of debugging will cause us + // to timeout the tests. fail_point!("walreceiver-after-ingest"); last_rec_lsn = lsn; diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index e6c835aa75..3c48c84598 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -5,12 +5,17 @@ use anyhow::Context; use std::path::Path; +use utils::serde_percent::Percent; use pageserver_api::models::PageserverUtilization; -pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result { - // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough +use crate::{config::PageServerConf, tenant::mgr::TenantManager}; +pub(crate) fn regenerate( + conf: &PageServerConf, + tenants_path: &Path, + tenant_manager: &TenantManager, +) -> anyhow::Result { let statvfs = nix::sys::statvfs::statvfs(tenants_path) .map_err(std::io::Error::from) .context("statvfs tenants directory")?; @@ -34,16 +39,31 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result e.max_usage_pct, + None => Percent::new(100).unwrap(), + }; + + // Express a static value for how many shards we may schedule on one node + const MAX_SHARDS: u32 = 20000; + + let mut doc = PageserverUtilization { disk_usage_bytes: used, free_space_bytes: free, - // lower is better; start with a constant - // - // note that u64::MAX will be output as i64::MAX as u64, but that should not matter - utilization_score: u64::MAX, + disk_wanted_bytes, + disk_usable_pct, + shard_count, + max_shard_count: MAX_SHARDS, + utilization_score: 0, captured_at: utils::serde_system_time::SystemTime(captured_at), }; + doc.refresh_score(); + // TODO: make utilization_score into a metric Ok(doc) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 770081b3b4..82585f9ed8 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -107,8 +107,10 @@ enum ProcessOnceCell { } struct Process { - _launched_processes_guard: utils::sync::gate::GateGuard, process: process::WalRedoProcess, + /// This field is last in this struct so the guard gets dropped _after_ [`Self::process`]. + /// (Reminder: dropping [`Self::process`] synchronously sends SIGKILL and then `wait()`s for it to exit). + _launched_processes_guard: utils::sync::gate::GateGuard, } impl std::ops::Deref for Process { @@ -327,20 +329,23 @@ impl PostgresRedoManager { }, Err(permit) => { let start = Instant::now(); - let proc = Arc::new(Process { - _launched_processes_guard: match self.launched_processes.enter() { + // acquire guard before spawning process, so that we don't spawn new processes + // if the gate is already closed. + let _launched_processes_guard = match self.launched_processes.enter() { Ok(guard) => guard, Err(GateError::GateClosed) => unreachable!( "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" ), - }, - process: process::WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - }); + }; + let proc = Arc::new(Process { + process: process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, + _launched_processes_guard, + }); let duration = start.elapsed(); WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); info!( diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index 93252e6b29..de023da5c4 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -45,6 +45,7 @@ static const char *jwt_token = NULL; /* GUCs */ static char *ConsoleURL = NULL; static bool ForwardDDL = true; +static bool RegressTestMode = false; /* * CURL docs say that this buffer must exist until we call curl_easy_cleanup @@ -802,6 +803,14 @@ NeonProcessUtility( case T_DropRoleStmt: HandleDropRole(castNode(DropRoleStmt, parseTree)); break; + case T_CreateTableSpaceStmt: + if (!RegressTestMode) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("CREATE TABLESPACE is not supported on Neon"))); + } + break; default: break; } @@ -864,6 +873,18 @@ InitControlPlaneConnector() NULL, NULL); + DefineCustomBoolVariable( + "neon.regress_test_mode", + "Controls whether we are running in the regression test mode", + NULL, + &RegressTestMode, + false, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); if (!jwt_token) { diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index d107cdc1c2..784d0f1da3 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -32,6 +32,7 @@ #include "utils/builtins.h" #include "utils/pg_lsn.h" #include "utils/guc.h" +#include "utils/guc_tables.h" #include "utils/wait_event.h" #include "extension_server.h" @@ -68,10 +69,10 @@ InitLogicalReplicationMonitor(void) DefineCustomIntVariable( "neon.logical_replication_max_snap_files", - "Maximum allowed logical replication .snap files", + "Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.", NULL, &logical_replication_max_snap_files, - 300, 0, INT_MAX, + 300, -1, INT_MAX, PGC_SIGHUP, 0, NULL, NULL, NULL); @@ -584,6 +585,40 @@ RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *n return false; } + +/* + * pgbouncer is able to track GUCs reported by Postgres. + * But most parameters cannot be tracked this way. The only parameters that can be tracked are ones + * that Postgres reports to the client. Unfortunately `search_path` is not reported by Postgres: + * https://www.postgresql.org/message-id/flat/CAGECzQQ6xFcgrg%2Be0p9mCumtK362TiA6vTiiZKoYbS8OXggwuQ%40mail.gmail.com#be4bfd7a9cf1f0633bdb2d1790a0a1be + * This code sets GUC_REPORT flag for `search_path`making it possible to include it in + * pgbouncer's `track_extra_parameters` list. + * + * This code is inspired by how the Citus extension does this, see + * https://github.com/citusdata/citus/blob/2a263fe69a707d16ef24378f7650742386b0968f/src/backend/distributed/shared_library_init.c#L2694 + */ +static void +ReportSearchPath(void) +{ +#if PG_VERSION_NUM >= 160000 + int nGucs = 0; + struct config_generic **gucs = get_guc_variables(&nGucs); +#else + struct config_generic **gucs = get_guc_variables(); + int nGucs = GetNumConfigOptions(); +#endif + + for (int i = 0; i < nGucs; i++) + { + struct config_generic *guc = (struct config_generic *) gucs[i]; + + if (strcmp(guc->name, "search_path") == 0) + { + guc->flags |= GUC_REPORT; + } + } +} + void _PG_init(void) { @@ -599,6 +634,7 @@ _PG_init(void) pg_init_walproposer(); WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitLogicalReplicationMonitor(); @@ -626,6 +662,8 @@ _PG_init(void) * extension was loaded will be removed. */ EmitWarningsOnPlaceholders("neon"); + + ReportSearchPath(); } PG_FUNCTION_INFO_V1(pg_cluster_size); diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 944b316344..f3ddc64061 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -512,7 +512,7 @@ replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRe } /* - * Start walsender streaming replication + * Start walproposer streaming replication */ static void walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos) diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c index 8f8d1dfc01..bd3856e9d9 100644 --- a/pgxn/neon/walsender_hooks.c +++ b/pgxn/neon/walsender_hooks.c @@ -20,6 +20,7 @@ #include "utils/guc.h" #include "postmaster/interrupt.h" +#include "neon.h" #include "neon_walreader.h" #include "walproposer.h" @@ -181,6 +182,13 @@ NeonWALReadSegmentClose(XLogReaderState *xlogreader) void NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr) { + /* + * If safekeepers are not configured, assume we don't need neon_walreader, + * i.e. running neon fork locally. + */ + if (wal_acceptors_list[0] == '\0') + return; + if (!wal_reader) { XLogRecPtr epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn); diff --git a/pgxn/neon_rmgr/neon_rmgr.c b/pgxn/neon_rmgr/neon_rmgr.c index 496ca08c08..c3f726db84 100644 --- a/pgxn/neon_rmgr/neon_rmgr.c +++ b/pgxn/neon_rmgr/neon_rmgr.c @@ -186,7 +186,7 @@ static void fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) { *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | - HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK); + HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK | HEAP_COMBOCID); *infomask2 &= ~HEAP_KEYS_UPDATED; if (infobits & XLHL_XMAX_IS_MULTI) @@ -195,6 +195,8 @@ fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) *infomask |= HEAP_XMAX_LOCK_ONLY; if (infobits & XLHL_XMAX_EXCL_LOCK) *infomask |= HEAP_XMAX_EXCL_LOCK; + if (infobits & XLHL_COMBOCID) + *infomask |= HEAP_COMBOCID; /* note HEAP_XMAX_SHR_LOCK isn't considered here */ if (infobits & XLHL_XMAX_KEYSHR_LOCK) *infomask |= HEAP_XMAX_KEYSHR_LOCK; @@ -284,7 +286,7 @@ redo_neon_heap_insert(XLogReaderState *record) htup->t_infomask = xlhdr.t_infomask; htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); + htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid; htup->t_ctid = target_tid; if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, @@ -373,7 +375,7 @@ redo_neon_heap_delete(XLogReaderState *record) HeapTupleHeaderSetXmax(htup, xlrec->xmax); else HeapTupleHeaderSetXmin(htup, InvalidTransactionId); - HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; /* Mark the page as a candidate for pruning */ PageSetPrunable(page, XLogRecGetXid(record)); @@ -490,7 +492,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update) fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, &htup->t_infomask2); HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); - HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; /* Set forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -623,7 +625,7 @@ redo_neon_heap_update(XLogReaderState *record, bool hot_update) htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); + htup->t_choice.t_heap.t_field3.t_cid = xlhdr.t_cid; HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -728,7 +730,7 @@ redo_neon_heap_lock(XLogReaderState *record) offnum); } HeapTupleHeaderSetXmax(htup, xlrec->xmax); - HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } @@ -840,7 +842,7 @@ redo_neon_heap_multi_insert(XLogReaderState *record) htup->t_infomask = xlhdr->t_infomask; htup->t_hoff = xlhdr->t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlrec->t_cid); + htup->t_choice.t_heap.t_field3.t_cid = xlrec->t_cid; ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); diff --git a/poetry.lock b/poetry.lock index 9026824558..7db91e51f7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,91 +1,103 @@ # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +[[package]] +name = "aiohappyeyeballs" +version = "2.3.5" +description = "Happy Eyeballs for asyncio" +optional = false +python-versions = ">=3.8" +files = [ + {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"}, + {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"}, +] + [[package]] name = "aiohttp" -version = "3.9.4" +version = "3.10.2" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" files = [ - {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"}, - {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"}, - {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"}, - {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"}, - {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"}, - {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"}, - {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"}, - {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"}, - {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"}, - {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"}, - {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"}, - {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"}, - {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"}, - {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"}, - {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"}, - {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"}, - {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"}, - {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"}, - {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"}, - {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"}, - {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"}, - {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"}, - {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"}, - {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"}, - {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"}, - {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"}, - {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"}, - {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"}, - {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"}, - {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"}, - {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"}, - {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"}, - {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"}, - {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"}, - {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"}, - {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"}, + {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"}, + {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"}, + {file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"}, + {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"}, + {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"}, + {file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"}, + {file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"}, + {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"}, + {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"}, + {file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"}, + {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"}, + {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"}, + {file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"}, + {file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"}, + {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"}, + {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"}, + {file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"}, + {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"}, + {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"}, + {file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"}, + {file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"}, + {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"}, + {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"}, + {file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"}, + {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"}, + {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"}, + {file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"}, + {file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"}, + {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"}, + {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"}, + {file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"}, + {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"}, + {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"}, + {file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"}, + {file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"}, + {file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"}, ] [package.dependencies] +aiohappyeyeballs = ">=2.3.0" aiosignal = ">=1.1.2" async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""} attrs = ">=17.3.0" @@ -94,7 +106,7 @@ multidict = ">=4.5,<7.0" yarl = ">=1.0,<2.0" [package.extras] -speedups = ["Brotli", "aiodns", "brotlicffi"] +speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] [[package]] name = "aiopg" @@ -3371,4 +3383,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72" +content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055" diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index e5b6536328..c41df07a4d 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -34,6 +34,7 @@ use tracing::error; use tracing::info; use typed_json::json; use url::Url; +use urlencoding; use utils::http::error::ApiError; use crate::auth::backend::ComputeUserInfo; @@ -168,7 +169,8 @@ fn get_conn_info( .path_segments() .ok_or(ConnInfoError::MissingDbName)?; - let dbname: DbName = url_path.next().ok_or(ConnInfoError::InvalidDbName)?.into(); + let dbname: DbName = + urlencoding::decode(url_path.next().ok_or(ConnInfoError::InvalidDbName)?)?.into(); ctx.set_dbname(dbname.clone()); let username = RoleName::from(urlencoding::decode(connection_url.username())?); diff --git a/pyproject.toml b/pyproject.toml index cfb569b2ba..ad3961ef55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ psutil = "^5.9.4" types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" -aiohttp = "3.9.4" +aiohttp = "3.10.2" pytest-rerunfailures = "^13.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 3510359591..368b8d300a 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.80.0" +channel = "1.80.1" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql new file mode 100644 index 0000000000..53222c614e --- /dev/null +++ b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql @@ -0,0 +1 @@ +DROP TABLE controllers; diff --git a/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql new file mode 100644 index 0000000000..90546948cb --- /dev/null +++ b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql @@ -0,0 +1,5 @@ +CREATE TABLE controllers ( + address VARCHAR NOT NULL, + started_at TIMESTAMPTZ NOT NULL, + PRIMARY KEY(address, started_at) +); diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs new file mode 100644 index 0000000000..dea1f04649 --- /dev/null +++ b/storage_controller/src/drain_utils.rs @@ -0,0 +1,225 @@ +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc, +}; + +use pageserver_api::controller_api::NodeSchedulingPolicy; +use utils::{id::NodeId, shard::TenantShardId}; + +use crate::{ + background_node_operations::OperationError, node::Node, scheduler::Scheduler, + tenant_shard::TenantShard, +}; + +pub(crate) struct TenantShardIterator { + tenants_accessor: F, + inspected_all_shards: bool, + last_inspected_shard: Option, +} + +/// A simple iterator which can be used in tandem with [`crate::service::Service`] +/// to iterate over all known tenant shard ids without holding the lock on the +/// service state at all times. +impl TenantShardIterator +where + F: Fn(Option) -> Option, +{ + pub(crate) fn new(tenants_accessor: F) -> Self { + Self { + tenants_accessor, + inspected_all_shards: false, + last_inspected_shard: None, + } + } + + /// Returns the next tenant shard id if one exists + pub(crate) fn next(&mut self) -> Option { + if self.inspected_all_shards { + return None; + } + + match (self.tenants_accessor)(self.last_inspected_shard) { + Some(tid) => { + self.last_inspected_shard = Some(tid); + Some(tid) + } + None => { + self.inspected_all_shards = true; + None + } + } + } + + /// Returns true when the end of the iterator is reached and false otherwise + pub(crate) fn finished(&self) -> bool { + self.inspected_all_shards + } +} + +/// Check that the state of the node being drained is as expected: +/// node is present in memory and scheduling policy is set to [`NodeSchedulingPolicy::Draining`] +pub(crate) fn validate_node_state( + node_id: &NodeId, + nodes: Arc>, +) -> Result<(), OperationError> { + let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged( + format!("node {} was removed", node_id).into(), + ))?; + + let current_policy = node.get_scheduling(); + if !matches!(current_policy, NodeSchedulingPolicy::Draining) { + // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think + // about it + return Err(OperationError::NodeStateChanged( + format!("node {} changed state to {:?}", node_id, current_policy).into(), + )); + } + + Ok(()) +} + +/// Struct that houses a few utility methods for draining pageserver nodes +pub(crate) struct TenantShardDrain { + pub(crate) drained_node: NodeId, + pub(crate) tenant_shard_id: TenantShardId, +} + +impl TenantShardDrain { + /// Check if the tenant shard under question is eligible for drainining: + /// it's primary attachment is on the node being drained + pub(crate) fn tenant_shard_eligible_for_drain( + &self, + tenants: &BTreeMap, + scheduler: &Scheduler, + ) -> Option { + let tenant_shard = tenants.get(&self.tenant_shard_id)?; + + if *tenant_shard.intent.get_attached() != Some(self.drained_node) { + return None; + } + + match scheduler.node_preferred(tenant_shard.intent.get_secondary()) { + Some(node) => Some(node), + None => { + tracing::warn!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "No eligible secondary while draining {}", self.drained_node + ); + + None + } + } + } + + /// Attempt to reschedule the tenant shard under question to one of its secondary locations + /// Returns an Err when the operation should be aborted and Ok(None) when the tenant shard + /// should be skipped. + pub(crate) fn reschedule_to_secondary<'a>( + &self, + destination: NodeId, + tenants: &'a mut BTreeMap, + scheduler: &mut Scheduler, + nodes: &Arc>, + ) -> Result, OperationError> { + let tenant_shard = match tenants.get_mut(&self.tenant_shard_id) { + Some(some) => some, + None => { + // Tenant shard was removed in the meantime. + // Skip to the next one, but don't fail the overall operation + return Ok(None); + } + }; + + if !nodes.contains_key(&destination) { + return Err(OperationError::NodeStateChanged( + format!("node {} was removed", destination).into(), + )); + } + + if !tenant_shard.intent.get_secondary().contains(&destination) { + tracing::info!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Secondary moved away from {destination} during drain" + ); + + return Ok(None); + } + + match tenant_shard.reschedule_to_secondary(Some(destination), scheduler) { + Err(e) => { + tracing::warn!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Scheduling error when draining pageserver {} : {}", self.drained_node, e + ); + + Ok(None) + } + Ok(()) => { + let scheduled_to = tenant_shard.intent.get_attached(); + tracing::info!( + tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Rescheduled shard while draining node {}: {} -> {:?}", + self.drained_node, + self.drained_node, + scheduled_to + ); + + Ok(Some(tenant_shard)) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use utils::{ + id::TenantId, + shard::{ShardCount, ShardNumber, TenantShardId}, + }; + + use super::TenantShardIterator; + + #[test] + fn test_tenant_shard_iterator() { + let tenant_id = TenantId::generate(); + let shard_count = ShardCount(8); + + let mut tenant_shards = Vec::default(); + for i in 0..shard_count.0 { + tenant_shards.push(( + TenantShardId { + tenant_id, + shard_number: ShardNumber(i), + shard_count, + }, + (), + )) + } + + let tenant_shards = Arc::new(tenant_shards); + + let mut tid_iter = TenantShardIterator::new({ + let tenants = tenant_shards.clone(); + move |last_inspected_shard: Option| { + let entry = match last_inspected_shard { + Some(skip_past) => { + let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past); + cursor.nth(1) + } + None => tenants.first(), + }; + + entry.map(|(tid, _)| tid).copied() + } + }); + + let mut iterated_over = Vec::default(); + while let Some(tid) = tid_iter.next() { + iterated_over.push((tid, ())); + } + + assert_eq!(iterated_over, *tenant_shards); + } +} diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs index 8caf638904..2034addbe1 100644 --- a/storage_controller/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -4,12 +4,14 @@ use utils::seqwait::MonotonicCounter; mod auth; mod background_node_operations; mod compute_hook; +mod drain_utils; mod heartbeater; pub mod http; mod id_lock_map; pub mod metrics; mod node; mod pageserver_client; +mod peer_client; pub mod persistence; mod reconciler; mod scheduler; diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 2799f21fdc..5a68799141 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,6 +1,7 @@ use anyhow::{anyhow, Context}; use clap::Parser; use diesel::Connection; +use hyper::Uri; use metrics::launch_timestamp::LaunchTimestamp; use metrics::BuildInfo; use std::path::PathBuf; @@ -83,6 +84,13 @@ struct Cli { #[arg(long, default_value = "5s")] db_connect_timeout: humantime::Duration, + #[arg(long, default_value = "false")] + start_as_candidate: bool, + + // TODO: make this mandatory once the helm chart gets updated + #[arg(long)] + address_for_peers: Option, + /// `neon_local` sets this to the path of the neon_local repo dir. /// Only relevant for testing. // TODO: make `cfg(feature = "testing")` @@ -92,6 +100,11 @@ struct Cli { /// Chaos testing #[arg(long)] chaos_interval: Option, + + // Maximum acceptable lag for the secondary location while draining + // a pageserver + #[arg(long)] + max_secondary_lag_bytes: Option, } enum StrictMode { @@ -279,6 +292,10 @@ async fn async_main() -> anyhow::Result<()> { .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, + max_secondary_lag_bytes: args.max_secondary_lag_bytes, + address_for_peers: args.address_for_peers, + start_as_candidate: args.start_as_candidate, + http_service_port: args.listen.port() as i32, }; // After loading secrets & config, but before starting anything else, apply database migrations diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index a1a4b8543d..c2303e7a7f 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -12,6 +12,7 @@ use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, Metr use metrics::NeonMetrics; use once_cell::sync::Lazy; use std::sync::Mutex; +use strum::IntoEnumIterator; use crate::{ persistence::{DatabaseError, DatabaseOperation}, @@ -241,3 +242,18 @@ impl DatabaseError { } } } + +/// Update the leadership status metric gauges to reflect the requested status +pub(crate) fn update_leadership_status(status: LeadershipStatus) { + let status_metric = &METRICS_REGISTRY + .metrics_group + .storage_controller_leadership_status; + + for s in LeadershipStatus::iter() { + if s == status { + status_metric.set(LeadershipStatusGroup { status: s }, 1); + } else { + status_metric.set(LeadershipStatusGroup { status: s }, 0); + } + } +} diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs new file mode 100644 index 0000000000..ebb59a1720 --- /dev/null +++ b/storage_controller/src/peer_client.rs @@ -0,0 +1,106 @@ +use crate::tenant_shard::ObservedState; +use pageserver_api::shard::TenantShardId; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use tokio_util::sync::CancellationToken; + +use hyper::Uri; +use reqwest::{StatusCode, Url}; +use utils::{backoff, http::error::HttpErrorBody}; + +#[derive(Debug, Clone)] +pub(crate) struct PeerClient { + uri: Uri, + jwt: Option, + client: reqwest::Client, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum StorageControllerPeerError { + #[error("failed to deserialize error response with status code {0} at {1}: {2}")] + DeserializationError(StatusCode, Url, reqwest::Error), + #[error("storage controller peer API error ({0}): {1}")] + ApiError(StatusCode, String), + #[error("failed to send HTTP request: {0}")] + SendError(reqwest::Error), + #[error("Cancelled")] + Cancelled, +} + +pub(crate) type Result = std::result::Result; + +pub(crate) trait ResponseErrorMessageExt: Sized { + fn error_from_body(self) -> impl std::future::Future> + Send; +} + +impl ResponseErrorMessageExt for reqwest::Response { + async fn error_from_body(self) -> Result { + let status = self.status(); + if !(status.is_client_error() || status.is_server_error()) { + return Ok(self); + } + + let url = self.url().to_owned(); + Err(match self.json::().await { + Ok(HttpErrorBody { msg }) => StorageControllerPeerError::ApiError(status, msg), + Err(err) => StorageControllerPeerError::DeserializationError(status, url, err), + }) + } +} + +#[derive(Serialize, Deserialize, Debug, Default)] +pub(crate) struct GlobalObservedState(pub(crate) HashMap); + +impl PeerClient { + pub(crate) fn new(uri: Uri, jwt: Option) -> Self { + Self { + uri, + jwt, + client: reqwest::Client::new(), + } + } + + async fn request_step_down(&self) -> Result { + let step_down_path = format!("{}control/v1/step_down", self.uri); + let req = self.client.put(step_down_path); + let req = if let Some(jwt) = &self.jwt { + req.header(reqwest::header::AUTHORIZATION, format!("Bearer {jwt}")) + } else { + req + }; + + let res = req + .send() + .await + .map_err(StorageControllerPeerError::SendError)?; + let response = res.error_from_body().await?; + + let status = response.status(); + let url = response.url().to_owned(); + + response + .json() + .await + .map_err(|err| StorageControllerPeerError::DeserializationError(status, url, err)) + } + + /// Request the peer to step down and return its current observed state + /// All errors are retried with exponential backoff for a maximum of 4 attempts. + /// Assuming all retries are performed, the function times out after roughly 4 seconds. + pub(crate) async fn step_down( + &self, + cancel: &CancellationToken, + ) -> Result { + backoff::retry( + || self.request_step_down(), + |_e| false, + 2, + 4, + "Send step down request", + cancel, + ) + .await + .ok_or_else(|| StorageControllerPeerError::Cancelled) + .and_then(|x| x) + } +} diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 64a3e597ce..aebbdec0d1 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -95,6 +95,8 @@ pub(crate) enum DatabaseOperation { ListMetadataHealth, ListMetadataHealthUnhealthy, ListMetadataHealthOutdated, + GetLeader, + UpdateLeader, } #[must_use] @@ -785,6 +787,69 @@ impl Persistence { ) .await } + + /// Get the current entry from the `leader` table if one exists. + /// It is an error for the table to contain more than one entry. + pub(crate) async fn get_leader(&self) -> DatabaseResult> { + let mut leader: Vec = self + .with_measured_conn( + DatabaseOperation::GetLeader, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::controllers::table.load::(conn)?) + }, + ) + .await?; + + if leader.len() > 1 { + return Err(DatabaseError::Logical(format!( + "More than one entry present in the leader table: {leader:?}" + ))); + } + + Ok(leader.pop()) + } + + /// Update the new leader with compare-exchange semantics. If `prev` does not + /// match the current leader entry, then the update is treated as a failure. + /// When `prev` is not specified, the update is forced. + pub(crate) async fn update_leader( + &self, + prev: Option, + new: ControllerPersistence, + ) -> DatabaseResult<()> { + use crate::schema::controllers::dsl::*; + + let updated = self + .with_measured_conn( + DatabaseOperation::UpdateLeader, + move |conn| -> DatabaseResult { + let updated = match &prev { + Some(prev) => diesel::update(controllers) + .filter(address.eq(prev.address.clone())) + .filter(started_at.eq(prev.started_at)) + .set(( + address.eq(new.address.clone()), + started_at.eq(new.started_at), + )) + .execute(conn)?, + None => diesel::insert_into(controllers) + .values(new.clone()) + .execute(conn)?, + }; + + Ok(updated) + }, + ) + .await?; + + if updated == 0 { + return Err(DatabaseError::Logical( + "Leader table update failed".to_string(), + )); + } + + Ok(()) + } } /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably @@ -910,3 +975,12 @@ impl From for MetadataHealthRecord { } } } + +#[derive( + Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq, Debug, Clone, +)] +#[diesel(table_name = crate::schema::controllers)] +pub(crate) struct ControllerPersistence { + pub(crate) address: String, + pub(crate) started_at: chrono::DateTime, +} diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 254fdb364e..94db879ade 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -39,6 +39,9 @@ pub(super) struct Reconciler { /// to detach this tenant shard. pub(crate) detach: Vec, + /// Configuration specific to this reconciler + pub(crate) reconciler_config: ReconcilerConfig, + pub(crate) config: TenantConfig, pub(crate) observed: ObservedState, @@ -73,6 +76,65 @@ pub(super) struct Reconciler { pub(crate) persistence: Arc, } +pub(crate) struct ReconcilerConfigBuilder { + config: ReconcilerConfig, +} + +impl ReconcilerConfigBuilder { + pub(crate) fn new() -> Self { + Self { + config: ReconcilerConfig::default(), + } + } + + pub(crate) fn secondary_warmup_timeout(self, value: Duration) -> Self { + Self { + config: ReconcilerConfig { + secondary_warmup_timeout: Some(value), + ..self.config + }, + } + } + + pub(crate) fn secondary_download_request_timeout(self, value: Duration) -> Self { + Self { + config: ReconcilerConfig { + secondary_download_request_timeout: Some(value), + ..self.config + }, + } + } + + pub(crate) fn build(self) -> ReconcilerConfig { + self.config + } +} + +#[derive(Default, Debug, Copy, Clone)] +pub(crate) struct ReconcilerConfig { + // During live migration give up on warming-up the secondary + // after this timeout. + secondary_warmup_timeout: Option, + + // During live migrations this is the amount of time that + // the pagserver will hold our poll. + secondary_download_request_timeout: Option, +} + +impl ReconcilerConfig { + pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration { + const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300); + self.secondary_warmup_timeout + .unwrap_or(SECONDARY_WARMUP_TIMEOUT_DEFAULT) + } + + pub(crate) fn get_secondary_download_request_timeout(&self) -> Duration { + const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT: Duration = Duration::from_secs(20); + self.secondary_download_request_timeout + .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT) + } +} + /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O pub(crate) struct ReconcileUnits { _sem_units: tokio::sync::OwnedSemaphorePermit, @@ -300,11 +362,13 @@ impl Reconciler { ) -> Result<(), ReconcileError> { // This is not the timeout for a request, but the total amount of time we're willing to wait // for a secondary location to get up to date before - const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300); + let total_download_timeout = self.reconciler_config.get_secondary_warmup_timeout(); // This the long-polling interval for the secondary download requests we send to destination pageserver // during a migration. - const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20); + let request_download_timeout = self + .reconciler_config + .get_secondary_download_request_timeout(); let started_at = Instant::now(); @@ -315,14 +379,14 @@ impl Reconciler { client .tenant_secondary_download( tenant_shard_id, - Some(REQUEST_DOWNLOAD_TIMEOUT), + Some(request_download_timeout), ) .await }, &self.service_config.jwt_token, 1, 3, - REQUEST_DOWNLOAD_TIMEOUT * 2, + request_download_timeout * 2, &self.cancel, ) .await @@ -350,7 +414,7 @@ impl Reconciler { return Ok(()); } else if status == StatusCode::ACCEPTED { let total_runtime = started_at.elapsed(); - if total_runtime > TOTAL_DOWNLOAD_TIMEOUT { + if total_runtime > total_download_timeout { tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", total_runtime.as_millis(), progress.layers_downloaded, diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index cb5ba3f38b..77ba47e114 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -1,5 +1,12 @@ // @generated automatically by Diesel CLI. +diesel::table! { + controllers (address, started_at) { + address -> Varchar, + started_at -> Timestamptz, + } +} + diesel::table! { metadata_health (tenant_id, shard_number, shard_count) { tenant_id -> Varchar, @@ -36,4 +43,4 @@ diesel::table! { } } -diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,); +diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,); diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index e391ce65e6..ee8e9ac5a1 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,3 +1,4 @@ +use hyper::Uri; use std::{ borrow::Cow, cmp::Ordering, @@ -14,10 +15,14 @@ use crate::{ Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION, }, compute_hook::NotifyError, + drain_utils::{self, TenantShardDrain, TenantShardIterator}, id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, - metrics::LeadershipStatusGroup, - persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter}, - reconciler::{ReconcileError, ReconcileUnits}, + metrics, + peer_client::{GlobalObservedState, PeerClient}, + persistence::{ + AbortShardSplitStatus, ControllerPersistence, MetadataHealthPersistence, TenantFilter, + }, + reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, scheduler::{MaySchedule, ScheduleContext, ScheduleMode}, tenant_shard::{ MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization, @@ -82,7 +87,6 @@ use crate::{ ReconcilerWaiter, TenantShard, }, }; -use serde::{Deserialize, Serialize}; pub mod chaos_injector; @@ -139,7 +143,15 @@ enum NodeOperations { /// Allowed transitions are: /// 1. Leader -> SteppedDown /// 2. Candidate -> Leader -#[derive(Copy, Clone, strum_macros::Display, measured::FixedCardinalityLabel)] +#[derive( + Eq, + PartialEq, + Copy, + Clone, + strum_macros::Display, + strum_macros::EnumIter, + measured::FixedCardinalityLabel, +)] #[strum(serialize_all = "snake_case")] pub(crate) enum LeadershipStatus { /// This is the steady state where the storage controller can produce @@ -225,22 +237,12 @@ impl ServiceState { tenants: BTreeMap, scheduler: Scheduler, delayed_reconcile_rx: tokio::sync::mpsc::Receiver, + initial_leadership_status: LeadershipStatus, ) -> Self { - let status = &crate::metrics::METRICS_REGISTRY - .metrics_group - .storage_controller_leadership_status; - - status.set( - LeadershipStatusGroup { - status: LeadershipStatus::Leader, - }, - 1, - ); + metrics::update_leadership_status(initial_leadership_status); Self { - // TODO: Starting up as Leader is a transient state. Once we enable rolling - // upgrades on the k8s side, we should start up as Candidate. - leadership_status: LeadershipStatus::Leader, + leadership_status: initial_leadership_status, tenants, nodes: Arc::new(nodes), scheduler, @@ -265,29 +267,12 @@ impl ServiceState { fn step_down(&mut self) { self.leadership_status = LeadershipStatus::SteppedDown; + metrics::update_leadership_status(self.leadership_status); + } - let status = &crate::metrics::METRICS_REGISTRY - .metrics_group - .storage_controller_leadership_status; - - status.set( - LeadershipStatusGroup { - status: LeadershipStatus::SteppedDown, - }, - 1, - ); - status.set( - LeadershipStatusGroup { - status: LeadershipStatus::Leader, - }, - 0, - ); - status.set( - LeadershipStatusGroup { - status: LeadershipStatus::Candidate, - }, - 0, - ); + fn become_leader(&mut self) { + self.leadership_status = LeadershipStatus::Leader; + metrics::update_leadership_status(self.leadership_status); } } @@ -325,6 +310,18 @@ pub struct Config { // TODO: make this cfg(feature = "testing") pub neon_local_repo_dir: Option, + + // Maximum acceptable download lag for the secondary location + // while draining a node. If the secondary location is lagging + // by more than the configured amount, then the secondary is not + // upgraded to primary. + pub max_secondary_lag_bytes: Option, + + pub address_for_peers: Option, + + pub start_as_candidate: bool, + + pub http_service_port: i32, } impl From for ApiError { @@ -492,9 +489,10 @@ pub(crate) enum ReconcileResultRequest { Stop, } -// TODO: move this into the storcon peer client when that gets added -#[derive(Serialize, Deserialize, Debug, Default)] -pub(crate) struct GlobalObservedState(HashMap); +struct LeaderStepDownState { + observed: GlobalObservedState, + leader: ControllerPersistence, +} impl Service { pub fn get_config(&self) -> &Config { @@ -506,15 +504,11 @@ impl Service { #[instrument(skip_all)] async fn startup_reconcile( self: &Arc, + leader_step_down_state: Option, bg_compute_notify_result_tx: tokio::sync::mpsc::Sender< Result<(), (TenantShardId, NotifyError)>, >, ) { - // For all tenant shards, a vector of observed states on nodes (where None means - // indeterminate, same as in [`ObservedStateLocation`]) - let mut observed: HashMap)>> = - HashMap::new(); - // Startup reconciliation does I/O to other services: whether they // are responsive or not, we should aim to finish within our deadline, because: // - If we don't, a k8s readiness hook watching /ready will kill us. @@ -528,26 +522,28 @@ impl Service { .checked_add(STARTUP_RECONCILE_TIMEOUT / 2) .expect("Reconcile timeout is a modest constant"); + let (observed, current_leader) = if let Some(state) = leader_step_down_state { + tracing::info!( + "Using observed state received from leader at {}", + state.leader.address, + ); + (state.observed, Some(state.leader)) + } else { + ( + self.build_global_observed_state(node_scan_deadline).await, + None, + ) + }; + // Accumulate a list of any tenant locations that ought to be detached let mut cleanup = Vec::new(); - let node_listings = self.scan_node_locations(node_scan_deadline).await; - // Send initial heartbeat requests to nodes that replied to the location listing above. - let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await; - - for (node_id, list_response) in node_listings { - let tenant_shards = list_response.tenant_shards; - tracing::info!( - "Received {} shard statuses from pageserver {}, setting it to Active", - tenant_shards.len(), - node_id - ); - - for (tenant_shard_id, conf_opt) in tenant_shards { - let shard_observations = observed.entry(tenant_shard_id).or_default(); - shard_observations.push((node_id, conf_opt)); - } - } + // Send initial heartbeat requests to all nodes loaded from the database + let all_nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await; // List of tenants for which we will attempt to notify compute of their location at startup let mut compute_notifications = Vec::new(); @@ -570,17 +566,16 @@ impl Service { } *nodes = Arc::new(new_nodes); - for (tenant_shard_id, shard_observations) in observed { - for (node_id, observed_loc) in shard_observations { - let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { - cleanup.push((tenant_shard_id, node_id)); - continue; - }; - tenant_shard - .observed - .locations - .insert(node_id, ObservedStateLocation { conf: observed_loc }); - } + for (tenant_shard_id, observed_state) in observed.0 { + let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { + for node_id in observed_state.locations.keys() { + cleanup.push((tenant_shard_id, *node_id)); + } + + continue; + }; + + tenant_shard.observed = observed_state; } // Populate each tenant's intent state @@ -614,6 +609,28 @@ impl Service { tenants.len() }; + // Before making any obeservable changes to the cluster, persist self + // as leader in database and memory. + if let Some(address_for_peers) = &self.config.address_for_peers { + // TODO: `address-for-peers` can become a mandatory cli arg + // after we update the k8s setup + let proposed_leader = ControllerPersistence { + address: address_for_peers.to_string(), + started_at: chrono::Utc::now(), + }; + + if let Err(err) = self + .persistence + .update_leader(current_leader, proposed_leader) + .await + { + tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ..."); + std::process::exit(1); + } + } + + self.inner.write().unwrap().become_leader(); + // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that // generation_pageserver in the database. @@ -779,6 +796,31 @@ impl Service { node_results } + async fn build_global_observed_state(&self, deadline: Instant) -> GlobalObservedState { + let node_listings = self.scan_node_locations(deadline).await; + let mut observed = GlobalObservedState::default(); + + for (node_id, location_confs) in node_listings { + tracing::info!( + "Received {} shard statuses from pageserver {}", + location_confs.tenant_shards.len(), + node_id + ); + + for (tid, location_conf) in location_confs.tenant_shards { + let entry = observed.0.entry(tid).or_default(); + entry.locations.insert( + node_id, + ObservedStateLocation { + conf: location_conf, + }, + ); + } + } + + observed + } + /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers. /// /// This is safe to run in the background, because if we don't have this TenantShardId in our map of @@ -1257,12 +1299,20 @@ impl Service { config.max_warming_up_interval, cancel.clone(), ); + + let initial_leadership_status = if config.start_as_candidate { + LeadershipStatus::Candidate + } else { + LeadershipStatus::Leader + }; + let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( nodes, tenants, scheduler, delayed_reconcile_rx, + initial_leadership_status, ))), config: config.clone(), persistence, @@ -1331,7 +1381,16 @@ impl Service { return; }; - this.startup_reconcile(bg_compute_notify_result_tx).await; + let leadership_status = this.inner.read().unwrap().get_leadership_status(); + let peer_observed_state = match leadership_status { + LeadershipStatus::Candidate => this.request_step_down().await, + LeadershipStatus::Leader => None, + LeadershipStatus::SteppedDown => unreachable!(), + }; + + this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx) + .await; + drop(startup_completion); } }); @@ -2930,6 +2989,7 @@ impl Service { ); let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + client .timeline_detach_ancestor(tenant_shard_id, timeline_id) .await @@ -2946,7 +3006,8 @@ impl Service { Error::ApiError(StatusCode::BAD_REQUEST, msg) => { ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}")) } - // rest can be mapped + // rest can be mapped as usual + // FIXME: this converts some 500 to 409 which is not per openapi other => passthrough_api_error(&node, other), } }) @@ -5187,11 +5248,22 @@ impl Service { Ok(()) } - /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], + /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler + /// configuration fn maybe_reconcile_shard( &self, shard: &mut TenantShard, nodes: &Arc>, + ) -> Option { + self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default()) + } + + /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], + fn maybe_configured_reconcile_shard( + &self, + shard: &mut TenantShard, + nodes: &Arc>, + reconciler_config: ReconcilerConfig, ) -> Option { let reconcile_needed = shard.get_reconcile_needed(nodes); @@ -5241,6 +5313,7 @@ impl Service { &self.result_tx, nodes, &self.compute_hook, + reconciler_config, &self.config, &self.persistence, units, @@ -5715,18 +5788,92 @@ impl Service { self.gate.close().await; } + /// Spot check the download lag for a secondary location of a shard. + /// Should be used as a heuristic, since it's not always precise: the + /// secondary might have not downloaded the new heat map yet and, hence, + /// is not aware of the lag. + /// + /// Returns: + /// * Ok(None) if the lag could not be determined from the status, + /// * Ok(Some(_)) if the lag could be determind + /// * Err on failures to query the pageserver. + async fn secondary_lag( + &self, + secondary: &NodeId, + tenant_shard_id: TenantShardId, + ) -> Result, mgmt_api::Error> { + let nodes = self.inner.read().unwrap().nodes.clone(); + let node = nodes.get(secondary).ok_or(mgmt_api::Error::ApiError( + StatusCode::NOT_FOUND, + format!("Node with id {} not found", secondary), + ))?; + + match node + .with_client_retries( + |client| async move { client.tenant_secondary_status(tenant_shard_id).await }, + &self.config.jwt_token, + 1, + 3, + Duration::from_millis(250), + &self.cancel, + ) + .await + { + Some(Ok(status)) => match status.heatmap_mtime { + Some(_) => Ok(Some(status.bytes_total - status.bytes_downloaded)), + None => Ok(None), + }, + Some(Err(e)) => Err(e), + None => Err(mgmt_api::Error::Cancelled), + } + } + /// Drain a node by moving the shards attached to it as primaries. /// This is a long running operation and it should run as a separate Tokio task. pub(crate) async fn drain_node( - &self, + self: &Arc, node_id: NodeId, cancel: CancellationToken, ) -> Result<(), OperationError> { - let mut last_inspected_shard: Option = None; - let mut inspected_all_shards = false; + const MAX_SECONDARY_LAG_BYTES_DEFAULT: u64 = 256 * 1024 * 1024; + let max_secondary_lag_bytes = self + .config + .max_secondary_lag_bytes + .unwrap_or(MAX_SECONDARY_LAG_BYTES_DEFAULT); + + // By default, live migrations are generous about the wait time for getting + // the secondary location up to speed. When draining, give up earlier in order + // to not stall the operation when a cold secondary is encountered. + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); + let reconciler_config = ReconcilerConfigBuilder::new() + .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) + .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) + .build(); + let mut waiters = Vec::new(); - while !inspected_all_shards { + let mut tid_iter = TenantShardIterator::new({ + let service = self.clone(); + move |last_inspected_shard: Option| { + let locked = &service.inner.read().unwrap(); + let tenants = &locked.tenants; + let entry = match last_inspected_shard { + Some(skip_past) => { + // Skip to the last seen tenant shard id + let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past); + + // Skip past the last seen + cursor.nth(1) + } + None => tenants.first_key_value(), + }; + + entry.map(|(tid, _)| tid).copied() + } + }); + + while !tid_iter.finished() { if cancel.is_cancelled() { match self .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active)) @@ -5745,71 +5892,82 @@ impl Service { } } - { - let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, scheduler) = locked.parts_mut(); + drain_utils::validate_node_state(&node_id, self.inner.read().unwrap().nodes.clone())?; - let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged( - format!("node {node_id} was removed").into(), - ))?; - - let current_policy = node.get_scheduling(); - if !matches!(current_policy, NodeSchedulingPolicy::Draining) { - // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think - // about it - return Err(OperationError::NodeStateChanged( - format!("node {node_id} changed state to {current_policy:?}").into(), - )); - } - - let mut cursor = tenants.iter_mut().skip_while({ - let skip_past = last_inspected_shard; - move |(tid, _)| match skip_past { - Some(last) => **tid != last, - None => false, + while waiters.len() < MAX_RECONCILES_PER_OPERATION { + let tid = match tid_iter.next() { + Some(tid) => tid, + None => { + break; } - }); + }; - while waiters.len() < MAX_RECONCILES_PER_OPERATION { - let (tid, tenant_shard) = match cursor.next() { - Some(some) => some, + let tid_drain = TenantShardDrain { + drained_node: node_id, + tenant_shard_id: tid, + }; + + let dest_node_id = { + let locked = self.inner.read().unwrap(); + + match tid_drain + .tenant_shard_eligible_for_drain(&locked.tenants, &locked.scheduler) + { + Some(node_id) => node_id, None => { - inspected_all_shards = true; - break; + continue; } - }; + } + }; - // If the shard is not attached to the node being drained, skip it. - if *tenant_shard.intent.get_attached() != Some(node_id) { - last_inspected_shard = Some(*tid); + match self.secondary_lag(&dest_node_id, tid).await { + Ok(Some(lag)) if lag <= max_secondary_lag_bytes => { + // The secondary is reasonably up to date. + // Migrate to it + } + Ok(Some(lag)) => { + tracing::info!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Secondary on node {dest_node_id} is lagging by {lag}. Skipping reconcile." + ); continue; } + Ok(None) => { + tracing::info!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Could not determine lag for secondary on node {dest_node_id}. Skipping reconcile." + ); + continue; + } + Err(err) => { + tracing::warn!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Failed to get secondary lag from node {dest_node_id}. Skipping reconcile: {err}" + ); + continue; + } + } - match tenant_shard.reschedule_to_secondary(None, scheduler) { - Err(e) => { - tracing::warn!( - tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), - "Scheduling error when draining pageserver {} : {e}", node_id - ); - } - Ok(()) => { - let scheduled_to = tenant_shard.intent.get_attached(); - tracing::info!( - tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), - "Rescheduled shard while draining node {}: {} -> {:?}", - node_id, - node_id, - scheduled_to - ); + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + let rescheduled = tid_drain.reschedule_to_secondary( + dest_node_id, + tenants, + scheduler, + nodes, + )?; - let waiter = self.maybe_reconcile_shard(tenant_shard, nodes); - if let Some(some) = waiter { - waiters.push(some); - } + if let Some(tenant_shard) = rescheduled { + let waiter = self.maybe_configured_reconcile_shard( + tenant_shard, + nodes, + reconciler_config, + ); + if let Some(some) = waiter { + waiters.push(some); } } - - last_inspected_shard = Some(*tid); } } @@ -6181,4 +6339,61 @@ impl Service { global_observed } + + /// Request step down from the currently registered leader in the database + /// + /// If such an entry is persisted, the success path returns the observed + /// state and details of the leader. Otherwise, None is returned indicating + /// there is no leader currently. + /// + /// On failures to query the database or step down error responses the process is killed + /// and we rely on k8s to retry. + async fn request_step_down(&self) -> Option { + let leader = match self.persistence.get_leader().await { + Ok(leader) => leader, + Err(err) => { + tracing::error!( + "Failed to query database for current leader: {err}. Aborting start-up ..." + ); + std::process::exit(1); + } + }; + + match leader { + Some(leader) => { + tracing::info!("Sending step down request to {leader:?}"); + + // TODO: jwt token + let client = PeerClient::new( + Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"), + self.config.jwt_token.clone(), + ); + let state = client.step_down(&self.cancel).await; + match state { + Ok(state) => Some(LeaderStepDownState { + observed: state, + leader: leader.clone(), + }), + Err(err) => { + // TODO: Make leaders periodically update a timestamp field in the + // database and, if the leader is not reachable from the current instance, + // but inferred as alive from the timestamp, abort start-up. This avoids + // a potential scenario in which we have two controllers acting as leaders. + tracing::error!( + "Leader ({}) did not respond to step-down request: {}", + leader.address, + err + ); + None + } + } + } + None => { + tracing::info!( + "No leader found to request step down from. Will build observed state." + ); + None + } + } + } } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index e250f29f98..1fcc3c8547 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -7,7 +7,7 @@ use std::{ use crate::{ metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, persistence::TenantShardPersistence, - reconciler::ReconcileUnits, + reconciler::{ReconcileUnits, ReconcilerConfig}, scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext}, service::ReconcileResultRequest, }; @@ -1063,6 +1063,7 @@ impl TenantShard { result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, compute_hook: &Arc, + reconciler_config: ReconcilerConfig, service_config: &service::Config, persistence: &Arc, units: ReconcileUnits, @@ -1101,6 +1102,7 @@ impl TenantShard { generation: self.generation, intent: reconciler_intent, detach, + reconciler_config, config: self.config.clone(), observed: self.observed.clone(), compute_hook: compute_hook.clone(), diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index ff230feae3..c8b1ed49f4 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use std::time::{Duration, SystemTime}; @@ -117,7 +117,7 @@ use refs::AncestorRefs; // - Are there any refs to ancestor shards' layers? #[derive(Default)] struct TenantRefAccumulator { - shards_seen: HashMap>, + shards_seen: HashMap>, // For each shard that has refs to an ancestor's layers, the set of ancestor layers referred to ancestor_ref_shards: AncestorRefs, @@ -130,7 +130,7 @@ impl TenantRefAccumulator { .shards_seen .entry(ttid.tenant_shard_id.tenant_id) .or_default()) - .push(this_shard_idx); + .insert(this_shard_idx); let mut ancestor_refs = Vec::new(); for (layer_name, layer_metadata) in &index_part.layer_metadata { @@ -154,7 +154,7 @@ impl TenantRefAccumulator { summary: &mut GcSummary, ) -> (Vec, AncestorRefs) { let mut ancestors_to_gc = Vec::new(); - for (tenant_id, mut shard_indices) in self.shards_seen { + for (tenant_id, shard_indices) in self.shards_seen { // Find the highest shard count let latest_count = shard_indices .iter() @@ -162,6 +162,7 @@ impl TenantRefAccumulator { .max() .expect("Always at least one shard"); + let mut shard_indices = shard_indices.iter().collect::>(); let (mut latest_shards, ancestor_shards) = { let at = itertools::partition(&mut shard_indices, |i| i.shard_count == latest_count); @@ -174,7 +175,7 @@ impl TenantRefAccumulator { // to scan the S3 bucket halfway through a shard split. if latest_shards.len() != latest_count.count() as usize { // This should be extremely rare, so we warn on it. - tracing::warn!(%tenant_id, "Missed some shards at count {:?}", latest_count); + tracing::warn!(%tenant_id, "Missed some shards at count {:?}: {latest_shards:?}", latest_count); continue; } @@ -212,7 +213,7 @@ impl TenantRefAccumulator { .iter() .map(|s| s.tenant_shard_id.to_index()) .collect(); - if controller_indices != latest_shards { + if !controller_indices.iter().eq(latest_shards.iter().copied()) { tracing::info!(%tenant_id, "Latest shards seen in S3 ({latest_shards:?}) don't match controller state ({controller_indices:?})"); continue; } diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 08215438e1..5fe544b3bd 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -42,7 +42,11 @@ class PgCompare(ABC): pass @abstractmethod - def flush(self): + def flush(self, compact: bool = False, gc: bool = False): + pass + + @abstractmethod + def compact(self): pass @abstractmethod @@ -129,13 +133,16 @@ class NeonCompare(PgCompare): def pg_bin(self) -> PgBin: return self._pg_bin - def flush(self): + def flush(self, compact: bool = True, gc: bool = True): wait_for_last_flush_lsn(self.env, self._pg, self.tenant, self.timeline) - self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline) - self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0) + self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline, compact=compact) + if gc: + self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0) def compact(self): - self.pageserver_http_client.timeline_compact(self.tenant, self.timeline) + self.pageserver_http_client.timeline_compact( + self.tenant, self.timeline, wait_until_uploaded=True + ) def report_peak_memory_use(self): self.zenbenchmark.record( @@ -215,9 +222,12 @@ class VanillaCompare(PgCompare): def pg_bin(self) -> PgBin: return self._pg.pg_bin - def flush(self): + def flush(self, compact: bool = False, gc: bool = False): self.cur.execute("checkpoint") + def compact(self): + pass + def report_peak_memory_use(self): pass # TODO find something @@ -266,6 +276,9 @@ class RemoteCompare(PgCompare): # TODO: flush the remote pageserver pass + def compact(self): + pass + def report_peak_memory_use(self): # TODO: get memory usage from remote pageserver pass diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c6f4404784..aaa1f21997 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,6 +14,7 @@ import textwrap import threading import time import uuid +from collections import defaultdict from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime @@ -23,7 +24,7 @@ from functools import cached_property, partial from itertools import chain, product from pathlib import Path from types import TracebackType -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast +from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast from urllib.parse import quote, urlparse import asyncpg @@ -387,7 +388,7 @@ class PgProtocol: return self.safe_psql_many([query], **kwargs)[0] def safe_psql_many( - self, queries: List[str], log_query=True, **kwargs: Any + self, queries: Iterable[str], log_query=True, **kwargs: Any ) -> List[List[Tuple[Any, ...]]]: """ Execute queries against the node and return all rows. @@ -962,7 +963,7 @@ class NeonEnvBuilder: if self.env: log.info("Cleaning up all storage and compute nodes") self.env.stop( - immediate=True, + immediate=False, # if the test threw an exception, don't check for errors # as a failing assertion would cause the cleanup below to fail ps_assert_metric_no_errors=(exc_type is None), @@ -1250,21 +1251,57 @@ class NeonEnv: def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): """ After this method returns, there should be no child processes running. + + Unless of course, some stopping failed, in that case, all remaining child processes are leaked. """ - self.endpoints.stop_all(fail_on_endpoint_errors) + + # the commonly failing components have special try-except behavior, + # trying to get us to actually shutdown all processes over easier error + # reporting. + + raise_later = None + try: + self.endpoints.stop_all(fail_on_endpoint_errors) + except Exception as e: + raise_later = e # Stop storage controller before pageservers: we don't want it to spuriously # detect a pageserver "failure" during test teardown self.storage_controller.stop(immediate=immediate) + stop_later = [] + metric_errors = [] + for sk in self.safekeepers: sk.stop(immediate=immediate) for pageserver in self.pageservers: if ps_assert_metric_no_errors: - pageserver.assert_no_metric_errors() - pageserver.stop(immediate=immediate) + try: + pageserver.assert_no_metric_errors() + except Exception as e: + metric_errors.append(e) + log.error(f"metric validation failed on {pageserver.id}: {e}") + try: + pageserver.stop(immediate=immediate) + except RuntimeError: + stop_later.append(pageserver) self.broker.stop(immediate=immediate) + # TODO: for nice logging we need python 3.11 ExceptionGroup + for ps in stop_later: + ps.stop(immediate=True) + + if raise_later is not None: + raise raise_later + + for error in metric_errors: + raise error + + if len(stop_later) > 0: + raise RuntimeError( + f"{len(stop_later)} out of {len(self.pageservers)} pageservers failed to stop gracefully" + ) + @property def pageserver(self) -> NeonPageserver: """ @@ -2667,6 +2704,69 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"Got failpoints request response code {res.status_code}") res.raise_for_status() + def get_tenants_placement(self) -> defaultdict[str, Dict[str, Any]]: + """ + Get the intent and observed placements of all tenants known to the storage controller. + """ + tenants = self.tenant_list() + + tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict( + lambda: { + "observed": {"attached": None, "secondary": []}, + "intent": {"attached": None, "secondary": []}, + } + ) + + for t in tenants: + for node_id, loc_state in t["observed"]["locations"].items(): + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] + in set(["AttachedSingle", "AttachedMulti", "AttachedStale"]) + ): + tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id) + + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] == "Secondary" + ): + tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append( + int(node_id) + ) + + if "attached" in t["intent"]: + tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"][ + "attached" + ] + + if "secondary" in t["intent"]: + tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][ + "secondary" + ] + + return tenant_placement + + def warm_up_all_secondaries(self): + log.info("Warming up all secondary locations") + + tenant_placement = self.get_tenants_placement() + for tid, placement in tenant_placement.items(): + assert placement["observed"]["attached"] is not None + primary_id = placement["observed"]["attached"] + + assert len(placement["observed"]["secondary"]) == 1 + secondary_id = placement["observed"]["secondary"][0] + + parsed_tid = TenantShardId.parse(tid) + self.env.get_pageserver(primary_id).http_client().tenant_heatmap_upload(parsed_tid) + self.env.get_pageserver(secondary_id).http_client().tenant_secondary_download( + parsed_tid, wait_ms=250 + ) + @property def workdir(self) -> Path: return self.env.repo_dir @@ -4034,6 +4134,17 @@ class Endpoint(PgProtocol, LogUtils): assert self.pgdata_dir is not None # please mypy return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024 + def clear_shared_buffers(self, cursor: Optional[Any] = None): + """ + Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.' + + Might also clear LFC. + """ + if cursor is not None: + cursor.execute("select clear_buffer_cache()") + else: + self.safe_psql("select clear_buffer_cache()") + class EndpointFactory: """An object representing multiple compute endpoints.""" @@ -4829,7 +4940,7 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) -def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -> Lsn: +def logical_replication_sync(subscriber: PgProtocol, publisher: PgProtocol) -> Lsn: """Wait logical replication subscriber to sync with publisher.""" publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) while True: diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 65d6ff5d62..cd4261f1b8 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -361,6 +361,12 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self.verbose_error(res) return (res.status_code, res.json()) + def tenant_secondary_status(self, tenant_id: Union[TenantId, TenantShardId]): + url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/status" + res = self.get(url) + self.verbose_error(res) + return res.json() + def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]): assert "tenant_id" not in config.keys() res = self.put( diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 0f2a997b1e..1b6c3c23ba 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -177,9 +177,14 @@ class S3Storage: def access_env_vars(self) -> Dict[str, str]: if self.aws_profile is not None: - return { + env = { "AWS_PROFILE": self.aws_profile, } + # Pass through HOME env var because AWS_PROFILE needs it in order to work + home = os.getenv("HOME") + if home is not None: + env["HOME"] = home + return env if self.access_key is not None and self.secret_key is not None: return { "AWS_ACCESS_KEY_ID": self.access_key, diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index dfd9caba3e..cc93762175 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -182,14 +182,8 @@ class Workload: def validate(self, pageserver_id: Optional[int] = None): endpoint = self.endpoint(pageserver_id) - result = endpoint.safe_psql_many( - [ - "select clear_buffer_cache()", - f""" - SELECT COUNT(*) FROM {self.table} - """, - ] - ) + endpoint.clear_shared_buffers() + result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}") log.info(f"validate({self.expect_rows}): {result}") - assert result == [[("",)], [(self.expect_rows,)]] + assert result == [(self.expect_rows,)] diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py index 700b731418..5426a06ca1 100644 --- a/test_runner/logical_repl/test_debezium.py +++ b/test_runner/logical_repl/test_debezium.py @@ -12,7 +12,6 @@ import requests from fixtures.log_helper import log from fixtures.neon_fixtures import RemotePostgres from fixtures.utils import wait_until -from kafka import KafkaConsumer class DebeziumAPI: @@ -95,6 +94,8 @@ def debezium(remote_pg: RemotePostgres): log.debug("%s %s %s", resp.status_code, resp.ok, resp.text) assert resp.status_code == 201 assert len(dbz.list_connectors()) == 1 + from kafka import KafkaConsumer + consumer = KafkaConsumer( "dbserver1.inventory.customers", bootstrap_servers=["kafka:9092"], diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index 3258d4dcfa..8b934057e4 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -44,8 +44,7 @@ def test_basebackup_with_high_slru_count( page_cache_size = 16384 max_file_descriptors = 500000 neon_env_builder.pageserver_config_override = ( - f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; " - f"get_vectored_impl='vectored'; validate_vectored_get=false" + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" ) params.update( { diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py index 644c1f559b..0348b08f04 100644 --- a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py +++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py @@ -62,6 +62,9 @@ def test_download_churn( run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration) + # see https://github.com/neondatabase/neon/issues/8712 + env.stop(immediate=True) + def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): remote_storage_kind = s3_storage() diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 3dad348976..69df7974b9 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,9 +1,9 @@ from contextlib import closing -import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare +from fixtures.log_helper import log from fixtures.pg_version import PgVersion @@ -17,7 +17,6 @@ from fixtures.pg_version import PgVersion # 3. Disk space used # 4. Peak memory usage # -@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/7124") def test_bulk_insert(neon_with_baseline: PgCompare): env = neon_with_baseline @@ -30,8 +29,8 @@ def test_bulk_insert(neon_with_baseline: PgCompare): # Run INSERT, recording the time and I/O it takes with env.record_pageserver_writes("pageserver_writes"): with env.record_duration("insert"): - cur.execute("insert into huge values (generate_series(1, 5000000), 0);") - env.flush() + cur.execute("insert into huge values (generate_series(1, 20000000), 0);") + env.flush(compact=False, gc=False) env.report_peak_memory_use() env.report_size() @@ -49,6 +48,9 @@ def test_bulk_insert(neon_with_baseline: PgCompare): if isinstance(env, NeonCompare): measure_recovery_time(env) + with env.record_duration("compaction"): + env.compact() + def measure_recovery_time(env: NeonCompare): client = env.env.pageserver.http_client() @@ -71,7 +73,9 @@ def measure_recovery_time(env: NeonCompare): # Measure recovery time with env.record_duration("wal_recovery"): + log.info("Entering recovery...") client.timeline_create(pg_version, env.tenant, env.timeline) # Flush, which will also wait for lsn to catch up - env.flush() + env.flush(compact=False, gc=False) + log.info("Finished recovery.") diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index 9b20954d45..890b70b9fc 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -36,3 +36,6 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): with zenbenchmark.record_duration("test_query"): cur.execute("SELECT count(*) from t") assert cur.fetchone() == (n_iters * n_records,) + + # see https://github.com/neondatabase/neon/issues/8712 + env.stop(immediate=True) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 04785f7184..297aedfbed 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -2,7 +2,6 @@ import concurrent.futures import random import time from collections import defaultdict -from typing import Any, Dict import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId @@ -24,51 +23,14 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[ This function takes into account the intersection of the intent and the observed state. If they do not match, it asserts out. """ - tenants = env.storage_controller.tenant_list() - - intent = dict() - observed = dict() - - tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict( - lambda: { - "observed": {"attached": None, "secondary": []}, - "intent": {"attached": None, "secondary": []}, - } - ) - - for t in tenants: - for node_id, loc_state in t["observed"]["locations"].items(): - if ( - loc_state is not None - and "conf" in loc_state - and loc_state["conf"] is not None - and loc_state["conf"]["mode"] - in set(["AttachedSingle", "AttachedMulti", "AttachedStale"]) - ): - observed[t["tenant_shard_id"]] = int(node_id) - tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id) - - if ( - loc_state is not None - and "conf" in loc_state - and loc_state["conf"] is not None - and loc_state["conf"]["mode"] == "Secondary" - ): - tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id)) - - if "attached" in t["intent"]: - intent[t["tenant_shard_id"]] = t["intent"]["attached"] - tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"] - - if "secondary" in t["intent"]: - tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][ - "secondary" - ] - + tenant_placement = env.storage_controller.get_tenants_placement() log.info(f"{tenant_placement=}") matching = { - tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid] + tid: tenant_placement[tid]["intent"]["attached"] + for tid in tenant_placement + if tenant_placement[tid]["intent"]["attached"] + == tenant_placement[tid]["observed"]["attached"] } assert len(matching) == total_shards diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index 7e40081aa2..f83b44a7ad 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -20,7 +20,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): } ) - pageserver_http.configure_failpoints(("flush-frozen-pausable", "sleep(10000)")) + failpoint = "flush-frozen-pausable" + + pageserver_http.configure_failpoints((failpoint, "sleep(10000)")) endpoint_branch0 = env.endpoints.create_start("main", tenant_id=tenant) branch0_cur = endpoint_branch0.connect().cursor() @@ -96,3 +98,5 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): assert query_scalar(branch1_cur, "SELECT count(*) FROM foo") == 200000 assert query_scalar(branch2_cur, "SELECT count(*) FROM foo") == 300000 + + pageserver_http.configure_failpoints((failpoint, "off")) diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py new file mode 100644 index 0000000000..6d2567b7ee --- /dev/null +++ b/test_runner/regress/test_combocid.py @@ -0,0 +1,139 @@ +from fixtures.neon_fixtures import NeonEnvBuilder + + +def do_combocid_op(neon_env_builder: NeonEnvBuilder, op): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers='1MB'", + ], + ) + + conn = endpoint.connect() + cur = conn.cursor() + n_records = 1000 + + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("create table t(id integer, val integer)") + + cur.execute("begin") + cur.execute("insert into t values (1, 0)") + cur.execute("insert into t values (2, 0)") + cur.execute(f"insert into t select g, 0 from generate_series(3,{n_records}) g") + + # Open a cursor that scroll it halfway through + cur.execute("DECLARE c1 NO SCROLL CURSOR WITHOUT HOLD FOR SELECT * FROM t") + cur.execute("fetch 500 from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + # Perform specified operation + cur.execute(op) + + # Clear the cache, so that we exercise reconstructing the pages + # from WAL + cur.execute("SELECT clear_buffer_cache()") + + # Check that the cursor opened earlier still works. If the + # combocids are not restored correctly, it won't. + cur.execute("fetch all from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + cur.execute("rollback") + + +def test_combocid_delete(neon_env_builder: NeonEnvBuilder): + do_combocid_op(neon_env_builder, "delete from t") + + +def test_combocid_update(neon_env_builder: NeonEnvBuilder): + do_combocid_op(neon_env_builder, "update t set val=val+1") + + +def test_combocid_lock(neon_env_builder: NeonEnvBuilder): + do_combocid_op(neon_env_builder, "select * from t for update") + + +def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers='1MB'", + ], + ) + + conn = endpoint.connect() + cur = conn.cursor() + n_records = 1000 + + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("create table t(id integer, val integer)") + file_path = f"{endpoint.pg_data_dir_path()}/t.csv" + cur.execute(f"insert into t select g, 0 from generate_series(1,{n_records}) g") + cur.execute(f"copy t to '{file_path}'") + cur.execute("truncate table t") + + cur.execute("begin") + cur.execute(f"copy t from '{file_path}'") + + # Open a cursor that scroll it halfway through + cur.execute("DECLARE c1 NO SCROLL CURSOR WITHOUT HOLD FOR SELECT * FROM t") + cur.execute("fetch 500 from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + # Delete all the rows. Because all of the rows were inserted earlier in the + # same transaction, all the rows will get a combocid. + cur.execute("delete from t") + # Clear the cache, so that we exercise reconstructing the pages + # from WAL + cur.execute("SELECT clear_buffer_cache()") + + # Check that the cursor opened earlier still works. If the + # combocids are not restored correctly, it won't. + cur.execute("fetch all from c1") + rows = cur.fetchall() + assert len(rows) == 500 + + cur.execute("rollback") + + +def test_combocid(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + + conn = endpoint.connect() + cur = conn.cursor() + n_records = 100000 + + cur.execute("create table t(id integer, val integer)") + cur.execute(f"insert into t values (generate_series(1,{n_records}), 0)") + + cur.execute("begin") + + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + + cur.execute("delete from t") + assert cur.rowcount == n_records + cur.execute("delete from t") + assert cur.rowcount == 0 + + cur.execute(f"insert into t values (generate_series(1,{n_records}), 0)") + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + cur.execute("update t set val=val+1") + assert cur.rowcount == n_records + + cur.execute("rollback") diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 8edc8c554c..ae63136abb 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -168,7 +168,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): # re-execute the query, it will make GetPage # requests. This does not clear the last-written LSN cache # so we still remember the LSNs of the pages. - s_cur.execute("SELECT clear_buffer_cache()") + secondary.clear_shared_buffers(cursor=s_cur) if pause_apply: s_cur.execute("SELECT pg_wal_replay_pause()") @@ -332,6 +332,7 @@ def test_replica_query_race(neon_simple_env: NeonEnv): log.info(f"read {reads}: counter {readcounter}, last update {writecounter}") reads += 1 + # FIXME: what about LFC clearing? await conn.execute("SELECT clear_buffer_cache()") async def both(): diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 66afe9ddfd..0d18aa43b7 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -4,11 +4,13 @@ from random import choice from string import ascii_lowercase import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( AuxFileStore, NeonEnv, NeonEnvBuilder, + PgProtocol, logical_replication_sync, wait_for_last_flush_lsn, ) @@ -253,6 +255,21 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of cur.execute( "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')" ) + cur.execute( + """ +INSERT INTO wal_generator (data) +SELECT repeat('A', 1024) -- Generates a kilobyte of data per row +FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data +""" + ) + + endpoint.stop_and_destroy() + endpoint = env.endpoints.create_start("init") + with endpoint.connect().cursor() as cur: + log.info("advance slot") + cur.execute( + "SELECT * from pg_replication_slot_advance('slotty_mcslotface', pg_current_wal_lsn())" + ) # Tests that walsender correctly blocks until WAL is downloaded from safekeepers @@ -524,3 +541,90 @@ def test_replication_shutdown(neon_simple_env: NeonEnv): assert [r[0] for r in res] == [10, 20, 30, 40] wait_until(10, 0.5, check_that_changes_propagated) + + +def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn: + """ + Wait for logical replication subscriber reported flush_lsn to reach + pg_current_wal_flush_lsn on publisher. Note that this is somewhat unreliable + because for some WAL records like vacuum subscriber won't get any data at + all. + """ + publisher_flush_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + + def check_caughtup(): + res = publisher.safe_psql( + """ +select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication sr, pg_replication_slots s + where s.active_pid = sr.pid and s.slot_type = 'logical'; + """ + )[0] + sent_lsn, flush_lsn, curr_publisher_flush_lsn = Lsn(res[0]), Lsn(res[1]), Lsn(res[2]) + log.info( + f"sent_lsn={sent_lsn}, flush_lsn={flush_lsn}, publisher_flush_lsn={curr_publisher_flush_lsn}, waiting flush_lsn to reach {publisher_flush_lsn}" + ) + assert flush_lsn >= publisher_flush_lsn + + wait_until(30, 0.5, check_caughtup) + return publisher_flush_lsn + + +# Test that subscriber takes into account quorum committed flush_lsn in +# flush_lsn reporting to publisher. Without this, it may ack too far, losing +# data on restart because publisher advances START_REPLICATION position to the +# confirmed_flush_lsn of the slot. +def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg): + env = neon_simple_env + # use vanilla as publisher to allow writes on it when safekeeper is down + vanilla_pg.configure( + [ + "wal_level = 'logical'", + # neon fork uses custom WAL records which won't work without extension installed with obscure + # ERROR: resource manager with ID 134 not registered + # error. + "shared_preload_libraries = 'neon'", + ] + ) + vanilla_pg.start() + vanilla_pg.safe_psql("create extension neon;") + + env.neon_cli.create_branch("subscriber") + sub = env.endpoints.create("subscriber") + sub.start() + + with vanilla_pg.cursor() as pcur: + with sub.cursor() as scur: + pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + pcur.execute("CREATE PUBLICATION pub FOR TABLE t") + scur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + + pub_connstr = vanilla_pg.connstr().replace("'", "''") + log.info(f"pub connstr is {pub_connstr}, subscriber connstr {sub.connstr()}") + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_connstr}' PUBLICATION pub with (synchronous_commit=off)" + scur.execute(query) + time.sleep(2) # let initial table sync complete + + # stop safekeeper so it won't get any data + for sk in env.safekeepers: + sk.stop() + # and insert to publisher + with vanilla_pg.cursor() as pcur: + for i in range(0, 1000): + pcur.execute("INSERT into t values (%s, random()*100000)", (i,)) + # wait until sub receives all data + logical_replication_sync(sub, vanilla_pg) + # Update confirmed_flush_lsn of the slot. If subscriber ack'ed recevied data + # as flushed we'll now lose it if subscriber restars. That's why + # logical_replication_wait_flush_lsn_sync is expected to hang while + # safekeeper is down. + vanilla_pg.safe_psql("checkpoint;") + assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000 + + # restart subscriber and ensure it can catch up lost tail again + sub.stop(mode="immediate") + for sk in env.safekeepers: + sk.start() + sub.start() + log.info("waiting for sync after restart") + logical_replication_wait_flush_lsn_sync(vanilla_pg) + assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000 diff --git a/test_runner/regress/test_oid_overflow.py b/test_runner/regress/test_oid_overflow.py index a94ae99ed9..e8eefc2414 100644 --- a/test_runner/regress/test_oid_overflow.py +++ b/test_runner/regress/test_oid_overflow.py @@ -37,7 +37,7 @@ def test_oid_overflow(neon_env_builder: NeonEnvBuilder): oid = cur.fetchall()[0][0] log.info(f"t2.relfilenode={oid}") - cur.execute("SELECT clear_buffer_cache()") + endpoint.clear_shared_buffers(cursor=cur) cur.execute("SELECT x from t1") assert cur.fetchone() == (1,) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 68a45f957c..bbf82fea4c 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -159,6 +159,8 @@ def test_pageserver_chaos( if build_type == "debug": pytest.skip("times out in debug builds") + # same rationale as with the immediate stop; we might leave orphan layers behind. + neon_env_builder.disable_scrub_on_exit() neon_env_builder.enable_pageserver_remote_storage(s3_storage()) if shard_count is not None: neon_env_builder.num_pageservers = shard_count @@ -220,3 +222,11 @@ def test_pageserver_chaos( # Check that all the updates are visible num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0] assert num_updates == i * 100000 + + # currently pageserver cannot tolerate the fact that "s3" goes away, and if + # we succeeded in a compaction before shutdown, there might be a lot of + # uploads pending, certainly more than what we can ingest with MOCK_S3 + # + # so instead, do a fast shutdown for this one test. + # See https://github.com/neondatabase/neon/issues/8709 + env.stop(immediate=True) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 6f7ea0092a..45ce5b1c5b 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -144,7 +144,13 @@ def test_pg_regress( ) # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("main") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # Enable the test mode, so that we don't need to patch the test cases. + "neon.regress_test_mode = true", + ], + ) endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. @@ -207,7 +213,14 @@ def test_isolation( # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them - endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"]) + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "max_prepared_transactions=100", + # Enable the test mode, so that we don't need to patch the test cases. + "neon.regress_test_mode = true", + ], + ) endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_isolation_regress to run in. @@ -268,7 +281,13 @@ def test_sql_regress( ) # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("main") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # Enable the test mode, so that we don't need to patch the test cases. + "neon.regress_test_mode = true", + ], + ) endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index f446f4f200..d2b8c2ed8b 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -2,6 +2,7 @@ import asyncio import json import subprocess import time +import urllib.parse from typing import Any, List, Optional, Tuple import psycopg2 @@ -275,6 +276,31 @@ def test_sql_over_http(static_proxy: NeonProxy): assert res["rowCount"] is None +def test_sql_over_http_db_name_with_space(static_proxy: NeonProxy): + db = "db with spaces" + static_proxy.safe_psql_many( + ( + f'create database "{db}"', + "create role http with login password 'http' superuser", + ) + ) + + def q(sql: str, params: Optional[List[Any]] = None) -> Any: + params = params or [] + connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/{urllib.parse.quote(db)}" + response = requests.post( + f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + data=json.dumps({"query": sql, "params": params}), + headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr}, + verify=str(static_proxy.test_output_dir / "proxy.crt"), + ) + assert response.status_code == 200, response.text + return response.json() + + rows = q("select 42 as answer")["rows"] + assert rows == [{"answer": 42}] + + def test_sql_over_http_output_options(static_proxy: NeonProxy): static_proxy.safe_psql("create role http2 with login password 'http2' superuser") diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index 2437c8f806..d128c60a99 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -61,7 +61,7 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Clear buffer cache to ensure no stale pages are brought into the cache") - c.execute("select clear_buffer_cache()") + endpoint.clear_shared_buffers(cursor=c) cache_entries = query_scalar( c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index eb2cdccdb9..9b2557a165 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -17,6 +17,7 @@ from fixtures.neon_fixtures import ( PgBin, StorageControllerApiException, TokenScope, + last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( @@ -1597,6 +1598,8 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): # Perform a graceful rolling restart for ps in env.pageservers: + env.storage_controller.warm_up_all_secondaries() + env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 ) @@ -1645,6 +1648,115 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): assert_shard_counts_balanced(env, shard_counts, total_shards) +def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + """ + Artificially make a tenant shard's secondary location lag behind the primary + and check that storage controller driven node drains skip the lagging tenant shard. + Finally, validate that the tenant shard is migrated when a new drain request comes + in and it's no longer lagging. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.storage_controller_config = { + "max_secondary_lag_bytes": 1 * 1024 * 1024, + } + + env = neon_env_builder.init_configs() + env.start() + + tid, timeline_id = env.neon_cli.create_tenant(placement_policy='{"Attached":1}') + + # Give things a chance to settle. + env.storage_controller.reconcile_until_idle(timeout_secs=30) + + locations = env.storage_controller.locate(tid) + assert len(locations) == 1 + primary: int = locations[0]["node_id"] + not_primary = [ps.id for ps in env.pageservers if ps.id != primary] + assert len(not_primary) == 1 + secondary = not_primary[0] + + log.info(f"Paused secondary downloads on {secondary}") + env.get_pageserver(secondary).http_client().configure_failpoints( + ("secondary-layer-download-pausable", "pause") + ) + + log.info(f"Ingesting some data for {tid}") + + with env.endpoints.create_start("main", tenant_id=tid) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql("CREATE TABLE created_foo(id integer);") + last_flush_lsn_upload(env, endpoint, tid, timeline_id) + + log.info(f"Uploading heatmap from {primary} and requesting download from {secondary}") + + env.get_pageserver(primary).http_client().tenant_heatmap_upload(tid) + env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100) + + def secondary_is_lagging(): + resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid) + lag = resp["bytes_total"] - resp["bytes_downloaded"] + + if lag <= 1 * 1024 * 1024: + raise Exception(f"Secondary lag not big enough: {lag}") + + log.info(f"Looking for lag to develop on the secondary {secondary}") + wait_until(10, 1, secondary_is_lagging) + + log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}") + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2 + ) + + env.storage_controller.poll_node_status( + primary, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=6, + backoff=5, + ) + + locations = env.storage_controller.locate(tid) + assert len(locations) == 1 + assert locations[0]["node_id"] == primary + + log.info(f"Unpausing secondary downloads on {secondary}") + env.get_pageserver(secondary).http_client().configure_failpoints( + ("secondary-layer-download-pausable", "off") + ) + env.get_pageserver(secondary).http_client().tenant_secondary_download(tid, wait_ms=100) + + log.info(f"Waiting for lag to reduce on {secondary}") + + def lag_is_acceptable(): + resp = env.get_pageserver(secondary).http_client().tenant_secondary_status(tid) + lag = resp["bytes_total"] - resp["bytes_downloaded"] + + if lag > 1 * 1024 * 1024: + raise Exception(f"Secondary lag not big enough: {lag}") + + wait_until(10, 1, lag_is_acceptable) + + env.storage_controller.node_configure(primary, {"scheduling": "Active"}) + + log.info(f"Starting drain of primary {primary} with non-laggy secondary {secondary}") + + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), primary, max_attempts=3, backoff=2 + ) + + env.storage_controller.poll_node_status( + primary, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.PAUSE_FOR_RESTART, + max_attempts=6, + backoff=5, + ) + + locations = env.storage_controller.locate(tid) + assert len(locations) == 1 + assert locations[0]["node_id"] == secondary + + def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_configs() @@ -1671,6 +1783,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): ps_id_to_drain = env.pageservers[0].id + env.storage_controller.warm_up_all_secondaries() env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps_id_to_drain, diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 388f6a9e92..2844d1b1d2 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -204,6 +204,11 @@ def test_scrubber_physical_gc_ancestors( }, ) + # Create an extra timeline, to ensure the scrubber isn't confused by multiple timelines + env.storage_controller.pageserver_api().timeline_create( + env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate() + ) + # Make sure the original shard has some layers workload = Workload(env, tenant_id, timeline_id) workload.init() @@ -214,6 +219,11 @@ def test_scrubber_physical_gc_ancestors( shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately + # Create a timeline after split, to ensure scrubber can handle timelines that exist in child shards but not ancestors + env.storage_controller.pageserver_api().timeline_create( + env.pg_version, tenant_id=tenant_id, new_timeline_id=TimelineId.generate() + ) + # Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which # compacts, and we only want to do tha explicitly later in the test. workload.write_rows(100, upload=False) @@ -305,10 +315,19 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder # Make sure the original shard has some layers workload = Workload(env, tenant_id, timeline_id) workload.init() - workload.write_rows(100) + workload.write_rows(100, upload=False) + workload.stop() new_shard_count = 4 shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) + for shard in shards: + ps = env.get_tenant_pageserver(shard) + log.info(f"Waiting for shard {shard} on pageserver {ps.id}") + ps.http_client().timeline_checkpoint( + shard, timeline_id, compact=False, wait_until_uploaded=True + ) + + ps.http_client().deletion_queue_flush(execute=True) # Create a second timeline so that when we delete the first one, child shards still have some content in S3. # @@ -319,15 +338,6 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder PgVersion.NOT_SET, tenant_id, other_timeline_id ) - # Write after split so that child shards have some indices in S3 - workload.write_rows(100, upload=False) - for shard in shards: - ps = env.get_tenant_pageserver(shard) - log.info(f"Waiting for shard {shard} on pageserver {ps.id}") - ps.http_client().timeline_checkpoint( - shard, timeline_id, compact=False, wait_until_uploaded=True - ) - # The timeline still exists in child shards and they reference its layers, so scrubbing # now shouldn't delete anything. gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=0, mode="full") diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index 91caad7220..4581008022 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -37,7 +37,9 @@ def test_subscriber_restart(neon_simple_env: NeonEnv): scur.execute("CREATE TABLE t (pk integer primary key, sk integer)") # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin" - query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + # synchronous_commit=on to test a hypothesis for why this test has been flaky. + # XXX: Add link to the issue + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub with (synchronous_commit=on)" scur.execute(query) time.sleep(2) # let initial table sync complete diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index c01b3a2e89..dadf5ca672 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -128,6 +128,8 @@ def test_tenant_delete_smoke( assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0 + env.pageserver.stop() + def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder): """Reproduction of 2023-11-23 stuck tenants investigation""" @@ -200,11 +202,10 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE if deletion is not None: deletion.join() + env.pageserver.stop() -def test_tenant_delete_races_timeline_creation( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, -): + +def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder): """ Validate that timeline creation executed in parallel with deletion works correctly. @@ -318,6 +319,8 @@ def test_tenant_delete_races_timeline_creation( # We deleted our only tenant, and the scrubber fails if it detects nothing neon_env_builder.disable_scrub_on_exit() + env.pageserver.stop() + def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): """ diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index b3767a2766..902457c2ac 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -5,7 +5,7 @@ import time from concurrent.futures import ThreadPoolExecutor from queue import Empty, Queue from threading import Barrier -from typing import List, Tuple +from typing import List, Set, Tuple import pytest from fixtures.common_types import Lsn, TimelineId @@ -411,7 +411,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None - assert ep.safe_psql("SELECT clear_buffer_cache();") + ep.clear_shared_buffers() assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0 ep.stop() @@ -807,22 +807,24 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( What remains not tested by this: - shutdown winning over complete - - Shutdown winning over complete needs gc blocking and reparenting any left-overs on retry. """ if sharded and mode == "delete_tenant": # the shared/exclusive lock for tenant is blocking this: # timeline detach ancestor takes shared, delete tenant takes exclusive - pytest.skip( - "tenant deletion while timeline ancestor detach is underway is not supported yet" - ) + pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen") shard_count = 2 if sharded else 1 neon_env_builder.num_pageservers = shard_count - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count if sharded else None) + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count if sharded else None, + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + }, + ) for ps in env.pageservers: ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) @@ -831,7 +833,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( detached_timeline = env.neon_cli.create_branch("detached soon", "main") - failpoint = "timeline-detach-ancestor::before_starting_after_locking_pausable" + pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable" env.storage_controller.reconcile_until_idle() shards = env.storage_controller.locate(env.initial_tenant) @@ -843,13 +845,20 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( victim = pageservers[int(shards[-1]["node_id"])] victim_http = victim.http_client() - victim_http.configure_failpoints((failpoint, "pause")) + victim_http.configure_failpoints((pausepoint, "pause")) def detach_ancestor(): target.detach_ancestor(env.initial_tenant, detached_timeline) - def at_failpoint() -> Tuple[str, LogCursor]: - return victim.assert_log_contains(f"at failpoint {failpoint}") + def at_failpoint() -> LogCursor: + msg, offset = victim.assert_log_contains(f"at failpoint {pausepoint}") + log.info(f"found {msg}") + msg, offset = victim.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + log.info(f"found {msg}") + return offset def start_delete(): if mode == "delete_timeline": @@ -882,23 +891,44 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( with ThreadPoolExecutor(max_workers=2) as pool: try: fut = pool.submit(detach_ancestor) - _, offset = wait_until(10, 1.0, at_failpoint) + offset = wait_until(10, 1.0, at_failpoint) delete = pool.submit(start_delete) - wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) + offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) - victim_http.configure_failpoints((failpoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) delete.result() assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}" + # TODO: match the error with pytest.raises(PageserverApiException) as exc: fut.result() + log.info(f"TODO: match this error: {exc.value}") assert exc.value.status_code == 503 finally: - victim_http.configure_failpoints((failpoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) + + if mode != "delete_timeline": + return + + # make sure the gc is unblocked + time.sleep(2) + victim.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) + + if not sharded: + # we have the other node only while sharded + return + + other = pageservers[int(shards[0]["node_id"])] + log.info(f"other is {other.id}") + _, offset = other.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/\\S+/timeline/\\S+/detach_ancestor .*\\}: Request handled, status: 200 OK", + ) + # this might be a lot earlier than the victims line, but that is okay. + _, offset = other.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) @pytest.mark.parametrize("mode", ["delete_reparentable_timeline"]) @@ -915,7 +945,9 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv assert ( mode == "delete_reparentable_timeline" - ), "only one now, but we could have the create just as well, need gc blocking" + ), "only one now, but creating reparentable timelines cannot be supported even with gc blocking" + # perhaps it could be supported by always doing this for the shard0 first, and after that for others. + # when we run shard0 to completion, we can use it's timelines to restrict which can be reparented. shard_count = 2 neon_env_builder.num_pageservers = shard_count @@ -1048,10 +1080,267 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv victim_http.configure_failpoints((pausepoint, "off")) +def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: NeonEnvBuilder): + """ + Using a failpoint, force the completion step of timeline ancestor detach to + fail after reparenting a single timeline. + + Retrying should try reparenting until all reparentings are done, all the + time blocking gc even across restarts (first round). + + A completion failpoint is used to inhibit completion on second to last + round. + + On last round, the completion uses a path where no reparentings can happen + because original ancestor is deleted, and there is a completion to unblock + gc without restart. + """ + + # to get the remote storage metrics + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + env.pageserver.allowed_errors.extend( + [ + ".* reparenting failed: failpoint: timeline-detach-ancestor::allow_one_reparented", + ".* Error processing HTTP request: InternalServerError\\(failed to reparent all candidate timelines, please retry", + ".* Error processing HTTP request: InternalServerError\\(failpoint: timeline-detach-ancestor::complete_before_uploading", + ] + ) + + http = env.pageserver.http_client() + + def remote_storage_copy_requests(): + return http.get_metric_value( + "remote_storage_s3_request_seconds_count", + {"request_type": "copy_object", "result": "ok"}, + ) + + def reparenting_progress(timelines: List[TimelineId]) -> Tuple[int, Set[TimelineId]]: + reparented = 0 + not_reparented = set() + for timeline in timelines: + detail = http.timeline_detail(env.initial_tenant, timeline) + ancestor = TimelineId(detail["ancestor_timeline_id"]) + if ancestor == detached: + reparented += 1 + else: + not_reparented.add(timeline) + return (reparented, not_reparented) + + # main ------A-----B-----C-----D-----E> lsn + timelines = [] + with env.endpoints.create_start("main") as ep: + for counter in range(5): + ep.safe_psql( + f"create table foo_{counter} as select i::bigint from generate_series(1, 10000) t(i)" + ) + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + http.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + branch = env.neon_cli.create_branch( + f"branch_{counter}", "main", ancestor_start_lsn=branch_lsn + ) + timelines.append(branch) + + flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline) + + # detach "E" which has most reparentable timelines under it + detached = timelines.pop() + assert len(timelines) == 4 + + http = http.without_status_retrying() + + http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return")) + + not_reparented: Set[TimelineId] = set() + # tracked offset in the pageserver log which is at least at the most recent activation + offset = None + + def try_detach(): + with pytest.raises( + PageserverApiException, + match=".*failed to reparent all candidate timelines, please retry", + ) as exc: + http.detach_ancestor(env.initial_tenant, detached) + assert exc.value.status_code == 500 + + # first round -- do more checking to make sure the gc gets paused + try_detach() + + assert ( + http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None + ), "first round should had detached 'detached'" + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == 1 + + time.sleep(2) + _, offset = env.pageserver.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request", + offset, + ) + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + _, offset = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + metric = remote_storage_copy_requests() + assert metric != 0 + # make sure the gc blocking is persistent over a restart + env.pageserver.restart() + env.pageserver.quiesce_tenants() + time.sleep(2) + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + assert env.pageserver.log_contains(".* gc_loop.*: [0-9] timelines need GC", offset) is None + _, offset = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + # restore failpoint for the next reparented + http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return")) + + reparented_before = reparented + + # do two more rounds + for _ in range(2): + try_detach() + + assert ( + http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None + ), "first round should had detached 'detached'" + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == reparented_before + 1 + reparented_before = reparented + + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + metric = remote_storage_copy_requests() + assert metric == 0, "copies happen in the first round" + + assert offset is not None + assert len(not_reparented) == 1 + + http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "return")) + + # almost final round, the failpoint is hit no longer as there is only one reparented and one always gets to succeed. + # the tenant is restarted once more, but we fail during completing. + with pytest.raises( + PageserverApiException, match=".* timeline-detach-ancestor::complete_before_uploading" + ) as exc: + http.detach_ancestor(env.initial_tenant, detached) + assert exc.value.status_code == 500 + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + + # delete the previous ancestor to take a different path to completion. all + # other tests take the "detach? reparent complete", but this only hits + # "complete". + http.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20) + + http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off")) + + reparented_resp = http.detach_ancestor(env.initial_tenant, detached) + assert reparented_resp == set(timelines) + # no need to quiesce_tenants anymore, because completion does that + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == len(timelines) + + time.sleep(2) + assert ( + env.pageserver.log_contains(".*: attach finished, activating", offset) is None + ), "there should be no restart with the final detach_ancestor as it only completed" + + # gc is unblocked + env.pageserver.assert_log_contains(".* gc_loop.*: 5 timelines need GC", offset) + + metric = remote_storage_copy_requests() + assert metric == 0 + + +def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( + neon_env_builder: NeonEnvBuilder, +): + """ + Make sure that a timeline deleted after restart will unpause gc blocking. + """ + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + http = env.pageserver.http_client() + + detached = env.neon_cli.create_branch("detached") + + failpoint = "timeline-detach-ancestor::after_activating_before_finding-pausable" + + http.configure_failpoints((failpoint, "pause")) + + def detach_and_get_stuck(): + return http.detach_ancestor(env.initial_tenant, detached) + + def request_processing_noted_in_log(): + _, offset = env.pageserver.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request", + ) + return offset + + def delete_detached(): + return http.timeline_delete(env.initial_tenant, detached) + + try: + with ThreadPoolExecutor(max_workers=1) as pool: + detach = pool.submit(detach_and_get_stuck) + + offset = wait_until(10, 1.0, request_processing_noted_in_log) + + # make this named fn tor more clear failure test output logging + def pausepoint_hit_with_gc_paused() -> LogCursor: + env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + _, at = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + return at + + offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused) + + delete_detached() + + wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0) + + http.configure_failpoints((failpoint, "off")) + + with pytest.raises(PageserverApiException) as exc: + detach.result() + + # FIXME: this should be 404 but because there is another Anyhow conversion it is 500 + assert exc.value.status_code == 500 + env.pageserver.allowed_errors.append( + ".*Error processing HTTP request: InternalServerError\\(detached timeline was not found after restart" + ) + finally: + http.configure_failpoints((failpoint, "off")) + + # make sure gc has been unblocked + time.sleep(2) + + env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) + + # TODO: -# - after starting the operation, pageserver is shutdown, restarted -# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited -# - deletion of reparented while reparenting should fail once, then succeed (?) # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. # diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 1f220eec9e..642b9e449b 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1137,3 +1137,10 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) else: raise RuntimeError(activation_method) + + client.configure_failpoints( + [ + ("timeline-calculate-logical-size-pause", "off"), + ("walreceiver-after-ingest", "off"), + ] + ) diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 225b952e73..7272979c4a 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -62,7 +62,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # Clear the buffer cache, to force the VM page to be re-fetched from # the page server - cur.execute("SELECT clear_buffer_cache()") + endpoint.clear_shared_buffers(cursor=cur) # Check that an index-only scan doesn't see the deleted row. If the # clearing of the VM bit was not replayed correctly, this would incorrectly diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index f02f19c588..bf7829fc84 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2159,7 +2159,7 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder): # generate some data to commit WAL on safekeepers endpoint.safe_psql("insert into t select generate_series(1,100), 'action'") # clear the buffers - endpoint.safe_psql("select clear_buffer_cache()") + endpoint.clear_shared_buffers() # read data to fetch pages from pageserver endpoint.safe_psql("select sum(i) from t") diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 7bbe834c8c..3fd7a45f8a 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 7bbe834c8c2dc37802eca8484311599bc47341f6 +Subproject commit 3fd7a45f8aae85c080df6329e3c85887b7f3a737 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 9eba7dd382..46b4b235f3 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 9eba7dd382606ffca43aca865f337ec21bcdac73 +Subproject commit 46b4b235f38413ab5974bb22c022f9b829257674 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 5377f5ed72..47a9122a5a 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3 +Subproject commit 47a9122a5a150a3217fafd3f3d4fe8e020ea718a diff --git a/vendor/revisions.json b/vendor/revisions.json index 570dfc1550..6e3e489b5d 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,14 @@ { - "v16": ["16.3", "5377f5ed7290af45b7cb6b0d98d43cbf4a4e77f3"], - "v15": ["15.7", "9eba7dd382606ffca43aca865f337ec21bcdac73"], - "v14": ["14.12", "7bbe834c8c2dc37802eca8484311599bc47341f6"] + "v16": [ + "16.3", + "47a9122a5a150a3217fafd3f3d4fe8e020ea718a" + ], + "v15": [ + "15.7", + "46b4b235f38413ab5974bb22c022f9b829257674" + ], + "v14": [ + "14.12", + "3fd7a45f8aae85c080df6329e3c85887b7f3a737" + ] } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 7d005c7139..41d6e11725 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -416,7 +416,7 @@ build: | # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor # requires cgroup v2, so we'll build cgroup-tools ourselves. FROM debian:bullseye-slim as libcgroup-builder - ENV LIBCGROUP_VERSION v2.0.3 + ENV LIBCGROUP_VERSION=v2.0.3 RUN set -exu \ && apt update \ @@ -460,7 +460,7 @@ build: | pkg-config # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) - ENV PGBOUNCER_TAG pgbouncer_1_22_1 + ENV PGBOUNCER_TAG=pgbouncer_1_22_1 RUN set -e \ && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \