diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 1b602883c5..29c4d18f4a 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -20,3 +20,4 @@ config-variables: - REMOTE_STORAGE_AZURE_REGION - SLACK_UPCOMING_RELEASE_CHANNEL_ID - DEV_AWS_OIDC_ROLE_ARN + - BENCHMARK_INGEST_TARGET_PROJECTID diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 2bdb727719..16b6e71498 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -221,6 +221,8 @@ runs: REPORT_URL: ${{ steps.generate-report.outputs.report-url }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: + # Retry script for 5XX server errors: https://github.com/actions/github-script#retries + retries: 5 script: | const { REPORT_URL, COMMIT_SHA } = process.env diff --git a/.github/actions/set-docker-config-dir/action.yml b/.github/actions/set-docker-config-dir/action.yml deleted file mode 100644 index 3ee8bec8c6..0000000000 --- a/.github/actions/set-docker-config-dir/action.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: "Set custom docker config directory" -description: "Create a directory for docker config and set DOCKER_CONFIG" - -# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings -runs: - using: "composite" - steps: - - name: Show warning on GitHub-hosted runners - if: runner.environment == 'github-hosted' - shell: bash -euo pipefail {0} - run: | - # Using the following environment variables to find a path to the workflow file - # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch - # ${GITHUB_REPOSITORY} - octocat/hello-world - # ${GITHUB_REF} - refs/heads/my_branch - # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables - - filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"} - filename=${filename_with_ref%"@$GITHUB_REF"} - - # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message - title='Unnecessary usage of `.github/actions/set-docker-config-dir`' - message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners' - echo "::warning file=${filename},title=${title}::${message}" - - - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7 - env: - DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }} - with: - main: | - mkdir -p "${DOCKER_CONFIG}" - echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV - post: | - if [ -d "${DOCKER_CONFIG}" ]; then - rm -r "${DOCKER_CONFIG}" - fi diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 22c025dd89..89328f20ee 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,14 +1,3 @@ ## Problem ## Summary of changes - -## Checklist before requesting a review - -- [ ] I have performed a self-review of my code. -- [ ] If it is a core feature, I have added thorough tests. -- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? -- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. - -## Checklist before merging - -- [ ] Do not forget to reformat commit message to not include the above checklist diff --git a/.github/workflows/_check-codestyle-python.yml b/.github/workflows/_check-codestyle-python.yml new file mode 100644 index 0000000000..9ae28a1379 --- /dev/null +++ b/.github/workflows/_check-codestyle-python.yml @@ -0,0 +1,37 @@ +name: Check Codestyle Python + +on: + workflow_call: + inputs: + build-tools-image: + description: 'build-tools image' + required: true + type: string + +defaults: + run: + shell: bash -euxo pipefail {0} + +jobs: + check-codestyle-python: + runs-on: [ self-hosted, small ] + container: + image: ${{ inputs.build-tools-image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - uses: actions/checkout@v4 + + - uses: actions/cache@v4 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} + + - run: ./scripts/pysync + + - run: poetry run ruff check . + - run: poetry run ruff format --check . + - run: poetry run mypy . diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 10750089b2..82b065c524 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -64,7 +64,7 @@ jobs: - uses: actions/checkout@v4 - - uses: ./.github/actions/set-docker-config-dir + - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - uses: docker/setup-buildx-action@v3 with: cache-binary: false diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index bba51ddc92..cc6f91d28e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -90,35 +90,10 @@ jobs: check-codestyle-python: needs: [ check-permissions, build-build-tools-image ] - runs-on: [ self-hosted, small ] - container: - image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm - credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - options: --init - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Cache poetry deps - uses: actions/cache@v4 - with: - path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - - - name: Install Python deps - run: ./scripts/pysync - - - name: Run `ruff check` to ensure code format - run: poetry run ruff check . - - - name: Run `ruff format` to ensure code format - run: poetry run ruff format --check . - - - name: Run mypy to check types - run: poetry run mypy . + uses: ./.github/workflows/_check-codestyle-python.yml + with: + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + secrets: inherit check-codestyle-jsonnet: needs: [ check-permissions, build-build-tools-image ] @@ -141,6 +116,7 @@ jobs: # Check that the vendor/postgres-* submodules point to the # corresponding REL_*_STABLE_neon branches. check-submodules: + needs: [ check-permissions ] runs-on: ubuntu-22.04 steps: - name: Checkout @@ -521,6 +497,8 @@ jobs: REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: + # Retry script for 5XX server errors: https://github.com/actions/github-script#retries + retries: 5 script: | const { REPORT_URL_NEW, COMMIT_SHA } = process.env @@ -552,7 +530,7 @@ jobs: with: submodules: true - - uses: ./.github/actions/set-docker-config-dir + - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - uses: docker/setup-buildx-action@v3 with: cache-binary: false @@ -643,7 +621,7 @@ jobs: with: submodules: true - - uses: ./.github/actions/set-docker-config-dir + - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - uses: docker/setup-buildx-action@v3 with: cache-binary: false @@ -824,7 +802,7 @@ jobs: curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder chmod +x vm-builder - - uses: ./.github/actions/set-docker-config-dir + - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} @@ -860,7 +838,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: ./.github/actions/set-docker-config-dir + - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml new file mode 100644 index 0000000000..d770bb2bb5 --- /dev/null +++ b/.github/workflows/ingest_benchmark.yml @@ -0,0 +1,372 @@ +name: Benchmarking + +on: + # uncomment to run on push for debugging your PR + # push: + # branches: [ your branch ] + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 9 * * *' # run once a day, timezone is utc + workflow_dispatch: # adds ability to run this manually + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow globally because we need dedicated resources which only exist once + group: ingest-bench-workflow + cancel-in-progress: true + +jobs: + ingest: + strategy: + matrix: + target_project: [new_empty_project, large_existing_project] + permissions: + contents: write + statuses: write + id-token: write # aws-actions/configure-aws-credentials + env: + PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config + PSQL: /tmp/neon/pg_install/v16/bin/psql + PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib + PGCOPYDB: /pgcopydb/bin/pgcopydb + PGCOPYDB_LIB_PATH: /pgcopydb/lib + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + timeout-minutes: 1440 + + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials # necessary to download artefacts + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Create Neon Project + if: ${{ matrix.target_project == 'new_empty_project' }} + id: create-neon-project-ingest-target + uses: ./.github/actions/neon-project-create + with: + region_id: aws-us-east-2 + postgres_version: 16 + compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Initialize Neon project and retrieve current backpressure seconds + if: ${{ matrix.target_project == 'new_empty_project' }} + env: + NEW_PROJECT_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }} + NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }} + run: | + echo "Initializing Neon project with project_id: ${NEW_PROJECT_ID}" + export LD_LIBRARY_PATH=${PG_16_LIB_PATH} + ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;" + BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;") + echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV + echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV + + - name: Create Neon Branch for large tenant + if: ${{ matrix.target_project == 'large_existing_project' }} + id: create-neon-branch-ingest-target + uses: ./.github/actions/neon-branch-create + with: + project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Initialize Neon project and retrieve current backpressure seconds + if: ${{ matrix.target_project == 'large_existing_project' }} + env: + NEW_PROJECT_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }} + NEW_BRANCH_ID: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }} + run: | + echo "Initializing Neon branch with branch_id: ${NEW_BRANCH_ID}" + export LD_LIBRARY_PATH=${PG_16_LIB_PATH} + # Extract the part before the database name + base_connstr="${NEW_PROJECT_CONNSTR%/*}" + # Extract the query parameters (if any) after the database name + query_params="${NEW_PROJECT_CONNSTR#*\?}" + # Reconstruct the new connection string + if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then + new_connstr="${base_connstr}/neondb?${query_params}" + else + new_connstr="${base_connstr}/neondb" + fi + ${PSQL} "${new_connstr}" -c "drop database ludicrous;" + ${PSQL} "${new_connstr}" -c "CREATE DATABASE ludicrous;" + if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then + NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous?${query_params}" + else + NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous" + fi + ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;" + BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;") + echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV + echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV + + + - name: Create pgcopydb filter file + run: | + cat << EOF > /tmp/pgcopydb_filter.txt + [include-only-table] + public.events + public.emails + public.email_transmissions + public.payments + public.editions + public.edition_modules + public.sp_content + public.email_broadcasts + public.user_collections + public.devices + public.user_accounts + public.lessons + public.lesson_users + public.payment_methods + public.orders + public.course_emails + public.modules + public.users + public.module_users + public.courses + public.payment_gateway_keys + public.accounts + public.roles + public.payment_gateways + public.management + public.event_names + EOF + + - name: Invoke pgcopydb + env: + BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }} + run: | + export LD_LIBRARY_PATH=${PGCOPYDB_LIB_PATH}:${PG_16_LIB_PATH} + export PGCOPYDB_SOURCE_PGURI="${BENCHMARK_INGEST_SOURCE_CONNSTR}" + export PGCOPYDB_TARGET_PGURI="${NEW_PROJECT_CONNSTR}" + export PGOPTIONS="-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7" + ${PG_CONFIG} --bindir + ${PGCOPYDB} --version + ${PGCOPYDB} clone --skip-vacuum --no-owner --no-acl --skip-db-properties --table-jobs 4 \ + --index-jobs 4 --restore-jobs 4 --split-tables-larger-than 10GB --skip-extensions \ + --use-copy-binary --filters /tmp/pgcopydb_filter.txt 2>&1 | tee /tmp/pgcopydb_${{ matrix.target_project }}.log + + # create dummy pgcopydb log to test parsing + # - name: create dummy log for parser test + # run: | + # cat << EOF > /tmp/pgcopydb_${{ matrix.target_project }}.log + # 2024-11-04 18:00:53.433 500861 INFO main.c:136 Running pgcopydb version 0.17.10.g8361a93 from "/usr/lib/postgresql/17/bin/pgcopydb" + # 2024-11-04 18:00:53.434 500861 INFO cli_common.c:1225 [SOURCE] Copying database from "postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60" + # 2024-11-04 18:00:53.434 500861 INFO cli_common.c:1226 [TARGET] Copying database into "postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60" + # 2024-11-04 18:00:53.442 500861 INFO copydb.c:105 Using work dir "/tmp/pgcopydb" + # 2024-11-04 18:00:53.541 500861 INFO snapshot.c:107 Exported snapshot "00000008-00000033-1" from the source database + # 2024-11-04 18:00:53.556 500865 INFO cli_clone_follow.c:543 STEP 1: fetch source database tables, indexes, and sequences + # 2024-11-04 18:00:54.570 500865 INFO copydb_schema.c:716 Splitting source candidate tables larger than 10 GB + # 2024-11-04 18:00:54.570 500865 INFO copydb_schema.c:829 Table public.events is 96 GB large which is larger than --split-tables-larger-than 10 GB, and does not have a unique column of type integer: splitting by CTID + # 2024-11-04 18:01:05.538 500865 INFO copydb_schema.c:905 Table public.events is 96 GB large, 10 COPY processes will be used, partitioning on ctid. + # 2024-11-04 18:01:05.564 500865 INFO copydb_schema.c:905 Table public.email_transmissions is 27 GB large, 4 COPY processes will be used, partitioning on id. + # 2024-11-04 18:01:05.584 500865 INFO copydb_schema.c:905 Table public.lessons is 25 GB large, 4 COPY processes will be used, partitioning on id. + # 2024-11-04 18:01:05.605 500865 INFO copydb_schema.c:905 Table public.lesson_users is 16 GB large, 3 COPY processes will be used, partitioning on id. + # 2024-11-04 18:01:05.605 500865 INFO copydb_schema.c:761 Fetched information for 26 tables (including 4 tables split in 21 partitions total), with an estimated total of 907 million tuples and 175 GB on-disk + # 2024-11-04 18:01:05.687 500865 INFO copydb_schema.c:968 Fetched information for 57 indexes (supporting 25 constraints) + # 2024-11-04 18:01:05.753 500865 INFO sequences.c:78 Fetching information for 24 sequences + # 2024-11-04 18:01:05.903 500865 INFO copydb_schema.c:1122 Fetched information for 4 extensions + # 2024-11-04 18:01:06.178 500865 INFO copydb_schema.c:1538 Found 0 indexes (supporting 0 constraints) in the target database + # 2024-11-04 18:01:06.184 500865 INFO cli_clone_follow.c:584 STEP 2: dump the source database schema (pre/post data) + # 2024-11-04 18:01:06.186 500865 INFO pgcmd.c:468 /usr/lib/postgresql/16/bin/pg_dump -Fc --snapshot 00000008-00000033-1 --section=pre-data --section=post-data --file /tmp/pgcopydb/schema/schema.dump 'postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' + # 2024-11-04 18:01:06.952 500865 INFO cli_clone_follow.c:592 STEP 3: restore the pre-data section to the target database + # 2024-11-04 18:01:07.004 500865 INFO pgcmd.c:1001 /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section pre-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/pre-filtered.list /tmp/pgcopydb/schema/schema.dump + # 2024-11-04 18:01:07.438 500874 INFO table-data.c:656 STEP 4: starting 4 table-data COPY processes + # 2024-11-04 18:01:07.451 500877 INFO vacuum.c:139 STEP 8: skipping VACUUM jobs per --skip-vacuum + # 2024-11-04 18:01:07.457 500875 INFO indexes.c:182 STEP 6: starting 4 CREATE INDEX processes + # 2024-11-04 18:01:07.457 500875 INFO indexes.c:183 STEP 7: constraints are built by the CREATE INDEX processes + # 2024-11-04 18:01:07.507 500865 INFO blobs.c:74 Skipping large objects: none found. + # 2024-11-04 18:01:07.509 500865 INFO sequences.c:194 STEP 9: reset sequences values + # 2024-11-04 18:01:07.510 500886 INFO sequences.c:290 Set sequences values on the target database + # 2024-11-04 20:49:00.587 500865 INFO cli_clone_follow.c:608 STEP 10: restore the post-data section to the target database + # 2024-11-04 20:49:00.600 500865 INFO pgcmd.c:1001 /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section post-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/post-filtered.list /tmp/pgcopydb/schema/schema.dump + # 2024-11-05 10:50:58.508 500865 INFO cli_clone_follow.c:639 All step are now done, 16h49m elapsed + # 2024-11-05 10:50:58.508 500865 INFO summary.c:3155 Printing summary for 26 tables and 57 indexes + + # OID | Schema | Name | Parts | copy duration | transmitted bytes | indexes | create index duration + # ------+--------+----------------------+-------+---------------+-------------------+---------+---------------------- + # 24654 | public | events | 10 | 1d11h | 878 GB | 1 | 1h41m + # 24623 | public | email_transmissions | 4 | 4h46m | 99 GB | 3 | 2h04m + # 24665 | public | lessons | 4 | 4h42m | 161 GB | 4 | 1m11s + # 24661 | public | lesson_users | 3 | 2h46m | 49 GB | 3 | 39m35s + # 24631 | public | emails | 1 | 34m07s | 10 GB | 2 | 17s + # 24739 | public | payments | 1 | 5m47s | 1848 MB | 4 | 4m40s + # 24681 | public | module_users | 1 | 4m57s | 1610 MB | 3 | 1m50s + # 24694 | public | orders | 1 | 2m50s | 835 MB | 3 | 1m05s + # 24597 | public | devices | 1 | 1m45s | 498 MB | 2 | 40s + # 24723 | public | payment_methods | 1 | 1m24s | 548 MB | 2 | 31s + # 24765 | public | user_collections | 1 | 2m17s | 1005 MB | 2 | 968ms + # 24774 | public | users | 1 | 52s | 291 MB | 4 | 27s + # 24760 | public | user_accounts | 1 | 16s | 172 MB | 3 | 16s + # 24606 | public | edition_modules | 1 | 8s983 | 46 MB | 3 | 4s749 + # 24583 | public | course_emails | 1 | 8s526 | 26 MB | 2 | 996ms + # 24685 | public | modules | 1 | 1s592 | 21 MB | 3 | 1s696 + # 24610 | public | editions | 1 | 2s199 | 7483 kB | 2 | 1s032 + # 24755 | public | sp_content | 1 | 1s555 | 4177 kB | 0 | 0ms + # 24619 | public | email_broadcasts | 1 | 744ms | 2645 kB | 2 | 677ms + # 24590 | public | courses | 1 | 387ms | 1540 kB | 2 | 367ms + # 24704 | public | payment_gateway_keys | 1 | 1s972 | 164 kB | 2 | 27ms + # 24576 | public | accounts | 1 | 58ms | 24 kB | 1 | 14ms + # 24647 | public | event_names | 1 | 32ms | 397 B | 1 | 8ms + # 24716 | public | payment_gateways | 1 | 1s675 | 117 B | 1 | 11ms + # 24748 | public | roles | 1 | 71ms | 173 B | 1 | 8ms + # 24676 | public | management | 1 | 33ms | 40 B | 1 | 19ms + + + # Step Connection Duration Transfer Concurrency + # -------------------------------------------------- ---------- ---------- ---------- ------------ + # Catalog Queries (table ordering, filtering, etc) source 12s 1 + # Dump Schema source 765ms 1 + # Prepare Schema target 466ms 1 + # COPY, INDEX, CONSTRAINTS, VACUUM (wall clock) both 2h47m 12 + # COPY (cumulative) both 7h46m 1225 GB 4 + # CREATE INDEX (cumulative) target 4h36m 4 + # CONSTRAINTS (cumulative) target 8s493 4 + # VACUUM (cumulative) target 0ms 4 + # Reset Sequences both 60ms 1 + # Large Objects (cumulative) (null) 0ms 0 + # Finalize Schema both 14h01m 4 + # -------------------------------------------------- ---------- ---------- ---------- ------------ + # Total Wall Clock Duration both 16h49m 20 + + + # EOF + + + - name: show tables sizes and retrieve current backpressure seconds + run: | + export LD_LIBRARY_PATH=${PG_16_LIB_PATH} + ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "\dt+" + BACKPRESSURE_TIME_AFTER_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;") + echo "BACKPRESSURE_TIME_AFTER_INGEST=${BACKPRESSURE_TIME_AFTER_INGEST}" >> $GITHUB_ENV + + - name: Parse pgcopydb log and report performance metrics + env: + PERF_TEST_RESULT_CONNSTR: ${{ secrets.PERF_TEST_RESULT_CONNSTR }} + run: | + export LD_LIBRARY_PATH=${PG_16_LIB_PATH} + + # Define the log file path + LOG_FILE="/tmp/pgcopydb_${{ matrix.target_project }}.log" + + # Get the current git commit hash + git config --global --add safe.directory /__w/neon/neon + COMMIT_HASH=$(git rev-parse --short HEAD) + + # Define the platform and test suite + PLATFORM="pg16-${{ matrix.target_project }}-us-east-2-staging" + SUIT="pgcopydb_ingest_bench" + + # Function to convert time (e.g., "2h47m", "4h36m", "118ms", "8s493") to seconds + convert_to_seconds() { + local duration=$1 + local total_seconds=0 + + # Check for hours (h) + if [[ "$duration" =~ ([0-9]+)h ]]; then + total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 3600)) + fi + + # Check for seconds (s) + if [[ "$duration" =~ ([0-9]+)s ]]; then + total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0})) + fi + + # Check for milliseconds (ms) (if applicable) + if [[ "$duration" =~ ([0-9]+)ms ]]; then + total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} / 1000)) + duration=${duration/${BASH_REMATCH[0]}/} # need to remove it to avoid double counting with m + fi + + # Check for minutes (m) - must be checked after ms because m is contained in ms + if [[ "$duration" =~ ([0-9]+)m ]]; then + total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 60)) + fi + + echo $total_seconds + } + + # Calculate the backpressure difference in seconds + BACKPRESSURE_TIME_DIFF=$(awk "BEGIN {print $BACKPRESSURE_TIME_AFTER_INGEST - $BACKPRESSURE_TIME_BEFORE_INGEST}") + + # Insert the backpressure time difference into the performance database + if [ -n "$BACKPRESSURE_TIME_DIFF" ]; then + PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \" + INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp) + VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', 'backpressure_time', ${BACKPRESSURE_TIME_DIFF}, 'seconds', 'lower_is_better', now()); + \"" + echo "Inserting backpressure time difference: ${BACKPRESSURE_TIME_DIFF} seconds" + eval $PSQL_CMD + fi + + # Extract and process log lines + while IFS= read -r line; do + METRIC_NAME="" + # Match each desired line and extract the relevant information + if [[ "$line" =~ COPY,\ INDEX,\ CONSTRAINTS,\ VACUUM.* ]]; then + METRIC_NAME="COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)" + elif [[ "$line" =~ COPY\ \(cumulative\).* ]]; then + METRIC_NAME="COPY (cumulative)" + elif [[ "$line" =~ CREATE\ INDEX\ \(cumulative\).* ]]; then + METRIC_NAME="CREATE INDEX (cumulative)" + elif [[ "$line" =~ CONSTRAINTS\ \(cumulative\).* ]]; then + METRIC_NAME="CONSTRAINTS (cumulative)" + elif [[ "$line" =~ Finalize\ Schema.* ]]; then + METRIC_NAME="Finalize Schema" + elif [[ "$line" =~ Total\ Wall\ Clock\ Duration.* ]]; then + METRIC_NAME="Total Wall Clock Duration" + fi + + # If a metric was matched, insert it into the performance database + if [ -n "$METRIC_NAME" ]; then + DURATION=$(echo "$line" | grep -oP '\d+h\d+m|\d+s|\d+ms|\d{1,2}h\d{1,2}m|\d+\.\d+s' | head -n 1) + METRIC_VALUE=$(convert_to_seconds "$DURATION") + PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \" + INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp) + VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', '${METRIC_NAME}', ${METRIC_VALUE}, 'seconds', 'lower_is_better', now()); + \"" + echo "Inserting ${METRIC_NAME} with value ${METRIC_VALUE} seconds" + eval $PSQL_CMD + fi + done < "$LOG_FILE" + + - name: Delete Neon Project + if: ${{ always() && matrix.target_project == 'new_empty_project' }} + uses: ./.github/actions/neon-project-delete + with: + project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Delete Neon Branch for large tenant + if: ${{ always() && matrix.target_project == 'large_existing_project' }} + uses: ./.github/actions/neon-branch-delete + with: + project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }} + branch_id: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 287c9ea281..cd5a665402 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -201,6 +201,8 @@ jobs: REPORT_URL: ${{ steps.upload-stats.outputs.report-url }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: + # Retry script for 5XX server errors: https://github.com/actions/github-script#retries + retries: 5 script: | const { REPORT_URL, SHA } = process.env diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml new file mode 100644 index 0000000000..137faa7abc --- /dev/null +++ b/.github/workflows/pre-merge-checks.yml @@ -0,0 +1,94 @@ +name: Pre-merge checks + +on: + merge_group: + branches: + - main + +defaults: + run: + shell: bash -euxo pipefail {0} + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +jobs: + get-changed-files: + runs-on: ubuntu-22.04 + outputs: + python-changed: ${{ steps.python-src.outputs.any_changed }} + steps: + - uses: actions/checkout@v4 + - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 + id: python-src + with: + files: | + .github/workflows/pre-merge-checks.yml + **/**.py + poetry.lock + pyproject.toml + + - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES + env: + PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }} + run: | + echo "${PYTHON_CHANGED_FILES}" + + check-build-tools-image: + if: needs.get-changed-files.outputs.python-changed == 'true' + needs: [ get-changed-files ] + uses: ./.github/workflows/check-build-tools-image.yml + + build-build-tools-image: + needs: [ check-build-tools-image ] + uses: ./.github/workflows/build-build-tools-image.yml + with: + image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} + secrets: inherit + + check-codestyle-python: + if: needs.get-changed-files.outputs.python-changed == 'true' + needs: [ get-changed-files, build-build-tools-image ] + uses: ./.github/workflows/_check-codestyle-python.yml + with: + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm + secrets: inherit + + # To get items from the merge queue merged into main we need to satisfy "Status checks that are required". + # Currently we require 2 jobs (checks with exact name): + # - conclusion + # - neon-cloud-e2e + conclusion: + if: always() + permissions: + statuses: write # for `github.repos.createCommitStatus(...)` + needs: + - get-changed-files + - check-codestyle-python + runs-on: ubuntu-22.04 + steps: + - name: Create fake `neon-cloud-e2e` check + uses: actions/github-script@v7 + with: + # Retry script for 5XX server errors: https://github.com/actions/github-script#retries + retries: 5 + script: | + const { repo, owner } = context.repo; + const targetUrl = `${context.serverUrl}/${owner}/${repo}/actions/runs/${context.runId}`; + + await github.rest.repos.createCommitStatus({ + owner: owner, + repo: repo, + sha: context.sha, + context: `neon-cloud-e2e`, + state: `success`, + target_url: targetUrl, + description: `fake check for merge queue`, + }); + + - name: Fail the job if any of the dependencies do not succeed or skipped + run: exit 1 + if: | + (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true') + || contains(needs.*.result, 'failure') + || contains(needs.*.result, 'cancelled') diff --git a/.github/workflows/report-workflow-stats-batch.yml b/.github/workflows/report-workflow-stats-batch.yml new file mode 100644 index 0000000000..98e394a3c2 --- /dev/null +++ b/.github/workflows/report-workflow-stats-batch.yml @@ -0,0 +1,29 @@ +name: Report Workflow Stats Batch + +on: + schedule: + - cron: '*/15 * * * *' + - cron: '25 0 * * *' + +jobs: + gh-workflow-stats-batch: + name: GitHub Workflow Stats Batch + runs-on: ubuntu-22.04 + permissions: + actions: read + steps: + - name: Export Workflow Run for the past 2 hours + uses: neondatabase/gh-workflow-stats-action@v0.2.1 + with: + db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} + db_table: "gh_workflow_stats_batch_neon" + gh_token: ${{ secrets.GITHUB_TOKEN }} + duration: '2h' + - name: Export Workflow Run for the past 24 hours + if: github.event.schedule == '25 0 * * *' + uses: neondatabase/gh-workflow-stats-action@v0.2.1 + with: + db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} + db_table: "gh_workflow_stats_batch_neon" + gh_token: ${{ secrets.GITHUB_TOKEN }} + duration: '24h' diff --git a/.github/workflows/report-workflow-stats.yml b/.github/workflows/report-workflow-stats.yml index 6abeff7695..0d135a257c 100644 --- a/.github/workflows/report-workflow-stats.yml +++ b/.github/workflows/report-workflow-stats.yml @@ -23,6 +23,7 @@ on: - Test Postgres client libraries - Trigger E2E Tests - cleanup caches by a branch + - Pre-merge checks types: [completed] jobs: diff --git a/Cargo.lock b/Cargo.lock index 9c2a0b455e..64231ed11c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1229,12 +1229,15 @@ dependencies = [ "flate2", "futures", "hyper 0.14.30", + "metrics", "nix 0.27.1", "notify", "num_cpus", + "once_cell", "opentelemetry", "opentelemetry_sdk", "postgres", + "prometheus", "regex", "remote_storage", "reqwest 0.12.4", @@ -4743,6 +4746,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rustls 0.22.4", + "rustls-native-certs 0.7.0", "rustls-pemfile 2.1.1", "rustls-pki-types", "serde", @@ -5146,6 +5150,7 @@ dependencies = [ "chrono", "clap", "crc32c", + "criterion", "desim", "fail", "futures", @@ -5153,6 +5158,7 @@ dependencies = [ "http 1.1.0", "humantime", "hyper 0.14.30", + "itertools 0.10.5", "metrics", "once_cell", "parking_lot 0.12.1", diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 93f1e48afa..c1190b13f4 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -1,12 +1,66 @@ ARG DEBIAN_VERSION=bullseye -FROM debian:${DEBIAN_VERSION}-slim +FROM debian:bookworm-slim AS pgcopydb_builder +ARG DEBIAN_VERSION + +RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ + set -e && \ + apt update && \ + apt install -y --no-install-recommends \ + ca-certificates wget gpg && \ + wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /usr/share/keyrings/postgresql-keyring.gpg && \ + echo "deb [signed-by=/usr/share/keyrings/postgresql-keyring.gpg] http://apt.postgresql.org/pub/repos/apt bookworm-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \ + apt-get update && \ + apt install -y --no-install-recommends \ + build-essential \ + autotools-dev \ + libedit-dev \ + libgc-dev \ + libpam0g-dev \ + libreadline-dev \ + libselinux1-dev \ + libxslt1-dev \ + libssl-dev \ + libkrb5-dev \ + zlib1g-dev \ + liblz4-dev \ + libpq5 \ + libpq-dev \ + libzstd-dev \ + postgresql-16 \ + postgresql-server-dev-16 \ + postgresql-common \ + python3-sphinx && \ + wget -O /tmp/pgcopydb.tar.gz https://github.com/dimitri/pgcopydb/archive/refs/tags/v0.17.tar.gz && \ + mkdir /tmp/pgcopydb && \ + tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \ + cd /tmp/pgcopydb && \ + make -s clean && \ + make -s -j12 install && \ + libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \ + mkdir -p /pgcopydb/lib && \ + cp "$libpq_path" /pgcopydb/lib/; \ + else \ + # copy command below will fail if we don't have dummy files, so we create them for other debian versions + mkdir -p /usr/lib/postgresql/16/bin && touch /usr/lib/postgresql/16/bin/pgcopydb && \ + mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \ + fi + +FROM debian:${DEBIAN_VERSION}-slim AS build_tools ARG DEBIAN_VERSION # Add nonroot user RUN useradd -ms /bin/bash nonroot -b /home SHELL ["/bin/bash", "-c"] +RUN mkdir -p /pgcopydb/bin && \ + mkdir -p /pgcopydb/lib && \ + chmod -R 755 /pgcopydb && \ + chown -R nonroot:nonroot /pgcopydb + +COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb +COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 + # System deps # # 'gdb' is included so that we get backtraces of core dumps produced in @@ -38,7 +92,7 @@ RUN set -e \ libseccomp-dev \ libsqlite3-dev \ libssl-dev \ - $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \ + $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && echo libstdc++-10-dev || echo libstdc++-11-dev) \ libtool \ libxml2-dev \ libxmlsec1-dev \ @@ -235,7 +289,13 @@ RUN whoami \ && cargo --version --verbose \ && rustup --version --verbose \ && rustc --version --verbose \ - && clang --version + && clang --version + +RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ + LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \ +else \ + echo "pgcopydb is not available for ${DEBIAN_VERSION}"; \ +fi # Set following flag to check in Makefile if its running in Docker RUN touch /home/nonroot/.docker_build diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index f070f66c0a..32405ece86 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -559,8 +559,8 @@ RUN case "${PG_VERSION}" in \ export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \ ;; \ "v17") \ - export TIMESCALEDB_VERSION=2.17.0 \ - export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \ + export TIMESCALEDB_VERSION=2.17.1 \ + export TIMESCALEDB_CHECKSUM=6277cf43f5695e23dae1c5cfeba00474d730b66ed53665a84b787a6bb1a57e28 \ ;; \ esac && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ @@ -624,16 +624,12 @@ FROM build-deps AS pg-cron-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -# 1.6.4 available, supports v17 # This is an experimental extension that we do not support on prod yet. # !Do not remove! # We set it in shared_preload_libraries and computes will fail to start if library is not found. ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ - echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \ + echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -1475,6 +1471,8 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter +COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml + COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml diff --git a/compute/etc/postgres_exporter.yml b/compute/etc/postgres_exporter.yml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql index 459c586d18..d97d625d4c 100644 --- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql +++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql @@ -1 +1 @@ -SELECT neon.backpressure_throttling_time()::float8 / 1000 AS throttled; +SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled; diff --git a/compute/patches/pg_anon.patch b/compute/patches/pg_anon.patch index 15dfd3c5a0..e2b4b292e4 100644 --- a/compute/patches/pg_anon.patch +++ b/compute/patches/pg_anon.patch @@ -1,3 +1,45 @@ +commit 00aa659afc9c7336ab81036edec3017168aabf40 +Author: Heikki Linnakangas +Date: Tue Nov 12 16:59:19 2024 +0200 + + Temporarily disable test that depends on timezone + +diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out +index 23ef5fa..9e60deb 100644 +--- a/ext-src/pg_anon-src/tests/expected/generalization.out ++++ b/ext-src/pg_anon-src/tests/expected/generalization.out +@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century'); + ["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST") + (1 row) + +-SELECT anon.generalize_tstzrange('19041107','millennium'); +- generalize_tstzrange +------------------------------------------------------------------ +- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST") +-(1 row) +- ++-- temporarily disabled, see: ++-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485 ++--SELECT anon.generalize_tstzrange('19041107','millennium'); + -- generalize_daterange + SELECT anon.generalize_daterange('19041107'); + generalize_daterange +diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql +index b868344..b4fc977 100644 +--- a/ext-src/pg_anon-src/tests/sql/generalization.sql ++++ b/ext-src/pg_anon-src/tests/sql/generalization.sql +@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month'); + SELECT anon.generalize_tstzrange('19041107','year'); + SELECT anon.generalize_tstzrange('19041107','decade'); + SELECT anon.generalize_tstzrange('19041107','century'); +-SELECT anon.generalize_tstzrange('19041107','millennium'); ++-- temporarily disabled, see: ++-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485 ++--SELECT anon.generalize_tstzrange('19041107','millennium'); + + -- generalize_daterange + SELECT anon.generalize_daterange('19041107'); + commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f Author: Alexey Masterov Date: Fri May 31 06:34:26 2024 +0000 diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index 79f894c289..ac9f5c6904 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -26,7 +26,7 @@ commands: - name: postgres-exporter user: nobody sysvInitAction: respawn - shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter' + shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' - name: sql-exporter user: nobody sysvInitAction: respawn diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index ff04b9e4c6..0d178e1c24 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -26,7 +26,7 @@ commands: - name: postgres-exporter user: nobody sysvInitAction: respawn - shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter' + shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' - name: sql-exporter user: nobody sysvInitAction: respawn diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 91e0b9d5b8..0bf4ed53d6 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -18,9 +18,11 @@ clap.workspace = true flate2.workspace = true futures.workspace = true hyper0 = { workspace = true, features = ["full"] } +metrics.workspace = true nix.workspace = true notify.workspace = true num_cpus.workspace = true +once_cell.workspace = true opentelemetry.workspace = true opentelemetry_sdk.workspace = true postgres.workspace = true @@ -39,6 +41,7 @@ tracing-subscriber.workspace = true tracing-utils.workspace = true thiserror.workspace = true url.workspace = true +prometheus.workspace = true compute_api.workspace = true utils.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index d3e42fe618..0a8cb14058 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -364,11 +364,29 @@ impl ComputeNode { let pageserver_connect_micros = start_time.elapsed().as_micros() as u64; let basebackup_cmd = match lsn { - Lsn(0) => format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id), - _ => format!( - "basebackup {} {} {} --gzip", - spec.tenant_id, spec.timeline_id, lsn - ), + Lsn(0) => { + if spec.spec.mode != ComputeMode::Primary { + format!( + "basebackup {} {} --gzip --replica", + spec.tenant_id, spec.timeline_id + ) + } else { + format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id) + } + } + _ => { + if spec.spec.mode != ComputeMode::Primary { + format!( + "basebackup {} {} {} --gzip --replica", + spec.tenant_id, spec.timeline_id, lsn + ) + } else { + format!( + "basebackup {} {} {} --gzip", + spec.tenant_id, spec.timeline_id, lsn + ) + } + } }; let copyreader = client.copy_out(basebackup_cmd.as_str())?; diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 479100eb89..d4e413034e 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -73,6 +73,19 @@ pub fn write_postgres_conf( )?; } + // Locales + if cfg!(target_os = "macos") { + writeln!(file, "lc_messages='C'")?; + writeln!(file, "lc_monetary='C'")?; + writeln!(file, "lc_time='C'")?; + writeln!(file, "lc_numeric='C'")?; + } else { + writeln!(file, "lc_messages='C.UTF-8'")?; + writeln!(file, "lc_monetary='C.UTF-8'")?; + writeln!(file, "lc_time='C.UTF-8'")?; + writeln!(file, "lc_numeric='C.UTF-8'")?; + } + match spec.mode { ComputeMode::Primary => {} ComputeMode::Static(lsn) => { diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index af35f71bf2..3677582c11 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -9,6 +9,7 @@ use crate::catalog::SchemaDumpError; use crate::catalog::{get_database_schema, get_dbs_and_roles}; use crate::compute::forward_termination_signal; use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; +use crate::installed_extensions; use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest}; use compute_api::responses::{ ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError, @@ -19,6 +20,8 @@ use anyhow::Result; use hyper::header::CONTENT_TYPE; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; +use metrics::Encoder; +use metrics::TextEncoder; use tokio::task; use tracing::{debug, error, info, warn}; use tracing_utils::http::OtelName; @@ -65,6 +68,28 @@ async fn routes(req: Request, compute: &Arc) -> Response { + debug!("serving /metrics GET request"); + + let mut buffer = vec![]; + let metrics = installed_extensions::collect(); + let encoder = TextEncoder::new(); + encoder.encode(&metrics, &mut buffer).unwrap(); + + match Response::builder() + .status(StatusCode::OK) + .header(CONTENT_TYPE, encoder.format_type()) + .body(Body::from(buffer)) + { + Ok(response) => response, + Err(err) => { + let msg = format!("error handling /metrics request: {err}"); + error!(msg); + render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR) + } + } + } // Collect Postgres current usage insights (&Method::GET, "/insights") => { info!("serving /insights GET request"); diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 11eee6ccfd..7b9a62c545 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -37,6 +37,21 @@ paths: schema: $ref: "#/components/schemas/ComputeMetrics" + /metrics + get: + tags: + - Info + summary: Get compute node metrics in text format. + description: "" + operationId: getComputeMetrics + responses: + 200: + description: ComputeMetrics + content: + text/plain: + schema: + type: string + description: Metrics in text format. /insights: get: tags: diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 877f99bff7..6dd55855db 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -1,4 +1,5 @@ use compute_api::responses::{InstalledExtension, InstalledExtensions}; +use metrics::proto::MetricFamily; use std::collections::HashMap; use std::collections::HashSet; use tracing::info; @@ -8,6 +9,10 @@ use anyhow::Result; use postgres::{Client, NoTls}; use tokio::task; +use metrics::core::Collector; +use metrics::{register_uint_gauge_vec, UIntGaugeVec}; +use once_cell::sync::Lazy; + /// We don't reuse get_existing_dbs() just for code clarity /// and to make database listing query here more explicit. /// @@ -59,6 +64,12 @@ pub async fn get_installed_extensions(connstr: Url) -> Result Result Result<()> { "[NEON_EXT_STAT] {}", serde_json::to_string(&result).expect("failed to serialize extensions list") ); - Ok(()) } + +static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "installed_extensions", + "Number of databases where the version of extension is installed", + &["extension_name", "version"] + ) + .expect("failed to define a metric") +}); + +pub fn collect() -> Vec { + INSTALLED_EXTENSIONS.collect() +} diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 48438adf43..c4063bbd1a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -944,6 +944,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, other: Default::default(), + // Typical developer machines use disks with slow fsync, and we don't care + // about data integrity: disable disk syncs. + no_sync: true, } }) .collect(), diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 9dc2a0c36b..032c88a829 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -225,6 +225,7 @@ pub struct PageServerConf { pub listen_http_addr: String, pub pg_auth_type: AuthType, pub http_auth_type: AuthType, + pub no_sync: bool, } impl Default for PageServerConf { @@ -235,6 +236,7 @@ impl Default for PageServerConf { listen_http_addr: String::new(), pg_auth_type: AuthType::Trust, http_auth_type: AuthType::Trust, + no_sync: false, } } } @@ -249,6 +251,8 @@ pub struct NeonLocalInitPageserverConf { pub listen_http_addr: String, pub pg_auth_type: AuthType, pub http_auth_type: AuthType, + #[serde(default, skip_serializing_if = "std::ops::Not::not")] + pub no_sync: bool, #[serde(flatten)] pub other: HashMap, } @@ -261,6 +265,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf { listen_http_addr, pg_auth_type, http_auth_type, + no_sync, other: _, } = conf; Self { @@ -269,6 +274,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf { listen_http_addr: listen_http_addr.clone(), pg_auth_type: *pg_auth_type, http_auth_type: *http_auth_type, + no_sync: *no_sync, } } } @@ -569,6 +575,8 @@ impl LocalEnv { listen_http_addr: String, pg_auth_type: AuthType, http_auth_type: AuthType, + #[serde(default)] + no_sync: bool, } let config_toml_path = dentry.path().join("pageserver.toml"); let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str( @@ -591,6 +599,7 @@ impl LocalEnv { listen_http_addr, pg_auth_type, http_auth_type, + no_sync, } = config_toml; let IdentityTomlSubset { id: identity_toml_id, @@ -607,6 +616,7 @@ impl LocalEnv { listen_http_addr, pg_auth_type, http_auth_type, + no_sync, }; pageservers.push(conf); } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index eab76e14c3..ae5e22ddc6 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -273,6 +273,7 @@ impl PageServerNode { ) })?; let args = vec!["-D", datadir_path_str]; + background_process::start_process( "pageserver", &datadir, diff --git a/docs/rfcs/038-aux-file-v2.md b/docs/rfcs/038-aux-file-v2.md index 9c3c336008..dc8c5d8fc4 100644 --- a/docs/rfcs/038-aux-file-v2.md +++ b/docs/rfcs/038-aux-file-v2.md @@ -91,7 +91,7 @@ generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace. There are two places we need to read the aux files from the pageserver: * On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that. -* We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error. +* We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API used to always attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error. Furthermore, as aux file reads usually need all layer files intersecting with that key range within the branch and cover a big keyspace, it incurs large overhead for tracking keyspaces that have not been read. Therefore, for sparse keyspaces, we [do not track](https://github.com/neondatabase/neon/pull/9631) `ummapped_keyspace`. ## Compaction and Image Layer Generation diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 00cc426c3c..f48c1febb5 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -64,6 +64,7 @@ pub struct ConfigToml { #[serde(with = "humantime_serde")] pub wal_redo_timeout: Duration, pub superuser: String, + pub locale: String, pub page_cache_size: usize, pub max_file_descriptors: usize, pub pg_distrib_dir: Option, @@ -106,6 +107,8 @@ pub struct ConfigToml { pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: Option, pub virtual_file_io_mode: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub no_sync: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -274,6 +277,11 @@ pub mod defaults { pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; + pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") { + "C" + } else { + "C.UTF-8" + }; pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; @@ -324,6 +332,7 @@ impl Default for ConfigToml { wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) .expect("cannot parse default wal redo timeout")), superuser: (DEFAULT_SUPERUSER.to_string()), + locale: DEFAULT_LOCALE.to_string(), page_cache_size: (DEFAULT_PAGE_CACHE_SIZE), max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS), pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir() @@ -389,6 +398,7 @@ impl Default for ConfigToml { l0_flush: None, virtual_file_io_mode: None, tenant_config: TenantConfigToml::default(), + no_sync: None, } } } diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs index b80ed2f203..5c3f3deb82 100644 --- a/libs/pageserver_api/src/record.rs +++ b/libs/pageserver_api/src/record.rs @@ -80,18 +80,18 @@ impl NeonWalRecord { } #[cfg(feature = "testing")] - pub fn wal_clear() -> Self { + pub fn wal_clear(s: impl AsRef) -> Self { Self::Test { - append: "".to_string(), + append: s.as_ref().to_string(), clear: true, will_init: false, } } #[cfg(feature = "testing")] - pub fn wal_init() -> Self { + pub fn wal_init(s: impl AsRef) -> Self { Self::Test { - append: "".to_string(), + append: s.as_ref().to_string(), clear: true, will_init: true, } diff --git a/libs/postgres_ffi/src/wal_generator.rs b/libs/postgres_ffi/src/wal_generator.rs index 97968c269b..dc679eea33 100644 --- a/libs/postgres_ffi/src/wal_generator.rs +++ b/libs/postgres_ffi/src/wal_generator.rs @@ -1,10 +1,10 @@ -use std::ffi::CStr; +use std::ffi::{CStr, CString}; use bytes::{Bytes, BytesMut}; use crc32c::crc32c_append; use utils::lsn::Lsn; -use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC}; +use super::bindings::{RmgrId, XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC}; use super::xlog_utils::{ XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE, XLP_FIRST_IS_CONTRECORD, @@ -16,11 +16,65 @@ use crate::pg_constants::{ }; use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; -/// Generates binary WAL records for use in tests and benchmarks. Currently only generates logical -/// messages (effectively noops) with a fixed payload. It is used as an iterator which yields -/// encoded bytes for a single WAL record, including internal page headers if it spans pages. -/// Concatenating the bytes will yield a complete, well-formed WAL, which can be chunked at segment -/// boundaries if desired. Not optimized for performance. +/// A WAL record payload. Will be prefixed by an XLogRecord header when encoded. +pub struct Record { + pub rmid: RmgrId, + pub info: u8, + pub data: Bytes, +} + +impl Record { + /// Encodes the WAL record including an XLogRecord header. prev_lsn is the start position of + /// the previous record in the WAL -- this is ignored by the Safekeeper, but not Postgres. + pub fn encode(&self, prev_lsn: Lsn) -> Bytes { + // Prefix data with block ID and length. + let data_header = Bytes::from(match self.data.len() { + 0 => vec![], + 1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, self.data.len() as u8], + 256.. => { + let len_bytes = (self.data.len() as u32).to_le_bytes(); + [&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat() + } + }); + + // Construct the WAL record header. + let mut header = XLogRecord { + xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + self.data.len()) as u32, + xl_xid: 0, + xl_prev: prev_lsn.into(), + xl_info: self.info, + xl_rmid: self.rmid, + __bindgen_padding_0: [0; 2], + xl_crc: 0, // see below + }; + + // Compute the CRC checksum for the data, and the header up to the CRC field. + let mut crc = 0; + crc = crc32c_append(crc, &data_header); + crc = crc32c_append(crc, &self.data); + crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]); + header.xl_crc = crc; + + // Encode the final header and record. + let header = header.encode().unwrap(); + + [header, data_header, self.data.clone()].concat().into() + } +} + +/// Generates WAL record payloads. +/// +/// TODO: currently only provides LogicalMessageGenerator for trivial noop messages. Add a generator +/// that creates a table and inserts rows. +pub trait RecordGenerator: Iterator {} + +impl> RecordGenerator for I {} + +/// Generates binary WAL for use in tests and benchmarks. The provided record generator constructs +/// the WAL records. It is used as an iterator which yields encoded bytes for a single WAL record, +/// including internal page headers if it spans pages. Concatenating the bytes will yield a +/// complete, well-formed WAL, which can be chunked at segment boundaries if desired. Not optimized +/// for performance. /// /// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this /// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`). @@ -31,10 +85,10 @@ use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; /// | Segment 1 | Segment 2 | Segment 3 | /// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 | /// | R1 | R2 |R3| R4 | R5 | R6 | R7 | R8 | -/// -/// TODO: support generating actual tables and rows. #[derive(Default)] -pub struct WalGenerator { +pub struct WalGenerator { + /// Generates record payloads for the WAL. + pub record_generator: R, /// Current LSN to append the next record at. /// /// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should @@ -46,73 +100,35 @@ pub struct WalGenerator { pub prev_lsn: Lsn, } -impl WalGenerator { - // For now, hardcode the message payload. - // TODO: support specifying the payload size. - const PREFIX: &CStr = c"prefix"; - const MESSAGE: &[u8] = b"message"; - - // Hardcode the sys, timeline, and DB IDs. We can make them configurable if we care about them. +impl WalGenerator { + // Hardcode the sys and timeline ID. We can make them configurable if we care about them. const SYS_ID: u64 = 0; const TIMELINE_ID: u32 = 1; - const DB_ID: u32 = 0; - /// Creates a new WAL generator, which emits logical message records (noops). - pub fn new() -> Self { - Self::default() + /// Creates a new WAL generator with the given record generator. + pub fn new(record_generator: R) -> WalGenerator { + Self { + record_generator, + lsn: Lsn(0), + prev_lsn: Lsn(0), + } } - /// Encodes a logical message (basically a noop), with the given prefix and message. - pub(crate) fn encode_logical_message(prefix: &CStr, message: &[u8]) -> Bytes { - let prefix = prefix.to_bytes_with_nul(); - let header = XlLogicalMessage { - db_id: Self::DB_ID, - transactional: 0, - prefix_size: prefix.len() as u64, - message_size: message.len() as u64, - }; - [&header.encode(), prefix, message].concat().into() + /// Appends a record with an arbitrary payload at the current LSN, then increments the LSN. + /// Returns the WAL bytes for the record, including page headers and padding, and the start LSN. + fn append_record(&mut self, record: Record) -> (Lsn, Bytes) { + let record = record.encode(self.prev_lsn); + let record = Self::insert_pages(record, self.lsn); + let record = Self::pad_record(record, self.lsn); + let lsn = self.lsn; + self.prev_lsn = self.lsn; + self.lsn += record.len() as u64; + (lsn, record) } - /// Encode a WAL record with the given payload data (e.g. a logical message). - pub(crate) fn encode_record(data: Bytes, rmid: u8, info: u8, prev_lsn: Lsn) -> Bytes { - // Prefix data with block ID and length. - let data_header = Bytes::from(match data.len() { - 0 => vec![], - 1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, data.len() as u8], - 256.. => { - let len_bytes = (data.len() as u32).to_le_bytes(); - [&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat() - } - }); - - // Construct the WAL record header. - let mut header = XLogRecord { - xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + data.len()) as u32, - xl_xid: 0, - xl_prev: prev_lsn.into(), - xl_info: info, - xl_rmid: rmid, - __bindgen_padding_0: [0; 2], - xl_crc: 0, // see below - }; - - // Compute the CRC checksum for the data, and the header up to the CRC field. - let mut crc = 0; - crc = crc32c_append(crc, &data_header); - crc = crc32c_append(crc, &data); - crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]); - header.xl_crc = crc; - - // Encode the final header and record. - let header = header.encode().unwrap(); - - [header, data_header, data].concat().into() - } - - /// Injects page headers on 8KB page boundaries. Takes the current LSN position where the record + /// Inserts page headers on 8KB page boundaries. Takes the current LSN position where the record /// is to be appended. - fn encode_pages(record: Bytes, mut lsn: Lsn) -> Bytes { + fn insert_pages(record: Bytes, mut lsn: Lsn) -> Bytes { // Fast path: record fits in current page, and the page already has a header. if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 { return record; @@ -173,31 +189,71 @@ impl WalGenerator { } [record, Bytes::from(vec![0; padding])].concat().into() } - - /// Generates a record with an arbitrary payload at the current LSN, then increments the LSN. - pub fn generate_record(&mut self, data: Bytes, rmid: u8, info: u8) -> Bytes { - let record = Self::encode_record(data, rmid, info, self.prev_lsn); - let record = Self::encode_pages(record, self.lsn); - let record = Self::pad_record(record, self.lsn); - self.prev_lsn = self.lsn; - self.lsn += record.len() as u64; - record - } - - /// Generates a logical message at the current LSN. Can be used to construct arbitrary messages. - pub fn generate_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> Bytes { - let data = Self::encode_logical_message(prefix, message); - self.generate_record(data, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE) - } } -/// Generate WAL records as an iterator. -impl Iterator for WalGenerator { +/// Generates WAL records as an iterator. +impl Iterator for WalGenerator { type Item = (Lsn, Bytes); fn next(&mut self) -> Option { - let lsn = self.lsn; - let record = self.generate_logical_message(Self::PREFIX, Self::MESSAGE); - Some((lsn, record)) + let record = self.record_generator.next()?; + Some(self.append_record(record)) + } +} + +/// Generates logical message records (effectively noops) with a fixed message. +pub struct LogicalMessageGenerator { + prefix: CString, + message: Vec, +} + +impl LogicalMessageGenerator { + const DB_ID: u32 = 0; // hardcoded for now + const RM_ID: RmgrId = RM_LOGICALMSG_ID; + const INFO: u8 = XLOG_LOGICAL_MESSAGE; + + /// Creates a new LogicalMessageGenerator. + pub fn new(prefix: &CStr, message: &[u8]) -> Self { + Self { + prefix: prefix.to_owned(), + message: message.to_owned(), + } + } + + /// Encodes a logical message. + fn encode(prefix: &CStr, message: &[u8]) -> Bytes { + let prefix = prefix.to_bytes_with_nul(); + let header = XlLogicalMessage { + db_id: Self::DB_ID, + transactional: 0, + prefix_size: prefix.len() as u64, + message_size: message.len() as u64, + }; + [&header.encode(), prefix, message].concat().into() + } +} + +impl Iterator for LogicalMessageGenerator { + type Item = Record; + + fn next(&mut self) -> Option { + Some(Record { + rmid: Self::RM_ID, + info: Self::INFO, + data: Self::encode(&self.prefix, &self.message), + }) + } +} + +impl WalGenerator { + /// Convenience method for appending a WAL record with an arbitrary logical message at the + /// current WAL LSN position. Returns the start LSN and resulting WAL bytes. + pub fn append_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> (Lsn, Bytes) { + let record = Record { + rmid: LogicalMessageGenerator::RM_ID, + info: LogicalMessageGenerator::INFO, + data: LogicalMessageGenerator::encode(prefix, message), + }; + self.append_record(record) } } diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 78a965174f..852b20eace 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -12,9 +12,9 @@ use super::bindings::{ CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz, XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, }; -use super::wal_generator::WalGenerator; +use super::wal_generator::LogicalMessageGenerator; use super::PG_MAJORVERSION; -use crate::pg_constants::{self, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE}; +use crate::pg_constants; use crate::PG_TLI; use crate::{uint32, uint64, Oid}; use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; @@ -493,12 +493,10 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes { // This function can take untrusted input, so discard any NUL bytes in the prefix string. let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs"); let message = message.as_bytes(); - WalGenerator::encode_record( - WalGenerator::encode_logical_message(&prefix, message), - RM_LOGICALMSG_ID, - XLOG_LOGICAL_MESSAGE, - Lsn(0), - ) + LogicalMessageGenerator::new(&prefix, message) + .next() + .unwrap() + .encode(Lsn(0)) } #[cfg(test)] diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs index 17790e9f70..ec9f868998 100644 --- a/libs/remote_storage/src/error.rs +++ b/libs/remote_storage/src/error.rs @@ -15,6 +15,9 @@ pub enum DownloadError { /// /// Concurrency control is not timed within timeout. Timeout, + /// Some integrity/consistency check failed during download. This is used during + /// timeline loads to cancel the load of a tenant if some timeline detects fatal corruption. + Fatal(String), /// The file was found in the remote storage, but the download failed. Other(anyhow::Error), } @@ -29,6 +32,7 @@ impl std::fmt::Display for DownloadError { DownloadError::Unmodified => write!(f, "File was not modified"), DownloadError::Cancelled => write!(f, "Cancelled, shutting down"), DownloadError::Timeout => write!(f, "timeout"), + DownloadError::Fatal(why) => write!(f, "Fatal read error: {why}"), DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), } } @@ -41,7 +45,7 @@ impl DownloadError { pub fn is_permanent(&self) -> bool { use DownloadError::*; match self { - BadInput(_) | NotFound | Unmodified | Cancelled => true, + BadInput(_) | NotFound | Unmodified | Fatal(_) | Cancelled => true, Timeout | Other(_) => false, } } diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh index 316ec8ed0d..93448369a0 100755 --- a/libs/utils/scripts/restore_from_wal.sh +++ b/libs/utils/scripts/restore_from_wal.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -euxo pipefail @@ -6,9 +6,44 @@ PG_BIN=$1 WAL_PATH=$2 DATA_DIR=$3 PORT=$4 +PG_VERSION=$5 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-) + +# The way that initdb is invoked must match how the pageserver runs initdb. +function initdb_with_args { + local cmd=( + "$PG_BIN"/initdb + -E utf8 + -U cloud_admin + -D "$DATA_DIR" + --locale 'C.UTF-8' + --lc-collate 'C.UTF-8' + --lc-ctype 'C.UTF-8' + --lc-messages 'C.UTF-8' + --lc-monetary 'C.UTF-8' + --lc-numeric 'C.UTF-8' + --lc-time 'C.UTF-8' + --sysid="$SYSID" + ) + + case "$PG_VERSION" in + 14) + # Postgres 14 and below didn't support --locale-provider + ;; + 15 | 16) + cmd+=(--locale-provider 'libc') + ;; + *) + # Postgres 17 added the builtin provider + cmd+=(--locale-provider 'builtin') + ;; + esac + + eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}" +} + rm -fr "$DATA_DIR" -env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID" +initdb_with_args echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-) diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 5bd6f4bedc..f7acc61ac1 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -40,6 +40,11 @@ pub enum Scope { /// Allows access to storage controller APIs used by the scrubber, to interrogate the state /// of a tenant & post scrub results. Scrubber, + + /// This scope is used for communication with other storage controller instances. + /// At the time of writing, this is only used for the step down request. + #[serde(rename = "controller_peer")] + ControllerPeer, } /// JWT payload. See docs/authentication.md for the format diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index b97c6c7a45..5241ab183c 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -123,15 +123,27 @@ pub async fn fsync_async_opt( Ok(()) } -/// Like postgres' durable_rename, renames file issuing fsyncs do make it -/// durable. After return, file and rename are guaranteed to be persisted. +/// Like postgres' durable_rename, renames a file and issues fsyncs to make it durable. After +/// returning, both the file and rename are guaranteed to be persisted. Both paths must be on the +/// same file system. /// -/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make -/// contents durable; 2) its directory entry to make rename durable 3) again to -/// already renamed file, which is not required by standards but postgres does -/// it, let's stick to that. Postgres additionally fsyncs newpath *before* -/// rename if it exists to ensure that at least one of the files survives, but -/// current callers don't need that. +/// Unlike postgres, it only fsyncs 1) the file to make contents durable, and 2) the directory to +/// make the rename durable. This sequence ensures the target file will never be incomplete. +/// +/// Postgres also: +/// +/// * Fsyncs the target file, if it exists, before the rename, to ensure either the new or existing +/// file survives a crash. Current callers don't need this as it should already be fsynced if +/// durability is needed. +/// +/// * Fsyncs the file after the rename. This can be required with certain OSes or file systems (e.g. +/// NFS), but not on Linux with most common file systems like ext4 (which we currently use). +/// +/// An audit of 8 other databases found that none fsynced the file after a rename: +/// +/// +/// eBPF probes confirmed that this is sufficient with ext4, XFS, and ZFS, but possibly not Btrfs: +/// /// /// virtual_file.rs has similar code, but it doesn't use vfs. /// @@ -149,9 +161,6 @@ pub async fn durable_rename( // Time to do the real deal. tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?; - // Postgres'ish fsync of renamed file. - fsync_async_opt(new_path.as_ref(), do_fsync).await?; - // Now fsync the parent let parent = match new_path.as_ref().parent() { Some(p) => p, diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 524f3604a1..f188165600 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -138,6 +138,11 @@ impl Lsn { self.0.checked_sub(other).map(Lsn) } + /// Subtract a number, saturating at numeric bounds instead of overflowing. + pub fn saturating_sub>(self, other: T) -> Lsn { + Lsn(self.0.saturating_sub(other.into())) + } + /// Subtract a number, returning the difference as i128 to avoid overflow. pub fn widening_sub>(self, other: T) -> i128 { let other: u64 = other.into(); diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs index 9dbb6ecedf..6b739d85a7 100644 --- a/pageserver/compaction/src/helpers.rs +++ b/pageserver/compaction/src/helpers.rs @@ -35,6 +35,15 @@ pub fn overlaps_with(a: &Range, b: &Range) -> bool { !(a.end <= b.start || b.end <= a.start) } +/// Whether a fully contains b, example as below +/// ```plain +/// | a | +/// | b | +/// ``` +pub fn fully_contains(a: &Range, b: &Range) -> bool { + a.start <= b.start && a.end >= b.end +} + pub fn union_to_keyspace(a: &mut CompactionKeySpace, b: CompactionKeySpace) { let x = std::mem::take(a); let mut all_ranges_iter = [x.into_iter(), b.into_iter()] diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs index 5c931fcfdb..4075427ab4 100644 --- a/pageserver/src/auth.rs +++ b/pageserver/src/auth.rs @@ -19,7 +19,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Infra - | Scope::Scrubber, + | Scope::Scrubber + | Scope::ControllerPeer, _, ) => Err(AuthError( format!( diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 782122139e..fe2a31167d 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -154,13 +154,17 @@ fn main() -> anyhow::Result<()> { }, }; - let started = Instant::now(); - syncfs(dirfd)?; - let elapsed = started.elapsed(); - info!( - elapsed_ms = elapsed.as_millis(), - "made tenant directory contents durable" - ); + if conf.no_sync { + info!("Skipping syncfs on startup"); + } else { + let started = Instant::now(); + syncfs(dirfd)?; + let elapsed = started.elapsed(); + info!( + elapsed_ms = elapsed.as_millis(), + "made tenant directory contents durable" + ); + } } // Initialize up failpoints support diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 06d4326459..b694a43599 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -69,6 +69,7 @@ pub struct PageServerConf { pub wal_redo_timeout: Duration, pub superuser: String, + pub locale: String, pub page_cache_size: usize, pub max_file_descriptors: usize, @@ -178,6 +179,9 @@ pub struct PageServerConf { /// Direct IO settings pub virtual_file_io_mode: virtual_file::IoMode, + + /// Optionally disable disk syncs (unsafe!) + pub no_sync: bool, } /// Token for authentication to safekeepers @@ -298,6 +302,7 @@ impl PageServerConf { wait_lsn_timeout, wal_redo_timeout, superuser, + locale, page_cache_size, max_file_descriptors, pg_distrib_dir, @@ -332,6 +337,7 @@ impl PageServerConf { concurrent_tenant_size_logical_size_queries, virtual_file_io_engine, tenant_config, + no_sync, } = config_toml; let mut conf = PageServerConf { @@ -344,6 +350,7 @@ impl PageServerConf { wait_lsn_timeout, wal_redo_timeout, superuser, + locale, page_cache_size, max_file_descriptors, http_auth_type, @@ -409,6 +416,7 @@ impl PageServerConf { .map(crate::l0_flush::L0FlushConfig::from) .unwrap_or_default(), virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()), + no_sync: no_sync.unwrap_or(false), }; // ------------------------------------------------------------ diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 72eb3e7ade..dde9c5dd0b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2002,9 +2002,9 @@ async fn timeline_offload_handler( "timeline has attached children".into(), )); } - if !timeline.can_offload() { + if let (false, reason) = timeline.can_offload() { return Err(ApiError::PreconditionFailed( - "Timeline::can_offload() returned false".into(), + format!("Timeline::can_offload() check failed: {}", reason) .into(), )); } offload_timeline(&tenant, &timeline) @@ -2169,6 +2169,21 @@ async fn timeline_detach_ancestor_handler( let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download); let ctx = &ctx; + // Flush the upload queues of all timelines before detaching ancestor. We do the same thing again + // during shutdown. This early upload ensures the pageserver does not need to upload too many + // things and creates downtime during timeline reloads. + for timeline in tenant.list_timelines() { + timeline + .remote_client + .wait_completion() + .await + .map_err(|e| { + ApiError::PreconditionFailed(format!("cannot drain upload queue: {e}").into()) + })?; + } + + tracing::info!("all timeline upload queues are drained"); + let timeline = tenant.get_timeline(timeline_id, true)?; let progress = timeline diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7b106569a4..7c1abbf3e2 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -45,7 +45,7 @@ use wal_decoder::serialized_batch::SerializedValueBatch; pub const MAX_AUX_FILE_DELTAS: usize = 1024; /// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached. -pub const MAX_AUX_FILE_V2_DELTAS: usize = 64; +pub const MAX_AUX_FILE_V2_DELTAS: usize = 16; #[derive(Debug)] pub enum LsnForTimestamp { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d45c99a41b..61bb1fe40c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1433,6 +1433,12 @@ impl Tenant { info!(%timeline_id, "index_part not found on remote"); continue; } + Err(DownloadError::Fatal(why)) => { + // If, while loading one remote timeline, we saw an indication that our generation + // number is likely invalid, then we should not load the whole tenant. + error!(%timeline_id, "Fatal error loading timeline: {why}"); + anyhow::bail!(why.to_string()); + } Err(e) => { // Some (possibly ephemeral) error happened during index_part download. // Pretend the timeline exists to not delete the timeline directory, @@ -2493,7 +2499,8 @@ impl Tenant { timelines_to_compact_or_offload = timelines .iter() .filter_map(|(timeline_id, timeline)| { - let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload()); + let (is_active, (can_offload, _)) = + (timeline.is_active(), timeline.can_offload()); let has_no_unoffloaded_children = { !timelines .iter() @@ -4779,10 +4786,12 @@ async fn run_initdb( let _permit = INIT_DB_SEMAPHORE.acquire().await; - let initdb_command = tokio::process::Command::new(&initdb_bin_path) + let mut initdb_command = tokio::process::Command::new(&initdb_bin_path); + initdb_command .args(["--pgdata", initdb_target_dir.as_ref()]) .args(["--username", &conf.superuser]) .args(["--encoding", "utf8"]) + .args(["--locale", &conf.locale]) .arg("--no-instructions") .arg("--no-sync") .env_clear() @@ -4792,15 +4801,27 @@ async fn run_initdb( // stdout invocation produces the same output every time, we don't need it .stdout(std::process::Stdio::null()) // we would be interested in the stderr output, if there was any - .stderr(std::process::Stdio::piped()) - .spawn()?; + .stderr(std::process::Stdio::piped()); + + // Before version 14, only the libc provide was available. + if pg_version > 14 { + // Version 17 brought with it a builtin locale provider which only provides + // C and C.UTF-8. While being safer for collation purposes since it is + // guaranteed to be consistent throughout a major release, it is also more + // performant. + let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" }; + + initdb_command.args(["--locale-provider", locale_provider]); + } + + let initdb_proc = initdb_command.spawn()?; // Ideally we'd select here with the cancellation token, but the problem is that // we can't safely terminate initdb: it launches processes of its own, and killing // initdb doesn't kill them. After we return from this function, we want the target // directory to be able to be cleaned up. // See https://github.com/neondatabase/neon/issues/6385 - let initdb_output = initdb_command.wait_with_output().await?; + let initdb_output = initdb_proc.wait_with_output().await?; if !initdb_output.status.success() { return Err(InitdbError::Failed( initdb_output.status, @@ -7742,13 +7763,13 @@ mod tests { ( get_key(3), Lsn(0x20), - Value::WalRecord(NeonWalRecord::wal_clear()), + Value::WalRecord(NeonWalRecord::wal_clear("c")), ), (get_key(4), Lsn(0x10), Value::Image("0x10".into())), ( get_key(4), Lsn(0x20), - Value::WalRecord(NeonWalRecord::wal_init()), + Value::WalRecord(NeonWalRecord::wal_init("i")), ), ]; let image1 = vec![(get_key(1), "0x10".into())]; @@ -7897,8 +7918,30 @@ mod tests { #[cfg(feature = "testing")] #[tokio::test] - async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?; + async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> { + test_simple_bottom_most_compaction_deltas_helper( + "test_simple_bottom_most_compaction_deltas_1", + false, + ) + .await + } + + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_simple_bottom_most_compaction_deltas_2() -> anyhow::Result<()> { + test_simple_bottom_most_compaction_deltas_helper( + "test_simple_bottom_most_compaction_deltas_2", + true, + ) + .await + } + + #[cfg(feature = "testing")] + async fn test_simple_bottom_most_compaction_deltas_helper( + test_name: &'static str, + use_delta_bottom_layer: bool, + ) -> anyhow::Result<()> { + let harness = TenantHarness::create(test_name).await?; let (tenant, ctx) = harness.load().await; fn get_key(id: u32) -> Key { @@ -7929,6 +7972,16 @@ mod tests { let img_layer = (0..10) .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) .collect_vec(); + // or, delta layer at 0x10 if `use_delta_bottom_layer` is true + let delta4 = (0..10) + .map(|id| { + ( + get_key(id), + Lsn(0x08), + Value::WalRecord(NeonWalRecord::wal_init(format!("value {id}@0x10"))), + ) + }) + .collect_vec(); let delta1 = vec![ ( @@ -7982,21 +8035,61 @@ mod tests { ), ]; - let tline = tenant - .create_test_timeline_with_layers( - TIMELINE_ID, - Lsn(0x10), - DEFAULT_PG_VERSION, - &ctx, - vec![ - DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), - DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), - DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), - ], // delta layers - vec![(Lsn(0x10), img_layer)], // image layers - Lsn(0x50), - ) - .await?; + let tline = if use_delta_bottom_layer { + tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x08), + DEFAULT_PG_VERSION, + &ctx, + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x08)..Lsn(0x10), + delta4, + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x48), + delta1, + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x48), + delta2, + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x48)..Lsn(0x50), + delta3, + ), + ], // delta layers + vec![], // image layers + Lsn(0x50), + ) + .await? + } else { + tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x48), + delta1, + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x48), + delta2, + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x48)..Lsn(0x50), + delta3, + ), + ], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await? + }; { // Update GC info let mut guard = tline.gc_info.write().unwrap(); @@ -8106,7 +8199,7 @@ mod tests { ( key, Lsn(0x10), - Value::Image(Bytes::copy_from_slice(b"0x10")), + Value::WalRecord(NeonWalRecord::wal_init("0x10")), ), ( key, @@ -8168,7 +8261,7 @@ mod tests { Lsn(0x20), KeyLogAtLsn(vec![( Lsn(0x20), - Value::Image(Bytes::copy_from_slice(b"0x10;0x20")), + Value::Image(Bytes::from_static(b"0x10;0x20")), )]), ), ( @@ -9150,7 +9243,7 @@ mod tests { let will_init = will_init_keys.contains(&i); if will_init { - delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init()))); + delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init("")))); expected_key_values.insert(key, "".to_string()); } else { @@ -9208,6 +9301,23 @@ mod tests { Ok(()) } + fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering { + ( + k1.is_delta, + k1.key_range.start, + k1.key_range.end, + k1.lsn_range.start, + k1.lsn_range.end, + ) + .cmp(&( + k2.is_delta, + k2.key_range.start, + k2.key_range.end, + k2.lsn_range.start, + k2.lsn_range.end, + )) + } + async fn inspect_and_sort( tline: &Arc, filter: Option>, @@ -9216,25 +9326,30 @@ mod tests { if let Some(filter) = filter { all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter)); } - all_layers.sort_by(|k1, k2| { - ( - k1.is_delta, - k1.key_range.start, - k1.key_range.end, - k1.lsn_range.start, - k1.lsn_range.end, - ) - .cmp(&( - k2.is_delta, - k2.key_range.start, - k2.key_range.end, - k2.lsn_range.start, - k2.lsn_range.end, - )) - }); + all_layers.sort_by(sort_layer_key); all_layers } + #[cfg(feature = "testing")] + fn check_layer_map_key_eq( + mut left: Vec, + mut right: Vec, + ) { + left.sort_by(sort_layer_key); + right.sort_by(sort_layer_key); + if left != right { + eprintln!("---LEFT---"); + for left in left.iter() { + eprintln!("{}", left); + } + eprintln!("---RIGHT---"); + for right in right.iter() { + eprintln!("{}", right); + } + assert_eq!(left, right); + } + } + #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> { @@ -9327,127 +9442,206 @@ mod tests { let cancel = CancellationToken::new(); - // Do a partial compaction on key range 0..4, we should generate a image layer; no other layers - // can be removed because they might be used for other key ranges. + // Do a partial compaction on key range 0..2 tline - .partial_compact_with_gc(Some(get_key(0)..get_key(4)), &cancel, EnumSet::new(), &ctx) + .partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx) .await .unwrap(); let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; - assert_eq!( + check_layer_map_key_eq( all_layers, vec![ + // newly-generated image layer for the partial compaction range 0-2 PersistentLayerKey { - key_range: get_key(0)..get_key(4), + key_range: get_key(0)..get_key(2), lsn_range: Lsn(0x20)..Lsn(0x21), - is_delta: false + is_delta: false, }, PersistentLayerKey { key_range: get_key(0)..get_key(10), lsn_range: Lsn(0x10)..Lsn(0x11), - is_delta: false + is_delta: false, }, + // delta1 is split and the second part is rewritten PersistentLayerKey { - key_range: get_key(1)..get_key(4), + key_range: get_key(2)..get_key(4), lsn_range: Lsn(0x20)..Lsn(0x48), - is_delta: true + is_delta: true, }, PersistentLayerKey { key_range: get_key(5)..get_key(7), lsn_range: Lsn(0x20)..Lsn(0x48), - is_delta: true + is_delta: true, }, PersistentLayerKey { key_range: get_key(8)..get_key(10), lsn_range: Lsn(0x48)..Lsn(0x50), - is_delta: true - } - ] + is_delta: true, + }, + ], ); - // Do a partial compaction on key range 4..10 + // Do a partial compaction on key range 2..4 tline - .partial_compact_with_gc(Some(get_key(4)..get_key(10)), &cancel, EnumSet::new(), &ctx) + .partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx) .await .unwrap(); let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; - assert_eq!( + check_layer_map_key_eq( all_layers, vec![ PersistentLayerKey { - key_range: get_key(0)..get_key(4), + key_range: get_key(0)..get_key(2), lsn_range: Lsn(0x20)..Lsn(0x21), - is_delta: false + is_delta: false, }, PersistentLayerKey { - // if (in the future) GC kicks in, this layer will be removed key_range: get_key(0)..get_key(10), lsn_range: Lsn(0x10)..Lsn(0x11), - is_delta: false + is_delta: false, }, + // image layer generated for the compaction range 2-4 PersistentLayerKey { - key_range: get_key(4)..get_key(10), + key_range: get_key(2)..get_key(4), lsn_range: Lsn(0x20)..Lsn(0x21), - is_delta: false + is_delta: false, }, + // we have key2/key3 above the retain_lsn, so we still need this delta layer PersistentLayerKey { - key_range: get_key(1)..get_key(4), + key_range: get_key(2)..get_key(4), lsn_range: Lsn(0x20)..Lsn(0x48), - is_delta: true + is_delta: true, }, PersistentLayerKey { key_range: get_key(5)..get_key(7), lsn_range: Lsn(0x20)..Lsn(0x48), - is_delta: true + is_delta: true, }, PersistentLayerKey { key_range: get_key(8)..get_key(10), lsn_range: Lsn(0x48)..Lsn(0x50), - is_delta: true - } - ] + is_delta: true, + }, + ], + ); + + // Do a partial compaction on key range 4..9 + tline + .partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; + check_layer_map_key_eq( + all_layers, + vec![ + PersistentLayerKey { + key_range: get_key(0)..get_key(2), + lsn_range: Lsn(0x20)..Lsn(0x21), + is_delta: false, + }, + PersistentLayerKey { + key_range: get_key(0)..get_key(10), + lsn_range: Lsn(0x10)..Lsn(0x11), + is_delta: false, + }, + PersistentLayerKey { + key_range: get_key(2)..get_key(4), + lsn_range: Lsn(0x20)..Lsn(0x21), + is_delta: false, + }, + PersistentLayerKey { + key_range: get_key(2)..get_key(4), + lsn_range: Lsn(0x20)..Lsn(0x48), + is_delta: true, + }, + // image layer generated for this compaction range + PersistentLayerKey { + key_range: get_key(4)..get_key(9), + lsn_range: Lsn(0x20)..Lsn(0x21), + is_delta: false, + }, + PersistentLayerKey { + key_range: get_key(8)..get_key(10), + lsn_range: Lsn(0x48)..Lsn(0x50), + is_delta: true, + }, + ], + ); + + // Do a partial compaction on key range 9..10 + tline + .partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx) + .await + .unwrap(); + let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; + check_layer_map_key_eq( + all_layers, + vec![ + PersistentLayerKey { + key_range: get_key(0)..get_key(2), + lsn_range: Lsn(0x20)..Lsn(0x21), + is_delta: false, + }, + PersistentLayerKey { + key_range: get_key(0)..get_key(10), + lsn_range: Lsn(0x10)..Lsn(0x11), + is_delta: false, + }, + PersistentLayerKey { + key_range: get_key(2)..get_key(4), + lsn_range: Lsn(0x20)..Lsn(0x21), + is_delta: false, + }, + PersistentLayerKey { + key_range: get_key(2)..get_key(4), + lsn_range: Lsn(0x20)..Lsn(0x48), + is_delta: true, + }, + PersistentLayerKey { + key_range: get_key(4)..get_key(9), + lsn_range: Lsn(0x20)..Lsn(0x21), + is_delta: false, + }, + // image layer generated for the compaction range + PersistentLayerKey { + key_range: get_key(9)..get_key(10), + lsn_range: Lsn(0x20)..Lsn(0x21), + is_delta: false, + }, + PersistentLayerKey { + key_range: get_key(8)..get_key(10), + lsn_range: Lsn(0x48)..Lsn(0x50), + is_delta: true, + }, + ], ); // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones. tline - .partial_compact_with_gc(Some(get_key(0)..get_key(10)), &cancel, EnumSet::new(), &ctx) + .partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx) .await .unwrap(); let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await; - assert_eq!( + check_layer_map_key_eq( all_layers, vec![ - PersistentLayerKey { - key_range: get_key(0)..get_key(4), - lsn_range: Lsn(0x20)..Lsn(0x21), - is_delta: false - }, + // aha, we removed all unnecessary image/delta layers and got a very clean layer map! PersistentLayerKey { key_range: get_key(0)..get_key(10), lsn_range: Lsn(0x20)..Lsn(0x21), - is_delta: false + is_delta: false, }, PersistentLayerKey { - key_range: get_key(4)..get_key(10), - lsn_range: Lsn(0x20)..Lsn(0x21), - is_delta: false - }, - PersistentLayerKey { - key_range: get_key(1)..get_key(4), + key_range: get_key(2)..get_key(4), lsn_range: Lsn(0x20)..Lsn(0x48), - is_delta: true - }, - PersistentLayerKey { - key_range: get_key(5)..get_key(7), - lsn_range: Lsn(0x20)..Lsn(0x48), - is_delta: true + is_delta: true, }, PersistentLayerKey { key_range: get_key(8)..get_key(10), lsn_range: Lsn(0x48)..Lsn(0x50), - is_delta: true - } - ] + is_delta: true, + }, + ], ); Ok(()) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index a4c458b737..4fc9d740c8 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1959,7 +1959,7 @@ impl TenantManager { attempt.before_reset_tenant(); let (_guard, progress) = utils::completion::channel(); - match tenant.shutdown(progress, ShutdownMode::Hard).await { + match tenant.shutdown(progress, ShutdownMode::Flush).await { Ok(()) => { slot_guard.drop_old_value().expect("it was just shutdown"); } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 0aa8d61036..600583f6b5 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -574,12 +574,18 @@ impl RemoteTimelineClient { if latest_index_generation > index_generation { // Unexpected! Why are we loading such an old index if a more recent one exists? - tracing::warn!( + // We will refuse to proceed, as there is no reasonable scenario where this should happen, but + // there _is_ a clear bug/corruption scenario where it would happen (controller sets the generation + // backwards). + tracing::error!( ?index_generation, ?latest_index_generation, ?latest_index_mtime, "Found a newer index while loading an old one" ); + return Err(DownloadError::Fatal( + "Index age exceeds threshold and a newer index exists".into(), + )); } } @@ -2201,6 +2207,18 @@ impl RemoteTimelineClient { inner.initialized_mut()?; Ok(UploadQueueAccessor { inner }) } + + pub(crate) fn no_pending_work(&self) -> bool { + let inner = self.upload_queue.lock().unwrap(); + match &*inner { + UploadQueue::Uninitialized + | UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => true, + UploadQueue::Stopped(UploadQueueStopped::Deletable(x)) => { + x.upload_queue_for_deletion.no_pending_work() + } + UploadQueue::Initialized(x) => x.no_pending_work(), + } + } } pub(crate) struct UploadQueueAccessor<'a> { diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 664c00a6b1..fec8a0a16c 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -653,6 +653,10 @@ impl DeltaLayerWriter { }) } + pub fn is_empty(&self) -> bool { + self.inner.as_ref().unwrap().num_keys == 0 + } + /// /// Append a key-value pair to the file. /// diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs index ccfcf68e8f..8660be1fcc 100644 --- a/pageserver/src/tenant/storage_layer/filter_iterator.rs +++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs @@ -1,4 +1,4 @@ -use std::ops::Range; +use std::{ops::Range, sync::Arc}; use anyhow::bail; use pageserver_api::{ @@ -9,7 +9,10 @@ use utils::lsn::Lsn; use pageserver_api::value::Value; -use super::merge_iterator::MergeIterator; +use super::{ + merge_iterator::{MergeIterator, MergeIteratorItem}, + PersistentLayerKey, +}; /// A filter iterator over merge iterators (and can be easily extended to other types of iterators). /// @@ -48,10 +51,10 @@ impl<'a> FilterIterator<'a> { }) } - pub async fn next(&mut self) -> anyhow::Result> { - while let Some(item) = self.inner.next().await? { + async fn next_inner(&mut self) -> anyhow::Result> { + while let Some(item) = self.inner.next_inner::().await? { while self.current_filter_idx < self.retain_key_filters.len() - && item.0 >= self.retain_key_filters[self.current_filter_idx].end + && item.key_lsn_value().0 >= self.retain_key_filters[self.current_filter_idx].end { // [filter region] [filter region] [filter region] // ^ item @@ -68,7 +71,7 @@ impl<'a> FilterIterator<'a> { // ^ current filter (nothing) return Ok(None); } - if self.retain_key_filters[self.current_filter_idx].contains(&item.0) { + if self.retain_key_filters[self.current_filter_idx].contains(&item.key_lsn_value().0) { // [filter region] [filter region] [filter region] // ^ item // ^ current filter @@ -81,6 +84,16 @@ impl<'a> FilterIterator<'a> { } Ok(None) } + + pub async fn next(&mut self) -> anyhow::Result> { + self.next_inner().await + } + + pub async fn next_with_trace( + &mut self, + ) -> anyhow::Result)>> { + self.next_inner().await + } } #[cfg(test)] diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 2ce26ed2eb..af6112d535 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -67,6 +67,8 @@ pub struct InMemoryLayer { /// The above fields never change, except for `end_lsn`, which is only set once. /// All other changing parts are in `inner`, and protected by a mutex. inner: RwLock, + + estimated_in_mem_size: AtomicU64, } impl std::fmt::Debug for InMemoryLayer { @@ -543,6 +545,10 @@ impl InMemoryLayer { Ok(inner.file.len()) } + pub fn estimated_in_mem_size(&self) -> u64 { + self.estimated_in_mem_size.load(AtomicOrdering::Relaxed) + } + /// Create a new, empty, in-memory layer pub async fn create( conf: &'static PageServerConf, @@ -572,6 +578,7 @@ impl InMemoryLayer { file, resource_units: GlobalResourceUnits::new(), }), + estimated_in_mem_size: AtomicU64::new(0), }) } @@ -642,6 +649,12 @@ impl InMemoryLayer { // because this case is unexpected, and we would like tests to fail if this happens. warn!("Key {} at {} written twice at same LSN", key, lsn); } + self.estimated_in_mem_size.fetch_add( + (std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::()) as u64, + AtomicOrdering::Relaxed, + ); } inner.resource_units.maybe_publish_size(new_size); diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index 980202f12c..19cfcb0867 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -1,6 +1,7 @@ use std::{ cmp::Ordering, collections::{binary_heap, BinaryHeap}, + sync::Arc, }; use anyhow::bail; @@ -13,10 +14,11 @@ use pageserver_api::value::Value; use super::{ delta_layer::{DeltaLayerInner, DeltaLayerIterator}, image_layer::{ImageLayerInner, ImageLayerIterator}, + PersistentLayerDesc, PersistentLayerKey, }; #[derive(Clone, Copy)] -enum LayerRef<'a> { +pub(crate) enum LayerRef<'a> { Image(&'a ImageLayerInner), Delta(&'a DeltaLayerInner), } @@ -62,18 +64,20 @@ impl LayerIterRef<'_> { /// 1. Unified iterator for image and delta layers. /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge). /// 3. Lazy creation of the real delta/image iterator. -enum IteratorWrapper<'a> { +pub(crate) enum IteratorWrapper<'a> { NotLoaded { ctx: &'a RequestContext, first_key_lower_bound: (Key, Lsn), layer: LayerRef<'a>, + source_desc: Arc, }, Loaded { iter: PeekableLayerIterRef<'a>, + source_desc: Arc, }, } -struct PeekableLayerIterRef<'a> { +pub(crate) struct PeekableLayerIterRef<'a> { iter: LayerIterRef<'a>, peeked: Option<(Key, Lsn, Value)>, // None == end } @@ -151,6 +155,12 @@ impl<'a> IteratorWrapper<'a> { layer: LayerRef::Image(image_layer), first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()), ctx, + source_desc: PersistentLayerKey { + key_range: image_layer.key_range().clone(), + lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer.lsn()), + is_delta: false, + } + .into(), } } @@ -162,12 +172,18 @@ impl<'a> IteratorWrapper<'a> { layer: LayerRef::Delta(delta_layer), first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start), ctx, + source_desc: PersistentLayerKey { + key_range: delta_layer.key_range().clone(), + lsn_range: delta_layer.lsn_range().clone(), + is_delta: true, + } + .into(), } } fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> { match self { - Self::Loaded { iter } => iter + Self::Loaded { iter, .. } => iter .peek() .as_ref() .map(|(key, lsn, val)| (key, *lsn, Some(val))), @@ -191,6 +207,7 @@ impl<'a> IteratorWrapper<'a> { ctx, first_key_lower_bound, layer, + source_desc, } = self else { unreachable!() @@ -206,7 +223,10 @@ impl<'a> IteratorWrapper<'a> { ); } } - *self = Self::Loaded { iter }; + *self = Self::Loaded { + iter, + source_desc: source_desc.clone(), + }; Ok(()) } @@ -220,11 +240,19 @@ impl<'a> IteratorWrapper<'a> { /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. async fn next(&mut self) -> anyhow::Result> { - let Self::Loaded { iter } = self else { + let Self::Loaded { iter, .. } = self else { panic!("must load the iterator before using") }; iter.next().await } + + /// Get the persistent layer key corresponding to this iterator + fn trace_source(&self) -> Arc { + match self { + Self::Loaded { source_desc, .. } => source_desc.clone(), + Self::NotLoaded { source_desc, .. } => source_desc.clone(), + } + } } /// A merge iterator over delta/image layer iterators. @@ -242,6 +270,32 @@ pub struct MergeIterator<'a> { heap: BinaryHeap>, } +pub(crate) trait MergeIteratorItem { + fn new(item: (Key, Lsn, Value), iterator: &IteratorWrapper<'_>) -> Self; + + fn key_lsn_value(&self) -> &(Key, Lsn, Value); +} + +impl MergeIteratorItem for (Key, Lsn, Value) { + fn new(item: (Key, Lsn, Value), _: &IteratorWrapper<'_>) -> Self { + item + } + + fn key_lsn_value(&self) -> &(Key, Lsn, Value) { + self + } +} + +impl MergeIteratorItem for ((Key, Lsn, Value), Arc) { + fn new(item: (Key, Lsn, Value), iter: &IteratorWrapper<'_>) -> Self { + (item, iter.trace_source().clone()) + } + + fn key_lsn_value(&self) -> &(Key, Lsn, Value) { + &self.0 + } +} + impl<'a> MergeIterator<'a> { pub fn create( deltas: &[&'a DeltaLayerInner], @@ -260,7 +314,7 @@ impl<'a> MergeIterator<'a> { } } - pub async fn next(&mut self) -> anyhow::Result> { + pub(crate) async fn next_inner(&mut self) -> anyhow::Result> { while let Some(mut iter) = self.heap.peek_mut() { if !iter.is_loaded() { // Once we load the iterator, we can know the real first key-value pair in the iterator. @@ -275,10 +329,22 @@ impl<'a> MergeIterator<'a> { binary_heap::PeekMut::pop(iter); continue; }; - return Ok(Some(item)); + return Ok(Some(R::new(item, &iter))); } Ok(None) } + + /// Get the next key-value pair from the iterator. + pub async fn next(&mut self) -> anyhow::Result> { + self.next_inner().await + } + + /// Get the next key-value pair from the iterator, and trace where the key comes from. + pub async fn next_with_trace( + &mut self, + ) -> anyhow::Result)>> { + self.next_inner().await + } } #[cfg(test)] @@ -496,7 +562,7 @@ mod tests { ( get_key(0), Lsn(0x10), - Value::WalRecord(NeonWalRecord::wal_init()), + Value::WalRecord(NeonWalRecord::wal_init("")), ), ( get_key(0), @@ -506,7 +572,7 @@ mod tests { ( get_key(5), Lsn(0x10), - Value::WalRecord(NeonWalRecord::wal_init()), + Value::WalRecord(NeonWalRecord::wal_init("")), ), ( get_key(5), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ee823beca8..09ddb19765 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -23,6 +23,7 @@ use handle::ShardTimelineId; use offload::OffloadError; use once_cell::sync::Lazy; use pageserver_api::{ + config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD, key::{ KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE, @@ -852,6 +853,10 @@ pub(crate) enum ShutdownMode { /// While we are flushing, we continue to accept read I/O for LSNs ingested before /// the call to [`Timeline::shutdown`]. FreezeAndFlush, + /// Only flush the layers to the remote storage without freezing any open layers. This is the + /// mode used by ancestor detach and any other operations that reloads a tenant but not increasing + /// the generation number. + Flush, /// Shut down immediately, without waiting for any open layers to flush. Hard, } @@ -1565,12 +1570,16 @@ impl Timeline { /// /// This is neccessary but not sufficient for offloading of the timeline as it might have /// child timelines that are not offloaded yet. - pub(crate) fn can_offload(&self) -> bool { + pub(crate) fn can_offload(&self) -> (bool, &'static str) { if self.remote_client.is_archived() != Some(true) { - return false; + return (false, "the timeline is not archived"); + } + if !self.remote_client.no_pending_work() { + // if the remote client is still processing some work, we can't offload + return (false, "the upload queue is not drained yet"); } - true + (true, "ok") } /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending @@ -1678,11 +1687,6 @@ impl Timeline { pub(crate) async fn shutdown(&self, mode: ShutdownMode) { debug_assert_current_span_has_tenant_and_timeline_id(); - let try_freeze_and_flush = match mode { - ShutdownMode::FreezeAndFlush => true, - ShutdownMode::Hard => false, - }; - // Regardless of whether we're going to try_freeze_and_flush // or not, stop ingesting any more data. Walreceiver only provides // cancellation but no "wait until gone", because it uses the Timeline::gate. @@ -1704,7 +1708,7 @@ impl Timeline { // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); - if try_freeze_and_flush { + if let ShutdownMode::FreezeAndFlush = mode { if let Some((open, frozen)) = self .layers .read() @@ -1746,6 +1750,20 @@ impl Timeline { warn!("failed to freeze and flush: {e:#}"); } } + + // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but + // we also do a final check here to ensure that the queue is empty. + if !self.remote_client.no_pending_work() { + warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + } + } + + if let ShutdownMode::Flush = mode { + // drain the upload queue + self.remote_client.shutdown().await; + if !self.remote_client.no_pending_work() { + warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + } } // Signal any subscribers to our cancellation token to drop out @@ -3488,18 +3506,37 @@ impl Timeline { let timer = self.metrics.flush_time_histo.start_timer(); + let num_frozen_layers; + let frozen_layer_total_size; let layer_to_flush = { let guard = self.layers.read().await; let Ok(lm) = guard.layer_map() else { info!("dropping out of flush loop for timeline shutdown"); return; }; + num_frozen_layers = lm.frozen_layers.len(); + frozen_layer_total_size = lm + .frozen_layers + .iter() + .map(|l| l.estimated_in_mem_size()) + .sum::(); lm.frozen_layers.front().cloned() // drop 'layers' lock to allow concurrent reads and writes }; let Some(layer_to_flush) = layer_to_flush else { break Ok(()); }; + if num_frozen_layers + > std::cmp::max( + self.get_compaction_threshold(), + DEFAULT_COMPACTION_THRESHOLD, + ) + && frozen_layer_total_size >= /* 128 MB */ 128000000 + { + tracing::warn!( + "too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes", + ); + } match self.flush_frozen_layer(layer_to_flush, ctx).await { Ok(this_layer_to_lsn) => { flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn); @@ -4090,6 +4127,7 @@ impl Timeline { ) -> Result { // Metadata keys image layer creation. let mut reconstruct_state = ValuesReconstructState::default(); + let begin = Instant::now(); let data = self .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) .await?; @@ -4106,14 +4144,11 @@ impl Timeline { (new_data, total_kb_retrieved / 1024, total_keys_retrieved) }; let delta_files_accessed = reconstruct_state.get_delta_layers_visited(); + let elapsed = begin.elapsed(); let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; - debug!( - trigger_generation, - delta_files_accessed, - total_kb_retrieved, - total_keys_retrieved, - "generate metadata images" + info!( + "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64() ); if !trigger_generation && mode == ImageLayerCreationMode::Try { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 01c2803881..e6ef1aae2b 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -4,7 +4,7 @@ //! //! The old legacy algorithm is implemented directly in `timeline.rs`. -use std::collections::{BinaryHeap, HashSet}; +use std::collections::{BinaryHeap, HashMap, HashSet}; use std::ops::{Deref, Range}; use std::sync::Arc; @@ -56,7 +56,7 @@ use pageserver_api::value::Value; use utils::lsn::Lsn; -use pageserver_compaction::helpers::overlaps_with; +use pageserver_compaction::helpers::{fully_contains, overlaps_with}; use pageserver_compaction::interface::*; use super::CompactionError; @@ -64,6 +64,23 @@ use super::CompactionError; /// Maximum number of deltas before generating an image layer in bottom-most compaction. const COMPACTION_DELTA_THRESHOLD: usize = 5; +pub struct GcCompactionJobDescription { + /// All layers to read in the compaction job + selected_layers: Vec, + /// GC cutoff of the job + gc_cutoff: Lsn, + /// LSNs to retain for the job + retain_lsns_below_horizon: Vec, + /// Maximum layer LSN processed in this compaction + max_layer_lsn: Lsn, + /// Only compact layers overlapping with this range + compaction_key_range: Range, + /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap. + /// This field is here solely for debugging. The field will not be read once the compaction + /// description is generated. + rewrite_layers: Vec>, +} + /// The result of bottom-most compaction for a single key at each LSN. #[derive(Debug)] #[cfg_attr(test, derive(PartialEq))] @@ -1722,7 +1739,8 @@ impl Timeline { flags: EnumSet, ctx: &RequestContext, ) -> anyhow::Result<()> { - self.partial_compact_with_gc(None, cancel, flags, ctx).await + self.partial_compact_with_gc(Key::MIN..Key::MAX, cancel, flags, ctx) + .await } /// An experimental compaction building block that combines compaction with garbage collection. @@ -1732,12 +1750,15 @@ impl Timeline { /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon, /// and create delta layers with all deltas >= gc horizon. /// - /// If `key_range`, it will only compact the keys within the range, aka partial compaction. This functionality - /// is not complete yet, and if it is set, only image layers will be generated. - /// + /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction. + /// Partial compaction will read and process all layers overlapping with the key range, even if it might + /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained + /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing + /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not + /// part of the range. pub(crate) async fn partial_compact_with_gc( self: &Arc, - compaction_key_range: Option>, + compaction_key_range: Range, cancel: &CancellationToken, flags: EnumSet, ctx: &RequestContext, @@ -1762,9 +1783,8 @@ impl Timeline { .await?; let dry_run = flags.contains(CompactFlags::DryRun); - let partial_compaction = compaction_key_range.is_some(); - if let Some(ref compaction_key_range) = compaction_key_range { + if compaction_key_range == (Key::MIN..Key::MAX) { info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end); } else { info!("running enhanced gc bottom-most compaction, dry_run={dry_run}"); @@ -1780,7 +1800,7 @@ impl Timeline { // The layer selection has the following properties: // 1. If a layer is in the selection, all layers below it are in the selection. // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. - let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = if !partial_compaction { + let job_desc = { let guard = self.layers.read().await; let layers = guard.layer_map()?; let gc_info = self.gc_info.read().unwrap(); @@ -1810,9 +1830,21 @@ impl Timeline { }; // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key // layers to compact. + let mut rewrite_layers = Vec::new(); for desc in layers.iter_historic_layers() { - if desc.get_lsn_range().end <= max_layer_lsn { + if desc.get_lsn_range().end <= max_layer_lsn + && overlaps_with(&desc.get_key_range(), &compaction_key_range) + { + // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range, + // even if it might contain extra keys selected_layers.push(guard.get_from_desc(&desc)); + // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine + // to overlap image layers) + if desc.is_delta() + && !fully_contains(&compaction_key_range, &desc.get_key_range()) + { + rewrite_layers.push(desc); + } } } if selected_layers.is_empty() { @@ -1820,82 +1852,59 @@ impl Timeline { return Ok(()); } retain_lsns_below_horizon.sort(); - (selected_layers, gc_cutoff, retain_lsns_below_horizon) - } else { - // In case of partial compaction, we currently only support generating image layers, and therefore, - // we pick all layers that are below the lowest retain_lsn and does not intersect with any of the layers. - let guard = self.layers.read().await; - let layers = guard.layer_map()?; - let gc_info = self.gc_info.read().unwrap(); - let mut min_lsn = gc_info.cutoffs.select_min(); - for (lsn, _, _) in &gc_info.retain_lsns { - if lsn < &min_lsn { - min_lsn = *lsn; - } + GcCompactionJobDescription { + selected_layers, + gc_cutoff, + retain_lsns_below_horizon, + max_layer_lsn, + compaction_key_range, + rewrite_layers, } - for lsn in gc_info.leases.keys() { - if lsn < &min_lsn { - min_lsn = *lsn; - } - } - let mut selected_layers = Vec::new(); - drop(gc_info); - // |-------| |-------| |-------| - // | Delta | | Delta | | Delta | -- min_lsn could be intersecting with the layers - // |-------| |-------| |-------| <- we want to pick all the layers below min_lsn, so that - // | Delta | | Delta | | Delta | ...we can remove them after compaction - // |-------| |-------| |-------| - // Pick all the layers intersect or below the min_lsn, get the largest LSN in the selected layers. - let Some(compaction_key_range) = compaction_key_range.as_ref() else { - unreachable!() - }; - for desc in layers.iter_historic_layers() { - if desc.get_lsn_range().end <= min_lsn - && overlaps_with(&desc.key_range, compaction_key_range) - { - selected_layers.push(guard.get_from_desc(&desc)); - } - } - if selected_layers.is_empty() { - info!("no layers to compact with gc"); - return Ok(()); - } - (selected_layers, min_lsn, Vec::new()) }; let lowest_retain_lsn = if self.ancestor_timeline.is_some() { - if partial_compaction { - warn!("partial compaction cannot run on child branches (for now)"); - return Ok(()); - } Lsn(self.ancestor_lsn.0 + 1) } else { - let res = retain_lsns_below_horizon + let res = job_desc + .retain_lsns_below_horizon .first() .copied() - .unwrap_or(gc_cutoff); + .unwrap_or(job_desc.gc_cutoff); if cfg!(debug_assertions) { assert_eq!( res, - retain_lsns_below_horizon + job_desc + .retain_lsns_below_horizon .iter() .min() .copied() - .unwrap_or(gc_cutoff) + .unwrap_or(job_desc.gc_cutoff) ); } res }; info!( - "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}", - layer_selection.len(), - gc_cutoff, - lowest_retain_lsn + "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}", + job_desc.selected_layers.len(), + job_desc.rewrite_layers.len(), + job_desc.max_layer_lsn, + job_desc.gc_cutoff, + lowest_retain_lsn, + job_desc.compaction_key_range.start, + job_desc.compaction_key_range.end ); - self.check_compaction_space(&layer_selection).await?; + for layer in &job_desc.selected_layers { + debug!("read layer: {}", layer.layer_desc().key()); + } + for layer in &job_desc.rewrite_layers { + debug!("rewrite layer: {}", layer.key()); + } + + self.check_compaction_space(&job_desc.selected_layers) + .await?; // Generate statistics for the compaction - for layer in &layer_selection { + for layer in &job_desc.selected_layers { let desc = layer.layer_desc(); if desc.is_delta() { stat.visit_delta_layer(desc.file_size()); @@ -1906,25 +1915,25 @@ impl Timeline { // Step 1: construct a k-merge iterator over all layers. // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point. - let layer_names: Vec = layer_selection + let layer_names = job_desc + .selected_layers .iter() .map(|layer| layer.layer_desc().layer_name()) .collect_vec(); if let Some(err) = check_valid_layermap(&layer_names) { - bail!("cannot run gc-compaction because {}", err); + warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err); } // The maximum LSN we are processing in this compaction loop - let end_lsn = layer_selection + let end_lsn = job_desc + .selected_layers .iter() .map(|l| l.layer_desc().lsn_range.end) .max() .unwrap(); - // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized - // as an L0 layer. let mut delta_layers = Vec::new(); let mut image_layers = Vec::new(); let mut downloaded_layers = Vec::new(); - for layer in &layer_selection { + for layer in &job_desc.selected_layers { let resident_layer = layer.download_and_keep_resident().await?; downloaded_layers.push(resident_layer); } @@ -1943,8 +1952,8 @@ impl Timeline { dense_ks, sparse_ks, )?; - // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas. - // Data of the same key. + + // Step 2: Produce images+deltas. let mut accumulated_values = Vec::new(); let mut last_key: Option = None; @@ -1956,10 +1965,7 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, - compaction_key_range - .as_ref() - .map(|x| x.start) - .unwrap_or(Key::MIN), + job_desc.compaction_key_range.start, lowest_retain_lsn, self.get_compaction_target_size(), ctx, @@ -1979,6 +1985,13 @@ impl Timeline { ) .await?; + #[derive(Default)] + struct RewritingLayers { + before: Option, + after: Option, + } + let mut delta_layer_rewriters = HashMap::, RewritingLayers>::new(); + /// Returns None if there is no ancestor branch. Throw an error when the key is not found. /// /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image @@ -2004,10 +2017,51 @@ impl Timeline { // the key and LSN range are determined. However, to keep things simple here, we still // create this writer, and discard the writer in the end. - while let Some((key, lsn, val)) = merge_iter.next().await? { + while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? { if cancel.is_cancelled() { return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error } + if !job_desc.compaction_key_range.contains(&key) { + if !desc.is_delta { + continue; + } + let rewriter = delta_layer_rewriters.entry(desc.clone()).or_default(); + let rewriter = if key < job_desc.compaction_key_range.start { + if rewriter.before.is_none() { + rewriter.before = Some( + DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + desc.key_range.start, + desc.lsn_range.clone(), + ctx, + ) + .await?, + ); + } + rewriter.before.as_mut().unwrap() + } else if key >= job_desc.compaction_key_range.end { + if rewriter.after.is_none() { + rewriter.after = Some( + DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + job_desc.compaction_key_range.end, + desc.lsn_range.clone(), + ctx, + ) + .await?, + ); + } + rewriter.after.as_mut().unwrap() + } else { + unreachable!() + }; + rewriter.put_value(key, lsn, val, ctx).await?; + continue; + } match val { Value::Image(_) => stat.visit_image_key(&val), Value::WalRecord(_) => stat.visit_wal_key(&val), @@ -2018,35 +2072,27 @@ impl Timeline { } accumulated_values.push((key, lsn, val)); } else { - let last_key = last_key.as_mut().unwrap(); - stat.on_unique_key_visited(); - let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range { - !compaction_key_range.contains(last_key) - } else { - false - }; - if !skip_adding_key { - let retention = self - .generate_key_retention( - *last_key, - &accumulated_values, - gc_cutoff, - &retain_lsns_below_horizon, - COMPACTION_DELTA_THRESHOLD, - get_ancestor_image(self, *last_key, ctx).await?, - ) - .await?; - // Put the image into the image layer. Currently we have a single big layer for the compaction. - retention - .pipe_to( - *last_key, - &mut delta_layer_writer, - image_layer_writer.as_mut(), - &mut stat, - ctx, - ) - .await?; - } + let last_key: &mut Key = last_key.as_mut().unwrap(); + stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction + let retention = self + .generate_key_retention( + *last_key, + &accumulated_values, + job_desc.gc_cutoff, + &job_desc.retain_lsns_below_horizon, + COMPACTION_DELTA_THRESHOLD, + get_ancestor_image(self, *last_key, ctx).await?, + ) + .await?; + retention + .pipe_to( + *last_key, + &mut delta_layer_writer, + image_layer_writer.as_mut(), + &mut stat, + ctx, + ) + .await?; accumulated_values.clear(); *last_key = key; accumulated_values.push((key, lsn, val)); @@ -2057,35 +2103,43 @@ impl Timeline { let last_key = last_key.expect("no keys produced during compaction"); stat.on_unique_key_visited(); - let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range { - !compaction_key_range.contains(&last_key) - } else { - false - }; - if !skip_adding_key { - let retention = self - .generate_key_retention( - last_key, - &accumulated_values, - gc_cutoff, - &retain_lsns_below_horizon, - COMPACTION_DELTA_THRESHOLD, - get_ancestor_image(self, last_key, ctx).await?, - ) - .await?; - // Put the image into the image layer. Currently we have a single big layer for the compaction. - retention - .pipe_to( - last_key, - &mut delta_layer_writer, - image_layer_writer.as_mut(), - &mut stat, - ctx, - ) - .await?; - } + let retention = self + .generate_key_retention( + last_key, + &accumulated_values, + job_desc.gc_cutoff, + &job_desc.retain_lsns_below_horizon, + COMPACTION_DELTA_THRESHOLD, + get_ancestor_image(self, last_key, ctx).await?, + ) + .await?; + retention + .pipe_to( + last_key, + &mut delta_layer_writer, + image_layer_writer.as_mut(), + &mut stat, + ctx, + ) + .await?; // end: move the above part to the loop body + let mut rewrote_delta_layers = Vec::new(); + for (key, writers) in delta_layer_rewriters { + if let Some(delta_writer_before) = writers.before { + let (desc, path) = delta_writer_before + .finish(job_desc.compaction_key_range.start, ctx) + .await?; + let layer = Layer::finish_creating(self.conf, self, desc, &path)?; + rewrote_delta_layers.push(layer); + } + if let Some(delta_writer_after) = writers.after { + let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?; + let layer = Layer::finish_creating(self.conf, self, desc, &path)?; + rewrote_delta_layers.push(layer); + } + } + let discard = |key: &PersistentLayerKey| { let key = key.clone(); async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await } @@ -2093,10 +2147,7 @@ impl Timeline { let produced_image_layers = if let Some(writer) = image_layer_writer { if !dry_run { - let end_key = compaction_key_range - .as_ref() - .map(|x| x.end) - .unwrap_or(Key::MAX); + let end_key = job_desc.compaction_key_range.end; writer .finish_with_discard_fn(self, ctx, end_key, discard) .await? @@ -2117,10 +2168,8 @@ impl Timeline { Vec::new() }; - if partial_compaction && !produced_delta_layers.is_empty() { - bail!("implementation error: partial compaction should not be producing delta layers (for now)"); - } - + // TODO: make image/delta/rewrote_delta layers generation atomic. At this point, we already generated resident layers, and if + // compaction is cancelled at this point, we might have some layers that are not cleaned up. let mut compact_to = Vec::new(); let mut keep_layers = HashSet::new(); let produced_delta_layers_len = produced_delta_layers.len(); @@ -2128,52 +2177,84 @@ impl Timeline { for action in produced_delta_layers { match action { BatchWriterResult::Produced(layer) => { + if cfg!(debug_assertions) { + info!("produced delta layer: {}", layer.layer_desc().key()); + } stat.produce_delta_layer(layer.layer_desc().file_size()); compact_to.push(layer); } BatchWriterResult::Discarded(l) => { + if cfg!(debug_assertions) { + info!("discarded delta layer: {}", l); + } keep_layers.insert(l); stat.discard_delta_layer(); } } } + for layer in &rewrote_delta_layers { + debug!( + "produced rewritten delta layer: {}", + layer.layer_desc().key() + ); + } + compact_to.extend(rewrote_delta_layers); for action in produced_image_layers { match action { BatchWriterResult::Produced(layer) => { + debug!("produced image layer: {}", layer.layer_desc().key()); stat.produce_image_layer(layer.layer_desc().file_size()); compact_to.push(layer); } BatchWriterResult::Discarded(l) => { + debug!("discarded image layer: {}", l); keep_layers.insert(l); stat.discard_image_layer(); } } } - let mut layer_selection = layer_selection; - layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); - if let Some(ref compaction_key_range) = compaction_key_range { - // Partial compaction might select more data than it processes, e.g., if - // the compaction_key_range only partially overlaps: - // - // [---compaction_key_range---] - // [---A----][----B----][----C----][----D----] - // - // A,B,C,D are all in the `layer_selection`. The created image layers contain - // whatever is needed from B, C, and from `----]` of A, and from `[--` of D. - // - // In contrast, `[--A-` and `--D----]` have not been processed, so, we must - // keep that data. - // - // The solution for now is to keep A and D completely. - // (layer_selection is what we'll remove from the layer map, so, - // retain what is _not_ fully covered by compaction_key_range). - layer_selection.retain(|x| { - let key_range = &x.layer_desc().key_range; - key_range.start >= compaction_key_range.start - && key_range.end <= compaction_key_range.end - }); + + let mut layer_selection = job_desc.selected_layers; + + // Partial compaction might select more data than it processes, e.g., if + // the compaction_key_range only partially overlaps: + // + // [---compaction_key_range---] + // [---A----][----B----][----C----][----D----] + // + // For delta layers, we will rewrite the layers so that it is cut exactly at + // the compaction key range, so we can always discard them. However, for image + // layers, as we do not rewrite them for now, we need to handle them differently. + // Assume image layers A, B, C, D are all in the `layer_selection`. + // + // The created image layers contain whatever is needed from B, C, and from + // `----]` of A, and from `[---` of D. + // + // In contrast, `[---A` and `D----]` have not been processed, so, we must + // keep that data. + // + // The solution for now is to keep A and D completely if they are image layers. + // (layer_selection is what we'll remove from the layer map, so, retain what + // is _not_ fully covered by compaction_key_range). + for layer in &layer_selection { + if !layer.layer_desc().is_delta() { + if !overlaps_with( + &layer.layer_desc().key_range, + &job_desc.compaction_key_range, + ) { + bail!("violated constraint: image layer outside of compaction key range"); + } + if !fully_contains( + &job_desc.compaction_key_range, + &layer.layer_desc().key_range, + ) { + keep_layers.insert(layer.layer_desc().key()); + } + } } + layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key())); + info!( "gc-compaction statistics: {}", serde_json::to_string(&stat)? @@ -2192,6 +2273,7 @@ impl Timeline { // Step 3: Place back to the layer map. { + // TODO: sanity check if the layer map is valid (i.e., should not have overlaps) let mut guard = self.layers.write().await; guard .open_mut()? diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index cccf24e303..1394843467 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -47,21 +47,18 @@ pub(crate) async fn offload_timeline( match is_archived { Some(true) => (), Some(false) => { - tracing::warn!(?is_archived, "tried offloading a non-archived timeline"); + tracing::warn!("tried offloading a non-archived timeline"); return Err(OffloadError::NotArchived); } None => { // This is legal: calls to this function can race with the timeline shutting down - tracing::info!( - ?is_archived, - "tried offloading a timeline whose remote storage is not initialized" - ); + tracing::info!("tried offloading a timeline whose remote storage is not initialized"); return Err(OffloadError::Cancelled); } } // Now that the Timeline is in Stopping state, request all the related tasks to shut down. - timeline.shutdown(super::ShutdownMode::Hard).await; + timeline.shutdown(super::ShutdownMode::Flush).await; // TODO extend guard mechanism above with method // to make deletions possible while offloading is in progress diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index d712d8bf5e..78601d87af 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -253,6 +253,10 @@ pub(crate) fn apply_in_neon( use bytes::BufMut; if *will_init { assert!(*clear, "init record must be clear to ensure correctness"); + assert!( + page.is_empty(), + "init record must be the first entry to ensure correctness" + ); } if *clear { page.clear(); diff --git a/pgxn/neon/logical_replication_monitor.c b/pgxn/neon/logical_replication_monitor.c index 2de429b83d..1badbbed21 100644 --- a/pgxn/neon/logical_replication_monitor.c +++ b/pgxn/neon/logical_replication_monitor.c @@ -1,7 +1,8 @@ +#include #include #include -#include #include +#include #include "postgres.h" @@ -21,17 +22,35 @@ static int logical_replication_max_snap_files = 300; +/* + * According to Chi (shyzh), the pageserver _should_ be good with 10 MB worth of + * snapshot files. Let's use 8 MB since 8 is a power of 2. + */ +static int logical_replication_max_logicalsnapdir_size = 8000; + +/* + * A primitive description of a logical snapshot file including the LSN of the + * file and its size. + */ +typedef struct SnapDesc { + XLogRecPtr lsn; + off_t sz; +} SnapDesc; + PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg); +/* + * Sorts an array of snapshot descriptors by their LSN. + */ static int -LsnDescComparator(const void *a, const void *b) +SnapDescComparator(const void *a, const void *b) { - XLogRecPtr lsn1 = *((const XLogRecPtr *) a); - XLogRecPtr lsn2 = *((const XLogRecPtr *) b); + const SnapDesc *desc1 = a; + const SnapDesc *desc2 = b; - if (lsn1 < lsn2) + if (desc1->lsn < desc2->lsn) return 1; - else if (lsn1 == lsn2) + else if (desc1->lsn == desc2->lsn) return 0; else return -1; @@ -43,28 +62,39 @@ LsnDescComparator(const void *a, const void *b) * slots having lower restart_lsn should be dropped. */ static XLogRecPtr -get_num_snap_files_lsn_threshold(void) +get_snapshots_cutoff_lsn(void) { - DIR *dirdesc; - struct dirent *de; - char *snap_path = "pg_logical/snapshots/"; - int lsns_allocated = 1024; - int lsns_num = 0; - XLogRecPtr *lsns; - XLogRecPtr cutoff; +/* PG 18 has a constant defined for this, PG_LOGICAL_SNAPSHOTS_DIR */ +#define SNAPDIR "pg_logical/snapshots" - if (logical_replication_max_snap_files < 0) + DIR *dirdesc; + int dirdesc_fd; + struct dirent *de; + size_t snapshot_index = 0; + SnapDesc *snapshot_descriptors; + size_t descriptors_allocated = 1024; + XLogRecPtr cutoff = 0; + off_t logicalsnapdir_size = 0; + const int logical_replication_max_logicalsnapdir_size_bytes = logical_replication_max_logicalsnapdir_size * 1000; + + if (logical_replication_max_snap_files < 0 && logical_replication_max_logicalsnapdir_size < 0) return 0; - lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated); + snapshot_descriptors = palloc(sizeof(*snapshot_descriptors) * descriptors_allocated); + + dirdesc = AllocateDir(SNAPDIR); + dirdesc_fd = dirfd(dirdesc); + if (dirdesc_fd == -1) + ereport(ERROR, errmsg("failed to get a file descriptor for " SNAPDIR ": %m")); /* find all .snap files and get their lsns */ - dirdesc = AllocateDir(snap_path); - while ((de = ReadDir(dirdesc, snap_path)) != NULL) + while ((de = ReadDir(dirdesc, SNAPDIR)) != NULL) { - XLogRecPtr lsn; uint32 hi; uint32 lo; + struct stat st; + XLogRecPtr lsn; + SnapDesc *desc; if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) @@ -79,28 +109,69 @@ get_num_snap_files_lsn_threshold(void) lsn = ((uint64) hi) << 32 | lo; elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn)); - if (lsns_allocated == lsns_num) + + if (fstatat(dirdesc_fd, de->d_name, &st, 0) == -1) + ereport(ERROR, errmsg("failed to get the size of " SNAPDIR "/%s: %m", de->d_name)); + + if (descriptors_allocated == snapshot_index) { - lsns_allocated *= 2; - lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated); + descriptors_allocated *= 2; + snapshot_descriptors = repalloc(snapshot_descriptors, sizeof(*snapshot_descriptors) * descriptors_allocated); } - lsns[lsns_num++] = lsn; + + desc = &snapshot_descriptors[snapshot_index++]; + desc->lsn = lsn; + desc->sz = st.st_size; } - /* sort by lsn desc */ - qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator); - /* and take cutoff at logical_replication_max_snap_files */ - if (logical_replication_max_snap_files > lsns_num) - cutoff = 0; - /* have less files than cutoff */ - else + + qsort(snapshot_descriptors, snapshot_index, sizeof(*snapshot_descriptors), SnapDescComparator); + + /* Are there more snapshot files than specified? */ + if (logical_replication_max_snap_files <= snapshot_index) { - cutoff = lsns[logical_replication_max_snap_files - 1]; - elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d", - LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files); + cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn; + elog(LOG, + "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d", + LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files); } - pfree(lsns); + + /* Is the size of the logical snapshots directory larger than specified? + * + * It's possible we could hit both thresholds, so remove any extra files + * first, and then truncate based on size of the remaining files. + */ + if (logicalsnapdir_size > logical_replication_max_logicalsnapdir_size_bytes) + { + /* Unfortunately, iterating the directory does not guarantee any order + * so we can't cache an index in the preceding loop. + */ + + off_t sz; + const XLogRecPtr original = cutoff; + + sz = snapshot_descriptors[0].sz; + for (size_t i = 1; i < logical_replication_max_snap_files; ++i) + { + if (sz > logical_replication_max_logicalsnapdir_size_bytes) + { + cutoff = snapshot_descriptors[i - 1].lsn; + break; + } + + sz += snapshot_descriptors[i].sz; + } + + if (cutoff != original) + elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB", + LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size); + } + + pfree(snapshot_descriptors); FreeDir(dirdesc); + return cutoff; + +#undef SNAPDIR } void @@ -118,6 +189,16 @@ InitLogicalReplicationMonitor(void) 0, NULL, NULL, NULL); + DefineCustomIntVariable( + "neon.logical_replication_max_logicalsnapdir_size", + "Maximum allowed size of the pg_logical/snapshots directory (KB). When exceeded, slots are dropped until the limit is met. -1 disables the limit.", + NULL, + &logical_replication_max_logicalsnapdir_size, + 8000, -1, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_KB, + NULL, NULL, NULL); + memset(&bgw, 0, sizeof(bgw)); bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; @@ -162,7 +243,7 @@ LogicalSlotsMonitorMain(Datum main_arg) * If there are too many .snap files, just drop all logical slots to * prevent aux files bloat. */ - cutoff_lsn = get_num_snap_files_lsn_threshold(); + cutoff_lsn = get_snapshots_cutoff_lsn(); if (cutoff_lsn > 0) { for (int i = 0; i < max_replication_slots; i++) diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index b575712dbe..5854a7ef0f 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -611,6 +611,17 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun recptr = startptr; nbytes = count; +/* Try to read directly from WAL buffers first. */ +#if PG_MAJORVERSION_NUM >= 17 + { + Size rbytes; + rbytes = WALReadFromBuffers(p, recptr, nbytes, tli); + recptr += rbytes; + nbytes -= rbytes; + p += rbytes; + } +#endif + while (nbytes > 0) { uint32 startoff; diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index d2a6104c74..e89ffdb628 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1361,29 +1361,35 @@ SendAppendRequests(Safekeeper *sk) if (sk->active_state == SS_ACTIVE_READ_WAL) { char *errmsg; + int req_len; req = &sk->appendRequest; + req_len = req->endLsn - req->beginLsn; - switch (wp->api.wal_read(sk, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req->endLsn - req->beginLsn, - &errmsg)) + /* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */ + if (req_len > 0) { - case NEON_WALREAD_SUCCESS: - break; - case NEON_WALREAD_WOULDBLOCK: - return true; - case NEON_WALREAD_ERROR: - wp_log(WARNING, "WAL reading for node %s:%s failed: %s", - sk->host, sk->port, errmsg); - ShutdownConnection(sk); - return false; - default: - Assert(false); + switch (wp->api.wal_read(sk, + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req_len, + &errmsg)) + { + case NEON_WALREAD_SUCCESS: + break; + case NEON_WALREAD_WOULDBLOCK: + return true; + case NEON_WALREAD_ERROR: + wp_log(WARNING, "WAL reading for node %s:%s failed: %s", + sk->host, sk->port, errmsg); + ShutdownConnection(sk); + return false; + default: + Assert(false); + } } - sk->outbuf.len += req->endLsn - req->beginLsn; + sk->outbuf.len += req_len; writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len); diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 706941c3f0..86444084ff 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -1489,33 +1489,11 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, { NeonWALReadResult res; -#if PG_MAJORVERSION_NUM >= 17 - if (!sk->wp->config->syncSafekeepers) - { - Size rbytes; - rbytes = WALReadFromBuffers(buf, startptr, count, - walprop_pg_get_timeline_id()); - - startptr += rbytes; - count -= rbytes; - } -#endif - - if (count == 0) - { - res = NEON_WALREAD_SUCCESS; - } - else - { - Assert(count > 0); - - /* Now read the remaining WAL from the WAL file */ - res = NeonWALRead(sk->xlogreader, - buf, - startptr, - count, - walprop_pg_get_timeline_id()); - } + res = NeonWALRead(sk->xlogreader, + buf, + startptr, + count, + walprop_pg_get_timeline_id()); if (res == NEON_WALREAD_SUCCESS) { diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index efd336dbea..1665d6361a 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -60,7 +60,7 @@ prometheus.workspace = true rand.workspace = true regex.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } -reqwest.workspace = true +reqwest = { workspace = true, features = ["rustls-tls-native-roots"] } reqwest-middleware = { workspace = true, features = ["json"] } reqwest-retry.workspace = true reqwest-tracing.workspace = true diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 83c3617612..bfc674139b 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -7,8 +7,11 @@ use arc_swap::ArcSwapOption; use dashmap::DashMap; use jose_jwk::crypto::KeyInfo; use reqwest::{redirect, Client}; +use reqwest_retry::policies::ExponentialBackoff; +use reqwest_retry::RetryTransientMiddleware; use serde::de::Visitor; use serde::{Deserialize, Deserializer}; +use serde_json::value::RawValue; use signature::Verifier; use thiserror::Error; use tokio::time::Instant; @@ -16,7 +19,7 @@ use tokio::time::Instant; use crate::auth::backend::ComputeCredentialKeys; use crate::context::RequestMonitoring; use crate::control_plane::errors::GetEndpointJwksError; -use crate::http::parse_json_body_with_limit; +use crate::http::read_body_with_limit; use crate::intern::RoleNameInt; use crate::types::{EndpointId, RoleName}; @@ -28,6 +31,10 @@ const MAX_RENEW: Duration = Duration::from_secs(3600); const MAX_JWK_BODY_SIZE: usize = 64 * 1024; const JWKS_USER_AGENT: &str = "neon-proxy"; +const JWKS_CONNECT_TIMEOUT: Duration = Duration::from_secs(2); +const JWKS_FETCH_TIMEOUT: Duration = Duration::from_secs(5); +const JWKS_FETCH_RETRIES: u32 = 3; + /// How to get the JWT auth rules pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { fn fetch_auth_rules( @@ -55,7 +62,7 @@ pub(crate) struct AuthRule { } pub struct JwkCache { - client: reqwest::Client, + client: reqwest_middleware::ClientWithMiddleware, map: DashMap<(EndpointId, RoleName), Arc>, } @@ -117,6 +124,14 @@ impl Default for JwkCacheEntryLock { } } +#[derive(Deserialize)] +struct JwkSet<'a> { + /// we parse into raw-value because not all keys in a JWKS are ones + /// we can parse directly, so we parse them lazily. + #[serde(borrow)] + keys: Vec<&'a RawValue>, +} + impl JwkCacheEntryLock { async fn acquire_permit<'a>(self: &'a Arc) -> JwkRenewalPermit<'a> { JwkRenewalPermit::acquire_permit(self).await @@ -130,7 +145,7 @@ impl JwkCacheEntryLock { &self, _permit: JwkRenewalPermit<'_>, ctx: &RequestMonitoring, - client: &reqwest::Client, + client: &reqwest_middleware::ClientWithMiddleware, endpoint: EndpointId, auth_rules: &F, ) -> Result, JwtError> { @@ -154,22 +169,73 @@ impl JwkCacheEntryLock { let req = client.get(rule.jwks_url.clone()); // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`. // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only. - match req.send().await.and_then(|r| r.error_for_status()) { + match req.send().await.and_then(|r| { + r.error_for_status() + .map_err(reqwest_middleware::Error::Reqwest) + }) { // todo: should we re-insert JWKs if we want to keep this JWKs URL? // I expect these failures would be quite sparse. Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"), Ok(r) => { let resp: http::Response = r.into(); - match parse_json_body_with_limit::( - resp.into_body(), - MAX_JWK_BODY_SIZE, - ) - .await + + let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE) + .await { + Ok(bytes) => bytes, + Err(e) => { + tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs"); + continue; + } + }; + + match serde_json::from_slice::(&bytes) { Err(e) => { tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs"); } Ok(jwks) => { + // size_of::<&RawValue>() == 16 + // size_of::() == 288 + // better to not pre-allocate this as it might be pretty large - especially if it has many + // keys we don't want or need. + // trivial 'attack': `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}` + // this would consume 8MiB just like that! + let mut keys = vec![]; + let mut failed = 0; + for key in jwks.keys { + match serde_json::from_str::(key.get()) { + Ok(key) => { + // if `use` (called `cls` in rust) is specified to be something other than signing, + // we can skip storing it. + if key + .prm + .cls + .as_ref() + .is_some_and(|c| *c != jose_jwk::Class::Signing) + { + continue; + } + + keys.push(key); + } + Err(e) => { + tracing::debug!(url=?rule.jwks_url, failed=?e, "could not decode JWK"); + failed += 1; + } + } + } + keys.shrink_to_fit(); + + if failed > 0 { + tracing::warn!(url=?rule.jwks_url, failed, "could not decode JWKs"); + } + + if keys.is_empty() { + tracing::warn!(url=?rule.jwks_url, "no valid JWKs found inside the response body"); + continue; + } + + let jwks = jose_jwk::JwkSet { keys }; key_sets.insert( rule.id, KeySet { @@ -179,7 +245,7 @@ impl JwkCacheEntryLock { }, ); } - } + }; } } } @@ -196,7 +262,7 @@ impl JwkCacheEntryLock { async fn get_or_update_jwk_cache( self: &Arc, ctx: &RequestMonitoring, - client: &reqwest::Client, + client: &reqwest_middleware::ClientWithMiddleware, endpoint: EndpointId, fetch: &F, ) -> Result, JwtError> { @@ -250,7 +316,7 @@ impl JwkCacheEntryLock { self: &Arc, ctx: &RequestMonitoring, jwt: &str, - client: &reqwest::Client, + client: &reqwest_middleware::ClientWithMiddleware, endpoint: EndpointId, role_name: &RoleName, fetch: &F, @@ -369,8 +435,19 @@ impl Default for JwkCache { let client = Client::builder() .user_agent(JWKS_USER_AGENT) .redirect(redirect::Policy::none()) + .tls_built_in_native_certs(true) + .connect_timeout(JWKS_CONNECT_TIMEOUT) + .timeout(JWKS_FETCH_TIMEOUT) .build() - .expect("using &str and standard redirect::Policy"); + .expect("client config should be valid"); + + // Retry up to 3 times with increasing intervals between attempts. + let retry_policy = ExponentialBackoff::builder().build_with_max_retries(JWKS_FETCH_RETRIES); + + let client = reqwest_middleware::ClientBuilder::new(client) + .with(RetryTransientMiddleware::new_with_policy(retry_policy)) + .build(); + JwkCache { client, map: DashMap::default(), @@ -1209,4 +1286,63 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL } } } + + #[tokio::test] + async fn check_jwk_keycloak_regression() { + let (rs, valid_jwk) = new_rsa_jwk(RS1, "rs1".into()); + let valid_jwk = serde_json::to_value(valid_jwk).unwrap(); + + // This is valid, but we cannot parse it as we have no support for encryption JWKs, only signature based ones. + // This is taken directly from keycloak. + let invalid_jwk = serde_json::json! { + { + "kid": "U-Jc9xRli84eNqRpYQoIPF-GNuRWV3ZvAIhziRW2sbQ", + "kty": "RSA", + "alg": "RSA-OAEP", + "use": "enc", + "n": "yypYWsEKmM_wWdcPnSGLSm5ytw1WG7P7EVkKSulcDRlrM6HWj3PR68YS8LySYM2D9Z-79oAdZGKhIfzutqL8rK1vS14zDuPpAM-RWY3JuQfm1O_-1DZM8-07PmVRegP5KPxsKblLf_My8ByH6sUOIa1p2rbe2q_b0dSTXYu1t0dW-cGL5VShc400YymvTwpc-5uYNsaVxZajnB7JP1OunOiuCJ48AuVp3PqsLzgoXqlXEB1ZZdch3xT3bxaTtNruGvG4xmLZY68O_T3yrwTCNH2h_jFdGPyXdyZToCMSMK2qSbytlfwfN55pT9Vv42Lz1YmoB7XRjI9aExKPc5AxFw", + "e": "AQAB", + "x5c": [ + "MIICmzCCAYMCBgGS41E6azANBgkqhkiG9w0BAQsFADARMQ8wDQYDVQQDDAZtYXN0ZXIwHhcNMjQxMDMxMTYwMTQ0WhcNMzQxMDMxMTYwMzI0WjARMQ8wDQYDVQQDDAZtYXN0ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDLKlhawQqYz/BZ1w+dIYtKbnK3DVYbs/sRWQpK6VwNGWszodaPc9HrxhLwvJJgzYP1n7v2gB1kYqEh/O62ovysrW9LXjMO4+kAz5FZjcm5B+bU7/7UNkzz7Ts+ZVF6A/ko/GwpuUt/8zLwHIfqxQ4hrWnatt7ar9vR1JNdi7W3R1b5wYvlVKFzjTRjKa9PClz7m5g2xpXFlqOcHsk/U66c6K4InjwC5Wnc+qwvOCheqVcQHVll1yHfFPdvFpO02u4a8bjGYtljrw79PfKvBMI0faH+MV0Y/Jd3JlOgIxIwrapJvK2V/B83nmlP1W/jYvPViagHtdGMj1oTEo9zkDEXAgMBAAEwDQYJKoZIhvcNAQELBQADggEBAECYX59+Q9v6c9sb6Q0/C6IgLWG2nVCgVE1YWwIzz+68WrhlmNCRuPjY94roB+tc2tdHbj+Nh3LMzJk7L1KCQoW1+LPK6A6E8W9ad0YPcuw8csV2pUA3+H56exQMH0fUAPQAU7tXWvnQ7otcpV1XA8afn/NTMTsnxi9mSkor8MLMYQ3aeRyh1+LAchHBthWiltqsSUqXrbJF59u5p0ghquuKcWR3TXsA7klGYBgGU5KAJifr9XT87rN0bOkGvbeWAgKvnQnjZwxdnLqTfp/pRY/PiJJHhgIBYPIA7STGnMPjmJ995i34zhnbnd8WHXJA3LxrIMqLW/l8eIdvtM1w8KI=" + ], + "x5t": "QhfzMMnuAfkReTgZ1HtrfyOeeZs", + "x5t#S256": "cmHDUdKgLiRCEN28D5FBy9IJLFmR7QWfm77SLhGTCTU" + } + }; + + let jwks = serde_json::json! {{ "keys": [invalid_jwk, valid_jwk ] }}; + let jwks_addr = jwks_server(move |path| match path { + "/" => Some(serde_json::to_vec(&jwks).unwrap()), + _ => None, + }) + .await; + + let role_name = RoleName::from("anonymous"); + let role = RoleNameInt::from(&role_name); + + let rules = vec![AuthRule { + id: "foo".to_owned(), + jwks_url: format!("http://{jwks_addr}/").parse().unwrap(), + audience: None, + role_names: vec![role], + }]; + + let fetch = Fetch(rules); + let jwk_cache = JwkCache::default(); + + let endpoint = EndpointId::from("ep"); + + let token = new_rsa_jwt("rs1".into(), rs); + + jwk_cache + .check_jwt( + &RequestMonitoring::test(), + endpoint.clone(), + &role_name, + &fetch, + &token, + ) + .await + .unwrap(); + } } diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs index f1b632e704..b1642cedb3 100644 --- a/proxy/src/http/mod.rs +++ b/proxy/src/http/mod.rs @@ -6,7 +6,6 @@ pub mod health_server; use std::time::Duration; -use anyhow::bail; use bytes::Bytes; use http::Method; use http_body_util::BodyExt; @@ -16,7 +15,7 @@ use reqwest_middleware::RequestBuilder; pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; pub(crate) use reqwest_retry::policies::ExponentialBackoff; pub(crate) use reqwest_retry::RetryTransientMiddleware; -use serde::de::DeserializeOwned; +use thiserror::Error; use crate::metrics::{ConsoleRequest, Metrics}; use crate::url::ApiUrl; @@ -122,10 +121,19 @@ impl Endpoint { } } -pub(crate) async fn parse_json_body_with_limit( +#[derive(Error, Debug)] +pub(crate) enum ReadBodyError { + #[error("Content length exceeds limit of {limit} bytes")] + BodyTooLarge { limit: usize }, + + #[error(transparent)] + Read(#[from] reqwest::Error), +} + +pub(crate) async fn read_body_with_limit( mut b: impl Body + Unpin, limit: usize, -) -> anyhow::Result { +) -> Result, ReadBodyError> { // We could use `b.limited().collect().await.to_bytes()` here // but this ends up being slightly more efficient as far as I can tell. @@ -133,20 +141,20 @@ pub(crate) async fn parse_json_body_with_limit( // in reqwest, this value is influenced by the Content-Length header. let lower_bound = match usize::try_from(b.size_hint().lower()) { Ok(bound) if bound <= limit => bound, - _ => bail!("Content length exceeds limit of {limit} bytes"), + _ => return Err(ReadBodyError::BodyTooLarge { limit }), }; let mut bytes = Vec::with_capacity(lower_bound); while let Some(frame) = b.frame().await.transpose()? { if let Ok(data) = frame.into_data() { if bytes.len() + data.len() > limit { - bail!("Content length exceeds limit of {limit} bytes") + return Err(ReadBodyError::BodyTooLarge { limit }); } bytes.extend_from_slice(&data); } } - Ok(serde_json::from_slice::(&bytes)?) + Ok(bytes) } #[cfg(test)] diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index f95d645c23..ad7e1d2771 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -1,12 +1,6 @@ // rustc lints/lint groups // https://doc.rust-lang.org/rustc/lints/groups.html -#![deny( - deprecated, - future_incompatible, - let_underscore, - nonstandard_style, - rust_2024_compatibility -)] +#![deny(deprecated, future_incompatible, let_underscore, nonstandard_style)] #![warn(clippy::all, clippy::pedantic, clippy::cargo)] // List of denied lints from the clippy::restriction group. // https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 00a8ac4768..61c39c32c9 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -16,8 +16,7 @@ use super::http_conn_pool::ClientDataHttp; use super::local_conn_pool::ClientDataLocal; use crate::auth::backend::ComputeUserInfo; use crate::context::RequestMonitoring; -use crate::control_plane::messages::ColdStartInfo; -use crate::control_plane::messages::MetricsAuxInfo; +use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::types::{DbName, EndpointCacheKey, RoleName}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index 56be70abec..a1d4473b01 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -7,7 +7,6 @@ use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; use rand::Rng; -use std::result::Result::Ok; use tokio::net::TcpStream; use tracing::{debug, error, info, info_span, Instrument}; diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index ec08d02240..85561e4aff 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -61,8 +61,14 @@ utils.workspace = true workspace_hack.workspace = true [dev-dependencies] +criterion.workspace = true +itertools.workspace = true walproposer.workspace = true rand.workspace = true desim.workspace = true tracing.workspace = true tracing-subscriber = { workspace = true, features = ["json"] } + +[[bench]] +name = "receive_wal" +harness = false diff --git a/safekeeper/benches/README.md b/safekeeper/benches/README.md new file mode 100644 index 0000000000..4119cc8d6e --- /dev/null +++ b/safekeeper/benches/README.md @@ -0,0 +1,22 @@ +## Safekeeper Benchmarks + +To run benchmarks: + +```sh +# All benchmarks. +cargo bench --package safekeeper + +# Specific file. +cargo bench --package safekeeper --bench receive_wal + +# Specific benchmark. +cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false + +# List available benchmarks. +cargo bench --package safekeeper --benches -- --list +``` + +Additional charts and statistics are available in `target/criterion/report/index.html`. + +Benchmarks are automatically compared against the previous run. To compare against other runs, see +`--baseline` and `--save-baseline`. \ No newline at end of file diff --git a/safekeeper/benches/benchutils.rs b/safekeeper/benches/benchutils.rs new file mode 100644 index 0000000000..4e8dc58c49 --- /dev/null +++ b/safekeeper/benches/benchutils.rs @@ -0,0 +1,102 @@ +use std::sync::Arc; + +use camino_tempfile::Utf8TempDir; +use safekeeper::rate_limit::RateLimiter; +use safekeeper::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory}; +use safekeeper::state::{TimelinePersistentState, TimelineState}; +use safekeeper::timeline::{get_timeline_dir, SharedState, StateSK, Timeline}; +use safekeeper::timelines_set::TimelinesSet; +use safekeeper::wal_backup::remote_timeline_path; +use safekeeper::{control_file, wal_storage, SafeKeeperConf}; +use tokio::fs::create_dir_all; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; + +/// A Safekeeper benchmarking environment. Uses a tempdir for storage, removed on drop. +pub struct Env { + /// Whether to enable fsync. + pub fsync: bool, + /// Benchmark directory. Deleted when dropped. + pub tempdir: Utf8TempDir, +} + +impl Env { + /// Creates a new benchmarking environment in a temporary directory. fsync controls whether to + /// enable fsyncing. + pub fn new(fsync: bool) -> anyhow::Result { + let tempdir = camino_tempfile::tempdir()?; + Ok(Self { fsync, tempdir }) + } + + /// Constructs a Safekeeper config for the given node ID. + fn make_conf(&self, node_id: NodeId) -> SafeKeeperConf { + let mut conf = SafeKeeperConf::dummy(); + conf.my_id = node_id; + conf.no_sync = !self.fsync; + conf.workdir = self.tempdir.path().join(format!("safekeeper-{node_id}")); + conf + } + + /// Constructs a Safekeeper with the given node and tenant/timeline ID. + /// + /// TODO: we should support using in-memory storage, to measure non-IO costs. This would be + /// easier if SafeKeeper used trait objects for storage rather than generics. It's also not + /// currently possible to construct a timeline using non-file storage since StateSK only accepts + /// SafeKeeper. + pub async fn make_safekeeper( + &self, + node_id: NodeId, + ttid: TenantTimelineId, + ) -> anyhow::Result> { + let conf = self.make_conf(node_id); + + let timeline_dir = get_timeline_dir(&conf, &ttid); + create_dir_all(&timeline_dir).await?; + + let mut pstate = TimelinePersistentState::empty(); + pstate.tenant_id = ttid.tenant_id; + pstate.timeline_id = ttid.timeline_id; + + let wal = wal_storage::PhysicalStorage::new(&ttid, &timeline_dir, &pstate, conf.no_sync)?; + let ctrl = + control_file::FileStorage::create_new(&timeline_dir, pstate, conf.no_sync).await?; + let state = TimelineState::new(ctrl); + let mut safekeeper = SafeKeeper::new(state, wal, conf.my_id)?; + + // Emulate an initial election. + safekeeper + .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected { + term: 1, + start_streaming_at: Lsn(0), + term_history: TermHistory(vec![(1, Lsn(0)).into()]), + timeline_start_lsn: Lsn(0), + })) + .await?; + + Ok(safekeeper) + } + + /// Constructs a timeline, including a new Safekeeper with the given node ID, and spawns its + /// manager task. + pub async fn make_timeline( + &self, + node_id: NodeId, + ttid: TenantTimelineId, + ) -> anyhow::Result> { + let conf = self.make_conf(node_id); + let timeline_dir = get_timeline_dir(&conf, &ttid); + let remote_path = remote_timeline_path(&ttid)?; + + let safekeeper = self.make_safekeeper(node_id, ttid).await?; + let shared_state = SharedState::new(StateSK::Loaded(safekeeper)); + + let timeline = Timeline::new(ttid, &timeline_dir, &remote_path, shared_state); + timeline.bootstrap( + &mut timeline.write_shared_state().await, + &conf, + Arc::new(TimelinesSet::default()), // ignored for now + RateLimiter::new(0, 0), + ); + Ok(timeline) + } +} diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs new file mode 100644 index 0000000000..e32d7526ca --- /dev/null +++ b/safekeeper/benches/receive_wal.rs @@ -0,0 +1,341 @@ +//! WAL ingestion benchmarks. + +#[path = "benchutils.rs"] +mod benchutils; + +use std::io::Write as _; + +use benchutils::Env; +use camino_tempfile::tempfile; +use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion}; +use itertools::Itertools as _; +use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator}; +use safekeeper::receive_wal::{self, WalAcceptor}; +use safekeeper::safekeeper::{ + AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, +}; +use tokio::io::AsyncWriteExt as _; +use utils::id::{NodeId, TenantTimelineId}; +use utils::lsn::Lsn; + +const KB: usize = 1024; +const MB: usize = 1024 * KB; +const GB: usize = 1024 * MB; + +// Register benchmarks with Criterion. +criterion_group!( + benches, + bench_process_msg, + bench_wal_acceptor, + bench_wal_acceptor_throughput, + bench_file_write +); +criterion_main!(benches); + +/// Benchmarks SafeKeeper::process_msg() as time per message and throughput. Each message is an +/// AppendRequest with a single WAL record containing an XlLogicalMessage of varying size. When +/// measuring throughput, only the logical message payload is considered, excluding +/// segment/page/record headers. +fn bench_process_msg(c: &mut Criterion) { + let mut g = c.benchmark_group("process_msg"); + for fsync in [false, true] { + for commit in [false, true] { + for size in [8, KB, 8 * KB, 128 * KB, MB] { + // Kind of weird to change the group throughput per benchmark, but it's the only way + // to vary it per benchmark. It works. + g.throughput(criterion::Throughput::Bytes(size as u64)); + g.bench_function(format!("fsync={fsync}/commit={commit}/size={size}"), |b| { + run_bench(b, size, fsync, commit).unwrap() + }); + } + } + } + + // The actual benchmark. If commit is true, advance the commit LSN on every message. + fn run_bench(b: &mut Bencher, size: usize, fsync: bool, commit: bool) -> anyhow::Result<()> { + let runtime = tokio::runtime::Builder::new_current_thread() // single is fine, sync IO only + .enable_all() + .build()?; + + // Construct the payload. The prefix counts towards the payload (including NUL terminator). + let prefix = c"p"; + let prefixlen = prefix.to_bytes_with_nul().len(); + assert!(size >= prefixlen); + let message = vec![0; size - prefixlen]; + + let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message)); + + // Set up the Safekeeper. + let env = Env::new(fsync)?; + let mut safekeeper = + runtime.block_on(env.make_safekeeper(NodeId(1), TenantTimelineId::generate()))?; + + b.iter_batched_ref( + // Pre-construct WAL records and requests. Criterion will batch them. + || { + let (lsn, record) = walgen.next().expect("endless WAL"); + ProposerAcceptorMessage::AppendRequest(AppendRequest { + h: AppendRequestHeader { + term: 1, + term_start_lsn: Lsn(0), + begin_lsn: lsn, + end_lsn: lsn + record.len() as u64, + commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record + truncate_lsn: Lsn(0), + proposer_uuid: [0; 16], + }, + wal_data: record, + }) + }, + // Benchmark message processing (time per message). + |msg| { + runtime + .block_on(safekeeper.process_msg(msg)) + .expect("message failed") + }, + BatchSize::SmallInput, // automatically determine a batch size + ); + Ok(()) + } +} + +/// Benchmarks WalAcceptor message processing time by sending it a batch of WAL records and waiting +/// for it to confirm that the last LSN has been flushed to storage. We pipeline a bunch of messages +/// instead of measuring each individual message to amortize costs (e.g. fsync), which is more +/// realistic. Records are XlLogicalMessage with a tiny payload (~64 bytes per record including +/// headers). Records are pre-constructed to avoid skewing the benchmark. +/// +/// TODO: add benchmarks with in-memory storage, see comment on `Env::make_safekeeper()`: +fn bench_wal_acceptor(c: &mut Criterion) { + let mut g = c.benchmark_group("wal_acceptor"); + for fsync in [false, true] { + for n in [1, 100, 10000] { + g.bench_function(format!("fsync={fsync}/n={n}"), |b| { + run_bench(b, n, fsync).unwrap() + }); + } + } + + /// The actual benchmark. n is the number of WAL records to send in a pipelined batch. + fn run_bench(b: &mut Bencher, n: usize, fsync: bool) -> anyhow::Result<()> { + let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded + + let env = Env::new(fsync)?; + let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message")); + + // Create buffered channels that can fit all requests, to avoid blocking on channels. + let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(n); + let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(n); + + // Spawn the WalAcceptor task. + runtime.block_on(async { + // TODO: WalAcceptor doesn't actually need a full timeline, only + // Safekeeper::process_msg(). Consider decoupling them to simplify the setup. + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate()) + .await? + .wal_residence_guard() + .await?; + WalAcceptor::spawn(tli, msg_rx, reply_tx, Some(0)); + anyhow::Ok(()) + })?; + + b.iter_batched( + // Pre-construct a batch of WAL records and requests. + || { + walgen + .take(n) + .map(|(lsn, record)| AppendRequest { + h: AppendRequestHeader { + term: 1, + term_start_lsn: Lsn(0), + begin_lsn: lsn, + end_lsn: lsn + record.len() as u64, + commit_lsn: Lsn(0), + truncate_lsn: Lsn(0), + proposer_uuid: [0; 16], + }, + wal_data: record, + }) + .collect_vec() + }, + // Benchmark batch ingestion (time per batch). + |reqs| { + runtime.block_on(async { + let final_lsn = reqs.last().unwrap().h.end_lsn; + // Stuff all the messages into the buffered channel to pipeline them. + for req in reqs { + let msg = ProposerAcceptorMessage::AppendRequest(req); + msg_tx.send(msg).await.expect("send failed"); + } + // Wait for the last message to get flushed. + while let Some(reply) = reply_rx.recv().await { + if let AcceptorProposerMessage::AppendResponse(resp) = reply { + if resp.flush_lsn >= final_lsn { + return; + } + } + } + panic!("disconnected") + }) + }, + BatchSize::PerIteration, // only run one request batch at a time + ); + Ok(()) + } +} + +/// Benchmarks WalAcceptor throughput by sending 1 GB of data with varying message sizes and waiting +/// for the last LSN to be flushed to storage. Only the actual message payload counts towards +/// throughput, headers are excluded and considered overhead. Records are XlLogicalMessage. +/// +/// To avoid running out of memory, messages are constructed during the benchmark. +fn bench_wal_acceptor_throughput(c: &mut Criterion) { + const VOLUME: usize = GB; // NB: excludes message/page/segment headers and padding + + let mut g = c.benchmark_group("wal_acceptor_throughput"); + g.sample_size(10); + g.throughput(criterion::Throughput::Bytes(VOLUME as u64)); + + for fsync in [false, true] { + for commit in [false, true] { + for size in [KB, 8 * KB, 128 * KB, MB] { + assert_eq!(VOLUME % size, 0, "volume must be divisible by size"); + let count = VOLUME / size; + g.bench_function(format!("fsync={fsync}/commit={commit}/size={size}"), |b| { + run_bench(b, count, size, fsync, commit).unwrap() + }); + } + } + } + + /// The actual benchmark. size is the payload size per message, count is the number of messages. + /// If commit is true, advance the commit LSN on each message. + fn run_bench( + b: &mut Bencher, + count: usize, + size: usize, + fsync: bool, + commit: bool, + ) -> anyhow::Result<()> { + let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded + + // Construct the payload. The prefix counts towards the payload (including NUL terminator). + let prefix = c"p"; + let prefixlen = prefix.to_bytes_with_nul().len(); + assert!(size >= prefixlen); + let message = vec![0; size - prefixlen]; + + let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message)); + + // Construct and spawn the WalAcceptor task. + let env = Env::new(fsync)?; + + let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE); + let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE); + + runtime.block_on(async { + let tli = env + .make_timeline(NodeId(1), TenantTimelineId::generate()) + .await? + .wal_residence_guard() + .await?; + WalAcceptor::spawn(tli, msg_rx, reply_tx, Some(0)); + anyhow::Ok(()) + })?; + + // Ingest the WAL. + b.iter(|| { + runtime.block_on(async { + let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest { + h: AppendRequestHeader { + term: 1, + term_start_lsn: Lsn(0), + begin_lsn: lsn, + end_lsn: lsn + record.len() as u64, + commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record + truncate_lsn: Lsn(0), + proposer_uuid: [0; 16], + }, + wal_data: record, + }); + + // Send requests. + for req in reqgen { + _ = reply_rx.try_recv(); // discard any replies, to avoid blocking + let msg = ProposerAcceptorMessage::AppendRequest(req); + msg_tx.send(msg).await.expect("send failed"); + } + + // Wait for last message to get flushed. + while let Some(reply) = reply_rx.recv().await { + if let AcceptorProposerMessage::AppendResponse(resp) = reply { + if resp.flush_lsn >= walgen.lsn { + return; + } + } + } + panic!("disconnected") + }) + }); + Ok(()) + } +} + +/// Benchmarks OS write throughput by appending blocks of a given size to a file. This is intended +/// to compare Tokio and stdlib writes, and give a baseline for optimal WAL throughput. +fn bench_file_write(c: &mut Criterion) { + let mut g = c.benchmark_group("file_write"); + + for kind in ["stdlib", "tokio"] { + for fsync in [false, true] { + for size in [8, KB, 8 * KB, 128 * KB, MB] { + // Kind of weird to change the group throughput per benchmark, but it's the only way to + // vary it per benchmark. It works. + g.throughput(criterion::Throughput::Bytes(size as u64)); + g.bench_function( + format!("{kind}/fsync={fsync}/size={size}"), + |b| match kind { + "stdlib" => run_bench_stdlib(b, size, fsync).unwrap(), + "tokio" => run_bench_tokio(b, size, fsync).unwrap(), + name => panic!("unknown kind {name}"), + }, + ); + } + } + } + + fn run_bench_stdlib(b: &mut Bencher, size: usize, fsync: bool) -> anyhow::Result<()> { + let mut file = tempfile()?; + let buf = vec![0u8; size]; + + b.iter(|| { + file.write_all(&buf).unwrap(); + file.flush().unwrap(); + if fsync { + file.sync_data().unwrap(); + } + }); + + Ok(()) + } + + fn run_bench_tokio(b: &mut Bencher, size: usize, fsync: bool) -> anyhow::Result<()> { + let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded + + let mut file = tokio::fs::File::from_std(tempfile()?); + let buf = vec![0u8; size]; + + b.iter(|| { + runtime.block_on(async { + file.write_all(&buf).await.unwrap(); + file.flush().await.unwrap(); + if fsync { + file.sync_data().await.unwrap(); + } + }) + }); + + Ok(()) + } +} diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index fdd0830b02..81c79fae30 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -20,7 +20,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< | Scope::PageServerApi | Scope::GenerationsApi | Scope::Infra - | Scope::Scrubber, + | Scope::Scrubber + | Scope::ControllerPeer, _, ) => Err(AuthError( format!( diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index b1cddaf062..6d68b6b59b 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -112,9 +112,7 @@ impl SafeKeeperConf { } impl SafeKeeperConf { - #[cfg(test)] - #[allow(unused)] - fn dummy() -> Self { + pub fn dummy() -> Self { SafeKeeperConf { workdir: Utf8PathBuf::from("./"), no_sync: false, diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index bb56e923f8..bbd2f86898 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -55,7 +55,7 @@ pub static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { pub static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_flush_wal_seconds", - "Seconds spent syncing WAL to a disk", + "Seconds spent syncing WAL to a disk (excluding segment initialization)", DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_flush_wal_seconds histogram") diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index cf41d7a0ab..f4983d44d0 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -979,7 +979,8 @@ where self.wal_store.flush_wal().await?; } - // Update commit_lsn. + // Update commit_lsn. It will be flushed to the control file regularly by the timeline + // manager, off of the WAL ingest hot path. if msg.h.commit_lsn != Lsn(0) { self.update_commit_lsn(msg.h.commit_lsn).await?; } @@ -992,15 +993,6 @@ where self.state.inmem.peer_horizon_lsn = max(self.state.inmem.peer_horizon_lsn, msg.h.truncate_lsn); - // Update truncate and commit LSN in control file. - // To avoid negative impact on performance of extra fsync, do it only - // when commit_lsn delta exceeds WAL segment size. - if self.state.commit_lsn + (self.state.server.wal_seg_size as u64) - < self.state.inmem.commit_lsn - { - self.state.flush().await?; - } - trace!( "processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}", msg.wal_data.len(), diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 0826a148ec..941b7e67d0 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -4,6 +4,7 @@ use std::{cmp::max, ops::Deref}; use anyhow::{bail, Result}; +use postgres_ffi::WAL_SEGMENT_SIZE; use safekeeper_api::models::TimelineTermBumpResponse; use serde::{Deserialize, Serialize}; use utils::{ @@ -138,14 +139,13 @@ impl TimelinePersistentState { }) } - #[cfg(test)] pub fn empty() -> Self { TimelinePersistentState::new( &TenantTimelineId::empty(), ServerInfo { pg_version: 170000, /* Postgres server version (major * 10000) */ system_id: 0, /* Postgres system identifier */ - wal_seg_size: 16 * 1024 * 1024, + wal_seg_size: WAL_SEGMENT_SIZE as u32, }, vec![], Lsn::INVALID, diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index f0113978c4..85add6bfea 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -2,7 +2,7 @@ //! to glue together SafeKeeper and all other background services. use anyhow::{anyhow, bail, Result}; -use camino::Utf8PathBuf; +use camino::{Utf8Path, Utf8PathBuf}; use remote_storage::RemotePath; use safekeeper_api::models::TimelineTermBumpResponse; use serde::{Deserialize, Serialize}; @@ -108,16 +108,11 @@ pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>; pub struct WriteGuardSharedState<'a> { tli: Arc, guard: RwLockWriteGuard<'a, SharedState>, - skip_update: bool, } impl<'a> WriteGuardSharedState<'a> { fn new(tli: Arc, guard: RwLockWriteGuard<'a, SharedState>) -> Self { - WriteGuardSharedState { - tli, - guard, - skip_update: false, - } + WriteGuardSharedState { tli, guard } } } @@ -159,12 +154,10 @@ impl Drop for WriteGuardSharedState<'_> { } }); - if !self.skip_update { - // send notification about shared state update - self.tli.shared_state_version_tx.send_modify(|old| { - *old += 1; - }); - } + // send notification about shared state update + self.tli.shared_state_version_tx.send_modify(|old| { + *old += 1; + }); } } @@ -325,8 +318,17 @@ pub struct SharedState { } impl SharedState { + /// Creates a new SharedState. + pub fn new(sk: StateSK) -> Self { + Self { + sk, + peers_info: PeersInfo(vec![]), + wal_removal_on_hold: false, + } + } + /// Restore SharedState from control file. If file doesn't exist, bails out. - fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result { + pub fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result { let timeline_dir = get_timeline_dir(conf, ttid); let control_store = control_file::FileStorage::restore_new(&timeline_dir, conf.no_sync)?; if control_store.server.wal_seg_size == 0 { @@ -352,11 +354,7 @@ impl SharedState { } }; - Ok(Self { - sk, - peers_info: PeersInfo(vec![]), - wal_removal_on_hold: false, - }) + Ok(Self::new(sk)) } pub(crate) fn get_wal_seg_size(&self) -> usize { @@ -480,11 +478,13 @@ pub struct Timeline { } impl Timeline { - /// Load existing timeline from disk. - pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result> { - let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); - - let shared_state = SharedState::restore(conf, &ttid)?; + /// Constructs a new timeline. + pub fn new( + ttid: TenantTimelineId, + timeline_dir: &Utf8Path, + remote_path: &RemotePath, + shared_state: SharedState, + ) -> Arc { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(shared_state.sk.state().commit_lsn); let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from(( @@ -494,10 +494,11 @@ impl Timeline { let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); let walreceivers = WalReceivers::new(); - let remote_path = remote_timeline_path(&ttid)?; - Ok(Arc::new(Timeline { + + Arc::new(Self { ttid, - remote_path, + remote_path: remote_path.to_owned(), + timeline_dir: timeline_dir.to_owned(), commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, @@ -508,13 +509,28 @@ impl Timeline { walsenders: WalSenders::new(walreceivers.clone()), walreceivers, cancel: CancellationToken::default(), - timeline_dir: get_timeline_dir(conf, &ttid), manager_ctl: ManagerCtl::new(), broker_active: AtomicBool::new(false), wal_backup_active: AtomicBool::new(false), last_removed_segno: AtomicU64::new(0), mgr_status: AtomicStatus::new(), - })) + }) + } + + /// Load existing timeline from disk. + pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result> { + let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); + + let shared_state = SharedState::restore(conf, &ttid)?; + let timeline_dir = get_timeline_dir(conf, &ttid); + let remote_path = remote_timeline_path(&ttid)?; + + Ok(Timeline::new( + ttid, + &timeline_dir, + &remote_path, + shared_state, + )) } /// Initialize fresh timeline on disk and start background tasks. If init @@ -1128,13 +1144,13 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result { /// Get a path to the tenant directory. If you just need to get a timeline directory, /// use WalResidentTimeline::get_timeline_dir instead. -pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf { +pub fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf { conf.workdir.join(tenant_id.to_string()) } /// Get a path to the timeline directory. If you need to read WAL files from disk, /// use WalResidentTimeline::get_timeline_dir instead. This function does not check /// timeline eviction status and WAL files might not be present on disk. -pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf { +pub fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf { get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string()) } diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 79200fff8d..e9fed21bf5 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -515,7 +515,12 @@ impl Manager { return; } - if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval { + if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval + // If the control file's commit_lsn lags more than one segment behind the current + // commit_lsn, flush immediately to limit recovery time in case of a crash. We don't do + // this on the WAL ingest hot path since it incurs fsync latency. + || state.commit_lsn.saturating_sub(state.cfile_commit_lsn).0 >= self.wal_seg_size as u64 + { let mut write_guard = self.tli.write_shared_state().await; // it should be done in the background because it blocks manager task, but flush() should // be fast enough not to be a problem now diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 33b8bfe28e..11f372bceb 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -31,7 +31,6 @@ use crate::state::TimelinePersistentState; use crate::wal_backup::{read_object, remote_timeline_path}; use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::XLogFileName; -use postgres_ffi::XLOG_BLCKSZ; use pq_proto::SystemId; use utils::{id::TenantTimelineId, lsn::Lsn}; @@ -223,6 +222,15 @@ impl PhysicalStorage { ) } + /// Call fsync if config requires so. + async fn fsync_file(&mut self, file: &File) -> Result<()> { + if !self.no_sync { + self.metrics + .observe_flush_seconds(time_io_closure(file.sync_all()).await?); + } + Ok(()) + } + /// Call fdatasync if config requires so. async fn fdatasync_file(&mut self, file: &File) -> Result<()> { if !self.no_sync { @@ -249,6 +257,9 @@ impl PhysicalStorage { // Try to open existing partial file Ok((file, true)) } else { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["initialize_segment"]) + .start_timer(); // Create and fill new partial file // // We're using fdatasync during WAL writing, so file size must not @@ -256,14 +267,16 @@ impl PhysicalStorage { // half initialized segment, first bake it under tmp filename and // then rename. let tmp_path = self.timeline_dir.join("waltmp"); - let mut file = File::create(&tmp_path) + let file = File::create(&tmp_path) .await .with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?; - write_zeroes(&mut file, self.wal_seg_size).await?; + fail::fail_point!("sk-zero-segment", |_| { + info!("sk-zero-segment failpoint hit"); + Err(anyhow::anyhow!("failpoint: sk-zero-segment")) + }); + file.set_len(self.wal_seg_size as u64).await?; - // Note: this doesn't get into observe_flush_seconds metric. But - // segment init should be separate metric, if any. if let Err(e) = durable_rename(&tmp_path, &wal_file_partial_path, !self.no_sync).await { // Probably rename succeeded, but fsync of it failed. Remove // the file then to avoid using it. @@ -486,12 +499,12 @@ impl Storage for PhysicalStorage { // Remove all segments after the given LSN. remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno).await?; - let (mut file, is_partial) = self.open_or_create(segno).await?; + let (file, is_partial) = self.open_or_create(segno).await?; // Fill end with zeroes - file.seek(SeekFrom::Start(xlogoff as u64)).await?; - write_zeroes(&mut file, self.wal_seg_size - xlogoff).await?; - self.fdatasync_file(&file).await?; + file.set_len(xlogoff as u64).await?; + file.set_len(self.wal_seg_size as u64).await?; + self.fsync_file(&file).await?; if !is_partial { // Make segment partial once again @@ -751,25 +764,6 @@ impl WalReader { } } -/// Zero block for filling created WAL segments. -const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; - -/// Helper for filling file with zeroes. -async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { - fail::fail_point!("sk-write-zeroes", |_| { - info!("write_zeroes hit failpoint"); - Err(anyhow::anyhow!("failpoint: sk-write-zeroes")) - }); - - while count >= XLOG_BLCKSZ { - file.write_all(ZERO_BLOCK).await?; - count -= XLOG_BLCKSZ; - } - file.write_all(&ZERO_BLOCK[0..count]).await?; - file.flush().await?; - Ok(()) -} - /// Helper function for opening WAL segment `segno` in `dir`. Returns file and /// whether it is .partial. pub(crate) async fn open_wal_file( diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs index f70cd65dfc..aefb3919a1 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -1,7 +1,7 @@ use std::{ffi::CStr, sync::Arc}; use parking_lot::{Mutex, MutexGuard}; -use postgres_ffi::v16::wal_generator::WalGenerator; +use postgres_ffi::v16::wal_generator::{LogicalMessageGenerator, WalGenerator}; use utils::lsn::Lsn; use super::block_storage::BlockStorage; @@ -18,7 +18,7 @@ impl DiskWalProposer { internal_available_lsn: Lsn(0), prev_lsn: Lsn(0), disk: BlockStorage::new(), - wal_generator: WalGenerator::new(), + wal_generator: WalGenerator::new(LogicalMessageGenerator::new(c"", &[])), }), }) } @@ -36,7 +36,7 @@ pub struct State { // actual WAL storage disk: BlockStorage, // WAL record generator - wal_generator: WalGenerator, + wal_generator: WalGenerator, } impl State { @@ -64,7 +64,7 @@ impl State { /// Inserts a logical record in the WAL at the current LSN. pub fn insert_logical_message(&mut self, prefix: &CStr, msg: &[u8]) { - let record = self.wal_generator.generate_logical_message(prefix, msg); + let (_, record) = self.wal_generator.append_logical_message(prefix, msg); self.disk.write(self.internal_available_lsn.into(), &record); self.prev_lsn = self.internal_available_lsn; self.internal_available_lsn += record.len() as u64; diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index f6ea1aedc6..9b5d4caf31 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1033,7 +1033,7 @@ async fn handle_update_preferred_azs(req: Request) -> Result) -> Result, ApiError> { - check_permissions(&req, Scope::Admin)?; + check_permissions(&req, Scope::ControllerPeer)?; let req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py index 8ebaf61e5e..be16be81de 100644 --- a/test_runner/fixtures/auth_tokens.py +++ b/test_runner/fixtures/auth_tokens.py @@ -45,3 +45,4 @@ class TokenScope(str, Enum): SAFEKEEPER_DATA = "safekeeperdata" TENANT = "tenant" SCRUBBER = "scrubber" + INFRA = "infra" diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 74fe39ef53..d3419bd8b1 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -80,7 +80,13 @@ class PgBenchRunResult: ): stdout_lines = stdout.splitlines() + number_of_clients = 0 + number_of_threads = 0 + number_of_transactions_actually_processed = 0 + latency_average = 0.0 latency_stddev = None + tps = 0.0 + scale = 0 # we know significant parts of these values from test input # but to be precise take them from output diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 2195ae8225..85b6e7a3b8 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -8,7 +8,7 @@ from contextlib import _GeneratorContextManager, contextmanager # Type-related stuff from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, final import pytest from _pytest.fixtures import FixtureRequest @@ -70,12 +70,12 @@ class PgCompare(ABC): @contextmanager @abstractmethod - def record_pageserver_writes(self, out_name: str): + def record_pageserver_writes(self, out_name: str) -> Iterator[None]: pass @contextmanager @abstractmethod - def record_duration(self, out_name: str): + def record_duration(self, out_name: str) -> Iterator[None]: pass @contextmanager @@ -105,6 +105,7 @@ class PgCompare(ABC): return results +@final class NeonCompare(PgCompare): """PgCompare interface for the neon stack.""" @@ -206,6 +207,7 @@ class NeonCompare(PgCompare): return self.zenbenchmark.record_duration(out_name) +@final class VanillaCompare(PgCompare): """PgCompare interface for vanilla postgres.""" @@ -271,6 +273,7 @@ class VanillaCompare(PgCompare): return self.zenbenchmark.record_duration(out_name) +@final class RemoteCompare(PgCompare): """PgCompare interface for a remote postgres instance.""" diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index ea8291c1e0..db3723b7cc 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -46,3 +46,8 @@ class EndpointHttpClient(requests.Session): ) res.raise_for_status() return res.json() + + def metrics(self) -> str: + res = self.get(f"http://localhost:{self.port}/metrics") + res.raise_for_status() + return res.text diff --git a/test_runner/fixtures/h2server.py b/test_runner/fixtures/h2server.py index 92783e1fb2..e890b2bcf1 100644 --- a/test_runner/fixtures/h2server.py +++ b/test_runner/fixtures/h2server.py @@ -4,11 +4,14 @@ https://python-hyper.org/projects/hyper-h2/en/stable/asyncio-example.html auth-broker -> local-proxy needs a h2 connection, so we need a h2 server :) """ +from __future__ import annotations + import asyncio import collections import io import json from collections.abc import AsyncIterable +from typing import TYPE_CHECKING, final import pytest_asyncio from h2.config import H2Configuration @@ -25,34 +28,45 @@ from h2.events import ( ) from h2.exceptions import ProtocolError, StreamClosedError from h2.settings import SettingCodes +from typing_extensions import override + +if TYPE_CHECKING: + from typing import Any, Optional + RequestData = collections.namedtuple("RequestData", ["headers", "data"]) +@final class H2Server: - def __init__(self, host, port) -> None: + def __init__(self, host: str, port: int) -> None: self.host = host self.port = port +@final class H2Protocol(asyncio.Protocol): def __init__(self): config = H2Configuration(client_side=False, header_encoding="utf-8") self.conn = H2Connection(config=config) - self.transport = None - self.stream_data = {} - self.flow_control_futures = {} + self.transport: Optional[asyncio.Transport] = None + self.stream_data: dict[int, RequestData] = {} + self.flow_control_futures: dict[int, asyncio.Future[Any]] = {} - def connection_made(self, transport: asyncio.Transport): # type: ignore[override] + @override + def connection_made(self, transport: asyncio.BaseTransport): + assert isinstance(transport, asyncio.Transport) self.transport = transport self.conn.initiate_connection() self.transport.write(self.conn.data_to_send()) - def connection_lost(self, _exc): + @override + def connection_lost(self, exc: Optional[Exception]): for future in self.flow_control_futures.values(): future.cancel() self.flow_control_futures = {} + @override def data_received(self, data: bytes): assert self.transport is not None try: @@ -77,7 +91,7 @@ class H2Protocol(asyncio.Protocol): self.window_updated(event.stream_id, event.delta) elif isinstance(event, RemoteSettingsChanged): if SettingCodes.INITIAL_WINDOW_SIZE in event.changed_settings: - self.window_updated(None, 0) + self.window_updated(0, 0) self.transport.write(self.conn.data_to_send()) @@ -123,7 +137,7 @@ class H2Protocol(asyncio.Protocol): else: stream_data.data.write(data) - def stream_reset(self, stream_id): + def stream_reset(self, stream_id: int): """ A stream reset was sent. Stop sending data. """ @@ -131,7 +145,7 @@ class H2Protocol(asyncio.Protocol): future = self.flow_control_futures.pop(stream_id) future.cancel() - async def send_data(self, data, stream_id): + async def send_data(self, data: bytes, stream_id: int): """ Send data according to the flow control rules. """ @@ -161,7 +175,7 @@ class H2Protocol(asyncio.Protocol): self.transport.write(self.conn.data_to_send()) data = data[chunk_size:] - async def wait_for_flow_control(self, stream_id): + async def wait_for_flow_control(self, stream_id: int): """ Waits for a Future that fires when the flow control window is opened. """ @@ -169,7 +183,7 @@ class H2Protocol(asyncio.Protocol): self.flow_control_futures[stream_id] = f await f - def window_updated(self, stream_id, delta): + def window_updated(self, stream_id: int, delta): """ A window update frame was received. Unblock some number of flow control Futures. diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index 89c1f324b4..9de6681beb 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -5,6 +5,8 @@ from typing import TYPE_CHECKING, cast, final import requests +from fixtures.log_helper import log + if TYPE_CHECKING: from typing import Any, Literal, Optional @@ -30,7 +32,11 @@ class NeonAPI: kwargs["headers"] = {} kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}" - return requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs) + resp = requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs) + log.debug("%s %s returned a %d: %s", method, endpoint, resp.status_code, resp.text) + resp.raise_for_status() + + return resp def create_project( self, @@ -66,8 +72,6 @@ class NeonAPI: json=data, ) - assert resp.status_code == 201 - return cast("dict[str, Any]", resp.json()) def get_project_details(self, project_id: str) -> dict[str, Any]: @@ -79,7 +83,7 @@ class NeonAPI: "Content-Type": "application/json", }, ) - assert resp.status_code == 200 + return cast("dict[str, Any]", resp.json()) def delete_project( @@ -95,8 +99,6 @@ class NeonAPI: }, ) - assert resp.status_code == 200 - return cast("dict[str, Any]", resp.json()) def start_endpoint( @@ -112,8 +114,6 @@ class NeonAPI: }, ) - assert resp.status_code == 200 - return cast("dict[str, Any]", resp.json()) def suspend_endpoint( @@ -129,8 +129,6 @@ class NeonAPI: }, ) - assert resp.status_code == 200 - return cast("dict[str, Any]", resp.json()) def restart_endpoint( @@ -146,8 +144,6 @@ class NeonAPI: }, ) - assert resp.status_code == 200 - return cast("dict[str, Any]", resp.json()) def create_endpoint( @@ -178,8 +174,6 @@ class NeonAPI: json=data, ) - assert resp.status_code == 201 - return cast("dict[str, Any]", resp.json()) def get_connection_uri( @@ -206,8 +200,6 @@ class NeonAPI: }, ) - assert resp.status_code == 200 - return cast("dict[str, Any]", resp.json()) def get_branches(self, project_id: str) -> dict[str, Any]: @@ -219,8 +211,6 @@ class NeonAPI: }, ) - assert resp.status_code == 200 - return cast("dict[str, Any]", resp.json()) def get_endpoints(self, project_id: str) -> dict[str, Any]: @@ -232,8 +222,6 @@ class NeonAPI: }, ) - assert resp.status_code == 200 - return cast("dict[str, Any]", resp.json()) def get_operations(self, project_id: str) -> dict[str, Any]: @@ -246,8 +234,6 @@ class NeonAPI: }, ) - assert resp.status_code == 200 - return cast("dict[str, Any]", resp.json()) def wait_for_operation_to_finish(self, project_id: str): diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index e4d6e6da5d..990db1aed0 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -286,7 +286,7 @@ class PgProtocol: return self.safe_psql_many([query], **kwargs)[0] def safe_psql_many( - self, queries: Iterable[str], log_query=True, **kwargs: Any + self, queries: Iterable[str], log_query: bool = True, **kwargs: Any ) -> list[list[tuple[Any, ...]]]: """ Execute queries against the node and return all rows. @@ -306,7 +306,7 @@ class PgProtocol: result.append(cur.fetchall()) return result - def safe_psql_scalar(self, query, log_query=True) -> Any: + def safe_psql_scalar(self, query: str, log_query: bool = True) -> Any: """ Execute query returning single row with single column. """ @@ -1065,6 +1065,9 @@ class NeonEnv: "http_auth_type": http_auth_type, # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override` "availability_zone": "us-east-2a", + # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids + # the pageserver taking a long time to start up due to syncfs flushing other tests' data + "no_sync": True, } if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine @@ -1782,7 +1785,7 @@ class NeonStorageController(MetricsGetter, LogUtils): self.request( "PUT", f"{self.api}/control/v1/node/{node_id}/drain", - headers=self.headers(TokenScope.ADMIN), + headers=self.headers(TokenScope.INFRA), ) def cancel_node_drain(self, node_id): @@ -1790,7 +1793,7 @@ class NeonStorageController(MetricsGetter, LogUtils): self.request( "DELETE", f"{self.api}/control/v1/node/{node_id}/drain", - headers=self.headers(TokenScope.ADMIN), + headers=self.headers(TokenScope.INFRA), ) def node_fill(self, node_id): @@ -1798,7 +1801,7 @@ class NeonStorageController(MetricsGetter, LogUtils): self.request( "PUT", f"{self.api}/control/v1/node/{node_id}/fill", - headers=self.headers(TokenScope.ADMIN), + headers=self.headers(TokenScope.INFRA), ) def cancel_node_fill(self, node_id): @@ -1806,14 +1809,14 @@ class NeonStorageController(MetricsGetter, LogUtils): self.request( "DELETE", f"{self.api}/control/v1/node/{node_id}/fill", - headers=self.headers(TokenScope.ADMIN), + headers=self.headers(TokenScope.INFRA), ) def node_status(self, node_id): response = self.request( "GET", f"{self.api}/control/v1/node/{node_id}", - headers=self.headers(TokenScope.ADMIN), + headers=self.headers(TokenScope.INFRA), ) return response.json() @@ -1829,7 +1832,7 @@ class NeonStorageController(MetricsGetter, LogUtils): response = self.request( "GET", f"{self.api}/control/v1/node", - headers=self.headers(TokenScope.ADMIN), + headers=self.headers(TokenScope.INFRA), ) return response.json() @@ -1857,7 +1860,7 @@ class NeonStorageController(MetricsGetter, LogUtils): shard_count: Optional[int] = None, shard_stripe_size: Optional[int] = None, tenant_config: Optional[dict[Any, Any]] = None, - placement_policy: Optional[Union[dict[Any, Any] | str]] = None, + placement_policy: Optional[Union[dict[Any, Any], str]] = None, ): """ Use this rather than pageserver_api() when you need to include shard parameters @@ -4177,9 +4180,15 @@ class Safekeeper(LogUtils): return self def assert_no_errors(self): - assert not self.log_contains("manager task finished prematurely") - assert not self.log_contains("error while acquiring WalResidentTimeline guard") - assert not self.log_contains("timeout while acquiring WalResidentTimeline guard") + not_allowed = [ + "manager task finished prematurely", + "error while acquiring WalResidentTimeline guard", + "timeout while acquiring WalResidentTimeline guard", + "invalid xlog page header:", + "WAL record crc mismatch at", + ] + for na in not_allowed: + assert not self.log_contains(na) def append_logical_message( self, tenant_id: TenantId, timeline_id: TimelineId, request: dict[str, Any] diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index fa85563e35..d05704c8e0 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -93,6 +93,8 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*WARN.*path=/v1/utilization .*request was dropped before completing", # Can happen during shutdown ".*scheduling deletion on drop failed: queue is in state Stopped.*", + # Too many frozen layers error is normal during intensive benchmarks + ".*too many frozen layers.*", ) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 57a5d6875e..d1a9b5921a 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -316,7 +316,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): def tenant_location_conf( self, tenant_id: Union[TenantId, TenantShardId], - location_conf=dict[str, Any], + location_conf: dict[str, Any], flush_ms=None, lazy: Optional[bool] = None, ): diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 4c4306be9e..ac7497ee6c 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -56,6 +56,8 @@ def wait_for_upload( lsn: Lsn, ): """waits for local timeline upload up to specified lsn""" + + current_lsn = Lsn(0) for i in range(20): current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline) if current_lsn >= lsn: @@ -203,6 +205,8 @@ def wait_for_last_record_lsn( lsn: Lsn, ) -> Lsn: """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" + + current_lsn = Lsn(0) for i in range(1000): current_lsn = last_record_lsn(pageserver_http, tenant, timeline) if current_lsn >= lsn: diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py index d950f2356d..60221573eb 100644 --- a/test_runner/fixtures/paths.py +++ b/test_runner/fixtures/paths.py @@ -112,7 +112,7 @@ def compatibility_snapshot_dir() -> Iterator[Path]: @pytest.fixture(scope="session") -def compatibility_neon_binpath() -> Optional[Iterator[Path]]: +def compatibility_neon_binpath() -> Iterator[Optional[Path]]: if os.getenv("REMOTE_ENV"): return comp_binpath = None @@ -133,7 +133,7 @@ def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: @pytest.fixture(scope="session") -def compatibility_pg_distrib_dir() -> Optional[Iterator[Path]]: +def compatibility_pg_distrib_dir() -> Iterator[Optional[Path]]: compat_distrib_dir = None if env_compat_postgres_bin := os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR"): compat_distrib_dir = Path(env_compat_postgres_bin).resolve() diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index 01f0245665..4feab52c43 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -1,10 +1,8 @@ from __future__ import annotations import enum -import os from typing import TYPE_CHECKING -import pytest from typing_extensions import override if TYPE_CHECKING: @@ -18,12 +16,15 @@ This fixture is used to determine which version of Postgres to use for tests. # Inherit PgVersion from str rather than int to make it easier to pass as a command-line argument # TODO: use enum.StrEnum for Python >= 3.11 -@enum.unique class PgVersion(str, enum.Enum): V14 = "14" V15 = "15" V16 = "16" V17 = "17" + + # Default Postgres Version for tests that don't really depend on Postgres itself + DEFAULT = V16 + # Instead of making version an optional parameter in methods, we can use this fake entry # to explicitly rely on the default server version (could be different from pg_version fixture value) NOT_SET = "<-POSTRGRES VERSION IS NOT SET->" @@ -59,27 +60,3 @@ class PgVersion(str, enum.Enum): # Make mypy happy # See https://github.com/python/mypy/issues/3974 return None - - -DEFAULT_VERSION: PgVersion = PgVersion.V16 - - -def skip_on_postgres(version: PgVersion, reason: str): - return pytest.mark.skipif( - PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version, - reason=reason, - ) - - -def xfail_on_postgres(version: PgVersion, reason: str): - return pytest.mark.xfail( - PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version, - reason=reason, - ) - - -def run_only_on_default_postgres(reason: str): - return pytest.mark.skipif( - PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION, - reason=reason, - ) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 01b7cf1026..96a651f0db 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -25,6 +25,7 @@ from fixtures.pageserver.common_types import ( parse_delta_layer, parse_image_layer, ) +from fixtures.pg_version import PgVersion if TYPE_CHECKING: from collections.abc import Iterable @@ -37,6 +38,7 @@ if TYPE_CHECKING: Fn = TypeVar("Fn", bound=Callable[..., Any]) + COMPONENT_BINARIES = { "storage_controller": ("storage_controller",), "storage_broker": ("storage_broker",), @@ -519,7 +521,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str This is essentially: lines=$(comm -3 \ - <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \ + <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \ <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \ | wc -l) [ "$lines" = "0" ] @@ -643,3 +645,40 @@ def allpairs_versions(): ) ids.append(f"combination_{''.join(cur_id)}") return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids} + + +def skip_on_postgres(version: PgVersion, reason: str): + return pytest.mark.skipif( + PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version, + reason=reason, + ) + + +def xfail_on_postgres(version: PgVersion, reason: str): + return pytest.mark.xfail( + PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version, + reason=reason, + ) + + +def run_only_on_default_postgres(reason: str): + return pytest.mark.skipif( + PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is not PgVersion.DEFAULT, + reason=reason, + ) + + +def skip_in_debug_build(reason: str): + return pytest.mark.skipif( + os.getenv("BUILD_TYPE", "debug") == "debug", + reason=reason, + ) + + +def skip_on_ci(reason: str): + # `CI` variable is always set to `true` on GitHub + # Ref: https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables + return pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason=reason, + ) diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index c038fc3fd2..3dbbb197f4 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -1,7 +1,6 @@ from __future__ import annotations import json -import os from pathlib import Path from typing import TYPE_CHECKING @@ -14,7 +13,7 @@ from fixtures.neon_fixtures import ( PgBin, wait_for_last_flush_lsn, ) -from fixtures.utils import get_scale_for_db, humantime_to_ms +from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci from performance.pageserver.util import ( setup_pageserver_with_tenants, @@ -38,9 +37,8 @@ if TYPE_CHECKING: @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) @pytest.mark.parametrize("n_tenants", [500]) @pytest.mark.timeout(10000) -@pytest.mark.skipif( - os.getenv("CI", "false") == "true", - reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI", +@skip_on_ci( + "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI" ) def test_pageserver_characterize_throughput_with_n_tenants( neon_env_builder: NeonEnvBuilder, @@ -66,9 +64,8 @@ def test_pageserver_characterize_throughput_with_n_tenants( @pytest.mark.parametrize("n_clients", [1, 64]) @pytest.mark.parametrize("n_tenants", [1]) @pytest.mark.timeout(2400) -@pytest.mark.skipif( - os.getenv("CI", "false") == "true", - reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI", +@skip_on_ci( + "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI" ) def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant( neon_env_builder: NeonEnvBuilder, diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index 743604a381..d571fab6b5 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -2,11 +2,13 @@ from __future__ import annotations from contextlib import closing from io import BufferedReader, RawIOBase -from typing import Optional +from typing import Optional, final from fixtures.compare_fixtures import PgCompare +from typing_extensions import override +@final class CopyTestData(RawIOBase): def __init__(self, rows: int): self.rows = rows @@ -14,6 +16,7 @@ class CopyTestData(RawIOBase): self.linebuf: Optional[bytes] = None self.ptr = 0 + @override def readable(self): return True diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 91d7e3446e..050c09c1e5 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -149,12 +149,16 @@ def test_subscriber_lag( check_pgbench_still_running(pub_workload, "pub") check_pgbench_still_running(sub_workload, "sub") - with ( - psycopg2.connect(pub_connstr) as pub_conn, - psycopg2.connect(sub_connstr) as sub_conn, - ): - with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) + + pub_conn.close() + sub_conn.close() log.info(f"Replica lagged behind master by {lag} seconds") zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) @@ -206,6 +210,7 @@ def test_publisher_restart( sub_conn = psycopg2.connect(sub_connstr) pub_conn.autocommit = True sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'") pub_exists = len(pub_cur.fetchall()) != 0 @@ -222,6 +227,7 @@ def test_publisher_restart( sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1") initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() sub_conn.close() @@ -248,12 +254,17 @@ def test_publisher_restart( ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env, ) - with ( - psycopg2.connect(pub_connstr) as pub_conn, - psycopg2.connect(sub_connstr) as sub_conn, - ): - with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: - lag = measure_logical_replication_lag(sub_cur, pub_cur) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) + + pub_conn.close() + sub_conn.close() log.info(f"Replica lagged behind master by {lag} seconds") zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) @@ -288,58 +299,56 @@ def test_snap_files( env = benchmark_project_pub.pgbench_env connstr = benchmark_project_pub.connstr - with psycopg2.connect(connstr) as conn: - conn.autocommit = True - with conn.cursor() as cur: - cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'") - is_super = cast("bool", cur.fetchall()[0][0]) - assert is_super, "This benchmark won't work if we don't have superuser" + conn = psycopg2.connect(connstr) + conn.autocommit = True + + with conn.cursor() as cur: + cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'") + is_super = cast("bool", cur.fetchall()[0][0]) + assert is_super, "This benchmark won't work if we don't have superuser" + + conn.close() pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env) conn = psycopg2.connect(connstr) conn.autocommit = True - cur = conn.cursor() - cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1") - with psycopg2.connect(connstr) as conn: - conn.autocommit = True - with conn.cursor() as cur: - cur.execute("SELECT pg_reload_conf()") - - with psycopg2.connect(connstr) as conn: - conn.autocommit = True - with conn.cursor() as cur: - cur.execute( - """ - DO $$ - BEGIN - IF EXISTS ( - SELECT 1 - FROM pg_replication_slots - WHERE slot_name = 'slotter' - ) THEN - PERFORM pg_drop_replication_slot('slotter'); - END IF; - END $$; + with conn.cursor() as cur: + cur.execute( """ - ) - cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')") + DO $$ + BEGIN + IF EXISTS ( + SELECT 1 + FROM pg_replication_slots + WHERE slot_name = 'slotter' + ) THEN + PERFORM pg_drop_replication_slot('slotter'); + END IF; + END $$; + """ + ) + cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')") + + conn.close() workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env) try: start = time.time() prev_measurement = time.time() while time.time() - start < test_duration_min * 60: - with psycopg2.connect(connstr) as conn: - with conn.cursor() as cur: - cur.execute( - "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s" - ) - check_pgbench_still_running(workload) - cur.execute( - "SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())" - ) + conn = psycopg2.connect(connstr) + conn.autocommit = True + + with conn.cursor() as cur: + cur.execute( + "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s" + ) + check_pgbench_still_running(workload) + cur.execute("SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())") + + conn.close() # Measure storage if time.time() - prev_measurement > test_interval_min * 60: diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index 8b368977df..d56f6dce09 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -102,15 +102,21 @@ def test_ro_replica_lag( check_pgbench_still_running(master_workload) check_pgbench_still_running(replica_workload) time.sleep(sync_interval_min * 60) + + conn_master = psycopg2.connect(master_connstr) + conn_replica = psycopg2.connect(replica_connstr) + conn_master.autocommit = True + conn_replica.autocommit = True + with ( - psycopg2.connect(master_connstr) as conn_master, - psycopg2.connect(replica_connstr) as conn_replica, + conn_master.cursor() as cur_master, + conn_replica.cursor() as cur_replica, ): - with ( - conn_master.cursor() as cur_master, - conn_replica.cursor() as cur_replica, - ): - lag = measure_replication_lag(cur_master, cur_replica) + lag = measure_replication_lag(cur_master, cur_replica) + + conn_master.close() + conn_replica.close() + log.info(f"Replica lagged behind master by {lag} seconds") zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) finally: @@ -219,11 +225,15 @@ def test_replication_start_stop( pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env) # Sync replicas - with psycopg2.connect(master_connstr) as conn_master: - with conn_master.cursor() as cur_master: - for i in range(num_replicas): - conn_replica = psycopg2.connect(replica_connstr[i]) - measure_replication_lag(cur_master, conn_replica.cursor()) + conn_master = psycopg2.connect(master_connstr) + conn_master.autocommit = True + + with conn_master.cursor() as cur_master: + for i in range(num_replicas): + conn_replica = psycopg2.connect(replica_connstr[i]) + measure_replication_lag(cur_master, conn_replica.cursor()) + + conn_master.close() master_pgbench = pg_bin.run_nonblocking( [ @@ -277,17 +287,22 @@ def test_replication_start_stop( time.sleep(configuration_test_time_sec) - with psycopg2.connect(master_connstr) as conn_master: - with conn_master.cursor() as cur_master: - for ireplica in range(num_replicas): - replica_conn = psycopg2.connect(replica_connstr[ireplica]) - lag = measure_replication_lag(cur_master, replica_conn.cursor()) - zenbenchmark.record( - f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER - ) - log.info( - f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}" - ) + conn_master = psycopg2.connect(master_connstr) + conn_master.autocommit = True + + with conn_master.cursor() as cur_master: + for ireplica in range(num_replicas): + replica_conn = psycopg2.connect(replica_connstr[ireplica]) + lag = measure_replication_lag(cur_master, replica_conn.cursor()) + zenbenchmark.record( + f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER + ) + log.info( + f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}" + ) + + conn_master.close() + master_pgbench.terminate() except Exception as e: error_occurred = True diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index 6d1565c5e5..fccfbc7f09 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -8,7 +8,7 @@ from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.pageserver.http import TimelineCreate406 -from fixtures.utils import query_scalar +from fixtures.utils import query_scalar, skip_in_debug_build # Test the GC implementation when running with branching. @@ -48,10 +48,8 @@ from fixtures.utils import query_scalar # Because the delta layer D covering lsn1 is corrupted, creating a branch # starting from lsn1 should return an error as follows: # could not find data for key ... at LSN ..., for request at LSN ... -def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): - if build_type == "debug": - pytest.skip("times out in debug builds") - +@skip_in_debug_build("times out in debug builds") +def test_branch_and_gc(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http_client = env.pageserver.http_client() diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 420055ac3a..370df3c379 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -2,7 +2,6 @@ from __future__ import annotations import enum import json -import os import time from typing import TYPE_CHECKING @@ -13,7 +12,7 @@ from fixtures.neon_fixtures import ( generate_uploads_and_deletions, ) from fixtures.pageserver.http import PageserverApiException -from fixtures.utils import wait_until +from fixtures.utils import skip_in_debug_build, wait_until from fixtures.workload import Workload if TYPE_CHECKING: @@ -32,7 +31,7 @@ AGGRESIVE_COMPACTION_TENANT_CONF = { } -@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") +@skip_in_debug_build("only run with release build") def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder): """ This is a smoke test that compaction kicks in. The workload repeatedly churns diff --git a/test_runner/regress/test_compute_locales.py b/test_runner/regress/test_compute_locales.py new file mode 100644 index 0000000000..00ef32fb5e --- /dev/null +++ b/test_runner/regress/test_compute_locales.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, cast + +from fixtures.pg_version import PgVersion + +if TYPE_CHECKING: + from collections.abc import Sequence + + from fixtures.neon_fixtures import NeonEnv + + +def test_default_locales(neon_simple_env: NeonEnv): + """ + Test that the default locales for compute databases is C.UTF-8. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + domain_locales = cast( + "Sequence[str]", + endpoint.safe_psql( + "SELECT current_setting('lc_messages') AS lc_messages," + + "current_setting('lc_monetary') AS lc_monetary," + + "current_setting('lc_numeric') AS lc_numeric," + + "current_setting('lc_time') AS lc_time" + )[0], + ) + for dl in domain_locales: + assert dl == "C.UTF-8" + + # Postgres 15 added the locale providers + if env.pg_version < PgVersion.V15: + results = cast( + "Sequence[str]", + endpoint.safe_psql( + "SELECT datcollate, datctype FROM pg_database WHERE datname = current_database()" + )[0], + ) + + datcollate = results[0] + datctype = results[1] + else: + results = cast( + "Sequence[str]", + endpoint.safe_psql( + "SELECT datlocprovider, datcollate, datctype FROM pg_database WHERE datname = current_database()" + )[0], + ) + datlocprovider = results[0] + datcollate = results[1] + datctype = results[2] + + if env.pg_version >= PgVersion.V17: + assert datlocprovider == "b", "The locale provider is not builtin" + else: + assert datlocprovider == "c", "The locale provider is not libc" + + assert datcollate == "C.UTF-8" + assert datctype == "C.UTF-8" diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 0134f80769..b2e19ad713 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -12,6 +12,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pg_version import PgVersion +from fixtures.utils import skip_on_postgres from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -41,17 +42,14 @@ def neon_env_builder_local( return neon_env_builder +@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building") +@skip_on_postgres(PgVersion.V17, reason="TODO: PG17 extension building") def test_remote_extensions( httpserver: HTTPServer, neon_env_builder_local: NeonEnvBuilder, httpserver_listen_address, pg_version, ): - if pg_version == PgVersion.V16: - pytest.skip("TODO: PG16 extension building") - if pg_version == PgVersion.V17: - pytest.skip("TODO: PG17 extension building") - # setup mock http server # that expects request for anon.tar.zst # and returns the requested file diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py index 646dac8e6e..2916748925 100644 --- a/test_runner/regress/test_ingestion_layer_size.py +++ b/test_runner/regress/test_ingestion_layer_size.py @@ -4,25 +4,22 @@ from collections.abc import Iterable from dataclasses import dataclass from typing import TYPE_CHECKING -import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo -from fixtures.utils import human_bytes +from fixtures.utils import human_bytes, skip_in_debug_build if TYPE_CHECKING: from typing import Union -def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, build_type: str): +@skip_in_debug_build("debug run is unnecessarily slow") +def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder): """ Build a non-small GIN index which includes similarly batched up images in WAL stream as does pgvector to show that we no longer create oversized layers. """ - if build_type == "debug": - pytest.skip("debug run is unnecessarily slow") - minimum_initdb_size = 20 * 1024**2 checkpoint_distance = 32 * 1024**2 minimum_good_layer_size = checkpoint_distance * 0.9 diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py index 4700db85ee..54ce7c8340 100644 --- a/test_runner/regress/test_installed_extensions.py +++ b/test_runner/regress/test_installed_extensions.py @@ -1,6 +1,14 @@ -from logging import info +from __future__ import annotations -from fixtures.neon_fixtures import NeonEnv +import time +from logging import info +from typing import TYPE_CHECKING + +from fixtures.log_helper import log +from fixtures.metrics import parse_metrics + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv def test_installed_extensions(neon_simple_env: NeonEnv): @@ -85,3 +93,52 @@ def test_installed_extensions(neon_simple_env: NeonEnv): assert ext["n_databases"] == 2 ext["versions"].sort() assert ext["versions"] == ["1.2", "1.3"] + + # check that /metrics endpoint is available + # ensure that we see the metric before and after restart + res = client.metrics() + info("Metrics: %s", res) + m = parse_metrics(res) + neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"}) + assert len(neon_m) == 1 + for sample in neon_m: + assert sample.value == 2 + neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"}) + assert len(neon_m) == 1 + for sample in neon_m: + assert sample.value == 1 + + endpoint.stop() + endpoint.start() + + timeout = 10 + while timeout > 0: + try: + res = client.metrics() + timeout = -1 + if len(parse_metrics(res).query_all("installed_extensions")) < 4: + # Assume that not all metrics that are collected yet + time.sleep(1) + timeout -= 1 + continue + except Exception: + log.exception("failed to get metrics, assume they are not collected yet") + time.sleep(1) + timeout -= 1 + continue + + assert ( + len(parse_metrics(res).query_all("installed_extensions")) >= 4 + ), "Not all metrics are collected" + + info("After restart metrics: %s", res) + m = parse_metrics(res) + neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"}) + assert len(neon_m) == 1 + for sample in neon_m: + assert sample.value == 1 + + neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"}) + assert len(neon_m) == 1 + for sample in neon_m: + assert sample.value == 1 diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py index a08d522fc2..d9043fef7f 100644 --- a/test_runner/regress/test_layer_bloating.py +++ b/test_runner/regress/test_layer_bloating.py @@ -2,7 +2,6 @@ from __future__ import annotations import os -import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -10,12 +9,18 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.pg_version import PgVersion +from fixtures.utils import skip_on_postgres +@skip_on_postgres( + PgVersion.V14, + reason="pg_log_standby_snapshot() function is available since Postgres 16", +) +@skip_on_postgres( + PgVersion.V15, + reason="pg_log_standby_snapshot() function is available since Postgres 16", +) def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg): - if neon_env_builder.pg_version != PgVersion.V16: - pytest.skip("pg_log_standby_snapshot() function is available only in PG16") - env = neon_env_builder.init_start( initial_tenant_conf={ "gc_period": "0s", diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index c49ac6893e..2eb38c49b2 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -2,7 +2,6 @@ from __future__ import annotations import time -import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -12,17 +11,13 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import wait_for_upload from fixtures.remote_storage import RemoteStorageKind +from fixtures.utils import skip_in_debug_build # Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway) # and then download them back. -def test_basic_eviction( - neon_env_builder: NeonEnvBuilder, - build_type: str, -): - if build_type == "debug": - pytest.skip("times out in debug builds") - +@skip_in_debug_build("times out in debug builds") +def test_basic_eviction(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start( diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py index 9a3fdd835d..f6fbdcabfd 100644 --- a/test_runner/regress/test_logging.py +++ b/test_runner/regress/test_logging.py @@ -5,8 +5,7 @@ import uuid import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.pg_version import run_only_on_default_postgres -from fixtures.utils import wait_until +from fixtures.utils import run_only_on_default_postgres, wait_until @pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"]) diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 30027463df..df83ca1c44 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -4,24 +4,31 @@ import time from functools import partial from random import choice from string import ascii_lowercase +from typing import TYPE_CHECKING, cast -from fixtures.common_types import Lsn +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( - NeonEnv, - NeonEnvBuilder, - PgProtocol, logical_replication_sync, wait_for_last_flush_lsn, ) from fixtures.utils import wait_until +if TYPE_CHECKING: + from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, + NeonEnvBuilder, + PgProtocol, + VanillaPostgres, + ) + def random_string(n: int): return "".join([choice(ascii_lowercase) for _ in range(n)]) -def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): +def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres): env = neon_simple_env tenant_id = env.initial_tenant @@ -160,10 +167,10 @@ COMMIT; # Test that neon.logical_replication_max_snap_files works -def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg): - def slot_removed(ep): +def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres): + def slot_removed(ep: Endpoint): assert ( - endpoint.safe_psql( + ep.safe_psql( "select count(*) from pg_replication_slots where slot_name = 'stale_slot'" )[0][0] == 0 @@ -254,7 +261,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of # Tests that walsender correctly blocks until WAL is downloaded from safekeepers -def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg): +def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg: VanillaPostgres): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() @@ -336,13 +343,13 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of # # Most pages start with a contrecord, so we don't do anything special # to ensure that. -def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg): +def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres): env = neon_simple_env env.create_branch("init") endpoint = env.endpoints.create_start("init") - tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0] - timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0] + tenant_id = TenantId(cast("str", endpoint.safe_psql("show neon.tenant_id")[0][0])) + timeline_id = TimelineId(cast("str", endpoint.safe_psql("show neon.timeline_id")[0][0])) cur = endpoint.connect().cursor() cur.execute("create table t(key int, value text)") @@ -380,7 +387,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg): # logical replication bug as such, but without logical replication, # records passed ot the WAL redo process are never large enough to hit # the bug. -def test_large_records(neon_simple_env: NeonEnv, vanilla_pg): +def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres): env = neon_simple_env env.create_branch("init") @@ -522,15 +529,20 @@ def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn: because for some WAL records like vacuum subscriber won't get any data at all. """ - publisher_flush_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + publisher_flush_lsn = Lsn( + cast("str", publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + ) def check_caughtup(): - res = publisher.safe_psql( - """ + res = cast( + "tuple[str, str, str]", + publisher.safe_psql( + """ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication sr, pg_replication_slots s where s.active_pid = sr.pid and s.slot_type = 'logical'; """ - )[0] + )[0], + ) sent_lsn, flush_lsn, curr_publisher_flush_lsn = Lsn(res[0]), Lsn(res[1]), Lsn(res[2]) log.info( f"sent_lsn={sent_lsn}, flush_lsn={flush_lsn}, publisher_flush_lsn={curr_publisher_flush_lsn}, waiting flush_lsn to reach {publisher_flush_lsn}" @@ -545,7 +557,7 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication # flush_lsn reporting to publisher. Without this, subscriber may ack too far, # losing data on restart because publisher implicitly advances positition given # in START_REPLICATION to the confirmed_flush_lsn of the slot. -def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg): +def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres): env = neon_simple_env # use vanilla as publisher to allow writes on it when safekeeper is down vanilla_pg.configure( @@ -593,7 +605,7 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg): # logical_replication_wait_flush_lsn_sync is expected to hang while # safekeeper is down. vanilla_pg.safe_psql("checkpoint;") - assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000 + assert cast("int", sub.safe_psql_scalar("SELECT count(*) FROM t")) == 1000 # restart subscriber and ensure it can catch up lost tail again sub.stop(mode="immediate") diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 783fb813cf..72db72f2b9 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os import subprocess from pathlib import Path from typing import cast @@ -15,7 +14,7 @@ from fixtures.neon_fixtures import ( parse_project_git_version_output, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pg_version import PgVersion, skip_on_postgres +from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build def helper_compare_timeline_list( @@ -195,10 +194,8 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): res.check_returncode() -@skip_on_postgres(PgVersion.V14, reason="does not use postgres") -@pytest.mark.skipif( - os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works" -) +@run_only_on_default_postgres(reason="does not use postgres") +@skip_in_debug_build("unit test for test support, either build works") def test_parse_project_git_version_output_positive(): commit = "b6f77b5816cf1dba12a3bc8747941182ce220846" @@ -217,10 +214,8 @@ def test_parse_project_git_version_output_positive(): assert parse_project_git_version_output(example) == commit -@skip_on_postgres(PgVersion.V14, reason="does not use postgres") -@pytest.mark.skipif( - os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works" -) +@run_only_on_default_postgres(reason="does not use postgres") +@skip_in_debug_build("unit test for test support, either build works") def test_parse_project_git_version_output_local_docker(): """ Makes sure the tests don't accept the default version in Dockerfile one gets without providing @@ -234,10 +229,8 @@ def test_parse_project_git_version_output_local_docker(): assert input in str(e) -@skip_on_postgres(PgVersion.V14, reason="does not use postgres") -@pytest.mark.skipif( - os.environ.get("BUILD_TYPE") == "debug", reason="cli api sanity, either build works" -) +@run_only_on_default_postgres(reason="does not use postgres") +@skip_in_debug_build("unit test for test support, either build works") def test_binaries_version_parses(neon_binpath: Path): """ Ensures that we can parse the actual outputs of --version from a set of binaries. diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 11ebb81023..4f59efb8b3 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -35,9 +35,10 @@ from fixtures.pageserver.utils import ( wait_for_upload, ) from fixtures.remote_storage import ( + LocalFsStorage, RemoteStorageKind, ) -from fixtures.utils import wait_until +from fixtures.utils import run_only_on_default_postgres, wait_until from fixtures.workload import Workload if TYPE_CHECKING: @@ -656,6 +657,7 @@ def test_upgrade_generationless_local_file_paths( workload.write_rows(1000) attached_pageserver = env.get_tenant_pageserver(tenant_id) + assert attached_pageserver is not None secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[ 0 ] @@ -727,3 +729,68 @@ def test_upgrade_generationless_local_file_paths( ) # We should download into the same local path we started with assert os.path.exists(victim_path) + + +@run_only_on_default_postgres("Only tests index logic") +def test_old_index_time_threshold( + neon_env_builder: NeonEnvBuilder, +): + """ + Exercise pageserver's detection of trying to load an ancient non-latest index. + (see https://github.com/neondatabase/neon/issues/6951) + """ + + # Run with local_fs because we will interfere with mtimes by local filesystem access + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(32) + + # Remember generation 1's index path + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + index_path = env.pageserver_remote_storage.index_path(tenant_id, timeline_id) + + # Increment generation by detaching+attaching, and write+flush some data to get a new remote index + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + workload.churn_rows(32) + + # A new index should have been written + assert env.pageserver_remote_storage.index_path(tenant_id, timeline_id) != index_path + + # Hack the mtime on the generation 1 index + log.info(f"Setting old mtime on {index_path}") + os.utime(index_path, times=(time.time(), time.time() - 30 * 24 * 3600)) + env.pageserver.allowed_errors.extend( + [ + ".*Found a newer index while loading an old one.*", + ".*Index age exceeds threshold and a newer index exists.*", + ] + ) + + # Detach from storage controller + attach in an old generation directly on the pageserver. + workload.stop() + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + env.storage_controller.reconcile_until_idle() + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"}) + env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy") + + # The controller would not do this (attach in an old generation): we are doing it to simulate + # a hypothetical profound bug in the controller. + env.pageserver.http_client().tenant_location_conf( + tenant_id, {"generation": 1, "mode": "AttachedSingle", "tenant_conf": {}} + ) + + # The pageserver should react to this situation by refusing to attach the tenant and putting + # it into Broken state + env.pageserver.allowed_errors.append(".*tenant is broken.*") + with pytest.raises( + PageserverApiException, + match="tenant is broken: Index age exceeds threshold and a newer index exists", + ): + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id) diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index c0eb598891..200a323a3a 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -import os import time from typing import TYPE_CHECKING @@ -16,7 +15,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload -from fixtures.utils import wait_until +from fixtures.utils import skip_in_debug_build, wait_until if TYPE_CHECKING: from typing import Optional @@ -227,12 +226,9 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): assert get_dirty_bytes(env) >= dirty_after_write -@pytest.mark.skipif( - # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is - # prohibitively slow in debug mode - os.getenv("BUILD_TYPE") == "debug", - reason="Avoid running bulkier ingest tests in debug mode", -) +# We have to use at least ~100MB of data to hit the lowest limit we can configure, which is +# prohibitively slow in debug mode +@skip_in_debug_build("Avoid running bulkier ingest tests in debug mode") def test_total_size_limit(neon_env_builder: NeonEnvBuilder): """ Test that checkpoints are done based on total ephemeral layer size, even if no one timeline is diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index f7c42fc893..fb6050689c 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -8,7 +8,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.remote_storage import s3_storage -from fixtures.utils import wait_until +from fixtures.utils import skip_in_debug_build, wait_until # Test restarting page server, while safekeeper and compute node keep @@ -155,12 +155,8 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # safekeeper and compute node keep running. @pytest.mark.timeout(540) @pytest.mark.parametrize("shard_count", [None, 4]) -def test_pageserver_chaos( - neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int] -): - if build_type == "debug": - pytest.skip("times out in debug builds") - +@skip_in_debug_build("times out in debug builds") +def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): # same rationale as with the immediate stop; we might leave orphan layers behind. neon_env_builder.disable_scrub_on_exit() neon_env_builder.enable_pageserver_remote_storage(s3_storage()) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 705b4ff054..d4aef96735 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -17,7 +17,7 @@ from fixtures.pageserver.utils import ( wait_for_upload_queue_empty, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage -from fixtures.utils import wait_until +from fixtures.utils import skip_in_debug_build, wait_until from fixtures.workload import Workload from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -765,7 +765,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): assert download_rate < expect_download_rate * 2 -@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") +@skip_in_debug_build("only run with release build") @pytest.mark.parametrize("via_controller", [True, False]) def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool): """ diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index b97fccddf5..f4698191eb 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -3,7 +3,6 @@ # from __future__ import annotations -import os from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import TYPE_CHECKING, cast @@ -19,6 +18,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import s3_storage +from fixtures.utils import skip_in_debug_build if TYPE_CHECKING: from typing import Optional @@ -329,7 +329,7 @@ def test_sql_regress( post_checks(env, test_output_dir, DBNAME, endpoint) -@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") +@skip_in_debug_build("only run with release build") def test_tx_abort_with_many_relations( neon_env_builder: NeonEnvBuilder, ): diff --git a/test_runner/regress/test_physical_and_logical_replicaiton.py b/test_runner/regress/test_physical_and_logical_replicaiton.py new file mode 100644 index 0000000000..ad2d0871b8 --- /dev/null +++ b/test_runner/regress/test_physical_and_logical_replicaiton.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +import time + +from fixtures.neon_fixtures import NeonEnv, logical_replication_sync + + +def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonEnv, vanilla_pg): + """Test read replica of a primary which has a logical replication publication""" + env = neon_simple_env + + n_records = 100000 + + primary = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) + p_con = primary.connect() + p_cur = p_con.cursor() + p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))") + p_cur.execute("create publication pub1 for table t") + + # start subscriber to primary + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE t(pk bigint primary key, payload text)") + connstr = primary.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + time.sleep(1) + secondary = env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + ) + + s_con = secondary.connect() + s_cur = s_con.cursor() + + for pk in range(n_records): + p_cur.execute("insert into t (pk) values (%s)", (pk,)) + + s_cur.execute("select count(*) from t") + assert s_cur.fetchall()[0][0] == n_records + + logical_replication_sync(vanilla_pg, primary) + assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records + + # Check that LR slot is not copied to replica + s_cur.execute("select count(*) from pg_replication_slots") + assert s_cur.fetchall()[0][0] == 0 + + +def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg): + """Test that AUX files are not saved at replica""" + env = neon_simple_env + + n_records = 20000 + + primary = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) + p_con = primary.connect() + p_cur = p_con.cursor() + p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))") + p_cur.execute("create publication pub1 for table t") + + # start subscriber + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE t(pk bigint primary key, payload text)") + connstr = primary.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + for pk in range(n_records): + p_cur.execute("insert into t (pk) values (%s)", (pk,)) + + # LR snapshot is stored each 15 seconds + time.sleep(16) + + # start replica + secondary = env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + ) + + s_con = secondary.connect() + s_cur = s_con.cursor() + + logical_replication_sync(vanilla_pg, primary) + + assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records + s_cur.execute("select count(*) from t") + assert s_cur.fetchall()[0][0] == n_records + + vanilla_pg.stop() + secondary.stop() + primary.stop() + assert not secondary.log_contains("cannot make new WAL entries during recovery") diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py index 071ca7c54e..ea01252ce4 100644 --- a/test_runner/regress/test_proxy_websockets.py +++ b/test_runner/regress/test_proxy_websockets.py @@ -37,7 +37,7 @@ async def test_websockets(static_proxy: NeonProxy): startup_message.extend(b"\0") length = (4 + len(startup_message)).to_bytes(4, byteorder="big") - await websocket.send([length, startup_message]) + await websocket.send([length, bytes(startup_message)]) startup_response = await websocket.recv() assert isinstance(startup_response, bytes) diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py index e81e7dad76..8e7c01f950 100644 --- a/test_runner/regress/test_replica_start.py +++ b/test_runner/regress/test_replica_start.py @@ -30,7 +30,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup from fixtures.pg_version import PgVersion -from fixtures.utils import query_scalar, wait_until +from fixtures.utils import query_scalar, skip_on_postgres, wait_until CREATE_SUBXACTS_FUNC = """ create or replace function create_subxacts(n integer) returns void as $$ @@ -137,6 +137,12 @@ def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv): assert secondary_cur.fetchone() == (1,) +@skip_on_postgres( + PgVersion.V14, reason="pg_log_standby_snapshot() function is available since Postgres 16" +) +@skip_on_postgres( + PgVersion.V15, reason="pg_log_standby_snapshot() function is available since Postgres 16" +) def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version): """ Test that starting a replica works right after the primary has @@ -149,9 +155,6 @@ def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version): """ env = neon_simple_env - if env.pg_version == PgVersion.V14 or env.pg_version == PgVersion.V15: - pytest.skip("pg_log_standby_snapshot() function is available only in PG16") - primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") primary_conn = primary.connect() primary_cur = primary_conn.cursor() diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 3a249bbdb4..0a4a53356d 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -20,7 +20,7 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty from fixtures.remote_storage import s3_storage -from fixtures.utils import wait_until +from fixtures.utils import skip_in_debug_build, wait_until from fixtures.workload import Workload from pytest_httpserver import HTTPServer from typing_extensions import override @@ -256,6 +256,7 @@ def test_sharding_split_compaction( # Cleanup part 1: while layers are still in PITR window, we should only drop layers that are fully redundant for shard in shards: ps = env.get_tenant_pageserver(shard) + assert ps is not None # Invoke compaction: this should drop any layers that don't overlap with the shard's key stripes detail_before = ps.http_client().timeline_detail(shard, timeline_id) @@ -852,12 +853,9 @@ def test_sharding_split_stripe_size( wait_until(10, 1, assert_restart_notification) -@pytest.mark.skipif( - # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're - # validating in this test don't benefit much from debug assertions. - os.getenv("BUILD_TYPE") == "debug", - reason="Avoid running bulkier ingest tests in debug mode", -) +# The quantity of data isn't huge, but debug can be _very_ slow, and the things we're +# validating in this test don't benefit much from debug assertions. +@skip_in_debug_build("Avoid running bulkier ingest tests in debug mode") def test_sharding_ingest_layer_sizes( neon_env_builder: NeonEnvBuilder, ): diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index c8de292588..2c3d79b18a 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -36,11 +36,12 @@ from fixtures.pageserver.utils import ( remote_storage_delete_key, timeline_delete_wait_completed, ) -from fixtures.pg_version import PgVersion, run_only_on_default_postgres +from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.storage_controller_proxy import StorageControllerProxy from fixtures.utils import ( + run_only_on_default_postgres, run_pg_bench_small, subprocess_capture, wait_until, @@ -1237,6 +1238,7 @@ def test_storage_controller_tenant_deletion( # Assert attachments all have local content for shard_id in shard_ids: pageserver = env.get_tenant_pageserver(shard_id) + assert pageserver is not None assert pageserver.tenant_dir(shard_id).exists() # Assert all shards have some content in remote storage @@ -2745,6 +2747,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB # Upload but don't compact origin_pageserver = env.get_tenant_pageserver(tenant_id) + assert origin_pageserver is not None dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0] origin_pageserver.http_client().timeline_checkpoint( tenant_id, timeline_id, wait_until_uploaded=True, compact=False diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 05db0fe977..11ad2173ae 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -245,6 +245,7 @@ def test_scrubber_physical_gc_ancestors( workload.write_rows(100, upload=False) for shard in shards: ps = env.get_tenant_pageserver(shard) + assert ps is not None log.info(f"Waiting for shard {shard} on pageserver {ps.id}") ps.http_client().timeline_checkpoint( shard, timeline_id, compact=False, wait_until_uploaded=True @@ -270,6 +271,7 @@ def test_scrubber_physical_gc_ancestors( workload.churn_rows(100) for shard in shards: ps = env.get_tenant_pageserver(shard) + assert ps is not None ps.http_client().timeline_compact(shard, timeline_id, force_image_layer_creation=True) ps.http_client().timeline_gc(shard, timeline_id, 0) @@ -336,12 +338,15 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder # Issue a deletion queue flush so that the parent shard can't leave behind layers # that will look like unexpected garbage to the scrubber - env.get_tenant_pageserver(tenant_id).http_client().deletion_queue_flush(execute=True) + ps = env.get_tenant_pageserver(tenant_id) + assert ps is not None + ps.http_client().deletion_queue_flush(execute=True) new_shard_count = 4 shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) for shard in shards: ps = env.get_tenant_pageserver(shard) + assert ps is not None log.info(f"Waiting for shard {shard} on pageserver {ps.id}") ps.http_client().timeline_checkpoint( shard, timeline_id, compact=False, wait_until_uploaded=True diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index b41f1709bd..8b733da0c6 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -21,7 +20,7 @@ from fixtures.pageserver.utils import ( wait_until_tenant_active, ) from fixtures.pg_version import PgVersion -from fixtures.utils import wait_until +from fixtures.utils import skip_in_debug_build, wait_until def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): @@ -279,7 +278,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa size_debug_file.write(size_debug) -@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") +@skip_in_debug_build("only run with release build") def test_single_branch_get_tenant_size_grows( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion ): @@ -315,6 +314,7 @@ def test_single_branch_get_tenant_size_grows( tenant_id: TenantId, timeline_id: TimelineId, ) -> tuple[Lsn, int]: + size = 0 consistent = False size_debug = None @@ -360,7 +360,7 @@ def test_single_branch_get_tenant_size_grows( collected_responses.append(("CREATE", current_lsn, size)) batch_size = 100 - + prev_size = 0 for i in range(3): with endpoint.cursor() as cur: cur.execute( diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 03cb79fc1d..5a499ea98b 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -427,7 +427,7 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): env.pageserver.start() for f in futs: - f.result(timeout=10) + f.result(timeout=30) # The tenant should end up active wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1) diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 5f211ec4d4..68e9385035 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -146,6 +146,7 @@ def test_threshold_based_eviction( out += [f" {remote} {layer.layer_file_name}"] return "\n".join(out) + stable_for: float = 0 observation_window = 8 * eviction_threshold consider_stable_when_no_change_for_seconds = 3 * eviction_threshold poll_interval = eviction_threshold / 3 diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 3e9812c38a..d3839e3d2c 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -1,15 +1,22 @@ from __future__ import annotations +import json +from typing import Optional + import pytest from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverApiException -from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty -from fixtures.remote_storage import s3_storage +from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty, list_prefix +from fixtures.remote_storage import S3Storage, s3_storage from fixtures.utils import wait_until +from mypy_boto3_s3.type_defs import ( + ObjectTypeDef, +) @pytest.mark.parametrize("shard_count", [0, 4]) @@ -369,3 +376,146 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel neon_env_builder.pageserver_remote_storage, prefix=f"tenants/{str(tenant_id)}/", ) + + +@pytest.mark.parametrize("offload_child", ["offload", "offload-corrupt", "archive", None]) +def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Optional[str]): + """ + Ensure that retain_lsn functionality for timelines works, both for offloaded and non-offloaded ones + """ + if offload_child == "offload-corrupt": + # Our corruption code only works with S3 compatible storage + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, root_timeline_id = env.create_tenant( + conf={ + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": 128 * 1024, + "compaction_threshold": 1, + "compaction_target_size": 128 * 1024, + # set small image creation thresholds so that gc deletes data + "image_creation_threshold": 2, + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # Disable pitr, we only want the latest lsn + "pitr_interval": "0s", + # Don't rely on endpoint lsn leases + "lsn_lease_length": "0s", + } + ) + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(v int, key serial primary key, t text default 'data_content')", + "SELECT setseed(0.4321)", + "INSERT INTO foo SELECT v FROM (SELECT generate_series(1,2048), (random() * 409600)::int as v) as random_numbers", + ] + ) + pre_branch_sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200") + log.info(f"Pre branch sum: {pre_branch_sum}") + last_flush_lsn_upload(env, endpoint, tenant_id, root_timeline_id) + + # Create a branch and write some additional data to the parent + child_timeline_id = env.create_branch("test_archived_branch", tenant_id) + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + # Do some churn of the data. This is important so that we can overwrite image layers. + for i in range(10): + endpoint.safe_psql_many( + [ + f"SELECT setseed(0.23{i})", + "UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 2", + "UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 1", + "UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 0", + ] + ) + post_branch_sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200") + log.info(f"Post branch sum: {post_branch_sum}") + last_flush_lsn_upload(env, endpoint, tenant_id, root_timeline_id) + + if offload_child is not None: + ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + child_timeline_id, + ) + assert leaf_detail["is_archived"] is True + if "offload" in offload_child: + ps_http.timeline_offload(tenant_id, child_timeline_id) + + # Do a restart to get rid of any in-memory objects (we only init gc info once, at attach) + env.pageserver.stop() + if offload_child == "offload-corrupt": + assert isinstance(env.pageserver_remote_storage, S3Storage) + listing = list_prefix( + env.pageserver_remote_storage, f"tenants/{str(tenant_id)}/tenant-manifest" + ) + objects: list[ObjectTypeDef] = listing.get("Contents", []) + assert len(objects) > 0 + remote_key: str = str(objects[0].get("Key", [])) + local_path = str(env.repo_dir / "tenant-manifest.json") + + log.info(f"Downloading {remote_key} -> {local_path}") + env.pageserver_remote_storage.client.download_file( + env.pageserver_remote_storage.bucket_name, remote_key, local_path + ) + + log.info(f"Corrupting {local_path}") + with open(local_path) as manifest_json_file: + manifest_json = json.load(manifest_json_file) + for offloaded_timeline in manifest_json["offloaded_timelines"]: + offloaded_timeline["ancestor_retain_lsn"] = None + with open(local_path, "w") as manifest_json_file: + json.dump(manifest_json, manifest_json_file) + + log.info(f"Uploading {local_path} -> {remote_key}") + env.pageserver_remote_storage.client.upload_file( + local_path, env.pageserver_remote_storage.bucket_name, remote_key + ) + # The point of our earlier efforts was to provoke these + env.pageserver.allowed_errors.extend( + [ + ".*initial size calculation failed: PageRead.MissingKey.could not find data for key.*", + ".*page_service_conn_main.*could not find data for key.*", + ] + ) + env.pageserver.start() + + # Do an agressive gc and compaction of the parent branch + ps_http.timeline_gc(tenant_id=tenant_id, timeline_id=root_timeline_id, gc_horizon=0) + ps_http.timeline_checkpoint( + tenant_id, + root_timeline_id, + force_l0_compaction=True, + force_repartition=True, + wait_until_uploaded=True, + compact=True, + ) + + if offload_child is not None: + ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + + # Now, after unarchival, the child timeline should still have its data accessible (or corrupted) + if offload_child == "offload-corrupt": + with pytest.raises(RuntimeError, match=".*failed to get basebackup.*"): + env.endpoints.create_start( + "test_archived_branch", tenant_id=tenant_id, basebackup_request_tries=1 + ) + else: + with env.endpoints.create_start("test_archived_branch", tenant_id=tenant_id) as endpoint: + sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200") + assert sum == pre_branch_sum diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 0e8519e07b..ef0eb05612 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -869,8 +869,17 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): assert count == 10000 -@pytest.mark.parametrize("mode", ["delete_timeline", "delete_tenant"]) -@pytest.mark.parametrize("sharded", [False, True]) +@pytest.mark.parametrize( + "mode, sharded", + [ + ("delete_timeline", False), + ("delete_timeline", True), + ("delete_tenant", False), + # the shared/exclusive lock for tenant is blocking this: + # timeline detach ancestor takes shared, delete tenant takes exclusive + # ("delete_tenant", True) + ], +) def test_timeline_detach_ancestor_interrupted_by_deletion( neon_env_builder: NeonEnvBuilder, mode: str, sharded: bool ): @@ -885,11 +894,6 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( - shutdown winning over complete, see test_timeline_is_deleted_before_timeline_detach_ancestor_completes """ - if sharded and mode == "delete_tenant": - # the shared/exclusive lock for tenant is blocking this: - # timeline detach ancestor takes shared, delete tenant takes exclusive - pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen") - shard_count = 2 if sharded else 1 neon_env_builder.num_pageservers = shard_count diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 157390c01c..0676b3dd9a 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -54,6 +54,8 @@ from fixtures.utils import ( PropagatingThread, get_dir_size, query_scalar, + run_only_on_default_postgres, + skip_in_debug_build, start_in_background, wait_until, ) @@ -1506,15 +1508,10 @@ class SafekeeperEnv: port=port.http, auth_token=None, ) - try: - safekeeper_process = start_in_background( - cmd, safekeeper_dir, "safekeeper.log", safekeeper_client.check_status - ) - return safekeeper_process - except Exception as e: - log.error(e) - safekeeper_process.kill() - raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}") from e + safekeeper_process = start_in_background( + cmd, safekeeper_dir, "safekeeper.log", safekeeper_client.check_status + ) + return safekeeper_process def get_safekeeper_connstrs(self): assert self.safekeepers is not None, "safekeepers are not initialized" @@ -2109,10 +2106,9 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): # The only way to verify this without manipulating time is to sleep for a while. # In this test we sleep for 60 seconds, so this test takes at least 1 minute to run. # This is longer than most other tests, we run it only for v16 to save CI resources. +@run_only_on_default_postgres("run only on release build to save CI resources") +@skip_in_debug_build("run only on release build to save CI resources") def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): - if os.environ.get("PYTEST_CURRENT_TEST", "").find("[debug-pg16]") == -1: - pytest.skip("run only on debug postgres v16 to save CI resources") - neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 92306469f8..d3e989afa8 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -14,6 +14,7 @@ from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import getLogger from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper from fixtures.remote_storage import RemoteStorageKind +from fixtures.utils import skip_in_debug_build if TYPE_CHECKING: from typing import Optional @@ -602,7 +603,7 @@ async def run_segment_init_failure(env: NeonEnv): sk = env.safekeepers[0] sk_http = sk.http_client() - sk_http.configure_failpoints([("sk-write-zeroes", "return")]) + sk_http.configure_failpoints([("sk-zero-segment", "return")]) conn = await ep.connect_async() ep.safe_psql("select pg_switch_wal()") # jump to the segment boundary # next insertion should hang until failpoint is disabled. @@ -760,10 +761,8 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat # The test takes more than default 5 minutes on Postgres 16, # see https://github.com/neondatabase/neon/issues/5305 @pytest.mark.timeout(600) +@skip_in_debug_build("times out in debug builds") def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, build_type: str): - if build_type == "debug": - pytest.skip("times out in debug builds") - neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 05b6ad8a9b..c8e51fde13 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -64,6 +64,7 @@ def test_wal_restore( ), str(data_dir), str(port), + env.pg_version, ] ) restored.start() @@ -127,6 +128,7 @@ def test_wal_restore_initdb( ), str(data_dir), str(port), + env.pg_version, ] ) restored.start() diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 2199b83fb7..c5e0d642ef 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 2199b83fb72680001ce0f43bf6187a21dfb8f45d +Subproject commit c5e0d642efb02e4bfedc283b0a7707fe6c79cc89 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 22e580fe9f..1feff6b60f 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 22e580fe9ffcea7e02592110b1c9bf426d83cada +Subproject commit 1feff6b60f07cb71b665d0f5ead71a4320a71743 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index e131a9c027..b0b693ea29 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit e131a9c027b202ce92bd7b9cf2569d48a6f9948e +Subproject commit b0b693ea298454e95e6b154780d1fd586a244dfd diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 9ad2f3c5c3..aa2e29f2b6 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 9ad2f3c5c37c08069a01c1e3f6b7cf275437e0cb +Subproject commit aa2e29f2b6952140dfe51876bbd11054acae776f diff --git a/vendor/revisions.json b/vendor/revisions.json index 18bde18359..a1f2bc5dd1 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ - "17.0", - "9ad2f3c5c37c08069a01c1e3f6b7cf275437e0cb" + "17.1", + "aa2e29f2b6952140dfe51876bbd11054acae776f" ], "v16": [ - "16.4", - "e131a9c027b202ce92bd7b9cf2569d48a6f9948e" + "16.5", + "b0b693ea298454e95e6b154780d1fd586a244dfd" ], "v15": [ - "15.8", - "22e580fe9ffcea7e02592110b1c9bf426d83cada" + "15.9", + "1feff6b60f07cb71b665d0f5ead71a4320a71743" ], "v14": [ - "14.13", - "2199b83fb72680001ce0f43bf6187a21dfb8f45d" + "14.14", + "c5e0d642efb02e4bfedc283b0a7707fe6c79cc89" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 02deecd385..ae4018a884 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -64,7 +64,7 @@ rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } -reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] } +reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "rustls-tls-native-roots", "stream"] } rustls = { version = "0.23", default-features = false, features = ["logging", "ring", "std", "tls12"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] }