diff --git a/.dockerignore b/.dockerignore index 0667d8870e..2c78951923 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,18 +1,18 @@ -**/.git/ -**/__pycache__ -**/.pytest_cache +* -.git -target -tmp_check -tmp_install -tmp_check_cli -test_output -.vscode -.neon -integration_tests/.neon -.mypy_cache - -Dockerfile -.dockerignore +!Cargo.toml +!Cargo.lock +!Makefile +!.cargo/ +!.config/ +!control_plane/ +!compute_tools/ +!libs/ +!pageserver/ +!pgxn/ +!proxy/ +!safekeeper/ +!vendor/postgres/ +!workspace_hack/ +!neon_local/ diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000000..3afa4b683c --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1 @@ +4c2bb43775947775401cbb9d774823c5723a91f8 diff --git a/.github/ISSUE_TEMPLATE/bug-template.md b/.github/ISSUE_TEMPLATE/bug-template.md new file mode 100644 index 0000000000..d33eec3cde --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-template.md @@ -0,0 +1,23 @@ +--- +name: Bug Template +about: Used for describing bugs +title: '' +labels: t/bug +assignees: '' + +--- + +## Steps to reproduce + + +## Expected result + + +## Actual result + + +## Environment + + +## Logs, links +- diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md new file mode 100644 index 0000000000..33ad7b1ef5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -0,0 +1,25 @@ +--- +name: Epic Template +about: A set of related tasks contributing towards specific outcome, comprizing of + more than 1 week of work. +title: 'Epic: ' +labels: t/Epic +assignees: '' + +--- + +## Motivation + + +## DoD + + +## Implementation ideas + + +## Tasks +- [ ] + + +## Other related tasks and Epics +- diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md new file mode 100644 index 0000000000..6f86114060 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md @@ -0,0 +1,20 @@ +## Release 202Y-MM-DD + +**NB: this PR must be merged only by 'Create a merge commit'!** + +### Checklist when preparing for release +- [ ] Read or refresh [the release flow guide](https://github.com/neondatabase/cloud/wiki/Release:-general-flow) +- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers? +- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan? + + + +### Checklist after release +- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/120/files)) +- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel +- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true) +- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some) +- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1) +- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time) + + diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml new file mode 100644 index 0000000000..34761f8df1 --- /dev/null +++ b/.github/actions/allure-report/action.yml @@ -0,0 +1,217 @@ +name: 'Create Allure report' +description: 'Create and publish Allure report' + +inputs: + action: + desctiption: 'generate or store' + required: true + build_type: + description: '`build_type` from run-python-test-set action' + required: true + test_selection: + description: '`test_selector` from run-python-test-set action' + required: false + +runs: + using: "composite" + steps: + - name: Validate input parameters + shell: bash -euxo pipefail {0} + run: | + if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then + echo 2>&1 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only" + exit 1 + fi + + if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then + echo 2>&1 "inputs.test_selection must be set for 'store' action" + exit 2 + fi + + - name: Calculate key + id: calculate-key + shell: bash -euxo pipefail {0} + run: | + # TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key + + pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) + if [ "${pr_number}" != "null" ]; then + key=pr-${pr_number} + elif [ "${GITHUB_REF}" = "refs/heads/main" ]; then + # Shortcut for a special branch + key=main + else + key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-") + fi + echo "::set-output name=KEY::${key}" + + - uses: actions/setup-java@v3 + if: ${{ inputs.action == 'generate' }} + with: + distribution: 'temurin' + java-version: '17' + + - name: Install Allure + if: ${{ inputs.action == 'generate' }} + shell: bash -euxo pipefail {0} + run: | + if ! which allure; then + ALLURE_ZIP=allure-${ALLURE_VERSION}.zip + wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP} + echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c + unzip -q ${ALLURE_ZIP} + echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH + rm -f ${ALLURE_ZIP} + fi + env: + ALLURE_VERSION: 2.19.0 + ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464 + + - name: Upload Allure results + if: ${{ inputs.action == 'store' }} + env: + REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }} + RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }} + TEST_OUTPUT: /tmp/test_output + BUCKET: neon-github-public-dev + shell: bash -euxo pipefail {0} + run: | + # Add metadata + cat < $TEST_OUTPUT/allure/results/executor.json + { + "name": "GitHub Actions", + "type": "github", + "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html", + "buildOrder": ${GITHUB_RUN_ID}, + "buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}", + "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}", + "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html", + "reportName": "Allure Report" + } + EOF + cat < $TEST_OUTPUT/allure/results/environment.properties + TEST_SELECTION=${{ inputs.test_selection }} + BUILD_TYPE=${{ inputs.build_type }} + EOF + + ARCHIVE="${GITHUB_RUN_ID}-${{ inputs.test_selection }}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst" + ZSTD_NBTHREADS=0 + + tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd . + aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}" + + # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this + - name: Acquire Allure lock + if: ${{ inputs.action == 'generate' }} + shell: bash -euxo pipefail {0} + env: + LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt + BUCKET: neon-github-public-dev + run: | + LOCK_TIMEOUT=300 # seconds + + for _ in $(seq 1 5); do + for i in $(seq 1 ${LOCK_TIMEOUT}); do + LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true) + # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS) + if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then + break + fi + sleep 1 + done + echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" > lock.txt + aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}" + + # A double-check that exactly WE have acquired the lock + aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt + if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then + break + fi + done + + - name: Generate and publish final Allure report + if: ${{ inputs.action == 'generate' }} + id: generate-report + env: + REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }} + RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }} + TEST_OUTPUT: /tmp/test_output + BUCKET: neon-github-public-dev + shell: bash -euxo pipefail {0} + run: | + # Get previously uploaded data for this run + ZSTD_NBTHREADS=0 + + s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output '.Contents[].Key') + if [ -z "$s3_filepaths" ]; then + # There's no previously uploaded data for this run + exit 0 + fi + for s3_filepath in ${s3_filepaths}; do + aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/" + + archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath) + mkdir -p ${archive%.tar.zst} + tar -xf ${archive} -C ${archive%.tar.zst} + rm -f ${archive} + done + + # Get history trend + aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true + + # Generate report + allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/* + + # Replace a logo link with a redirect to the latest version of the report + sed -i 's| ./index.html + + + + Redirecting to ${REPORT_URL} + + EOF + aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" + + echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} + echo "::set-output name=REPORT_URL::${REPORT_URL}" + + - name: Release Allure lock + if: ${{ inputs.action == 'generate' && always() }} + shell: bash -euxo pipefail {0} + env: + LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt + BUCKET: neon-github-public-dev + run: | + aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0 + + if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then + aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" + fi + + - uses: actions/github-script@v6 + if: ${{ inputs.action == 'generate' && always() }} + env: + REPORT_URL: ${{ steps.generate-report.outputs.REPORT_URL }} + BUILD_TYPE: ${{ inputs.build_type }} + SHA: ${{ github.event.pull_request.head.sha || github.sha }} + with: + script: | + const { REPORT_URL, BUILD_TYPE, SHA } = process.env + + await github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: `${SHA}`, + state: 'success', + target_url: `${REPORT_URL}`, + context: `Allure report / ${BUILD_TYPE}`, + }) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 3900f93ee4..2344fba13c 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -3,11 +3,11 @@ description: 'Runs a Neon python test set, performing all the required preparati inputs: build_type: - description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".' + description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug", or "remote" for the remote cluster' required: true rust_toolchain: description: 'Rust toolchain version to fetch the caches' - required: true + required: false test_selection: description: 'A python test suite to run' required: true @@ -24,7 +24,7 @@ inputs: required: false default: 'true' save_perf_report: - description: 'Whether to upload the performance report' + description: 'Whether to upload the performance report, if true PERF_TEST_RESULT_CONNSTR env variable should be set' required: false default: 'false' run_with_real_s3: @@ -52,6 +52,7 @@ runs: using: "composite" steps: - name: Get Neon artifact + if: inputs.build_type != 'remote' uses: ./.github/actions/download with: name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact @@ -78,7 +79,6 @@ runs: - name: Run pytest env: NEON_BIN: /tmp/neon/bin - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install TEST_OUTPUT: /tmp/test_output # this variable will be embedded in perf test report # and is needed to distinguish different environments @@ -88,6 +88,12 @@ runs: AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} shell: bash -euxo pipefail {0} run: | + export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} + + if [ "${BUILD_TYPE}" = "remote" ]; then + export REMOTE_ENV=1 + fi + PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" rm -rf $PERF_REPORT_DIR @@ -119,6 +125,13 @@ runs: cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) elif [[ "${{ inputs.build_type }}" == "release" ]]; then cov_prefix=() + else + cov_prefix=() + fi + + # Wake up the cluster if we use remote neon instance + if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then + ${POSTGRES_DISTRIB_DIR}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" fi # Run the tests. @@ -131,11 +144,12 @@ runs: # -n4 uses four processes to run tests via pytest-xdist # -s is not used to prevent pytest from capturing output, because tests are running # in parallel and logs are mixed between different tests + mkdir -p $TEST_OUTPUT/allure/results "${cov_prefix[@]}" ./scripts/pytest \ --junitxml=$TEST_OUTPUT/junit.xml \ + --alluredir=$TEST_OUTPUT/allure/results \ --tb=short \ --verbose \ - -m "not remote_cluster" \ -rA $TEST_SELECTION $EXTRA_PARAMS if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then @@ -146,17 +160,10 @@ runs: fi fi - - name: Delete all data but logs - shell: bash -euxo pipefail {0} + - name: Create Allure report if: always() - run: | - du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete - du -sh /tmp/test_output/* - - - name: Upload python test logs - if: always() - uses: ./.github/actions/upload + uses: ./.github/actions/allure-report with: - name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs - path: /tmp/test_output/ + action: store + build_type: ${{ inputs.build_type }} + test_selection: ${{ inputs.test_selection }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 8080d6b7db..4c58dda6b6 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -106,7 +106,7 @@ jobs: mkdir -p perf-report-staging # Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file, # it's important for test_perf_pgbench.py::test_pgbench_remote_* tests - ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --skip-interfering-proc-check --out-dir perf-report-staging --timeout 5400 + ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --out-dir perf-report-staging --timeout 5400 - name: Submit result env: @@ -128,9 +128,9 @@ jobs: env: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: "10gb" - REMOTE_ENV: "1" POSTGRES_DISTRIB_DIR: /usr TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote strategy: fail-fast: false @@ -138,23 +138,15 @@ jobs: connstr: [ BENCHMARK_CAPTEST_CONNSTR, BENCHMARK_RDS_CONNSTR ] runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2817580636 + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned + options: --init timeout-minutes: 360 # 6h steps: - uses: actions/checkout@v3 - - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 - with: - path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} - - - name: Install Python deps - run: ./scripts/pysync - - name: Calculate platform id: calculate-platform env: @@ -173,50 +165,56 @@ jobs: - name: Install Deps run: | - echo "deb http://apt.postgresql.org/pub/repos/apt focal-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list - wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - sudo apt -y update - sudo apt install -y postgresql-14 postgresql-client-14 + sudo apt install -y postgresql-14 - name: Benchmark init + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: true + extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init env: PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} - run: | - mkdir -p perf-report-captest - - psql $BENCHMARK_CONNSTR -c "SELECT 1;" - ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_init -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600 - - - name: Benchmark simple-update - env: - PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} - BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} - run: | - psql $BENCHMARK_CONNSTR -c "SELECT 1;" - ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_simple_update -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600 - - - name: Benchmark select-only - env: - PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} - BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} - run: | - psql $BENCHMARK_CONNSTR -c "SELECT 1;" - ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_select_only -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600 - - - name: Submit result - env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - run: | - REPORT_FROM=$(realpath perf-report-captest) REPORT_TO=staging scripts/generate_and_push_perf_report.sh - - name: Upload logs - if: always() - uses: ./.github/actions/upload + - name: Benchmark simple-update + uses: ./.github/actions/run-python-test-set with: - name: bench-captest-${{ steps.calculate-platform.outputs.PLATFORM }} - path: /tmp/test_output/ + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: true + extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update + env: + PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} + BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Benchmark simple-update + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: true + extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only + env: + PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }} + BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Create Allure report + uses: ./.github/actions/allure-report + with: + action: generate + build_type: ${{ env.BUILD_TYPE }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6b76b6e5fc..a3314738fa 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -95,11 +95,11 @@ jobs: if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" CARGO_FEATURES="" - CARGO_FLAGS="" + CARGO_FLAGS="--locked" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" CARGO_FEATURES="--features profiling" - CARGO_FLAGS="--release $CARGO_FEATURES" + CARGO_FLAGS="--locked --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV @@ -121,8 +121,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - name: Cache postgres build id: cache_pg @@ -136,6 +136,10 @@ jobs: run: mold -run make postgres -j$(nproc) shell: bash -euxo pipefail {0} + - name: Build neon extensions + run: mold -run make neon-pg-ext -j$(nproc) + shell: bash -euxo pipefail {0} + - name: Run cargo build run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests @@ -202,7 +206,7 @@ jobs: if: matrix.build_type == 'debug' uses: ./.github/actions/save-coverage-data - pg_regress-tests: + regress-tests: runs-on: dev container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned @@ -220,42 +224,13 @@ jobs: submodules: true fetch-depth: 2 - - name: Pytest regress tests + - name: Pytest regression tests uses: ./.github/actions/run-python-test-set with: build_type: ${{ matrix.build_type }} rust_toolchain: ${{ matrix.rust_toolchain }} - test_selection: batch_pg_regress + test_selection: regress needs_postgres_source: true - - - name: Merge and upload coverage data - if: matrix.build_type == 'debug' - uses: ./.github/actions/save-coverage-data - - other-tests: - runs-on: dev - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned - options: --init - needs: [ build-neon ] - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - rust_toolchain: [ 1.58 ] - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 2 - - - name: Pytest other tests - uses: ./.github/actions/run-python-test-set - with: - build_type: ${{ matrix.build_type }} - rust_toolchain: ${{ matrix.rust_toolchain }} - test_selection: batch_others run_with_real_s3: true real_s3_bucket: ci-tests-s3 real_s3_region: us-west-2 @@ -298,12 +273,35 @@ jobs: # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones + merge-allure-report: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + needs: [ regress-tests, benchmarks ] + if: always() + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: false + + - name: Create Allure report + uses: ./.github/actions/allure-report + with: + action: generate + build_type: ${{ matrix.build_type }} + coverage-report: runs-on: dev container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - needs: [ other-tests, pg_regress-tests ] + needs: [ regress-tests ] strategy: fail-fast: false matrix: @@ -325,7 +323,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + key: v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download @@ -460,19 +458,18 @@ jobs: - name: Configure ECR login run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Kaniko build compute node - working-directory: ./vendor/postgres/ - run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + - name: Kaniko build compute node with extensions + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID promote-images: runs-on: dev - needs: [ neon-image, compute-tools-image, compute-node-image ] + needs: [ neon-image, compute-node-image, compute-tools-image ] if: github.event_name != 'workflow_dispatch' container: amazon/aws-cli strategy: fail-fast: false matrix: - name: [ neon, compute-tools, compute-node ] + name: [ neon, compute-node, compute-tools ] steps: - name: Promote image to latest @@ -489,18 +486,6 @@ jobs: run: | go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 - -# - name: Get build tag -# run: | -# if [[ "$GITHUB_REF_NAME" == "main" ]]; then -# echo "::set-output name=tag::$(git rev-list --count HEAD)" -# elif [[ "$GITHUB_REF_NAME" == "release" ]]; then -# echo "::set-output name=tag::release-$(git rev-list --count HEAD)" -# else -# echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " -# echo "::set-output name=tag::$GITHUB_RUN_ID" -# fi -# id: build-tag - name: Configure ECR login run: | @@ -516,6 +501,9 @@ jobs: - name: Pull compute node image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node + - name: Pull rust image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust + - name: Configure docker login run: | # ECR Credential Helper & Docker Hub don't work together in config, hence reset @@ -531,6 +519,9 @@ jobs: - name: Push compute node image to Docker Hub run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}} + - name: Push rust image to Docker Hub + run: crane push rust neondatabase/rust:pinned + - name: Add latest tag to images if: | (github.ref_name == 'main' || github.ref_name == 'release') && @@ -567,7 +558,7 @@ jobs: #container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly - needs: [ push-docker-hub, calculate-deploy-targets, tag ] + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' @@ -622,7 +613,7 @@ jobs: runs-on: dev container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. - needs: [ push-docker-hub, calculate-deploy-targets, tag ] + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 6f13a38dea..b64ea8a01f 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -65,7 +65,7 @@ jobs: - name: Cache postgres build id: cache_pg - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | tmp_install/ @@ -81,6 +81,9 @@ jobs: if: steps.cache_pg.outputs.cache-hit != 'true' run: make postgres + - name: Build neon extensions + run: make neon-pg-ext + # Plain configure output can contain weird errors like 'error: C compiler cannot create executables' # and the real cause will be inside config.log - name: Print configure logs in case of failure @@ -94,20 +97,20 @@ jobs: - name: Cache cargo deps id: cache_cargo - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | ~/.cargo/registry !~/.cargo/registry/src ~/.cargo/git target - key: v2-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} + key: v3-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} - name: Run cargo clippy run: ./run_clippy.sh - name: Ensure all project builds - run: cargo build --all --all-targets + run: cargo build --locked --all --all-targets check-codestyle-python: runs-on: [ self-hosted, Linux, k8s-runner ] @@ -128,8 +131,14 @@ jobs: - name: Install Python deps run: ./scripts/pysync - - name: Run yapf to ensure code format - run: poetry run yapf --recursive --diff . + - name: Run isort to ensure code format + run: poetry run isort --diff --check . + + - name: Run black to ensure code format + run: poetry run black --diff --check . + + - name: Run flake8 to ensure code format + run: poetry run flake8 . - name: Run mypy to check types run: poetry run mypy . diff --git a/.github/workflows/notifications.yml b/.github/workflows/notifications.yml deleted file mode 100644 index 55dc979896..0000000000 --- a/.github/workflows/notifications.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: Send Notifications - -on: - push: - branches: [ main ] - -jobs: - send-notifications: - timeout-minutes: 30 - name: send commit notifications - runs-on: ubuntu-latest - - steps: - - - name: Checkout - uses: actions/checkout@v2 - with: - submodules: true - fetch-depth: 2 - - - name: Form variables for notification message - id: git_info_grab - run: | - git_stat=$(git show --stat=50) - git_stat="${git_stat//'%'/'%25'}" - git_stat="${git_stat//$'\n'/'%0A'}" - git_stat="${git_stat//$'\r'/'%0D'}" - git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces - echo "::set-output name=git_stat::$git_stat" - echo "::set-output name=sha_short::$(git rev-parse --short HEAD)" - echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})" - - - name: Send notification - uses: appleboy/telegram-action@master - with: - to: ${{ secrets.TELEGRAM_TO }} - token: ${{ secrets.TELEGRAM_TOKEN }} - format: markdown - args: | - *@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }}) - - ``` - ${{ steps.git_info_grab.outputs.git_stat }} - ``` - diff --git a/.yapfignore b/.yapfignore deleted file mode 100644 index 149428e452..0000000000 --- a/.yapfignore +++ /dev/null @@ -1,10 +0,0 @@ -# This file is only read when `yapf` is run from this directory. -# Hence we only top-level directories here to avoid confusion. -# See source code for the exact file format: https://github.com/google/yapf/blob/c6077954245bc3add82dafd853a1c7305a6ebd20/yapf/yapflib/file_resources.py#L40-L43 -vendor/ -target/ -tmp_install/ -__pycache__/ -test_output/ -.neon/ -.git/ diff --git a/Cargo.lock b/Cargo.lock index a70b2b7dc9..2e300e46f5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -48,9 +48,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.58" +version = "1.0.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb07d2053ccdbe10e2af2995a2f116c1330396493dc1269f6a91d0ae82e19704" +checksum = "c91f1f46651137be86f3a2b9a8359f9ab421d04d941c62b5982e1ca21113adf9" dependencies = [ "backtrace", ] @@ -77,7 +77,7 @@ dependencies = [ "num-traits", "rusticata-macros", "thiserror", - "time 0.3.11", + "time 0.3.12", ] [[package]] @@ -126,9 +126,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.56" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96cf8829f67d2eab0b2dfa42c5d0ef737e0724e4a82b01b3e292456202b19716" +checksum = "76464446b8bc32758d7e88ee1a804d9914cd9b1cb264c029899680b0be29826f" dependencies = [ "proc-macro2", "quote", @@ -166,7 +166,7 @@ dependencies = [ "http", "http-body", "hyper", - "itoa 1.0.2", + "itoa 1.0.3", "matchit", "memchr", "mime", @@ -298,9 +298,9 @@ checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3" [[package]] name = "bytemuck" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c53dfa917ec274df8ed3c572698f381a24eef2efba9492d797301b72b6db408a" +checksum = "a5377c8865e74a160d21f29c2d40669f53286db6eab59b88540cbb12ffc8b835" [[package]] name = "byteorder" @@ -310,9 +310,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.1.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" +checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db" dependencies = [ "serde", ] @@ -386,9 +386,9 @@ dependencies = [ [[package]] name = "clap" -version = "3.2.12" +version = "3.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab8b79fe3946ceb4a0b1c080b4018992b8d27e9ff363644c1c9b6387c854614d" +checksum = "a3dbbb6653e7c55cc8595ad3e1f7be8f32aba4eb7ff7f0fd1163d4f3d137c0a9" dependencies = [ "atty", "bitflags", @@ -455,7 +455,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 3.2.12", + "clap 3.2.16", "env_logger", "hyper", "log", @@ -601,9 +601,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.5" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c02a4d71819009c192cf4872265391563fd6a84c81ff2c0f2a7026ca4c1d85c" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if", "crossbeam-utils", @@ -611,9 +611,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" dependencies = [ "cfg-if", "crossbeam-epoch", @@ -622,9 +622,9 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07db9d94cbd326813772c968ccd25999e5f8ae22f4f8d1b11effa37ef6ce281d" +checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1" dependencies = [ "autocfg", "cfg-if", @@ -636,9 +636,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d82ee10ce34d7bc12c2122495e7593a9c41347ecdd64185af4ecf72cb1a7f83" +checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc" dependencies = [ "cfg-if", "once_cell", @@ -917,9 +917,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf" +checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" dependencies = [ "instant", ] @@ -1086,9 +1086,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.5" +version = "0.14.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd48d33ec7f05fbfa152300fdad764757cbded343c1aa1cff2fbaf4134851803" +checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" dependencies = [ "typenum", "version_check", @@ -1164,20 +1164,14 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" -[[package]] -name = "hashbrown" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" -dependencies = [ - "ahash", -] - [[package]] name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash", +] [[package]] name = "heck" @@ -1245,7 +1239,7 @@ checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", - "itoa 1.0.2", + "itoa 1.0.3", ] [[package]] @@ -1308,7 +1302,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 1.0.2", + "itoa 1.0.3", "pin-project-lite", "socket2", "tokio", @@ -1379,7 +1373,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" dependencies = [ "autocfg", - "hashbrown 0.12.3", + "hashbrown", ] [[package]] @@ -1391,7 +1385,7 @@ dependencies = [ "ahash", "atty", "indexmap", - "itoa 1.0.2", + "itoa 1.0.3", "lazy_static", "log", "num-format", @@ -1432,15 +1426,15 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" [[package]] name = "itoa" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d" +checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754" [[package]] name = "js-sys" -version = "0.3.58" +version = "0.3.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3fac17f7123a73ca62df411b1bf727ccc805daa070338fda671c86dac1bdc27" +checksum = "258451ab10b34f8af53416d1fdab72c22e805f0c92a1136d59470ec0b11138b2" dependencies = [ "wasm-bindgen", ] @@ -1482,9 +1476,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.126" +version = "0.2.127" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" +checksum = "505e71a4706fa491e9b1b55f51b95d4037d0821ee40131190475f692b35b009b" [[package]] name = "libloading" @@ -1659,7 +1653,7 @@ name = "neon_local" version = "0.1.0" dependencies = [ "anyhow", - "clap 3.2.12", + "clap 3.2.16", "comfy-table", "control_plane", "git-version", @@ -1854,7 +1848,7 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 3.2.12", + "clap 3.2.16", "close_fds", "const_format", "crc32c", @@ -2111,7 +2105,6 @@ dependencies = [ "bindgen", "byteorder", "bytes", - "chrono", "crc32c", "env_logger", "hex", @@ -2155,9 +2148,9 @@ checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" [[package]] name = "prettyplease" -version = "0.1.16" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da6ffbe862780245013cb1c0a48c4e44b7d665548088f91f6b90876d0625e4c2" +checksum = "697ae720ee02011f439e0701db107ffe2916d83f718342d65d7f8bf7b8a5fee9" dependencies = [ "proc-macro2", "syn", @@ -2171,9 +2164,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.40" +version = "1.0.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd96a1e8ed2596c337f8eae5f24924ec83f5ad5ab21ea8e455d3566c69fbcaf7" +checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab" dependencies = [ "unicode-ident", ] @@ -2271,13 +2264,14 @@ dependencies = [ "base64", "bstr", "bytes", - "clap 3.2.12", + "clap 3.2.16", "futures", "git-version", - "hashbrown 0.11.2", + "hashbrown", "hex", "hmac 0.12.1", "hyper", + "itertools", "md5", "metrics", "once_cell", @@ -2289,7 +2283,7 @@ dependencies = [ "routerify", "rstest", "rustls", - "rustls-pemfile 0.2.1", + "rustls-pemfile", "scopeguard", "serde", "serde_json", @@ -2315,20 +2309,11 @@ dependencies = [ "memchr", ] -[[package]] -name = "quickcheck" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" -dependencies = [ - "rand", -] - [[package]] name = "quote" -version = "1.0.20" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" dependencies = [ "proc-macro2", ] @@ -2411,9 +2396,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.13" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags", ] @@ -2508,7 +2493,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rustls", - "rustls-pemfile 1.0.0", + "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", @@ -2699,18 +2684,9 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "0.2.1" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eebeaeb360c87bfb72e84abdb3447159c0eaececf1bef2aecd65a8be949d1c9" -dependencies = [ - "base64", -] - -[[package]] -name = "rustls-pemfile" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7522c9de787ff061458fe9a829dc790a3f5b22dc571694fc5883f448b94d9a9" +checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55" dependencies = [ "base64", ] @@ -2726,15 +2702,15 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24c8ad4f0c00e1eb5bc7614d236a7f1300e3dbd76b68cac8e06fb00b015ad8d8" +checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8" [[package]] name = "ryu" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695" +checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" [[package]] name = "safekeeper" @@ -2744,7 +2720,7 @@ dependencies = [ "async-trait", "byteorder", "bytes", - "clap 3.2.12", + "clap 3.2.16", "const_format", "crc32c", "daemonize", @@ -2835,15 +2811,15 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2333e6df6d6598f2b1974829f853c2b4c5f4a6e503c10af918081aa6f8564e1" +checksum = "93f6841e709003d68bb2deee8c343572bf446003ec20a583e76f7b15cebf3711" [[package]] name = "serde" -version = "1.0.139" +version = "1.0.142" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0171ebb889e45aa68b44aee0859b3eede84c6f5f5c228e6f140c0b2a0a46cad6" +checksum = "e590c437916fb6b221e1d00df6e3294f3fccd70ca7e92541c475d6ed6ef5fee2" dependencies = [ "serde_derive", ] @@ -2860,9 +2836,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.139" +version = "1.0.142" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc1d3230c1de7932af58ad8ffbe1d784bd55efd5a9d84ac24f69c72d83543dfb" +checksum = "34b5b8d809babe02f538c2cfec6f2c1ed10804c0e5a6a041a049a4f5588ccc2e" dependencies = [ "proc-macro2", "quote", @@ -2871,11 +2847,11 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.82" +version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82c2c1fdcd807d1098552c5b9a36e425e42e9fbd7c6a37a8425f390f781f7fa7" +checksum = "38dd04e3c8279e75b31ef29dbdceebfe5ad89f4d0937213c53f7d49d01b3d5a7" dependencies = [ - "itoa 1.0.2", + "itoa 1.0.3", "ryu", "serde", ] @@ -2887,7 +2863,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa 1.0.2", + "itoa 1.0.3", "ryu", "serde", ] @@ -2992,7 +2968,7 @@ dependencies = [ "num-bigint", "num-traits", "thiserror", - "time 0.3.11", + "time 0.3.12", ] [[package]] @@ -3003,9 +2979,12 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "slab" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32" +checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +dependencies = [ + "autocfg", +] [[package]] name = "smallvec" @@ -3113,9 +3092,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.98" +version = "1.0.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd" +checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13" dependencies = [ "proc-macro2", "quote", @@ -3191,18 +3170,18 @@ checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" [[package]] name = "thiserror" -version = "1.0.31" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a" +checksum = "f5f6586b7f764adc0231f4c79be7b920e766bb2f3e51b3661cdb263828f19994" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.31" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a" +checksum = "12bafc5b54507e0149cdf1b145a5d80ab80a90bcd9275df43d4fff68460f6c21" dependencies = [ "proc-macro2", "quote", @@ -3231,14 +3210,14 @@ dependencies = [ [[package]] name = "time" -version = "0.3.11" +version = "0.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72c91f41dcb2f096c05f0873d667dceec1087ce5bcf984ec8ffb19acddbb3217" +checksum = "74b7cc93fc23ba97fde84f7eea56c55d1ba183f495c6715defdfc7b9cb8c870f" dependencies = [ - "itoa 1.0.2", + "itoa 1.0.3", + "js-sys", "libc", "num_threads", - "quickcheck", "time-macros", ] @@ -3275,9 +3254,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.20.0" +version = "1.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57aec3cfa4c296db7255446efb4928a6be304b431a806216105542a67b6ca82e" +checksum = "7a8325f63a7d4774dd041e363b2409ed1c5cbbd0f867795e661df066b2b0a581" dependencies = [ "autocfg", "bytes", @@ -3607,9 +3586,9 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" [[package]] name = "unicode-ident" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7" +checksum = "c4f5b37a154999a8f3f98cc23a628d850e154479cd94decf3414696e12e31aaf" [[package]] name = "unicode-normalization" @@ -3679,7 +3658,7 @@ dependencies = [ "rand", "routerify", "rustls", - "rustls-pemfile 0.2.1", + "rustls-pemfile", "rustls-split", "serde", "serde_json", @@ -3728,7 +3707,7 @@ name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 3.2.12", + "clap 3.2.16", "env_logger", "log", "once_cell", @@ -3772,9 +3751,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.81" +version = "0.2.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c53b543413a17a202f4be280a7e5c62a1c69345f5de525ee64f8cfdbc954994" +checksum = "fc7652e3f6c4706c8d9cd54832c4a4ccb9b5336e2c3bd154d5cccfbf1c1f5f7d" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -3782,13 +3761,13 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.81" +version = "0.2.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5491a68ab4500fa6b4d726bd67408630c3dbe9c4fe7bda16d5c82a1fd8c7340a" +checksum = "662cd44805586bd52971b9586b1df85cdbbd9112e4ef4d8f41559c334dc6ac3f" dependencies = [ "bumpalo", - "lazy_static", "log", + "once_cell", "proc-macro2", "quote", "syn", @@ -3797,9 +3776,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.31" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de9a9cec1733468a8c657e57fa2413d2ae2c0129b95e87c5b72b8ace4d13f31f" +checksum = "fa76fb221a1f8acddf5b54ace85912606980ad661ac7a503b4570ffd3a624dad" dependencies = [ "cfg-if", "js-sys", @@ -3809,9 +3788,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.81" +version = "0.2.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c441e177922bc58f1e12c022624b6216378e5febc2f0533e41ba443d505b80aa" +checksum = "b260f13d3012071dfb1512849c033b1925038373aea48ced3012c09df952c602" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3819,9 +3798,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.81" +version = "0.2.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d94ac45fcf608c1f45ef53e748d35660f168490c10b23704c7779ab8f5c3048" +checksum = "5be8e654bdd9b79216c2929ab90721aa82faf65c48cdf08bdc4e7f51357b80da" dependencies = [ "proc-macro2", "quote", @@ -3832,15 +3811,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.81" +version = "0.2.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a89911bd99e5f3659ec4acf9c4d93b0a90fe4a2a11f15328472058edc5261be" +checksum = "6598dd0bd3c7d51095ff6531a5b23e02acdc81804e30d8f07afb77b7215a140a" [[package]] name = "web-sys" -version = "0.3.58" +version = "0.3.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fed94beee57daf8dd7d51f2b15dc2bcde92d7a72304cdf662a4371008b71b90" +checksum = "ed055ab27f941423197eb86b2035720b1a3ce40504df082cac2ecc6ed73335a1" dependencies = [ "js-sys", "wasm-bindgen", @@ -3965,6 +3944,7 @@ version = "0.1.0" dependencies = [ "ahash", "anyhow", + "bstr", "bytes", "chrono", "clap 2.34.0", @@ -3974,7 +3954,7 @@ dependencies = [ "futures-task", "futures-util", "generic-array", - "hashbrown 0.11.2", + "hashbrown", "hex", "hyper", "indexmap", @@ -3989,11 +3969,12 @@ dependencies = [ "prost", "rand", "regex", + "regex-automata", "regex-syntax", "scopeguard", "serde", "syn", - "time 0.3.11", + "time 0.3.12", "tokio", "tokio-util", "tracing", @@ -4015,7 +3996,7 @@ dependencies = [ "oid-registry", "rusticata-macros", "thiserror", - "time 0.3.11", + "time 0.3.12", ] [[package]] @@ -4044,6 +4025,6 @@ dependencies = [ [[package]] name = "zeroize" -version = "1.5.6" +version = "1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20b578acffd8516a6c3f2a1bdefc1ec37e547bb4e0fb8b6b01a4cafc886b4442" +checksum = "c394b5bd0c6f669e7275d9c20aa90ae064cb22e75a1cad54e1b34088034b149f" diff --git a/Dockerfile b/Dockerfile index 17aa0025e8..aa31e227da 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,27 @@ +### Creates a storage Docker image with postgres, pageserver, safekeeper and proxy binaries. +### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters. +### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used +### inside this image in the real deployments. +ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG IMAGE=rust +ARG TAG=pinned + # Build Postgres -FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS pg-build +FROM $REPOSITORY/$IMAGE:$TAG AS pg-build WORKDIR /home/nonroot -COPY vendor/postgres vendor/postgres -COPY Makefile Makefile +COPY --chown=nonroot vendor/postgres vendor/postgres +COPY --chown=nonroot pgxn pgxn +COPY --chown=nonroot Makefile Makefile ENV BUILD_TYPE release RUN set -e \ - && mold -run make -j $(nproc) -s postgres \ + && mold -run make -j $(nproc) -s neon-pg-ext \ && rm -rf tmp_install/build \ && tar -C tmp_install -czf /home/nonroot/postgres_install.tar.gz . # Build zenith binaries -FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS build +FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local @@ -32,7 +41,7 @@ COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ - && mold -run cargo build --release \ +&& mold -run cargo build --locked --release \ && cachepot -s # Build final image @@ -58,7 +67,18 @@ COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy COPY --from=pg-build /home/nonroot/tmp_install/ /usr/local/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ +# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. +# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values. +RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \ + && /usr/local/bin/pageserver -D /data/.neon/ --init \ + -c "id=1234" \ + -c "broker_endpoints=['http://etcd:2379']" \ + -c "pg_distrib_dir='/usr/local'" \ + -c "listen_pg_addr='0.0.0.0:6400'" \ + -c "listen_http_addr='0.0.0.0:9898'" + VOLUME ["/data"] USER zenith EXPOSE 6400 -CMD ["pageserver"] +EXPOSE 9898 +CMD ["/bin/bash"] diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node new file mode 100644 index 0000000000..2e031b17da --- /dev/null +++ b/Dockerfile.compute-node @@ -0,0 +1,114 @@ +ARG TAG=pinned +# apparently, ARGs don't get replaced in RUN commands in kaniko +# ARG POSTGIS_VERSION=3.3.0 +# ARG PLV8_VERSION=3.1.4 + +FROM debian:bullseye-slim AS build-deps +RUN apt update && \ + apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ + libcurl4-openssl-dev libossp-uuid-dev + +# Build Postgres from the neon postgres repository. +FROM build-deps AS pg-build +COPY vendor/postgres postgres +RUN cd postgres && \ + ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ + # Install headers + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install + +# Build PostGIS from the upstream PostGIS mirror. PostGIS compiles against neon postgres sources without changes. +# Perhaps we could even use the upstream binaries, compiled against vanilla Postgres, but it would require some +# investigation to check that it works, and also keeps working in the future. So for now, we compile our own binaries. +FROM build-deps AS postgis-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +RUN apt update && \ + apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget + +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ + tar xvzf postgis-3.3.0.tar.gz && \ + cd postgis-3.3.0 && \ + ./autogen.sh && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + ./configure && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + cd extensions/postgis && \ + make clean && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control + +# Build plv8 +FROM build-deps AS plv8-build +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +RUN apt update && \ + apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev + +# https://github.com/plv8/plv8/issues/475 +# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary +RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update && \ + apt install -y --no-install-recommends -t testing binutils + +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ + tar xvzf v3.1.4.tar.gz && \ + cd plv8-3.1.4 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + rm -rf /plv8-* && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control + +# compile neon extensions +FROM build-deps AS neon-pg-ext-build +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY pgxn/ pgxn/ + +RUN make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon \ + -s install + +# Compile and run the Neon-specific `compute_ctl` binary +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools +USER nonroot +# Copy entire project to get Cargo.* files with proper dependencies for the whole project +COPY --chown=nonroot . . +RUN cd compute_tools && cargo build --locked --release + +# Put it all together into the final image +FROM debian:bullseye-slim +# Add user postgres +RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ + echo "postgres:test_console_pass" | chpasswd && \ + mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ + chown -R postgres:postgres /var/db/postgres && \ + chmod 0750 /var/db/postgres/compute && \ + echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig + +# TODO: Check if we can make the extension setup more modular versus a linear build +# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# +COPY --from=neon-pg-ext-build --chown=postgres /usr/local/pgsql /usr/local +COPY --from=compute-tools --chown=postgres /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl + +RUN apt update && \ + apt install -y libreadline-dev libossp-uuid-dev gdal-bin libgdal-dev libprotobuf-c-dev && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Debian bullseye provides GLIBC 2.31 when 2.34 is necessary as we compiled plv8 with that version +RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update && \ + apt install -y --no-install-recommends -t testing binutils && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# "temporary" symlink for old control-plane +RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl + +USER postgres +ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/Dockerfile.compute-node.legacy b/Dockerfile.compute-node.legacy new file mode 100644 index 0000000000..ba34e2486f --- /dev/null +++ b/Dockerfile.compute-node.legacy @@ -0,0 +1,87 @@ +# +# Legacy version of the Dockerfile for the compute node. +# Used by e2e CI. Building Dockerfile.compute-node will take +# unreasonable ammount of time without v2 runners. +# +# TODO: remove once cloud repo CI is moved to v2 runners. +# + + +# Allow specifiyng different compute-tools tag and image repo, so we are +# able to use different images +ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG IMAGE=compute-tools +ARG TAG=latest + +# +# Image with pre-built tools +# +FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps +# Only to get ready compute_ctl binary as deppendency + +# +# Image with Postgres build deps +# +FROM debian:buster-slim AS build-deps + +RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ + libcurl4-openssl-dev libossp-uuid-dev + +# +# Image with built Postgres +# +FROM build-deps AS pg-build + +# Add user postgres +RUN adduser postgres +RUN mkdir /pg && chown postgres:postgres /pg + +# Copy source files +COPY ./vendor/postgres /pg/ +COPY ./pgxn /pg/ + +# Build and install Postgres locally +RUN mkdir /pg/compute_build && cd /pg/compute_build && \ + ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \ + # Install main binaries and contribs + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ + # Install headers + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install + +# Install neon contrib +RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install + +USER postgres +WORKDIR /pg + +# +# Final compute node image to be exported +# +FROM debian:buster-slim + +# libreadline-dev is required to run psql +RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev + +# Add user postgres +RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ + echo "postgres:test_console_pass" | chpasswd && \ + mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ + chown -R postgres:postgres /var/db/postgres && \ + chmod 0750 /var/db/postgres/compute + +# Copy ready Postgres binaries +COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local + +# Copy binaries from compute-tools +COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl + +# XXX: temporary symlink for compatibility with old control-plane +RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl + +# Add postgres shared objects to the search path +RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig + +USER postgres + +ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 05393021c2..8231cd0ebb 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,6 +1,10 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml -FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS rust-build +ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG IMAGE=rust +ARG TAG=pinned + +FROM $REPOSITORY/$IMAGE:$TAG AS rust-build WORKDIR /home/nonroot # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. @@ -16,7 +20,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev COPY . . RUN set -e \ - && mold -run cargo build -p compute_tools --release \ + && mold -run cargo build -p compute_tools --locked --release \ && cachepot -s # Final image that only has one binary diff --git a/Makefile b/Makefile index fc75e9fc5e..9d7e1497e5 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,7 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 # Top level Makefile to build Zenith and PostgreSQL # .PHONY: all -all: zenith postgres +all: zenith postgres neon-pg-ext ### Zenith Rust bits # @@ -87,25 +87,39 @@ postgres: postgres-configure \ postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers` +@echo "Compiling PostgreSQL" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install - +@echo "Compiling contrib/neon" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install - +@echo "Compiling contrib/neon_test_utils" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install + +@echo "Compiling libpq" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq install +@echo "Compiling pg_buffercache" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install +@echo "Compiling pageinspect" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install - .PHONY: postgres-clean postgres-clean: $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq clean + +neon-pg-ext: postgres + +@echo "Compiling neon" + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \ + -C $(ROOT_PROJECT_DIR)/pgxn/neon install + +@echo "Compiling neon_test_utils" + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \ + -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils install + +.PHONY: neon-pg-ext-clean + $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean + $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean # This doesn't remove the effects of 'configure'. .PHONY: clean clean: cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean $(CARGO_CMD_PREFIX) cargo clean + cd pgxn/neon && $(MAKE) clean + cd pgxn/neon_test_utils && $(MAKE) clean # This removes everything .PHONY: distclean diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1e812f2aa0..58469b1c97 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -178,6 +178,7 @@ impl ComputeNode { .args(&["--sync-safekeepers"]) .env("PGDATA", &self.pgdata) // we cannot use -D in this mode .stdout(Stdio::piped()) + .stderr(Stdio::piped()) .spawn() .expect("postgres --sync-safekeepers failed to start"); @@ -187,10 +188,13 @@ impl ComputeNode { let sync_output = sync_handle .wait_with_output() .expect("postgres --sync-safekeepers failed"); + if !sync_output.status.success() { anyhow::bail!( - "postgres --sync-safekeepers exited with non-zero status: {}", + "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}, stderr: {}", sync_output.status, + String::from_utf8(sync_output.stdout).expect("postgres --sync-safekeepers exited, and stdout is not utf-8"), + String::from_utf8(sync_output.stderr).expect("postgres --sync-safekeepers exited, and stderr is not utf-8"), ); } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 207d09d76b..ac065fa60c 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -62,9 +62,16 @@ impl GenericOption { /// Represent `GenericOption` as configuration option. pub fn to_pg_setting(&self) -> String { if let Some(val) = &self.value { + let name = match self.name.as_str() { + "safekeepers" => "neon.safekeepers", + "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout", + "wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout", + it => it, + }; + match self.vartype.as_ref() { - "string" => format!("{} = '{}'", self.name, val), - _ => format!("{} = {}", self.name, val), + "string" => format!("{} = '{}'", name, val), + _ => format!("{} = {}", name, val), } } else { self.name.to_owned() diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index bdd6e60a69..c29416d9c4 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -85,7 +85,7 @@ "vartype": "bool" }, { - "name": "safekeepers", + "name": "neon.safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501", "vartype": "string" }, @@ -181,7 +181,6 @@ } ] }, - "delta_operations": [ { "action": "delete_db", diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 1f2e188398..bae944440e 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -28,7 +28,7 @@ mod pg_helpers_tests { assert_eq!( spec.cluster.settings.as_pg_settings(), - "fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" + "fsync = off\nwal_level = replica\nhot_standby = on\nneon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" ); } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index e78f96074e..57b5e1e10a 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -150,7 +150,7 @@ impl PostgresNode { let port: u16 = conf.parse_field("port", &context)?; let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?; let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?; - let uses_wal_proposer = conf.get("safekeepers").is_some(); + let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); // parse recovery_target_lsn, if any let recovery_target_lsn: Option = @@ -341,7 +341,7 @@ impl PostgresNode { .map(|sk| format!("localhost:{}", sk.pg_port)) .collect::>() .join(","); - conf.append("safekeepers", &safekeepers); + conf.append("neon.safekeepers", &safekeepers); } else { // We only use setup without safekeepers for tests, // and don't care about data durability on pageserver, diff --git a/docs/glossary.md b/docs/glossary.md index 665596c68d..25c66828c0 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -92,6 +92,7 @@ The layer map tracks what layers exist in a timeline. ### Layered repository Neon repository implementation that keeps data in layers. + ### LSN The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log. @@ -125,6 +126,26 @@ TODO: use this name consistently in remote storage code. Now `disk_consistent_ls * `ancestor_lsn` - LSN of the branch point (the LSN at which this branch was created) TODO: add table that describes mapping between PostgreSQL (compute), safekeeper and pageserver LSNs. + +### Logical size + +The pageserver tracks the "logical size" of a timeline. It is the +total size of all relations in all Postgres databases on the +timeline. It includes all user and system tables, including their FSM +and VM forks. But it does not include SLRUs, twophase files or any +other such data or metadata that lives outside relations. + +The logical size is calculated by the pageserver, and is sent to +PostgreSQL via feedback messages to the safekeepers. PostgreSQL uses +the logical size to enforce the size limit in the free tier. The +logical size is also shown to users in the web console. + +The logical size is not affected by branches or the physical layout of +layer files in the pageserver. If you have a database with 1 GB +logical size and you create a branch of it, both branches will have 1 +GB logical size, even though the branch is copy-on-write and won't +consume any extra physical disk space until you make changes to it. + ### Page (block) The basic structure used to store relation data. All pages are of the same size. diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 39f7be89a0..88f4b0e559 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -112,11 +112,13 @@ Run `poetry shell` to activate the virtual environment. Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`. ### Obligatory checks -We force code formatting via `yapf` and type hints via `mypy`. -Run the following commands in the repository's root (next to `setup.cfg`): +We force code formatting via `black`, `isort` and type hints via `mypy`. +Run the following commands in the repository's root (next to `pyproject.toml`): ```bash -poetry run yapf -ri . # All code is reformatted +poetry run isort . # Imports are reformatted +poetry run black . # All code is reformatted +poetry run flake8 . # Python linter poetry run mypy . # Ensure there are no typing errors ``` @@ -125,7 +127,7 @@ Otherwise it will not find its configuration. Also consider: -* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any. +* Running `pycodestyle` (or a linter of your choice) and fixing possible defects, if any. * Adding more type hints to your code to avoid `Any`. ### Changing dependencies diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 0118701a7e..5b9ecb7394 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2021" [dependencies] -chrono = "0.4.19" rand = "0.8.3" regex = "1.4.5" bytes = "1.0.1" diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs index 04ef346d88..1de1d367e0 100644 --- a/libs/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -1,8 +1,8 @@ //! //! Common utilities for dealing with PostgreSQL non-relation files. //! -use crate::transaction_id_precedes; use super::pg_constants; +use crate::transaction_id_precedes; use bytes::BytesMut; use log::*; diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 0e1c9567cb..b509fc87a5 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -8,9 +8,9 @@ //! to look deeper into the WAL records to also understand which blocks they modify, the code //! for that is in pageserver/src/walrecord.rs //! +use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; use super::pg_constants; use super::xlog_utils::*; -use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use crc32c::*; use log::*; @@ -170,6 +170,7 @@ impl WalStreamDecoder { } State::SkippingEverything { .. } => {} } + // now read page contents match &mut self.state { State::WaitingForRecord => { // need to have at least the xl_tot_len field @@ -194,8 +195,8 @@ impl WalStreamDecoder { return Ok(Some(self.complete_record(recordbuf)?)); } else { // Need to assemble the record from pieces. Remember the size of the - // record, and loop back. On next iteration, we will reach the 'else' - // branch below, and copy the part of the record that was on this page + // record, and loop back. On next iterations, we will reach the branch + // below, and copy the part of the record that was on this or next page(s) // to 'recordbuf'. Subsequent iterations will skip page headers, and // append the continuations from the next pages to 'recordbuf'. self.state = State::ReassemblingRecord { diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 07f8cb08aa..d5ad2f8633 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -42,19 +42,13 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; -pub trait RemoteObjectName { - // Needed to retrieve last component for RemoteObjectId. - // In other words a file name - fn object_name(&self) -> Option<&str>; -} - /// Storage (potentially remote) API to manage its state. /// This storage tries to be unaware of any layered repository context, /// providing basic CRUD operations for storage files. #[async_trait::async_trait] pub trait RemoteStorage: Send + Sync { /// A way to uniquely reference a file in the remote storage. - type RemoteObjectId: RemoteObjectName; + type RemoteObjectId; /// Attempts to derive the storage path out of the local path, if the latter is correct. fn remote_object_id(&self, local_path: &Path) -> anyhow::Result; @@ -71,7 +65,7 @@ pub trait RemoteStorage: Send + Sync { /// so this method doesnt need to. async fn list_prefixes( &self, - prefix: Option, + prefix: Option<&Self::RemoteObjectId>, ) -> anyhow::Result>; /// Streams the local file contents into remote into the remote storage entry. @@ -163,6 +157,13 @@ impl GenericRemoteStorage { } } } + + pub fn as_local(&self) -> Option<&LocalFs> { + match self { + Self::Local(local_fs) => Some(local_fs), + _ => None, + } + } } /// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 07b04084b9..ddf6c01759 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -5,7 +5,6 @@ //! volume is mounted to the local FS. use std::{ - borrow::Cow, future::Future, path::{Path, PathBuf}, pin::Pin, @@ -18,16 +17,10 @@ use tokio::{ }; use tracing::*; -use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectName}; +use crate::{path_with_suffix_extension, Download, DownloadError}; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; -impl RemoteObjectName for PathBuf { - fn object_name(&self) -> Option<&str> { - self.file_stem().and_then(|n| n.to_str()) - } -} - pub struct LocalFs { working_directory: PathBuf, storage_root: PathBuf, @@ -113,13 +106,10 @@ impl RemoteStorage for LocalFs { async fn list_prefixes( &self, - prefix: Option, + prefix: Option<&Self::RemoteObjectId>, ) -> anyhow::Result> { - let path = match prefix { - Some(prefix) => Cow::Owned(prefix), - None => Cow::Borrowed(&self.storage_root), - }; - get_all_files(path.as_ref(), false).await + let path = prefix.unwrap_or(&self.storage_root); + get_all_files(path, false).await } async fn upload( @@ -150,8 +140,7 @@ impl RemoteStorage for LocalFs { ); let from_size_bytes = from_size_bytes as u64; - // Require to read 1 byte more than the expected to check later, that the stream and its size match. - let mut buffer_to_read = from.take(from_size_bytes + 1); + let mut buffer_to_read = from.take(from_size_bytes); let bytes_read = io::copy(&mut buffer_to_read, &mut destination) .await @@ -162,17 +151,15 @@ impl RemoteStorage for LocalFs { ) })?; + if bytes_read < from_size_bytes { + bail!("Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes"); + } + // Check if there is any extra data after the given size. + let mut from = buffer_to_read.into_inner(); + let extra_read = from.read(&mut [1]).await?; ensure!( - bytes_read == from_size_bytes, - "Provided stream has actual size {} fthat is smaller than the given stream size {}", - bytes_read, - from_size_bytes - ); - - ensure!( - buffer_to_read.read(&mut [0]).await? == 0, - "Provided stream has bigger size than the given stream size {}", - from_size_bytes + extra_read == 0, + "Provided stream was larger than expected: expected {from_size_bytes} bytes", ); destination.flush().await.with_context(|| { @@ -609,6 +596,34 @@ mod fs_tests { Ok(()) } + #[tokio::test] + async fn upload_file_negatives() -> anyhow::Result<()> { + let storage = create_storage()?; + + let id = storage.remote_object_id(&storage.working_directory.join("dummy"))?; + let content = std::io::Cursor::new(b"12345"); + + // Check that you get an error if the size parameter doesn't match the actual + // size of the stream. + storage + .upload(content.clone(), 0, &id, None) + .await + .expect_err("upload with zero size succeeded"); + storage + .upload(content.clone(), 4, &id, None) + .await + .expect_err("upload with too short size succeeded"); + storage + .upload(content.clone(), 6, &id, None) + .await + .expect_err("upload with too large size succeeded"); + + // Correct size is 5, this should succeed. + storage.upload(content, 5, &id, None).await?; + + Ok(()) + } + fn create_storage() -> anyhow::Result { LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned()) } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 1b241fe4ed..db31200c36 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -19,9 +19,7 @@ use tokio::{io, sync::Semaphore}; use tokio_util::io::ReaderStream; use tracing::debug; -use crate::{ - strip_path_prefix, Download, DownloadError, RemoteObjectName, RemoteStorage, S3Config, -}; +use crate::{strip_path_prefix, Download, DownloadError, RemoteStorage, S3Config}; use super::StorageMetadata; @@ -96,6 +94,23 @@ const S3_PREFIX_SEPARATOR: char = '/'; pub struct S3ObjectKey(String); impl S3ObjectKey { + /// Turn a/b/c or a/b/c/ into c + pub fn object_name(&self) -> Option<&str> { + // corner case, char::to_string is not const, thats why this is more verbose than it needs to be + // see https://github.com/rust-lang/rust/issues/88674 + if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR { + return None; + } + + if self.0.ends_with(S3_PREFIX_SEPARATOR) { + self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1) + } else { + self.0 + .rsplit_once(S3_PREFIX_SEPARATOR) + .map(|(_, last)| last) + } + } + fn key(&self) -> &str { &self.0 } @@ -119,25 +134,6 @@ impl S3ObjectKey { } } -impl RemoteObjectName for S3ObjectKey { - /// Turn a/b/c or a/b/c/ into c - fn object_name(&self) -> Option<&str> { - // corner case, char::to_string is not const, thats why this is more verbose than it needs to be - // see https://github.com/rust-lang/rust/issues/88674 - if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR { - return None; - } - - if self.0.ends_with(S3_PREFIX_SEPARATOR) { - self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1) - } else { - self.0 - .rsplit_once(S3_PREFIX_SEPARATOR) - .map(|(_, last)| last) - } - } -} - /// AWS S3 storage. pub struct S3Bucket { workdir: PathBuf, @@ -316,11 +312,11 @@ impl RemoteStorage for S3Bucket { /// Note: it wont include empty "directories" async fn list_prefixes( &self, - prefix: Option, + prefix: Option<&Self::RemoteObjectId>, ) -> anyhow::Result> { // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix - .map(|p| p.0) + .map(|p| p.0.clone()) .or_else(|| self.prefix_in_bucket.clone()) .map(|mut p| { // required to end with a separator diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index e3e78ec68f..28ad658de4 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -39,7 +39,7 @@ bytes = "1.0.1" hex-literal = "0.3" tempfile = "3.2" criterion = "0.3" -rustls-pemfile = "0.2.1" +rustls-pemfile = "1" [[bench]] name = "benchmarks" diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 1b011bb73a..fa7a37adf1 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -8,6 +8,9 @@ pub mod lsn; /// SeqWait allows waiting for a future sequence number to arrive pub mod seqwait; +/// A simple Read-Copy-Update implementation. +pub mod simple_rcu; + /// append only ordered map implemented with a Vec pub mod vec_map; diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 4d873bd5ac..604eb75aaf 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -163,14 +163,9 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool { false } -// Truncate 0 from C string in Bytes and stringify it (returns slice, no allocations) -// PG protocol strings are always C strings. -fn cstr_to_str(b: &Bytes) -> Result<&str> { - let without_null = if b.last() == Some(&0) { - &b[..b.len() - 1] - } else { - &b[..] - }; +// Cast a byte slice to a string slice, dropping null terminator if there's one. +fn cstr_to_str(bytes: &[u8]) -> Result<&str> { + let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); std::str::from_utf8(without_null).map_err(|e| e.into()) } @@ -423,9 +418,9 @@ impl PostgresBackend { self.state = ProtoState::Established; } - FeMessage::Query(m) => { + FeMessage::Query(body) => { // remove null terminator - let query_string = cstr_to_str(&m.body)?; + let query_string = cstr_to_str(&body)?; trace!("got query {:?}", query_string); // xxx distinguish fatal and recoverable errors? diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index 3f14acd50d..dde76039d7 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -7,11 +7,14 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_protocol::PG_EPOCH; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::future::Future; -use std::io::{self, Cursor}; -use std::str; -use std::time::{Duration, SystemTime}; +use std::{ + borrow::Cow, + collections::HashMap, + future::Future, + io::{self, Cursor}, + str, + time::{Duration, SystemTime}, +}; use tokio::io::AsyncReadExt; use tracing::{trace, warn}; @@ -25,8 +28,10 @@ pub const TEXT_OID: Oid = 25; #[derive(Debug)] pub enum FeMessage { StartupPacket(FeStartupPacket), - Query(FeQueryMessage), // Simple query - Parse(FeParseMessage), // Extended query protocol + // Simple query. + Query(Bytes), + // Extended query protocol. + Parse(FeParseMessage), Describe(FeDescribeMessage), Bind(FeBindMessage), Execute(FeExecuteMessage), @@ -51,7 +56,67 @@ pub enum FeStartupPacket { }, } -pub type StartupMessageParams = HashMap; +#[derive(Debug)] +pub struct StartupMessageParams { + params: HashMap, +} + +impl StartupMessageParams { + /// Get parameter's value by its name. + pub fn get(&self, name: &str) -> Option<&str> { + self.params.get(name).map(|s| s.as_str()) + } + + /// Split command-line options according to PostgreSQL's logic, + /// taking into account all escape sequences but leaving them as-is. + /// [`None`] means that there's no `options` in [`Self`]. + pub fn options_raw(&self) -> Option> { + // See `postgres: pg_split_opts`. + let mut last_was_escape = false; + let iter = self + .get("options")? + .split(move |c: char| { + // We split by non-escaped whitespace symbols. + let should_split = c.is_ascii_whitespace() && !last_was_escape; + last_was_escape = c == '\\' && !last_was_escape; + should_split + }) + .filter(|s| !s.is_empty()); + + Some(iter) + } + + /// Split command-line options according to PostgreSQL's logic, + /// applying all escape sequences (using owned strings as needed). + /// [`None`] means that there's no `options` in [`Self`]. + pub fn options_escaped(&self) -> Option>> { + // See `postgres: pg_split_opts`. + let iter = self.options_raw()?.map(|s| { + let mut preserve_next_escape = false; + let escape = |c| { + // We should remove '\\' unless it's preceded by '\\'. + let should_remove = c == '\\' && !preserve_next_escape; + preserve_next_escape = should_remove; + should_remove + }; + + match s.contains('\\') { + true => Cow::Owned(s.replace(escape, "")), + false => Cow::Borrowed(s), + } + }); + + Some(iter) + } + + // This function is mostly useful in tests. + #[doc(hidden)] + pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self { + Self { + params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(), + } + } +} #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct CancelKeyData { @@ -69,11 +134,6 @@ impl Distribution for Standard { } } -#[derive(Debug)] -pub struct FeQueryMessage { - pub body: Bytes, -} - // We only support the simple case of Parse on unnamed prepared statement and // no params #[derive(Debug)] @@ -89,7 +149,7 @@ pub struct FeDescribeMessage { // we only support unnamed prepared stmt and portal #[derive(Debug)] -pub struct FeBindMessage {} +pub struct FeBindMessage; // we only support unnamed prepared stmt or portal #[derive(Debug)] @@ -100,7 +160,7 @@ pub struct FeExecuteMessage { // we only support unnamed prepared stmt and portal #[derive(Debug)] -pub struct FeCloseMessage {} +pub struct FeCloseMessage; /// Retry a read on EINTR /// @@ -163,22 +223,20 @@ impl FeMessage { Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), Err(e) => return Err(e.into()), }; - let len = retry_read!(stream.read_u32().await)?; - // The message length includes itself, so it better be at least 4 - let bodylen = len + // The message length includes itself, so it better be at least 4. + let len = retry_read!(stream.read_u32().await)? .checked_sub(4) - .context("invalid message length: parsing u32")?; + .context("invalid message length")?; - // Read message body - let mut body_buf: Vec = vec![0; bodylen as usize]; - stream.read_exact(&mut body_buf).await?; + let body = { + let mut buffer = vec![0u8; len as usize]; + stream.read_exact(&mut buffer).await?; + Bytes::from(buffer) + }; - let body = Bytes::from(body_buf); - - // Parse it match tag { - b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body }))), + b'Q' => Ok(Some(FeMessage::Query(body))), b'P' => Ok(Some(FeParseMessage::parse(body)?)), b'D' => Ok(Some(FeDescribeMessage::parse(body)?)), b'E' => Ok(Some(FeExecuteMessage::parse(body)?)), @@ -242,9 +300,9 @@ impl FeStartupPacket { stream.read_exact(params_bytes.as_mut()).await?; // Parse params depending on request code - let most_sig_16_bits = request_code >> 16; - let least_sig_16_bits = request_code & ((1 << 16) - 1); - let message = match (most_sig_16_bits, least_sig_16_bits) { + let req_hi = request_code >> 16; + let req_lo = request_code & ((1 << 16) - 1); + let message = match (req_hi, req_lo) { (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { ensure!(params_len == 8, "expected 8 bytes for CancelRequest params"); let mut cursor = Cursor::new(params_bytes); @@ -253,173 +311,115 @@ impl FeStartupPacket { cancel_key: cursor.read_i32().await?, }) } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => FeStartupPacket::SslRequest, + (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { + // Requested upgrade to SSL (aka TLS) + FeStartupPacket::SslRequest + } (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => { + // Requested upgrade to GSSAPI FeStartupPacket::GssEncRequest } (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { bail!("Unrecognized request code {}", unrecognized_code) } + // TODO bail if protocol major_version is not 3? (major_version, minor_version) => { - // TODO bail if protocol major_version is not 3? - // Parse null-terminated (String) pairs of param name / param value - let params_str = str::from_utf8(¶ms_bytes).unwrap(); - let mut params_tokens = params_str.split('\0'); - let mut params: HashMap = HashMap::new(); - while let Some(name) = params_tokens.next() { - let value = params_tokens + // Parse pairs of null-terminated strings (key, value). + // See `postgres: ProcessStartupPacket, build_startup_packet`. + let mut tokens = str::from_utf8(¶ms_bytes) + .context("StartupMessage params: invalid utf-8")? + .strip_suffix('\0') // drop packet's own null terminator + .context("StartupMessage params: missing null terminator")? + .split_terminator('\0'); + + let mut params = HashMap::new(); + while let Some(name) = tokens.next() { + let value = tokens .next() - .context("expected even number of params in StartupMessage")?; - if name == "options" { - // parsing options arguments "...&options=%3D+=..." - // '%3D' is '=' and '+' is ' ' + .context("StartupMessage params: key without value")?; - // Note: we allow users that don't have SNI capabilities, - // to pass a special keyword argument 'project' - // to be used to determine the cluster name by the proxy. - - //TODO: write unit test for this and refactor in its own function. - for cmdopt in value.split(' ') { - let nameval: Vec<&str> = cmdopt.split('=').collect(); - if nameval.len() == 2 { - params.insert(nameval[0].to_string(), nameval[1].to_string()); - } - } - } else { - params.insert(name.to_string(), value.to_string()); - } + params.insert(name.to_owned(), value.to_owned()); } + FeStartupPacket::StartupMessage { major_version, minor_version, - params, + params: StartupMessageParams { params }, } } }; + Ok(Some(FeMessage::StartupPacket(message))) }) } } impl FeParseMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { - let _pstmt_name = read_null_terminated(&mut buf)?; - let query_string = read_null_terminated(&mut buf)?; - let nparams = buf.get_i16(); - + fn parse(mut buf: Bytes) -> anyhow::Result { // FIXME: the rust-postgres driver uses a named prepared statement // for copy_out(). We're not prepared to handle that correctly. For // now, just ignore the statement name, assuming that the client never // uses more than one prepared statement at a time. - /* - if !pstmt_name.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "named prepared statements not implemented in Parse", - )); - } - */ - if nparams != 0 { - bail!("query params not implemented"); - } + let _pstmt_name = read_cstr(&mut buf)?; + let query_string = read_cstr(&mut buf)?; + let nparams = buf.get_i16(); + + ensure!(nparams == 0, "query params not implemented"); Ok(FeMessage::Parse(FeParseMessage { query_string })) } } impl FeDescribeMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> anyhow::Result { let kind = buf.get_u8(); - let _pstmt_name = read_null_terminated(&mut buf)?; + let _pstmt_name = read_cstr(&mut buf)?; // FIXME: see FeParseMessage::parse - /* - if !pstmt_name.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "named prepared statements not implemented in Describe", - )); - } - */ - - if kind != b'S' { - bail!("only prepared statmement Describe is implemented"); - } + ensure!( + kind == b'S', + "only prepared statemement Describe is implemented" + ); Ok(FeMessage::Describe(FeDescribeMessage { kind })) } } impl FeExecuteMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { - let portal_name = read_null_terminated(&mut buf)?; + fn parse(mut buf: Bytes) -> anyhow::Result { + let portal_name = read_cstr(&mut buf)?; let maxrows = buf.get_i32(); - if !portal_name.is_empty() { - bail!("named portals not implemented"); - } - - if maxrows != 0 { - bail!("row limit in Execute message not supported"); - } + ensure!(portal_name.is_empty(), "named portals not implemented"); + ensure!(maxrows == 0, "row limit in Execute message not implemented"); Ok(FeMessage::Execute(FeExecuteMessage { maxrows })) } } impl FeBindMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { - let portal_name = read_null_terminated(&mut buf)?; - let _pstmt_name = read_null_terminated(&mut buf)?; - - if !portal_name.is_empty() { - bail!("named portals not implemented"); - } + fn parse(mut buf: Bytes) -> anyhow::Result { + let portal_name = read_cstr(&mut buf)?; + let _pstmt_name = read_cstr(&mut buf)?; // FIXME: see FeParseMessage::parse - /* - if !pstmt_name.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "named prepared statements not implemented", - )); - } - */ + ensure!(portal_name.is_empty(), "named portals not implemented"); - Ok(FeMessage::Bind(FeBindMessage {})) + Ok(FeMessage::Bind(FeBindMessage)) } } impl FeCloseMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> anyhow::Result { let _kind = buf.get_u8(); - let _pstmt_or_portal_name = read_null_terminated(&mut buf)?; + let _pstmt_or_portal_name = read_cstr(&mut buf)?; // FIXME: we do nothing with Close - - Ok(FeMessage::Close(FeCloseMessage {})) + Ok(FeMessage::Close(FeCloseMessage)) } } -fn read_null_terminated(buf: &mut Bytes) -> anyhow::Result { - let mut result = BytesMut::new(); - - loop { - if !buf.has_remaining() { - bail!("no null-terminator in string"); - } - - let byte = buf.get_u8(); - - if byte == 0 { - break; - } - result.put_u8(byte); - } - Ok(result.freeze()) -} - // Backend #[derive(Debug)] @@ -441,7 +441,7 @@ pub enum BeMessage<'a> { // None means column is NULL DataRow(&'a [Option<&'a [u8]>]), ErrorResponse(&'a str), - // single byte - used in response to SSLRequest/GSSENCRequest + /// Single byte - used in response to SSLRequest/GSSENCRequest. EncryptionResponse(bool), NoData, ParameterDescription, @@ -554,49 +554,22 @@ pub static SINGLE_COL_ROWDESC: BeMessage = BeMessage::RowDescription(&[RowDescri formatcode: 0, }]); -// Safe usize -> i32|i16 conversion, from rust-postgres -trait FromUsize: Sized { - fn from_usize(x: usize) -> Result; -} - -macro_rules! from_usize { - ($t:ty) => { - impl FromUsize for $t { - #[inline] - fn from_usize(x: usize) -> io::Result<$t> { - if x > <$t>::max_value() as usize { - Err(io::Error::new( - io::ErrorKind::InvalidInput, - "value too large to transmit", - )) - } else { - Ok(x as $t) - } - } - } - }; -} - -from_usize!(i32); - /// Call f() to write body of the message and prepend it with 4-byte len as /// prescribed by the protocol. -fn write_body(buf: &mut BytesMut, f: F) -> io::Result<()> -where - F: FnOnce(&mut BytesMut) -> io::Result<()>, -{ +fn write_body(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R { let base = buf.len(); buf.extend_from_slice(&[0; 4]); - f(buf)?; + let res = f(buf); - let size = i32::from_usize(buf.len() - base)?; + let size = i32::try_from(buf.len() - base).expect("message too big to transmit"); (&mut buf[base..]).put_slice(&size.to_be_bytes()); - Ok(()) + + res } /// Safe write of s into buf as cstring (String in the protocol). -pub fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { +fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { if s.contains(&0) { return Err(io::Error::new( io::ErrorKind::InvalidInput, @@ -608,15 +581,11 @@ pub fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { Ok(()) } -// Truncate 0 from C string in Bytes and stringify it (returns slice, no allocations) -// PG protocol strings are always C strings. -fn cstr_to_str(b: &Bytes) -> Result<&str> { - let without_null = if b.last() == Some(&0) { - &b[..b.len() - 1] - } else { - &b[..] - }; - std::str::from_utf8(without_null).map_err(|e| e.into()) +fn read_cstr(buf: &mut Bytes) -> anyhow::Result { + let pos = buf.iter().position(|x| *x == 0); + let result = buf.split_to(pos.context("missing terminator")?); + buf.advance(1); // drop the null terminator + Ok(result) } impl<'a> BeMessage<'a> { @@ -631,18 +600,14 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'R'); write_body(buf, |buf| { buf.put_i32(0); // Specifies that the authentication was successful. - Ok::<_, io::Error>(()) - }) - .unwrap(); // write into BytesMut can't fail + }); } BeMessage::AuthenticationCleartextPassword => { buf.put_u8(b'R'); write_body(buf, |buf| { buf.put_i32(3); // Specifies that clear text password is required. - Ok::<_, io::Error>(()) - }) - .unwrap(); // write into BytesMut can't fail + }); } BeMessage::AuthenticationMD5Password(salt) => { @@ -650,9 +615,7 @@ impl<'a> BeMessage<'a> { write_body(buf, |buf| { buf.put_i32(5); // Specifies that an MD5-encrypted password is required. buf.put_slice(&salt[..]); - Ok::<_, io::Error>(()) - }) - .unwrap(); // write into BytesMut can't fail + }); } BeMessage::AuthenticationSasl(msg) => { @@ -677,8 +640,7 @@ impl<'a> BeMessage<'a> { } } Ok::<_, io::Error>(()) - }) - .unwrap() + })?; } BeMessage::BackendKeyData(key_data) => { @@ -686,77 +648,64 @@ impl<'a> BeMessage<'a> { write_body(buf, |buf| { buf.put_i32(key_data.backend_pid); buf.put_i32(key_data.cancel_key); - Ok(()) - }) - .unwrap(); + }); } BeMessage::BindComplete => { buf.put_u8(b'2'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::CloseComplete => { buf.put_u8(b'3'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::CommandComplete(cmd) => { buf.put_u8(b'C'); - write_body(buf, |buf| { - write_cstr(cmd, buf)?; - Ok::<_, io::Error>(()) - })?; + write_body(buf, |buf| write_cstr(cmd, buf))?; } BeMessage::CopyData(data) => { buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_slice(data); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::CopyDone => { buf.put_u8(b'c'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::CopyFail => { buf.put_u8(b'f'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::CopyInResponse => { buf.put_u8(b'G'); write_body(buf, |buf| { - buf.put_u8(1); /* copy_is_binary */ - buf.put_i16(0); /* numAttributes */ - Ok::<_, io::Error>(()) - }) - .unwrap(); + buf.put_u8(1); // copy_is_binary + buf.put_i16(0); // numAttributes + }); } BeMessage::CopyOutResponse => { buf.put_u8(b'H'); write_body(buf, |buf| { - buf.put_u8(0); /* copy_is_binary */ - buf.put_i16(0); /* numAttributes */ - Ok::<_, io::Error>(()) - }) - .unwrap(); + buf.put_u8(0); // copy_is_binary + buf.put_i16(0); // numAttributes + }); } BeMessage::CopyBothResponse => { buf.put_u8(b'W'); write_body(buf, |buf| { // doesn't matter, used only for replication - buf.put_u8(0); /* copy_is_binary */ - buf.put_i16(0); /* numAttributes */ - Ok::<_, io::Error>(()) - }) - .unwrap(); + buf.put_u8(0); // copy_is_binary + buf.put_i16(0); // numAttributes + }); } BeMessage::DataRow(vals) => { @@ -771,9 +720,7 @@ impl<'a> BeMessage<'a> { buf.put_i32(-1); } } - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } // ErrorResponse is a zero-terminated array of zero-terminated fields. @@ -788,18 +735,17 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'E'); write_body(buf, |buf| { buf.put_u8(b'S'); // severity - write_cstr(&Bytes::from("ERROR"), buf)?; + buf.put_slice(b"ERROR\0"); buf.put_u8(b'C'); // SQLSTATE error code - write_cstr(&Bytes::from("CXX000"), buf)?; + buf.put_slice(b"CXX000\0"); buf.put_u8(b'M'); // the message write_cstr(error_msg.as_bytes(), buf)?; buf.put_u8(0); // terminator Ok::<_, io::Error>(()) - }) - .unwrap(); + })?; } // NoticeResponse has the same format as ErrorResponse. From doc: "The frontend should display the @@ -812,23 +758,22 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'N'); write_body(buf, |buf| { buf.put_u8(b'S'); // severity - write_cstr(&Bytes::from("NOTICE"), buf)?; + buf.put_slice(b"NOTICE\0"); buf.put_u8(b'C'); // SQLSTATE error code - write_cstr(&Bytes::from("CXX000"), buf)?; + buf.put_slice(b"CXX000\0"); buf.put_u8(b'M'); // the message write_cstr(error_msg.as_bytes(), buf)?; buf.put_u8(0); // terminator Ok::<_, io::Error>(()) - }) - .unwrap(); + })?; } BeMessage::NoData => { buf.put_u8(b'n'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::EncryptionResponse(should_negotiate) => { @@ -853,9 +798,7 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'S'); write_body(buf, |buf| { buf.put_slice(&buffer[..cnt]); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::ParameterDescription => { @@ -863,23 +806,19 @@ impl<'a> BeMessage<'a> { write_body(buf, |buf| { // we don't support params, so always 0 buf.put_i16(0); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::ParseComplete => { buf.put_u8(b'1'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::ReadyForQuery => { buf.put_u8(b'Z'); write_body(buf, |buf| { buf.put_u8(b'I'); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::RowDescription(rows) => { @@ -907,9 +846,7 @@ impl<'a> BeMessage<'a> { buf.put_u64(body.wal_end); buf.put_i64(body.timestamp); buf.put_slice(body.data); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::KeepAlive(req) => { @@ -918,10 +855,8 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'k'); buf.put_u64(req.sent_ptr); buf.put_i64(req.timestamp); - buf.put_u8(if req.request_reply { 1u8 } else { 0u8 }); - Ok::<_, io::Error>(()) - }) - .unwrap(); + buf.put_u8(if req.request_reply { 1 } else { 0 }); + }); } } Ok(()) @@ -968,17 +903,17 @@ impl ReplicationFeedback { // value itself pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys - write_cstr(&Bytes::from("current_timeline_size"), buf)?; + buf.put_slice(b"current_timeline_size\0"); buf.put_i32(8); buf.put_u64(self.current_timeline_size); - write_cstr(&Bytes::from("ps_writelsn"), buf)?; + buf.put_slice(b"ps_writelsn\0"); buf.put_i32(8); buf.put_u64(self.ps_writelsn); - write_cstr(&Bytes::from("ps_flushlsn"), buf)?; + buf.put_slice(b"ps_flushlsn\0"); buf.put_i32(8); buf.put_u64(self.ps_flushlsn); - write_cstr(&Bytes::from("ps_applylsn"), buf)?; + buf.put_slice(b"ps_applylsn\0"); buf.put_i32(8); buf.put_u64(self.ps_applylsn); @@ -988,7 +923,7 @@ impl ReplicationFeedback { .expect("failed to serialize pg_replytime earlier than PG_EPOCH") .as_micros() as i64; - write_cstr(&Bytes::from("ps_replytime"), buf)?; + buf.put_slice(b"ps_replytime\0"); buf.put_i32(8); buf.put_i64(timestamp); Ok(()) @@ -998,33 +933,30 @@ impl ReplicationFeedback { pub fn parse(mut buf: Bytes) -> ReplicationFeedback { let mut zf = ReplicationFeedback::empty(); let nfields = buf.get_u8(); - let mut i = 0; - while i < nfields { - i += 1; - let key_cstr = read_null_terminated(&mut buf).unwrap(); - let key = cstr_to_str(&key_cstr).unwrap(); - match key { - "current_timeline_size" => { + for _ in 0..nfields { + let key = read_cstr(&mut buf).unwrap(); + match key.as_ref() { + b"current_timeline_size" => { let len = buf.get_i32(); assert_eq!(len, 8); zf.current_timeline_size = buf.get_u64(); } - "ps_writelsn" => { + b"ps_writelsn" => { let len = buf.get_i32(); assert_eq!(len, 8); zf.ps_writelsn = buf.get_u64(); } - "ps_flushlsn" => { + b"ps_flushlsn" => { let len = buf.get_i32(); assert_eq!(len, 8); zf.ps_flushlsn = buf.get_u64(); } - "ps_applylsn" => { + b"ps_applylsn" => { let len = buf.get_i32(); assert_eq!(len, 8); zf.ps_applylsn = buf.get_u64(); } - "ps_replytime" => { + b"ps_replytime" => { let len = buf.get_i32(); assert_eq!(len, 8); let raw_time = buf.get_i64(); @@ -1037,8 +969,8 @@ impl ReplicationFeedback { _ => { let len = buf.get_i32(); warn!( - "ReplicationFeedback parse. unknown key {} of len {}. Skip it.", - key, len + "ReplicationFeedback parse. unknown key {} of len {len}. Skip it.", + String::from_utf8_lossy(key.as_ref()) ); buf.advance(len as usize); } @@ -1084,7 +1016,7 @@ mod tests { *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1; } - write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap(); + data.put_slice(b"new_field_one\0"); data.put_i32(8); data.put_u64(42); @@ -1093,6 +1025,33 @@ mod tests { assert_eq!(zf, zf_parsed); } + #[test] + fn test_startup_message_params_options_escaped() { + fn split_options(params: &StartupMessageParams) -> Vec> { + params + .options_escaped() + .expect("options are None") + .collect() + } + + let make_params = |options| StartupMessageParams::new([("options", options)]); + + let params = StartupMessageParams::new([]); + assert!(matches!(params.options_escaped(), None)); + + let params = make_params(""); + assert!(split_options(¶ms).is_empty()); + + let params = make_params("foo"); + assert_eq!(split_options(¶ms), ["foo"]); + + let params = make_params(" foo bar "); + assert_eq!(split_options(¶ms), ["foo", "bar"]); + + let params = make_params("foo\\ bar \\ \\\\ baz\\ lol"); + assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]); + } + // Make sure that `read` is sync/async callable async fn _assert(stream: &mut (impl tokio::io::AsyncRead + Unpin)) { let _ = FeMessage::read(&mut [].as_ref()); diff --git a/libs/utils/src/seqwait_async.rs b/libs/utils/src/seqwait_async.rs index 09138e9dd4..f685e2b569 100644 --- a/libs/utils/src/seqwait_async.rs +++ b/libs/utils/src/seqwait_async.rs @@ -1,8 +1,8 @@ -/// -/// Async version of 'seqwait.rs' -/// -/// NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs. -/// +//! +//! Async version of 'seqwait.rs' +//! +//! NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs. +//! #![warn(missing_docs)] diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs new file mode 100644 index 0000000000..24423815ab --- /dev/null +++ b/libs/utils/src/simple_rcu.rs @@ -0,0 +1,217 @@ +//! +//! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat +//! similar to a lock, but it allows readers to "hold on" to an old value of RCU +//! without blocking writers, and allows writing a new values without blocking +//! readers. When you update the new value, the new value is immediately visible +//! to new readers, but the update waits until all existing readers have +//! finishe, so that no one sees the old value anymore. +//! +//! This implementation isn't wait-free; it uses an RwLock that is held for a +//! short duration when the value is read or updated. +//! +#![warn(missing_docs)] + +use std::ops::Deref; +use std::sync::mpsc::{sync_channel, Receiver, SyncSender}; +use std::sync::{Arc, Weak}; +use std::sync::{Mutex, RwLock, RwLockWriteGuard}; + +/// +/// Rcu allows multiple readers to read and hold onto a value without blocking +/// (for very long). Storing to the Rcu updates the value, making new readers +/// immediately see the new value, but it also waits for all current readers to +/// finish. +/// +pub struct Rcu { + inner: RwLock>, +} + +struct RcuInner { + current_cell: Arc>, + old_cells: Vec>>, +} + +/// +/// RcuCell holds one value. It can be the latest one, or an old one. +/// +struct RcuCell { + value: V, + + /// A dummy channel. We never send anything to this channel. The point is + /// that when the RcuCell is dropped, any cloned Senders will be notified + /// that the channel is closed. Updaters can use this to wait out until the + /// RcuCell has been dropped, i.e. until the old value is no longer in use. + /// + /// We never do anything with the receiver, we just need to hold onto it so + /// that the Senders will be notified when it's dropped. But because it's + /// not Sync, we need a Mutex on it. + watch: (SyncSender<()>, Mutex>), +} + +impl RcuCell { + fn new(value: V) -> Self { + let (watch_sender, watch_receiver) = sync_channel(0); + RcuCell { + value, + watch: (watch_sender, Mutex::new(watch_receiver)), + } + } +} + +impl Rcu { + /// Create a new `Rcu`, initialized to `starting_val` + pub fn new(starting_val: V) -> Self { + let inner = RcuInner { + current_cell: Arc::new(RcuCell::new(starting_val)), + old_cells: Vec::new(), + }; + Self { + inner: RwLock::new(inner), + } + } + + /// + /// Read current value. Any store() calls will block until the returned + /// guard object is dropped. + /// + pub fn read(&self) -> RcuReadGuard { + let current_cell = Arc::clone(&self.inner.read().unwrap().current_cell); + RcuReadGuard { cell: current_cell } + } + + /// + /// Lock the current value for updating. Returns a guard object that can be + /// used to read the current value, and to store a new value. + /// + /// Note: holding the write-guard blocks concurrent readers, so you should + /// finish the update and drop the guard quickly! + /// + pub fn write(&self) -> RcuWriteGuard<'_, V> { + let inner = self.inner.write().unwrap(); + RcuWriteGuard { inner } + } +} + +/// +/// Read guard returned by `read` +/// +pub struct RcuReadGuard { + cell: Arc>, +} + +impl Deref for RcuReadGuard { + type Target = V; + + fn deref(&self) -> &V { + &self.cell.value + } +} + +/// +/// Read guard returned by `read` +/// +pub struct RcuWriteGuard<'a, V> { + inner: RwLockWriteGuard<'a, RcuInner>, +} + +impl<'a, V> Deref for RcuWriteGuard<'a, V> { + type Target = V; + + fn deref(&self) -> &V { + &self.inner.current_cell.value + } +} + +impl<'a, V> RcuWriteGuard<'a, V> { + /// + /// Store a new value. The new value will be written to the Rcu immediately, + /// and will be immediately seen by any `read` calls that start afterwards. + /// But if there are any readers still holding onto the old value, or any + /// even older values, this will await until they have been released. + /// + /// This will drop the write-guard before it starts waiting for the reads to + /// finish, so a new write operation can begin before this functio returns. + /// + pub fn store(mut self, new_val: V) { + let new_cell = Arc::new(RcuCell::new(new_val)); + + let mut watches = Vec::new(); + { + let old = std::mem::replace(&mut self.inner.current_cell, new_cell); + self.inner.old_cells.push(Arc::downgrade(&old)); + + // cleanup old cells that no longer have any readers, and collect + // the watches for any that do. + self.inner.old_cells.retain(|weak| { + if let Some(cell) = weak.upgrade() { + watches.push(cell.watch.0.clone()); + true + } else { + false + } + }); + } + drop(self); + + // after all the old_cells are no longer in use, we're done + for w in watches.iter_mut() { + // This will block until the Receiver is closed. That happens then + // the RcuCell is dropped. + #[allow(clippy::single_match)] + match w.send(()) { + Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"), + Err(_) => { + // closed, which means that the cell has been dropped, and + // its value is no longer in use + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::{Arc, Mutex}; + use std::thread::{sleep, spawn}; + use std::time::Duration; + + #[test] + fn basic() { + let rcu = Arc::new(Rcu::new(1)); + let log = Arc::new(Mutex::new(Vec::new())); + + let a = rcu.read(); + assert_eq!(*a, 1); + log.lock().unwrap().push("one"); + + let (rcu_clone, log_clone) = (Arc::clone(&rcu), Arc::clone(&log)); + let thread = spawn(move || { + log_clone.lock().unwrap().push("store two start"); + let write_guard = rcu_clone.write(); + assert_eq!(*write_guard, 1); + write_guard.store(2); + log_clone.lock().unwrap().push("store two done"); + }); + // without this sleep the test can pass on accident if the writer is slow + sleep(Duration::from_secs(1)); + + // new read should see the new value + let b = rcu.read(); + assert_eq!(*b, 2); + + // old guard still sees the old value + assert_eq!(*a, 1); + + // Release the old guard. This lets the store in the thread to finish. + log.lock().unwrap().push("release a"); + drop(a); + + thread.join().unwrap(); + + assert_eq!( + log.lock().unwrap().as_slice(), + &["one", "store two start", "release a", "store two done",] + ); + } +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 63a2263ae0..902765f424 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -15,7 +15,7 @@ failpoints = ["fail/failpoints"] chrono = "0.4.19" rand = "0.8.3" regex = "1.4.5" -bytes = { version = "1.0.1", features = ['serde'] } +bytes = "1.0.1" byteorder = "1.4.3" futures = "0.3.13" hex = "0.4.3" diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 33f072553f..864c5b8ac8 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -22,8 +22,8 @@ use std::time::SystemTime; use tar::{Builder, EntryType, Header}; use tracing::*; +use crate::layered_repository::Timeline; use crate::reltag::{RelTag, SlruKind}; -use crate::DatadirTimeline; use postgres_ffi::v14::pg_constants; use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName}; @@ -36,13 +36,12 @@ use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a, W, T> +pub struct Basebackup<'a, W> where W: Write, - T: DatadirTimeline, { ar: Builder>, - timeline: &'a Arc, + timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, full_backup: bool, @@ -57,18 +56,17 @@ where // * When working without safekeepers. In this situation it is important to match the lsn // we are taking basebackup on with the lsn that is used in pageserver's walreceiver // to start the replication. -impl<'a, W, T> Basebackup<'a, W, T> +impl<'a, W> Basebackup<'a, W> where W: Write, - T: DatadirTimeline, { pub fn new( write: W, - timeline: &'a Arc, + timeline: &'a Arc, req_lsn: Option, prev_lsn: Option, full_backup: bool, - ) -> Result> { + ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the @@ -404,10 +402,9 @@ where } } -impl<'a, W, T> Drop for Basebackup<'a, W, T> +impl<'a, W> Drop for Basebackup<'a, W> where W: Write, - T: DatadirTimeline, { /// If the basebackup was not finished, prevent the Archive::drop() from /// writing the end-of-archive marker. diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 1a13147f42..7a33a548e7 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,6 +1,7 @@ //! Main entry point for the Page Server executable. -use std::{env, ops::ControlFlow, path::Path, str::FromStr}; +use remote_storage::GenericRemoteStorage; +use std::{env, ops::ControlFlow, path::Path, str::FromStr, sync::Arc}; use tracing::*; use anyhow::{bail, Context, Result}; @@ -298,7 +299,14 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() }; info!("Using auth: {:#?}", conf.auth_type); - let remote_index = tenant_mgr::init_tenant_mgr(conf)?; + let remote_storage = conf + .remote_storage_config + .as_ref() + .map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config)) + .transpose() + .context("Failed to init generic remote storage")? + .map(Arc::new); + let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.as_ref().map(Arc::clone))?; // Spawn a new thread for the http endpoint // bind before launching separate thread so the error reported before startup exits @@ -310,7 +318,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() "http_endpoint_thread", true, move || { - let router = http::make_router(conf, auth_cloned, remote_index)?; + let router = http::make_router(conf, auth_cloned, remote_index, remote_storage)?; endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher()) }, )?; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index a4f270580f..654f45a95d 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -129,9 +129,9 @@ pub struct LocalTimelineInfo { pub latest_gc_cutoff_lsn: Lsn, #[serde_as(as = "DisplayFromStr")] pub disk_consistent_lsn: Lsn, - pub current_logical_size: Option, // is None when timeline is Unloaded - pub current_physical_size: Option, // is None when timeline is Unloaded - pub current_logical_size_non_incremental: Option, + pub current_logical_size: Option, // is None when timeline is Unloaded + pub current_physical_size: Option, // is None when timeline is Unloaded + pub current_logical_size_non_incremental: Option, pub current_physical_size_non_incremental: Option, pub timeline_state: LocalTimelineState, @@ -150,6 +150,9 @@ pub struct RemoteTimelineInfo { pub awaits_download: bool, } +/// +/// This represents the output of the "timeline_detail" API call. +/// #[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1d0adec63d..ef18129504 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -11,10 +11,8 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; -use crate::layered_repository::{metadata::TimelineMetadata, LayeredTimeline}; -use crate::pgdatadir_mapping::DatadirTimeline; +use crate::layered_repository::{metadata::TimelineMetadata, Timeline}; use crate::repository::{LocalTimelineState, RepositoryTimeline}; -use crate::repository::{Repository, Timeline}; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant_config::TenantConfOpt; @@ -37,7 +35,7 @@ struct State { auth: Option>, remote_index: RemoteIndex, allowlist_routes: Vec, - remote_storage: Option, + remote_storage: Option>, } impl State { @@ -45,20 +43,12 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_index: RemoteIndex, + remote_storage: Option>, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() .map(|v| v.parse().unwrap()) .collect::>(); - // Note that this remote storage is created separately from the main one in the sync_loop. - // It's fine since it's stateless and some code duplication saves us from bloating the code around with generics. - let remote_storage = conf - .remote_storage_config - .as_ref() - .map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config)) - .transpose() - .context("Failed to init generic remote storage")?; - Ok(Self { conf, auth, @@ -85,7 +75,7 @@ fn get_config(request: &Request) -> &'static PageServerConf { // Helper functions to construct a LocalTimelineInfo struct for a timeline fn local_timeline_info_from_loaded_timeline( - timeline: &LayeredTimeline, + timeline: &Timeline, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> anyhow::Result { @@ -160,7 +150,7 @@ fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> Lo } fn local_timeline_info_from_repo_timeline( - repo_timeline: &RepositoryTimeline, + repo_timeline: &RepositoryTimeline, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> anyhow::Result { @@ -208,7 +198,6 @@ async fn status_handler(request: Request) -> Result, ApiErr async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; let request_data: TimelineCreateRequest = json_request(&mut request).await?; - check_permission(&request, Some(tenant_id))?; let new_timeline_info = tokio::task::spawn_blocking(move || { @@ -246,11 +235,12 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); let include_non_incremental_physical_size = query_param_present(&request, "include-non-incremental-physical-size"); + check_permission(&request, Some(tenant_id))?; + let local_timeline_infos = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); list_local_timelines( @@ -301,13 +291,12 @@ fn query_param_present(request: &Request, param: &str) -> bool { async fn timeline_detail_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); let include_non_incremental_physical_size = query_param_present(&request, "include-non-incremental-physical-size"); + check_permission(&request, Some(tenant_id))?; let (local_timeline_info, remote_timeline_info) = async { // any error here will render local timeline as None @@ -371,7 +360,7 @@ async fn tenant_attach_handler(request: Request) -> Result, let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - info!("Handling tenant attach {}", tenant_id,); + info!("Handling tenant attach {}", tenant_id); tokio::task::spawn_blocking(move || { if tenant_mgr::get_tenant_state(tenant_id).is_some() { @@ -451,16 +440,8 @@ async fn gather_tenant_timelines_index_parts( tenant_id: ZTenantId, ) -> anyhow::Result>> { let index_parts = match state.remote_storage.as_ref() { - Some(GenericRemoteStorage::Local(local_storage)) => { - storage_sync::gather_tenant_timelines_index_parts(state.conf, local_storage, tenant_id) - .await - } - // FIXME here s3 storage contains its own limits, that are separate from sync storage thread ones - // because it is a different instance. We can move this limit to some global static - // or use one instance everywhere. - Some(GenericRemoteStorage::S3(s3_storage)) => { - storage_sync::gather_tenant_timelines_index_parts(state.conf, s3_storage, tenant_id) - .await + Some(storage) => { + storage_sync::gather_tenant_timelines_index_parts(state.conf, storage, tenant_id).await } None => return Ok(None), } @@ -480,9 +461,8 @@ async fn gather_tenant_timelines_index_parts( async fn timeline_delete_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; let state = get_state(&request); tokio::task::spawn_blocking(move || { @@ -521,7 +501,6 @@ async fn tenant_detach_handler(request: Request) -> Result, } async fn tenant_list_handler(request: Request) -> Result, ApiError> { - // check for management permission check_permission(&request, None)?; let state = get_state(&request); @@ -589,7 +568,6 @@ async fn tenant_status(request: Request) -> Result, ApiErro } async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { - // check for management permission check_permission(&request, None)?; let request_data: TenantCreateRequest = json_request(&mut request).await?; @@ -658,7 +636,6 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result, ApiError> { let request_data: TenantConfigRequest = json_request(&mut request).await?; let tenant_id = request_data.tenant_id; - // check for management permission check_permission(&request, Some(tenant_id))?; let mut tenant_conf: TenantConfOpt = Default::default(); @@ -721,6 +698,7 @@ pub fn make_router( conf: &'static PageServerConf, auth: Option>, remote_index: RemoteIndex, + remote_storage: Option>, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); @@ -737,7 +715,8 @@ pub fn make_router( Ok(router .data(Arc::new( - State::new(conf, auth, remote_index).context("Failed to initialize router state")?, + State::new(conf, auth, remote_index, remote_storage) + .context("Failed to initialize router state")?, )) .get("/v1/status", status_handler) .get("/v1/tenant", tenant_list_handler) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 729829c5e8..4cc3aafb0e 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -11,6 +11,7 @@ use bytes::Bytes; use tracing::*; use walkdir::WalkDir; +use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::walingest::WalIngest; @@ -39,9 +40,9 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result { /// This is currently only used to import a cluster freshly created by initdb. /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. -pub fn import_timeline_from_postgres_datadir( +pub fn import_timeline_from_postgres_datadir( path: &Path, - tline: &T, + tline: &Timeline, lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; @@ -99,8 +100,8 @@ pub fn import_timeline_from_postgres_datadir( } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. -fn import_rel( - modification: &mut DatadirModification, +fn import_rel( + modification: &mut DatadirModification, path: &Path, spcoid: Oid, dboid: Oid, @@ -178,8 +179,8 @@ fn import_rel( /// Import an SLRU segment file /// -fn import_slru( - modification: &mut DatadirModification, +fn import_slru( + modification: &mut DatadirModification, slru: SlruKind, path: &Path, mut reader: Reader, @@ -234,12 +235,7 @@ fn import_slru( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. -fn import_wal( - walpath: &Path, - tline: &T, - startpoint: Lsn, - endpoint: Lsn, -) -> Result<()> { +fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> { let mut waldecoder = WalStreamDecoder::new(startpoint); let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE); @@ -305,12 +301,12 @@ fn import_wal( Ok(()) } -pub fn import_basebackup_from_tar( - tline: &T, +pub fn import_basebackup_from_tar( + tline: &Timeline, reader: Reader, base_lsn: Lsn, ) -> Result<()> { - info!("importing base at {}", base_lsn); + info!("importing base at {base_lsn}"); let mut modification = tline.begin_modification(base_lsn); modification.init_empty()?; @@ -335,7 +331,11 @@ pub fn import_basebackup_from_tar( debug!("directory {:?}", file_path); } _ => { - panic!("tar::EntryType::?? {}", file_path.display()); + bail!( + "entry {} in backup tar archive is of unexpected type: {:?}", + file_path.display(), + header.entry_type() + ); } } } @@ -347,8 +347,8 @@ pub fn import_basebackup_from_tar( Ok(()) } -pub fn import_wal_from_tar( - tline: &T, +pub fn import_wal_from_tar( + tline: &Timeline, reader: Reader, start_lsn: Lsn, end_lsn: Lsn, @@ -388,7 +388,11 @@ pub fn import_wal_from_tar( continue; } _ => { - panic!("tar::EntryType::?? {}", file_path.display()); + bail!( + "entry {} in WAL tar archive is of unexpected type: {:?}", + file_path.display(), + header.entry_type() + ); } } }; @@ -428,14 +432,12 @@ pub fn import_wal_from_tar( Ok(()) } -pub fn import_file( - modification: &mut DatadirModification, +fn import_file( + modification: &mut DatadirModification, file_path: &Path, reader: Reader, len: usize, ) -> Result> { - debug!("looking at {:?}", file_path); - if file_path.starts_with("global") { let spcnode = pg_constants::GLOBALTABLESPACE_OID; let dbnode = 0; @@ -557,7 +559,10 @@ pub fn import_file( // this to import arbitrary postgres databases. bail!("Importing pg_tblspc is not implemented"); } else { - debug!("ignored"); + debug!( + "ignoring unrecognized file \"{}\" in tar archive", + file_path.display() + ); } Ok(None) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 6bf2e71852..73c30b51b8 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -13,6 +13,7 @@ use anyhow::{bail, ensure, Context, Result}; use tracing::*; +use utils::zid::ZTenantTimelineId; use std::cmp::min; use std::collections::hash_map::Entry; @@ -31,7 +32,8 @@ use crate::config::PageServerConf; use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline}; +use crate::repository::{GcResult, RepositoryTimeline}; +use crate::tenant_mgr::LocalTimelineUpdate; use crate::thread_mgr; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; @@ -61,13 +63,13 @@ mod timeline; use storage_layer::Layer; use timeline::LayeredTimelineEntry; -pub use timeline::LayeredTimeline; +pub use timeline::Timeline; // re-export this function so that page_cache.rs can use it. pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; // re-export for use in storage_sync.rs -pub use crate::layered_repository::timeline::save_metadata; +pub use crate::layered_repository::metadata::save_metadata; // re-export for use in walreceiver pub use crate::layered_repository::timeline::WalReceiverInfo; @@ -78,7 +80,7 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// /// Repository consists of multiple timelines. Keep them in a hash table. /// -pub struct LayeredRepository { +pub struct Repository { // Global pageserver config parameters pub conf: &'static PageServerConf, @@ -119,17 +121,22 @@ pub struct LayeredRepository { upload_layers: bool, } -/// Public interface -impl Repository for LayeredRepository { - type Timeline = LayeredTimeline; - - fn get_timeline(&self, timelineid: ZTimelineId) -> Option> { - let timelines = self.timelines.lock().unwrap(); - self.get_timeline_internal(timelineid, &timelines) +/// A repository corresponds to one .neon directory. One repository holds multiple +/// timelines, forked off from the same initial call to 'initdb'. +impl Repository { + /// Get Timeline handle for given zenith timeline ID. + /// This function is idempotent. It doesn't change internal state in any way. + pub fn get_timeline(&self, timelineid: ZTimelineId) -> Option> { + self.timelines + .lock() + .unwrap() + .get(&timelineid) + .cloned() .map(RepositoryTimeline::from) } - fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result> { + /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. + pub fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result> { let mut timelines = self.timelines.lock().unwrap(); match self.get_timeline_load_internal(timelineid, &mut timelines)? { Some(local_loaded_timeline) => Ok(local_loaded_timeline), @@ -140,7 +147,9 @@ impl Repository for LayeredRepository { } } - fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)> { + /// Lists timelines the repository contains. + /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. + pub fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)> { self.timelines .lock() .unwrap() @@ -154,11 +163,13 @@ impl Repository for LayeredRepository { .collect() } - fn create_empty_timeline( + /// Create a new, empty timeline. The caller is responsible for loading data into it + /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. + pub fn create_empty_timeline( &self, timeline_id: ZTimelineId, initdb_lsn: Lsn, - ) -> Result> { + ) -> Result> { let mut timelines = self.timelines.lock().unwrap(); let vacant_timeline_entry = match timelines.entry(timeline_id) { Entry::Occupied(_) => bail!("Timeline already exists"), @@ -174,9 +185,9 @@ impl Repository for LayeredRepository { crashsafe_dir::create_dir_all(timeline_path)?; let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - timeline::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; + save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; - let timeline = LayeredTimeline::new( + let timeline = Timeline::new( self.conf, Arc::clone(&self.tenant_conf), metadata, @@ -192,11 +203,16 @@ impl Repository for LayeredRepository { let timeline = Arc::new(timeline); vacant_timeline_entry.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline))); + crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { + id: ZTenantTimelineId::new(self.tenant_id(), timeline_id), + timeline: Arc::clone(&timeline), + }); + Ok(timeline) } /// Branch a timeline - fn branch_timeline( + pub fn branch_timeline( &self, src: ZTimelineId, dst: ZTimelineId, @@ -238,7 +254,8 @@ impl Repository for LayeredRepository { src_timeline .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) .context(format!( - "invalid branch start lsn: less than latest GC cutoff {latest_gc_cutoff_lsn}" + "invalid branch start lsn: less than latest GC cutoff {}", + *latest_gc_cutoff_lsn ))?; { let gc_info = src_timeline.gc_info.read().unwrap(); @@ -274,11 +291,11 @@ impl Repository for LayeredRepository { dst_prev, Some(src), start_lsn, - *src_timeline.latest_gc_cutoff_lsn.read().unwrap(), + *src_timeline.latest_gc_cutoff_lsn.read(), src_timeline.initdb_lsn, ); crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; - timeline::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; + save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata }); info!("branched timeline {} from {} at {}", dst, src, start_lsn); @@ -286,10 +303,16 @@ impl Repository for LayeredRepository { Ok(()) } - /// Public entry point to GC. All the logic is in the private - /// gc_iteration_internal function, this public facade just wraps it for - /// metrics collection. - fn gc_iteration( + /// perform one garbage collection iteration, removing old data files from disk. + /// this function is periodically called by gc thread. + /// also it can be explicitly requested through page server api 'do_gc' command. + /// + /// 'timelineid' specifies the timeline to GC, or None for all. + /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). + /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC + /// to make tests more deterministic. + /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? + pub fn gc_iteration( &self, target_timeline_id: Option, horizon: u64, @@ -307,7 +330,11 @@ impl Repository for LayeredRepository { }) } - fn compaction_iteration(&self) -> Result<()> { + /// Perform one compaction iteration. + /// This function is periodically called by compactor thread. + /// Also it can be explicitly requested per timeline through page server + /// api's 'compact' command. + pub fn compaction_iteration(&self) -> Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the @@ -335,12 +362,11 @@ impl Repository for LayeredRepository { Ok(()) } - /// /// Flush all in-memory data to disk. /// - /// Used at shutdown. + /// Used at graceful shutdown. /// - fn checkpoint(&self) -> Result<()> { + pub fn checkpoint(&self) -> Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the // checkpoints. We don't want to block everything else while the @@ -370,7 +396,8 @@ impl Repository for LayeredRepository { Ok(()) } - fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { + /// Removes timeline-related in-memory data + pub fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { // in order to be retriable detach needs to be idempotent // (or at least to a point that each time the detach is called it can make progress) let mut timelines = self.timelines.lock().unwrap(); @@ -407,7 +434,9 @@ impl Repository for LayeredRepository { Ok(()) } - fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> { + /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. + /// See [`crate::remote_storage`] for more details about the synchronization. + pub fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> { debug!("attach timeline_id: {}", timeline_id,); match self.timelines.lock().unwrap().entry(timeline_id) { Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), @@ -421,13 +450,14 @@ impl Repository for LayeredRepository { Ok(()) } - fn get_remote_index(&self) -> &RemoteIndex { + /// Allows to retrieve remote timeline index from the tenant. Used in walreceiver to grab remote consistent lsn. + pub fn get_remote_index(&self) -> &RemoteIndex { &self.remote_index } } /// Private functions -impl LayeredRepository { +impl Repository { pub fn get_checkpoint_distance(&self) -> u64 { let tenant_conf = self.tenant_conf.read().unwrap(); tenant_conf @@ -517,49 +547,37 @@ impl LayeredRepository { tenant_conf.update(&new_tenant_conf); - LayeredRepository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?; + Repository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?; Ok(()) } - // Implementation of the public `get_timeline` function. - // Differences from the public: - // * interface in that the caller must already hold the mutex on the 'timelines' hashmap. - fn get_timeline_internal( - &self, - timelineid: ZTimelineId, - timelines: &HashMap, - ) -> Option { - timelines.get(&timelineid).cloned() - } - // Implementation of the public `get_timeline_load` function. // Differences from the public: // * interface in that the caller must already hold the mutex on the 'timelines' hashmap. fn get_timeline_load_internal( &self, - timelineid: ZTimelineId, + timeline_id: ZTimelineId, timelines: &mut HashMap, - ) -> anyhow::Result>> { - match timelines.get(&timelineid) { + ) -> anyhow::Result>> { + match timelines.get(&timeline_id) { Some(entry) => match entry { LayeredTimelineEntry::Loaded(local_timeline) => { - debug!("timeline {} found loaded into memory", &timelineid); + debug!("timeline {timeline_id} found loaded into memory"); return Ok(Some(Arc::clone(local_timeline))); } LayeredTimelineEntry::Unloaded { .. } => {} }, None => { - debug!("timeline {} not found", &timelineid); + debug!("timeline {timeline_id} not found"); return Ok(None); } }; debug!( - "timeline {} found on a local disk, but not loaded into the memory, loading", - &timelineid + "timeline {timeline_id} found on a local disk, but not loaded into the memory, loading" ); - let timeline = self.load_local_timeline(timelineid, timelines)?; + let timeline = self.load_local_timeline(timeline_id, timelines)?; let was_loaded = timelines.insert( - timelineid, + timeline_id, LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), ); ensure!( @@ -574,7 +592,7 @@ impl LayeredRepository { &self, timeline_id: ZTimelineId, timelines: &mut HashMap, - ) -> anyhow::Result> { + ) -> anyhow::Result> { let metadata = load_metadata(self.conf, timeline_id, self.tenant_id) .context("failed to load metadata")?; let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -591,7 +609,7 @@ impl LayeredRepository { .map(LayeredTimelineEntry::Loaded); let _enter = info_span!("loading local timeline").entered(); - let timeline = LayeredTimeline::new( + let timeline = Timeline::new( self.conf, Arc::clone(&self.tenant_conf), metadata, @@ -605,7 +623,14 @@ impl LayeredRepository { .load_layer_map(disk_consistent_lsn) .context("failed to load layermap")?; - Ok(Arc::new(timeline)) + let timeline = Arc::new(timeline); + + crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach { + id: ZTenantTimelineId::new(self.tenant_id(), timeline_id), + timeline: Arc::clone(&timeline), + }); + + Ok(timeline) } pub fn new( @@ -615,8 +640,8 @@ impl LayeredRepository { tenant_id: ZTenantId, remote_index: RemoteIndex, upload_layers: bool, - ) -> LayeredRepository { - LayeredRepository { + ) -> Repository { + Repository { tenant_id, file_lock: RwLock::new(()), conf, @@ -632,9 +657,9 @@ impl LayeredRepository { /// Locate and load config pub fn load_tenant_config( conf: &'static PageServerConf, - tenantid: ZTenantId, + tenant_id: ZTenantId, ) -> anyhow::Result { - let target_config_path = TenantConf::path(conf, tenantid); + let target_config_path = TenantConf::path(conf, tenant_id); info!("load tenantconf from {}", target_config_path.display()); @@ -669,11 +694,11 @@ impl LayeredRepository { pub fn persist_tenant_config( conf: &'static PageServerConf, - tenantid: ZTenantId, + tenant_id: ZTenantId, tenant_conf: TenantConfOpt, ) -> anyhow::Result<()> { let _enter = info_span!("saving tenantconf").entered(); - let target_config_path = TenantConf::path(conf, tenantid); + let target_config_path = TenantConf::path(conf, tenant_id); info!("save tenantconf to {}", target_config_path.display()); let mut conf_content = r#"# This file contains a specific per-tenant's config. @@ -810,7 +835,7 @@ impl LayeredRepository { // compaction (both require `layer_removal_cs` lock), // but the GC iteration can run concurrently with branch creation. // - // See comments in [`LayeredRepository::branch_timeline`] for more information + // See comments in [`Repository::branch_timeline`] for more information // about why branch creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { if thread_mgr::is_shutdown_requested() { @@ -886,22 +911,525 @@ pub fn load_metadata( }) } -/// -/// Tests that are specific to the layered storage format. -/// -/// There are more unit tests in repository.rs that work through the -/// Repository interface and are expected to work regardless of the -/// file format and directory layout. The test here are more low level. -/// #[cfg(test)] -pub mod tests { +pub mod repo_harness { + use bytes::{Bytes, BytesMut}; + use once_cell::sync::Lazy; + use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; + use std::{fs, path::PathBuf}; + use utils::lsn::Lsn; + + use crate::storage_sync::index::RemoteIndex; + use crate::{ + config::PageServerConf, + layered_repository::Repository, + repository::Key, + walrecord::ZenithWalRecord, + walredo::{WalRedoError, WalRedoManager}, + }; + + use super::*; + use crate::tenant_config::{TenantConf, TenantConfOpt}; + use hex_literal::hex; + use utils::zid::{ZTenantId, ZTimelineId}; + + pub const TIMELINE_ID: ZTimelineId = + ZTimelineId::from_array(hex!("11223344556677881122334455667788")); + pub const NEW_TIMELINE_ID: ZTimelineId = + ZTimelineId::from_array(hex!("AA223344556677881122334455667788")); + + /// Convenience function to create a page image with given string as the only content + #[allow(non_snake_case)] + pub fn TEST_IMG(s: &str) -> Bytes { + let mut buf = BytesMut::new(); + buf.extend_from_slice(s.as_bytes()); + buf.resize(64, 0); + + buf.freeze() + } + + static LOCK: Lazy> = Lazy::new(|| RwLock::new(())); + + impl From for TenantConfOpt { + fn from(tenant_conf: TenantConf) -> Self { + Self { + checkpoint_distance: Some(tenant_conf.checkpoint_distance), + checkpoint_timeout: Some(tenant_conf.checkpoint_timeout), + compaction_target_size: Some(tenant_conf.compaction_target_size), + compaction_period: Some(tenant_conf.compaction_period), + compaction_threshold: Some(tenant_conf.compaction_threshold), + gc_horizon: Some(tenant_conf.gc_horizon), + gc_period: Some(tenant_conf.gc_period), + image_creation_threshold: Some(tenant_conf.image_creation_threshold), + pitr_interval: Some(tenant_conf.pitr_interval), + walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), + lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), + max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), + } + } + } + + pub struct RepoHarness<'a> { + pub conf: &'static PageServerConf, + pub tenant_conf: TenantConf, + pub tenant_id: ZTenantId, + + pub lock_guard: ( + Option>, + Option>, + ), + } + + impl<'a> RepoHarness<'a> { + pub fn create(test_name: &'static str) -> Result { + Self::create_internal(test_name, false) + } + pub fn create_exclusive(test_name: &'static str) -> Result { + Self::create_internal(test_name, true) + } + fn create_internal(test_name: &'static str, exclusive: bool) -> Result { + let lock_guard = if exclusive { + (None, Some(LOCK.write().unwrap())) + } else { + (Some(LOCK.read().unwrap()), None) + }; + + let repo_dir = PageServerConf::test_repo_dir(test_name); + let _ = fs::remove_dir_all(&repo_dir); + fs::create_dir_all(&repo_dir)?; + + let conf = PageServerConf::dummy_conf(repo_dir); + // Make a static copy of the config. This can never be free'd, but that's + // OK in a test. + let conf: &'static PageServerConf = Box::leak(Box::new(conf)); + + let tenant_conf = TenantConf::dummy_conf(); + + let tenant_id = ZTenantId::generate(); + fs::create_dir_all(conf.tenant_path(&tenant_id))?; + fs::create_dir_all(conf.timelines_path(&tenant_id))?; + + Ok(Self { + conf, + tenant_conf, + tenant_id, + lock_guard, + }) + } + + pub fn load(&self) -> Repository { + self.try_load().expect("failed to load test repo") + } + + pub fn try_load(&self) -> Result { + let walredo_mgr = Arc::new(TestRedoManager); + + let repo = Repository::new( + self.conf, + TenantConfOpt::from(self.tenant_conf), + walredo_mgr, + self.tenant_id, + RemoteIndex::default(), + false, + ); + // populate repo with locally available timelines + for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) + .expect("should be able to read timelines dir") + { + let timeline_dir_entry = timeline_dir_entry.unwrap(); + let timeline_id: ZTimelineId = timeline_dir_entry + .path() + .file_name() + .unwrap() + .to_string_lossy() + .parse() + .unwrap(); + + repo.attach_timeline(timeline_id)?; + } + + Ok(repo) + } + + pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { + self.conf.timeline_path(timeline_id, &self.tenant_id) + } + } + + // Mock WAL redo manager that doesn't do much + pub struct TestRedoManager; + + impl WalRedoManager for TestRedoManager { + fn request_redo( + &self, + key: Key, + lsn: Lsn, + base_img: Option, + records: Vec<(Lsn, ZenithWalRecord)>, + ) -> Result { + let s = format!( + "redo for {} to get to {}, with {} and {} records", + key, + lsn, + if base_img.is_some() { + "base image" + } else { + "no base image" + }, + records.len() + ); + println!("{}", s); + + Ok(TEST_IMG(&s)) + } + } +} + +#[cfg(test)] +mod tests { use super::metadata::METADATA_FILE_NAME; use super::*; use crate::keyspace::KeySpaceAccum; - use crate::repository::repo_harness::*; + use crate::layered_repository::repo_harness::*; use crate::repository::{Key, Value}; + use bytes::BytesMut; + use hex_literal::hex; + use once_cell::sync::Lazy; use rand::{thread_rng, Rng}; + static TEST_KEY: Lazy = + Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001"))); + + #[test] + fn test_basic() -> Result<()> { + let repo = RepoHarness::create("test_basic")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + let writer = tline.writer(); + writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; + writer.finish_write(Lsn(0x10)); + drop(writer); + + let writer = tline.writer(); + writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; + writer.finish_write(Lsn(0x20)); + drop(writer); + + assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + + Ok(()) + } + + #[test] + fn no_duplicate_timelines() -> Result<()> { + let repo = RepoHarness::create("no_duplicate_timelines")?.load(); + let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) { + Ok(_) => panic!("duplicate timeline creation should fail"), + Err(e) => assert_eq!(e.to_string(), "Timeline already exists"), + } + + Ok(()) + } + + /// Convenience function to create a page image with given string as the only content + pub fn test_value(s: &str) -> Value { + let mut buf = BytesMut::new(); + buf.extend_from_slice(s.as_bytes()); + Value::Image(buf.freeze()) + } + + /// + /// Test branch creation + /// + #[test] + fn test_branch() -> Result<()> { + let repo = RepoHarness::create("test_branch")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let writer = tline.writer(); + use std::str::from_utf8; + + #[allow(non_snake_case)] + let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); + #[allow(non_snake_case)] + let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap(); + + // Insert a value on the timeline + writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?; + writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?; + writer.finish_write(Lsn(0x20)); + + writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?; + writer.finish_write(Lsn(0x30)); + writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?; + writer.finish_write(Lsn(0x40)); + + //assert_current_logical_size(&tline, Lsn(0x40)); + + // Branch the history, modify relation differently on the new timeline + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); + let new_writer = newtline.writer(); + new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?; + new_writer.finish_write(Lsn(0x40)); + + // Check page contents on both branches + assert_eq!( + from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?, + "foo at 0x40" + ); + assert_eq!( + from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?, + "bar at 0x40" + ); + assert_eq!( + from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?, + "foobar at 0x20" + ); + + //assert_current_logical_size(&tline, Lsn(0x40)); + + Ok(()) + } + + fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> Result<()> { + let mut lsn = start_lsn; + #[allow(non_snake_case)] + { + let writer = tline.writer(); + // Create a relation on the timeline + writer.put( + *TEST_KEY, + lsn, + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; + writer.finish_write(lsn); + lsn += 0x10; + writer.put( + *TEST_KEY, + lsn, + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; + writer.finish_write(lsn); + lsn += 0x10; + } + tline.checkpoint(CheckpointConfig::Forced)?; + { + let writer = tline.writer(); + writer.put( + *TEST_KEY, + lsn, + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; + writer.finish_write(lsn); + lsn += 0x10; + writer.put( + *TEST_KEY, + lsn, + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; + writer.finish_write(lsn); + } + tline.checkpoint(CheckpointConfig::Forced) + } + + #[test] + fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { + let repo = + RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; + + // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 + // FIXME: this doesn't actually remove any layer currently, given how the checkpointing + // and compaction works. But it does set the 'cutoff' point so that the cross check + // below should fail. + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + + // try to branch at lsn 25, should fail because we already garbage collected the data + match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + Ok(_) => panic!("branching should have failed"), + Err(err) => { + assert!(err.to_string().contains("invalid branch start lsn")); + assert!(err + .source() + .unwrap() + .to_string() + .contains("we might've already garbage collected needed data")) + } + } + + Ok(()) + } + + #[test] + fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { + let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); + + repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; + // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 + match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + Ok(_) => panic!("branching should have failed"), + Err(err) => { + assert!(&err.to_string().contains("invalid branch start lsn")); + assert!(&err + .source() + .unwrap() + .to_string() + .contains("is earlier than latest GC horizon")); + } + } + + Ok(()) + } + + /* + // FIXME: This currently fails to error out. Calling GC doesn't currently + // remove the old value, we'd need to work a little harder + #[test] + fn test_prohibit_get_for_garbage_collected_data() -> Result<()> { + let repo = + RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? + .load(); + + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; + + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); + assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); + match tline.get(*TEST_KEY, Lsn(0x25)) { + Ok(_) => panic!("request for page should have failed"), + Err(err) => assert!(err.to_string().contains("not found at")), + } + Ok(()) + } + */ + + #[test] + fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { + let repo = + RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; + + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); + // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); + + Ok(()) + } + #[test] + fn test_parent_keeps_data_forever_after_branching() -> Result<()> { + let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; + + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); + + make_some_layers(newtline.as_ref(), Lsn(0x60))?; + + // run gc on parent + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + + // Check that the data is still accessible on the branch. + assert_eq!( + newtline.get(*TEST_KEY, Lsn(0x50))?, + TEST_IMG(&format!("foo at {}", Lsn(0x40))) + ); + + Ok(()) + } + + #[test] + fn timeline_load() -> Result<()> { + const TEST_NAME: &str = "timeline_load"; + let harness = RepoHarness::create(TEST_NAME)?; + { + let repo = harness.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; + make_some_layers(tline.as_ref(), Lsn(0x8000))?; + tline.checkpoint(CheckpointConfig::Forced)?; + } + + let repo = harness.load(); + let tline = repo + .get_timeline(TIMELINE_ID) + .expect("cannot load timeline"); + assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); + + assert!(repo.get_timeline_load(TIMELINE_ID).is_ok()); + + let tline = repo + .get_timeline(TIMELINE_ID) + .expect("cannot load timeline"); + assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + + Ok(()) + } + + #[test] + fn timeline_load_with_ancestor() -> Result<()> { + const TEST_NAME: &str = "timeline_load_with_ancestor"; + let harness = RepoHarness::create(TEST_NAME)?; + // create two timelines + { + let repo = harness.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + make_some_layers(tline.as_ref(), Lsn(0x20))?; + tline.checkpoint(CheckpointConfig::Forced)?; + + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + + let newtline = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("Should have a local timeline"); + + make_some_layers(newtline.as_ref(), Lsn(0x60))?; + tline.checkpoint(CheckpointConfig::Forced)?; + } + + // check that both of them are initially unloaded + let repo = harness.load(); + { + let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); + assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); + + let tline = repo + .get_timeline(NEW_TIMELINE_ID) + .expect("cannot get timeline"); + assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); + } + // load only child timeline + let _ = repo + .get_timeline_load(NEW_TIMELINE_ID) + .expect("cannot load timeline"); + + // check that both, child and ancestor are loaded + let tline = repo + .get_timeline(NEW_TIMELINE_ID) + .expect("cannot get timeline"); + assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + + let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); + assert!(matches!(tline, RepositoryTimeline::Loaded(_))); + + Ok(()) + } + #[test] fn corrupt_metadata() -> Result<()> { const TEST_NAME: &str = "corrupt_metadata"; @@ -940,22 +1468,13 @@ pub mod tests { Ok(()) } - // Target file size in the unit tests. In production, the target - // file size is much larger, maybe 1 GB. But a small size makes it - // much faster to exercise all the logic for creating the files, - // garbage collection, compaction etc. - pub const TEST_FILE_SIZE: u64 = 4 * 1024 * 1024; - #[test] fn test_images() -> Result<()> { let repo = RepoHarness::create("test_images")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - #[allow(non_snake_case)] - let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); - let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; + writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; writer.finish_write(Lsn(0x10)); drop(writer); @@ -963,7 +1482,7 @@ pub mod tests { tline.compact()?; let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; + writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; writer.finish_write(Lsn(0x20)); drop(writer); @@ -971,7 +1490,7 @@ pub mod tests { tline.compact()?; let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?; + writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?; writer.finish_write(Lsn(0x30)); drop(writer); @@ -979,18 +1498,18 @@ pub mod tests { tline.compact()?; let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?; + writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?; writer.finish_write(Lsn(0x40)); drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; tline.compact()?; - assert_eq!(tline.get(TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); - assert_eq!(tline.get(TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30")); - assert_eq!(tline.get(TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40")); Ok(()) } diff --git a/pageserver/src/layered_repository/block_io.rs b/pageserver/src/layered_repository/block_io.rs index bc3bc082a0..5e32b8833a 100644 --- a/pageserver/src/layered_repository/block_io.rs +++ b/pageserver/src/layered_repository/block_io.rs @@ -157,7 +157,14 @@ where // Look up the right page let cache = page_cache::get(); loop { - match cache.read_immutable_buf(self.file_id, blknum) { + match cache + .read_immutable_buf(self.file_id, blknum) + .map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::Other, + format!("Failed to read immutable buf: {e:#}"), + ) + })? { ReadBufResult::Found(guard) => break Ok(guard), ReadBufResult::NotFound(mut write_guard) => { // Read the page from disk into the buffer diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs index 1776946e7a..a1b2d68cd5 100644 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ b/pageserver/src/layered_repository/ephemeral_file.rs @@ -12,7 +12,7 @@ use once_cell::sync::Lazy; use std::cmp::min; use std::collections::HashMap; use std::fs::OpenOptions; -use std::io::{Error, ErrorKind}; +use std::io::{self, ErrorKind}; use std::ops::DerefMut; use std::path::PathBuf; use std::sync::{Arc, RwLock}; @@ -51,7 +51,7 @@ impl EphemeralFile { conf: &PageServerConf, tenantid: ZTenantId, timelineid: ZTimelineId, - ) -> Result { + ) -> Result { let mut l = EPHEMERAL_FILES.write().unwrap(); let file_id = l.next_file_id; l.next_file_id += 1; @@ -76,7 +76,7 @@ impl EphemeralFile { }) } - fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> { + fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> { let mut off = 0; while off < PAGE_SZ { let n = self @@ -96,10 +96,13 @@ impl EphemeralFile { Ok(()) } - fn get_buf_for_write(&self, blkno: u32) -> Result { + fn get_buf_for_write(&self, blkno: u32) -> Result { // Look up the right page let cache = page_cache::get(); - let mut write_guard = match cache.write_ephemeral_buf(self.file_id, blkno) { + let mut write_guard = match cache + .write_ephemeral_buf(self.file_id, blkno) + .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))? + { WriteBufResult::Found(guard) => guard, WriteBufResult::NotFound(mut guard) => { // Read the page from disk into the buffer @@ -127,7 +130,7 @@ pub fn is_ephemeral_file(filename: &str) -> bool { } impl FileExt for EphemeralFile { - fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result { + fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result { // Look up the right page let blkno = (offset / PAGE_SZ as u64) as u32; let off = offset as usize % PAGE_SZ; @@ -137,7 +140,10 @@ impl FileExt for EphemeralFile { let mut write_guard; let cache = page_cache::get(); - let buf = match cache.read_ephemeral_buf(self.file_id, blkno) { + let buf = match cache + .read_ephemeral_buf(self.file_id, blkno) + .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))? + { ReadBufResult::Found(guard) => { read_guard = guard; read_guard.as_ref() @@ -158,7 +164,7 @@ impl FileExt for EphemeralFile { Ok(len) } - fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result { + fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result { // Look up the right page let blkno = (offset / PAGE_SZ as u64) as u32; let off = offset as usize % PAGE_SZ; @@ -166,7 +172,10 @@ impl FileExt for EphemeralFile { let mut write_guard; let cache = page_cache::get(); - let buf = match cache.write_ephemeral_buf(self.file_id, blkno) { + let buf = match cache + .write_ephemeral_buf(self.file_id, blkno) + .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))? + { WriteBufResult::Found(guard) => { write_guard = guard; write_guard.deref_mut() @@ -190,7 +199,7 @@ impl FileExt for EphemeralFile { } impl BlobWriter for EphemeralFile { - fn write_blob(&mut self, srcbuf: &[u8]) -> Result { + fn write_blob(&mut self, srcbuf: &[u8]) -> Result { let pos = self.size; let mut blknum = (self.size / PAGE_SZ as u64) as u32; @@ -268,11 +277,11 @@ impl Drop for EphemeralFile { } } -pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> { +pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> { if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) { match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) { Ok(_) => Ok(()), - Err(e) => Err(std::io::Error::new( + Err(e) => Err(io::Error::new( ErrorKind::Other, format!( "failed to write back to ephemeral file at {} error: {}", @@ -282,7 +291,7 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Er )), } } else { - Err(std::io::Error::new( + Err(io::Error::new( ErrorKind::Other, "could not write back page, not found in ephemeral files hash", )) @@ -292,11 +301,14 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Er impl BlockReader for EphemeralFile { type BlockLease = page_cache::PageReadGuard<'static>; - fn read_blk(&self, blknum: u32) -> Result { + fn read_blk(&self, blknum: u32) -> Result { // Look up the right page let cache = page_cache::get(); loop { - match cache.read_ephemeral_buf(self.file_id, blknum) { + match cache + .read_ephemeral_buf(self.file_id, blknum) + .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))? + { ReadBufResult::Found(guard) => return Ok(guard), ReadBufResult::NotFound(mut write_guard) => { // Read the page from disk into the buffer @@ -311,6 +323,10 @@ impl BlockReader for EphemeralFile { } } +fn to_io_error(e: anyhow::Error, context: &str) -> io::Error { + io::Error::new(ErrorKind::Other, format!("{context}: {e:#}")) +} + #[cfg(test)] mod tests { use super::*; @@ -322,7 +338,7 @@ mod tests { fn repo_harness( test_name: &str, - ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), Error> { + ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), io::Error> { let repo_dir = PageServerConf::test_repo_dir(test_name); let _ = fs::remove_dir_all(&repo_dir); let conf = PageServerConf::dummy_conf(repo_dir); @@ -339,7 +355,7 @@ mod tests { // Helper function to slurp contents of a file, starting at the current position, // into a string - fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result { + fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result { let mut buf = Vec::new(); buf.resize(len, 0u8); @@ -351,7 +367,7 @@ mod tests { } #[test] - fn test_ephemeral_files() -> Result<(), Error> { + fn test_ephemeral_files() -> Result<(), io::Error> { let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?; let file_a = EphemeralFile::create(conf, tenantid, timelineid)?; @@ -382,7 +398,7 @@ mod tests { } #[test] - fn test_ephemeral_blobs() -> Result<(), Error> { + fn test_ephemeral_blobs() -> Result<(), io::Error> { let (conf, tenantid, timelineid) = repo_harness("ephemeral_blobs")?; let mut file = EphemeralFile::create(conf, tenantid, timelineid)?; diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/layered_repository/filename.rs index f088088277..5ebac2332d 100644 --- a/pageserver/src/layered_repository/filename.rs +++ b/pageserver/src/layered_repository/filename.rs @@ -10,7 +10,7 @@ use std::path::PathBuf; use utils::lsn::Lsn; -// Note: LayeredTimeline::load_layer_map() relies on this sort order +// Note: Timeline::load_layer_map() relies on this sort order #[derive(Debug, PartialEq, Eq, Clone)] pub struct DeltaFileName { pub key_range: Range, diff --git a/pageserver/src/layered_repository/metadata.rs b/pageserver/src/layered_repository/metadata.rs index 0b47f8d697..910dba4644 100644 --- a/pageserver/src/layered_repository/metadata.rs +++ b/pageserver/src/layered_repository/metadata.rs @@ -1,4 +1,4 @@ -//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`] +//! Every image of a certain timeline from [`crate::layered_repository::Repository`] //! has a metadata that needs to be stored persistently. //! //! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of @@ -6,10 +6,13 @@ //! //! The module contains all structs and related helper methods related to timeline metadata. +use std::fs::{File, OpenOptions}; +use std::io::Write; use std::path::PathBuf; -use anyhow::ensure; +use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; +use tracing::info_span; use utils::{ bin_ser::BeSer, lsn::Lsn, @@ -17,6 +20,7 @@ use utils::{ }; use crate::config::PageServerConf; +use crate::virtual_file::VirtualFile; use crate::STORAGE_FORMAT_VERSION; /// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic. @@ -30,7 +34,7 @@ pub const METADATA_FILE_NAME: &str = "metadata"; /// Metadata stored on disk for each timeline /// -/// The fields correspond to the values we hold in memory, in LayeredTimeline. +/// The fields correspond to the values we hold in memory, in Timeline. #[derive(Debug, Clone, PartialEq, Eq)] pub struct TimelineMetadata { hdr: TimelineMetadataHeader, @@ -65,17 +69,6 @@ struct TimelineMetadataBody { initdb_lsn: Lsn, } -/// Points to a place in pageserver's local directory, -/// where certain timeline's metadata file should be located. -pub fn metadata_path( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, -) -> PathBuf { - conf.timeline_path(&timelineid, &tenantid) - .join(METADATA_FILE_NAME) -} - impl TimelineMetadata { pub fn new( disk_consistent_lsn: Lsn, @@ -173,11 +166,57 @@ impl TimelineMetadata { } } +/// Points to a place in pageserver's local directory, +/// where certain timeline's metadata file should be located. +pub fn metadata_path( + conf: &'static PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, +) -> PathBuf { + conf.timeline_path(&timelineid, &tenantid) + .join(METADATA_FILE_NAME) +} + +/// Save timeline metadata to file +pub fn save_metadata( + conf: &'static PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, + data: &TimelineMetadata, + first_save: bool, +) -> anyhow::Result<()> { + let _enter = info_span!("saving metadata").entered(); + let path = metadata_path(conf, timelineid, tenantid); + // use OpenOptions to ensure file presence is consistent with first_save + let mut file = VirtualFile::open_with_options( + &path, + OpenOptions::new().write(true).create_new(first_save), + )?; + + let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; + + if file.write(&metadata_bytes)? != metadata_bytes.len() { + bail!("Could not write all the metadata bytes in a single call"); + } + file.sync_all()?; + + // fsync the parent directory to ensure the directory entry is durable + if first_save { + let timeline_dir = File::open( + &path + .parent() + .expect("Metadata should always have a parent dir"), + )?; + timeline_dir.sync_all()?; + } + + Ok(()) +} + #[cfg(test)] mod tests { - use crate::repository::repo_harness::TIMELINE_ID; - use super::*; + use crate::layered_repository::repo_harness::TIMELINE_ID; #[test] fn metadata_serializes_correctly() { diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 6ef4915bdb..1a941affe5 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -9,14 +9,12 @@ use once_cell::sync::Lazy; use tracing::*; use std::cmp::{max, min, Ordering}; -use std::collections::{hash_map::Entry, HashMap, HashSet}; +use std::collections::{HashMap, HashSet}; use std::fs; -use std::fs::{File, OpenOptions}; -use std::io::Write; use std::ops::{Deref, Range}; use std::path::PathBuf; -use std::sync::atomic::{self, AtomicBool, AtomicIsize, Ordering as AtomicOrdering}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; +use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError}; use std::time::{Duration, Instant, SystemTime}; use metrics::{ @@ -32,7 +30,7 @@ use crate::layered_repository::{ image_layer::{ImageLayer, ImageLayerWriter}, inmemory_layer::InMemoryLayer, layer_map::{LayerMap, SearchResult}, - metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, + metadata::{save_metadata, TimelineMetadata, METADATA_FILE_NAME}, par_fsync, storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}, }; @@ -43,19 +41,18 @@ use crate::pgdatadir_mapping::BlockNumber; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; -use crate::DatadirTimeline; use postgres_ffi::v14::xlog_utils::to_pg_timestamp; use utils::{ lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, + simple_rcu::{Rcu, RcuReadGuard}, zid::{ZTenantId, ZTimelineId}, }; -use crate::repository::{GcResult, RepositoryTimeline, Timeline, TimelineWriter}; +use crate::repository::{GcResult, RepositoryTimeline}; use crate::repository::{Key, Value}; use crate::thread_mgr; -use crate::virtual_file::VirtualFile; use crate::walreceiver::IS_WAL_RECEIVER; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; @@ -140,6 +137,15 @@ static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_current_logical_size", + "Current logical size grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, // or in testing they estimate how much we would upload if we did. static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { @@ -160,7 +166,7 @@ static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { #[derive(Clone)] pub enum LayeredTimelineEntry { - Loaded(Arc), + Loaded(Arc), Unloaded { id: ZTimelineId, metadata: TimelineMetadata, @@ -191,7 +197,7 @@ impl LayeredTimelineEntry { } } - fn ensure_loaded(&self) -> anyhow::Result<&Arc> { + fn ensure_loaded(&self) -> anyhow::Result<&Arc> { match self { LayeredTimelineEntry::Loaded(timeline) => Ok(timeline), LayeredTimelineEntry::Unloaded { .. } => { @@ -213,7 +219,7 @@ impl LayeredTimelineEntry { } } -impl From for RepositoryTimeline { +impl From for RepositoryTimeline { fn from(entry: LayeredTimelineEntry) -> Self { match entry { LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _), @@ -235,6 +241,8 @@ struct TimelineMetrics { pub last_record_gauge: IntGauge, pub wait_lsn_time_histo: Histogram, pub current_physical_size_gauge: UIntGauge, + /// copy of LayeredTimeline.current_logical_size + pub current_logical_size_gauge: IntGauge, } impl TimelineMetrics { @@ -272,6 +280,9 @@ impl TimelineMetrics { let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); + let current_logical_size_gauge = CURRENT_LOGICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); TimelineMetrics { reconstruct_time_histo, @@ -284,11 +295,12 @@ impl TimelineMetrics { last_record_gauge, wait_lsn_time_histo, current_physical_size_gauge, + current_logical_size_gauge, } } } -pub struct LayeredTimeline { +pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -340,8 +352,8 @@ pub struct LayeredTimeline { upload_layers: AtomicBool, /// Ensures layers aren't frozen by checkpointer between - /// [`LayeredTimeline::get_layer_for_write`] and layer reads. - /// Locked automatically by [`LayeredTimelineWriter`] and checkpointer. + /// [`Timeline::get_layer_for_write`] and layer reads. + /// Locked automatically by [`TimelineWriter`] and checkpointer. /// Must always be acquired before the layer map/individual layer lock /// to avoid deadlock. write_lock: Mutex<()>, @@ -351,12 +363,12 @@ pub struct LayeredTimeline { /// Layer removal lock. /// A lock to ensure that no layer of the timeline is removed concurrently by other threads. - /// This lock is acquired in [`LayeredTimeline::gc`], [`LayeredTimeline::compact`], - /// and [`LayeredRepository::delete_timeline`]. + /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], + /// and [`Repository::delete_timeline`]. layer_removal_cs: Mutex<()>, // Needed to ensure that we can't create a branch at a point that was already garbage collected - pub latest_gc_cutoff_lsn: RwLock, + pub latest_gc_cutoff_lsn: Rcu, // List of child timelines and their branch points. This is needed to avoid // garbage collecting data that is still needed by the child timelines. @@ -377,7 +389,32 @@ pub struct LayeredTimeline { repartition_threshold: u64, /// Current logical size of the "datadir", at the last LSN. - current_logical_size: AtomicIsize, + /// + /// Size shouldn't ever be negative, but this is signed for two reasons: + /// + /// 1. If we initialized the "baseline" size lazily, while we already + /// process incoming WAL, the incoming WAL records could decrement the + /// variable and temporarily make it negative. (This is just future-proofing; + /// the initialization is currently not done lazily.) + /// + /// 2. If there is a bug and we e.g. forget to increment it in some cases + /// when size grows, but remember to decrement it when it shrinks again, the + /// variable could go negative. In that case, it seems better to at least + /// try to keep tracking it, rather than clamp or overflow it. Note that + /// get_current_logical_size() will clamp the returned value to zero if it's + /// negative, and log an error. Could set it permanently to zero or some + /// special value to indicate "broken" instead, but this will do for now. + /// + /// Note that we also expose a copy of this value as a prometheus metric, + /// see `current_logical_size_gauge`. Use the `update_current_logical_size` + /// and `set_current_logical_size` functions to modify this, they will + /// also keep the prometheus metric in sync. + current_logical_size: AtomicI64, + // TODO we don't have a good, API to ensure on a compilation level + // that the timeline passes all initialization. + // Hence we ensure that we init at least once for every timeline + // and keep this flag to avoid potentually long recomputes. + logical_size_initialized: AtomicBool, /// Information about the last processed message by the WAL receiver, /// or None if WAL receiver has not received anything for this timeline @@ -385,7 +422,7 @@ pub struct LayeredTimeline { pub last_received_wal: Mutex>, /// Relation size cache - rel_size_cache: RwLock>, + pub rel_size_cache: RwLock>, } pub struct WalReceiverInfo { @@ -394,46 +431,6 @@ pub struct WalReceiverInfo { pub last_received_msg_ts: u128, } -/// Inherit all the functions from DatadirTimeline, to provide the -/// functionality to store PostgreSQL relations, SLRUs, etc. in a -/// LayeredTimeline. -impl DatadirTimeline for LayeredTimeline { - fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { - let rel_size_cache = self.rel_size_cache.read().unwrap(); - if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { - if lsn >= *cached_lsn { - return Some(*nblocks); - } - } - None - } - - fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { - let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - match rel_size_cache.entry(tag) { - Entry::Occupied(mut entry) => { - let cached_lsn = entry.get_mut(); - if lsn >= cached_lsn.0 { - *cached_lsn = (lsn, nblocks); - } - } - Entry::Vacant(entry) => { - entry.insert((lsn, nblocks)); - } - } - } - - fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { - let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.insert(tag, (lsn, nblocks)); - } - - fn remove_cached_rel_size(&self, tag: &RelTag) { - let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.remove(tag); - } -} - /// /// Information about how much history needs to be retained, needed by /// Garbage Collection. @@ -464,45 +461,37 @@ pub struct GcInfo { } /// Public interface functions -impl Timeline for LayeredTimeline { - fn get_ancestor_lsn(&self) -> Lsn { +impl Timeline { + //------------------------------------------------------------------------------ + // Public GET functions + //------------------------------------------------------------------------------ + + /// Get the LSN where this branch was created + pub fn get_ancestor_lsn(&self) -> Lsn { self.ancestor_lsn } - fn get_ancestor_timeline_id(&self) -> Option { + /// Get the ancestor's timeline id + pub fn get_ancestor_timeline_id(&self) -> Option { self.ancestor_timeline .as_ref() .map(LayeredTimelineEntry::timeline_id) } - /// Wait until WAL has been received up to the given LSN. - fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { - // This should never be called from the WAL receiver thread, because that could lead - // to a deadlock. - ensure!( - !IS_WAL_RECEIVER.with(|c| c.get()), - "wait_lsn called by WAL receiver thread" - ); - - self.metrics.wait_lsn_time_histo.observe_closure_duration( - || self.last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .with_context(|| { - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - }))?; - - Ok(()) + /// Lock and get timeline's GC cuttof + pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { + self.latest_gc_cutoff_lsn.read() } - fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard { - self.latest_gc_cutoff_lsn.read().unwrap() - } - - /// Look up the value with the given a key - fn get(&self, key: Key, lsn: Lsn) -> Result { + /// Look up given page version. + /// + /// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction + /// above this needs to store suitable metadata to track what data exists with + /// what keys, in separate metadata entries. If a non-existent key is requested, + /// the Repository implementation may incorrectly return a value from an ancestor + /// branch, for example, or waste a lot of cycles chasing the non-existing key. + /// + pub fn get(&self, key: Key, lsn: Lsn) -> Result { // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image // and requested LSN. The cached image can also be used to reduce the amount of WAL needed @@ -531,68 +520,31 @@ impl Timeline for LayeredTimeline { .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) } - /// Public entry point for checkpoint(). All the logic is in the private - /// checkpoint_internal function, this public facade just wraps it for - /// metrics collection. - fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { - match cconf { - CheckpointConfig::Flush => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers(true) - } - CheckpointConfig::Forced => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers(true)?; - self.compact() - } - } - } - - /// - /// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn. - /// - fn check_lsn_is_in_scope( - &self, - lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, - ) -> Result<()> { - ensure!( - lsn >= **latest_gc_cutoff_lsn, - "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", - lsn, - **latest_gc_cutoff_lsn, - ); - Ok(()) - } - - fn get_last_record_lsn(&self) -> Lsn { + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. + pub fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last } - fn get_prev_record_lsn(&self) -> Lsn { + pub fn get_prev_record_lsn(&self) -> Lsn { self.last_record_lsn.load().prev } - fn get_last_record_rlsn(&self) -> RecordLsn { + /// Atomically get both last and prev. + pub fn get_last_record_rlsn(&self) -> RecordLsn { self.last_record_lsn.load() } - fn get_disk_consistent_lsn(&self) -> Lsn { + pub fn get_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn.load() } - fn writer<'a>(&'a self) -> Box { - Box::new(LayeredTimelineWriter { - tl: self, - _write_guard: self.write_lock.lock().unwrap(), - }) - } - - fn get_physical_size(&self) -> u64 { + /// Get the physical size of the timeline at the latest LSN + pub fn get_physical_size(&self) -> u64 { self.metrics.current_physical_size_gauge.get() } - fn get_physical_size_non_incremental(&self) -> anyhow::Result { + /// Get the physical size of the timeline at the latest LSN non incrementally + pub fn get_physical_size_non_incremental(&self) -> anyhow::Result { let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); // total size of layer files in the current timeline directory let mut total_physical_size = 0; @@ -611,9 +563,88 @@ impl Timeline for LayeredTimeline { Ok(total_physical_size) } + + /// + /// Wait until WAL has been received and processed up to this LSN. + /// + /// You should call this before any of the other get_* or list_* functions. Calling + /// those functions with an LSN that has been processed yet is an error. + /// + pub fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { + // This should never be called from the WAL receiver thread, because that could lead + // to a deadlock. + ensure!( + !IS_WAL_RECEIVER.with(|c| c.get()), + "wait_lsn called by WAL receiver thread" + ); + + self.metrics.wait_lsn_time_histo.observe_closure_duration( + || self.last_record_lsn + .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) + .with_context(|| { + format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", + lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() + ) + }))?; + + Ok(()) + } + + /// Check that it is valid to request operations with that lsn. + pub fn check_lsn_is_in_scope( + &self, + lsn: Lsn, + latest_gc_cutoff_lsn: &RcuReadGuard, + ) -> Result<()> { + ensure!( + lsn >= **latest_gc_cutoff_lsn, + "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", + lsn, + **latest_gc_cutoff_lsn, + ); + Ok(()) + } + + //------------------------------------------------------------------------------ + // Public PUT functions, to update the repository with new page versions. + // + // These are called by the WAL receiver to digest WAL records. + //------------------------------------------------------------------------------ + + /// Flush to disk all data that was written with the put_* functions + /// + /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't + /// know anything about them here in the repository. + pub fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { + match cconf { + CheckpointConfig::Flush => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true) + } + CheckpointConfig::Forced => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true)?; + self.compact() + } + } + } + + /// Mutate the timeline with a [`TimelineWriter`]. + /// + /// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter + /// is a generic type in this trait. But that doesn't currently work in + /// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html + pub fn writer(&self) -> TimelineWriter<'_> { + TimelineWriter { + tl: self, + _write_guard: self.write_lock.lock().unwrap(), + } + } } -impl LayeredTimeline { +// Private functions +impl Timeline { fn get_checkpoint_distance(&self) -> u64 { let tenant_conf = self.tenant_conf.read().unwrap(); tenant_conf @@ -662,8 +693,8 @@ impl LayeredTimeline { tenant_id: ZTenantId, walredo_mgr: Arc, upload_layers: bool, - ) -> LayeredTimeline { - let mut result = LayeredTimeline { + ) -> Timeline { + let mut result = Timeline { conf, tenant_conf, timeline_id, @@ -699,10 +730,11 @@ impl LayeredTimeline { pitr_cutoff: Lsn(0), }), - latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), + latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), - current_logical_size: AtomicIsize::new(0), + current_logical_size: AtomicI64::new(0), + logical_size_initialized: AtomicBool::new(false), partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), repartition_threshold: 0, @@ -807,6 +839,10 @@ impl LayeredTimeline { /// /// This can be a slow operation. pub fn init_logical_size(&self) -> Result<()> { + if self.logical_size_initialized.load(AtomicOrdering::Acquire) { + return Ok(()); + } + // Try a fast-path first: // Copy logical size from ancestor timeline if there has been no changes on this // branch, and no changes on the ancestor branch since the branch point. @@ -819,8 +855,7 @@ impl LayeredTimeline { // // Logical size 0 means that it was not initialized, so don't believe that. if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn { - self.current_logical_size - .store(ancestor_logical_size as isize, AtomicOrdering::SeqCst); + self.set_current_logical_size(ancestor_logical_size); debug!( "logical size copied from ancestor: {}", ancestor_logical_size @@ -834,8 +869,7 @@ impl LayeredTimeline { // Have to calculate it the hard way let last_lsn = self.get_last_record_lsn(); let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?; - self.current_logical_size - .store(logical_size as isize, AtomicOrdering::SeqCst); + self.set_current_logical_size(logical_size); debug!("calculated logical size the hard way: {}", logical_size); timer.stop_and_record(); @@ -844,10 +878,10 @@ impl LayeredTimeline { /// Retrieve current logical size of the timeline /// - /// NOTE: counted incrementally, includes ancestors, - pub fn get_current_logical_size(&self) -> usize { + /// NOTE: counted incrementally, includes ancestors. + pub fn get_current_logical_size(&self) -> u64 { let current_logical_size = self.current_logical_size.load(AtomicOrdering::Acquire); - match usize::try_from(current_logical_size) { + match u64::try_from(current_logical_size) { Ok(sz) => sz, Err(_) => { error!( @@ -859,6 +893,36 @@ impl LayeredTimeline { } } + /// Update current logical size, adding `delta' to the old value. + fn update_current_logical_size(&self, delta: i64) { + let new_size = self + .current_logical_size + .fetch_add(delta, AtomicOrdering::SeqCst); + + // Also set the value in the prometheus gauge. Note that + // there is a race condition here: if this is is called by two + // threads concurrently, the prometheus gauge might be set to + // one value while current_logical_size is set to the + // other. Currently, only initialization and the WAL receiver + // updates the logical size, and they don't run concurrently, + // so it cannot happen. And even if it did, it wouldn't be + // very serious, the metrics would just be slightly off until + // the next update. + self.metrics.current_logical_size_gauge.set(new_size); + } + + /// Set current logical size. + fn set_current_logical_size(&self, new_size: u64) { + self.current_logical_size + .store(new_size as i64, AtomicOrdering::SeqCst); + self.logical_size_initialized + .store(true, AtomicOrdering::SeqCst); + + // Also set the value in the prometheus gauge. Same race condition + // here as in `update_current_logical_size`. + self.metrics.current_logical_size_gauge.set(new_size as i64); + } + /// /// Get a handle to a Layer for reading. /// @@ -1014,7 +1078,7 @@ impl LayeredTimeline { Some((lsn, img)) } - fn get_ancestor_timeline(&self) -> Result> { + fn get_ancestor_timeline(&self) -> Result> { let ancestor = self .ancestor_timeline .as_ref() @@ -1135,7 +1199,7 @@ impl LayeredTimeline { /// Also flush after a period of time without new data -- it helps /// safekeepers to regard pageserver as caught up and suspend activity. /// - pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { + pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { let last_lsn = self.get_last_record_lsn(); let layers = self.layers.read().unwrap(); if let Some(open_layer) = &layers.open_layer { @@ -1314,7 +1378,7 @@ impl LayeredTimeline { ondisk_prev_record_lsn, ancestor_timelineid, self.ancestor_lsn, - *self.latest_gc_cutoff_lsn.read().unwrap(), + *self.latest_gc_cutoff_lsn.read(), self.initdb_lsn, ); @@ -1969,9 +2033,21 @@ impl LayeredTimeline { let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); - // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. - // See branch_timeline() for details. - *self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff; + // We need to ensure that no one tries to read page versions or create + // branches at a point before latest_gc_cutoff_lsn. See branch_timeline() + // for details. This will block until the old value is no longer in use. + // + // The GC cutoff should only ever move forwards. + { + let write_guard = self.latest_gc_cutoff_lsn.write(); + ensure!( + *write_guard <= new_gc_cutoff, + "Cannot move GC cutoff LSN backwards (was {}, new {})", + *write_guard, + new_gc_cutoff + ); + write_guard.store(new_gc_cutoff); + } info!("GC starting"); @@ -2117,7 +2193,7 @@ impl LayeredTimeline { key: Key, request_lsn: Lsn, mut data: ValueReconstructState, - ) -> Result { + ) -> anyhow::Result { // Perform WAL redo if needed data.records.reverse(); @@ -2167,13 +2243,15 @@ impl LayeredTimeline { if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); - cache.memorize_materialized_page( - self.tenant_id, - self.timeline_id, - key, - last_rec_lsn, - &img, - ); + cache + .memorize_materialized_page( + self.tenant_id, + self.timeline_id, + key, + last_rec_lsn, + &img, + ) + .context("Materialized page memoization failed")?; } Ok(img) @@ -2208,39 +2286,50 @@ fn layer_traversal_error( Err(msg_iter.fold(err, |err, msg| err.context(msg))) } -struct LayeredTimelineWriter<'a> { - tl: &'a LayeredTimeline, +/// Various functions to mutate the timeline. +// TODO Currently, Deref is used to allow easy access to read methods from this trait. +// This is probably considered a bad practice in Rust and should be fixed eventually, +// but will cause large code changes. +pub struct TimelineWriter<'a> { + tl: &'a Timeline, _write_guard: MutexGuard<'a, ()>, } -impl Deref for LayeredTimelineWriter<'_> { - type Target = dyn Timeline; +impl Deref for TimelineWriter<'_> { + type Target = Timeline; fn deref(&self) -> &Self::Target { self.tl } } -impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { - fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { +impl<'a> TimelineWriter<'a> { + /// Put a new page version that can be constructed from a WAL record + /// + /// This will implicitly extend the relation, if the page is beyond the + /// current end-of-file. + pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { self.tl.put_value(key, lsn, value) } - fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { + pub fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { self.tl.put_tombstone(key_range, lsn) } - /// + /// Track the end of the latest digested WAL record. /// Remember the (end of) last valid WAL record remembered in the timeline. /// - fn finish_write(&self, new_lsn: Lsn) { + /// Call this after you have finished writing all the WAL up to 'lsn'. + /// + /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for + /// the 'lsn' or anything older. The previous last record LSN is stored alongside + /// the latest and can be read. + pub fn finish_write(&self, new_lsn: Lsn) { self.tl.finish_write(new_lsn); } - fn update_current_logical_size(&self, delta: isize) { - self.tl - .current_logical_size - .fetch_add(delta, AtomicOrdering::SeqCst); + pub fn update_current_logical_size(&self, delta: i64) { + self.tl.update_current_logical_size(delta) } } @@ -2263,39 +2352,3 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } - -/// Save timeline metadata to file -pub fn save_metadata( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - data: &TimelineMetadata, - first_save: bool, -) -> Result<()> { - let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timelineid, tenantid); - // use OpenOptions to ensure file presence is consistent with first_save - let mut file = VirtualFile::open_with_options( - &path, - OpenOptions::new().write(true).create_new(first_save), - )?; - - let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; - - if file.write(&metadata_bytes)? != metadata_bytes.len() { - bail!("Could not write all the metadata bytes in a single call"); - } - file.sync_all()?; - - // fsync the parent directory to ensure the directory entry is durable - if first_save { - let timeline_dir = File::open( - &path - .parent() - .expect("Metadata should always have a parent dir"), - )?; - timeline_dir.sync_all()?; - } - - Ok(()) -} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 47fd8a84cf..06c5f552a4 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -28,8 +28,6 @@ use tracing::info; use crate::thread_mgr::ThreadKind; use metrics::{register_int_gauge_vec, IntGaugeVec}; -use pgdatadir_mapping::DatadirTimeline; - /// Current storage format version /// /// This is embedded in the metadata file, and also in the header of all the diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 818eaf1b8f..27b1400243 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -45,6 +45,7 @@ use std::{ }, }; +use anyhow::Context; use once_cell::sync::OnceCell; use tracing::error; use utils::{ @@ -342,7 +343,7 @@ impl PageCache { key: Key, lsn: Lsn, img: &[u8], - ) { + ) -> anyhow::Result<()> { let cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { tenant_id, @@ -352,7 +353,7 @@ impl PageCache { lsn, }; - match self.lock_for_write(&cache_key) { + match self.lock_for_write(&cache_key)? { WriteBufResult::Found(write_guard) => { // We already had it in cache. Another thread must've put it there // concurrently. Check that it had the same contents that we @@ -364,17 +365,19 @@ impl PageCache { write_guard.mark_valid(); } } + + Ok(()) } // Section 1.2: Public interface functions for working with Ephemeral pages. - pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult { + pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result { let mut cache_key = CacheKey::EphemeralPage { file_id, blkno }; self.lock_for_read(&mut cache_key) } - pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> WriteBufResult { + pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result { let cache_key = CacheKey::EphemeralPage { file_id, blkno }; self.lock_for_write(&cache_key) @@ -402,7 +405,7 @@ impl PageCache { // Section 1.3: Public interface functions for working with immutable file pages. - pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult { + pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result { let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno }; self.lock_for_read(&mut cache_key) @@ -495,15 +498,16 @@ impl PageCache { /// } /// ``` /// - fn lock_for_read(&self, cache_key: &mut CacheKey) -> ReadBufResult { + fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result { loop { // First check if the key already exists in the cache. if let Some(read_guard) = self.try_lock_for_read(cache_key) { - return ReadBufResult::Found(read_guard); + return Ok(ReadBufResult::Found(read_guard)); } // Not found. Find a victim buffer - let (slot_idx, mut inner) = self.find_victim(); + let (slot_idx, mut inner) = + self.find_victim().context("Failed to find evict victim")?; // Insert mapping for this. At this point, we may find that another // thread did the same thing concurrently. In that case, we evicted @@ -526,10 +530,10 @@ impl PageCache { inner.dirty = false; slot.usage_count.store(1, Ordering::Relaxed); - return ReadBufResult::NotFound(PageWriteGuard { + return Ok(ReadBufResult::NotFound(PageWriteGuard { inner, valid: false, - }); + })); } } @@ -556,15 +560,16 @@ impl PageCache { /// /// Similar to lock_for_read(), but the returned buffer is write-locked and /// may be modified by the caller even if it's already found in the cache. - fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult { + fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result { loop { // First check if the key already exists in the cache. if let Some(write_guard) = self.try_lock_for_write(cache_key) { - return WriteBufResult::Found(write_guard); + return Ok(WriteBufResult::Found(write_guard)); } // Not found. Find a victim buffer - let (slot_idx, mut inner) = self.find_victim(); + let (slot_idx, mut inner) = + self.find_victim().context("Failed to find evict victim")?; // Insert mapping for this. At this point, we may find that another // thread did the same thing concurrently. In that case, we evicted @@ -587,10 +592,10 @@ impl PageCache { inner.dirty = false; slot.usage_count.store(1, Ordering::Relaxed); - return WriteBufResult::NotFound(PageWriteGuard { + return Ok(WriteBufResult::NotFound(PageWriteGuard { inner, valid: false, - }); + })); } } @@ -754,7 +759,7 @@ impl PageCache { /// Find a slot to evict. /// /// On return, the slot is empty and write-locked. - fn find_victim(&self) -> (usize, RwLockWriteGuard) { + fn find_victim(&self) -> anyhow::Result<(usize, RwLockWriteGuard)> { let iter_limit = self.slots.len() * 10; let mut iters = 0; loop { @@ -767,7 +772,7 @@ impl PageCache { let mut inner = match slot.inner.try_write() { Ok(inner) => inner, Err(TryLockError::Poisoned(err)) => { - panic!("buffer lock was poisoned: {:?}", err) + anyhow::bail!("buffer lock was poisoned: {err:?}") } Err(TryLockError::WouldBlock) => { // If we have looped through the whole buffer pool 10 times @@ -777,7 +782,7 @@ impl PageCache { // there are buffers in the pool. In practice, with a reasonably // large buffer pool it really shouldn't happen. if iters > iter_limit { - panic!("could not find a victim buffer to evict"); + anyhow::bail!("exceeded evict iter limit"); } continue; } @@ -804,7 +809,7 @@ impl PageCache { inner.dirty = false; inner.key = None; } - return (slot_idx, inner); + return Ok((slot_idx, inner)); } } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b63bb90be1..d59a82d488 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -17,24 +17,24 @@ use std::io::{self, Read}; use std::net::TcpListener; use std::str; use std::str::FromStr; -use std::sync::{Arc, RwLockReadGuard}; +use std::sync::Arc; use tracing::*; use utils::{ auth::{self, Claims, JwtAuth, Scope}, lsn::Lsn, postgres_backend::{self, is_socket_read_timed_out, AuthType, PostgresBackend}, pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, + simple_rcu::RcuReadGuard, zid::{ZTenantId, ZTimelineId}, }; use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; -use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp}; +use crate::layered_repository::Timeline; +use crate::pgdatadir_mapping::LsnForTimestamp; use crate::profiling::profpoint_start; use crate::reltag::RelTag; -use crate::repository::Repository; -use crate::repository::Timeline; use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; @@ -495,22 +495,22 @@ impl PageServerHandler { PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME .with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]) .observe_closure_duration(|| { - self.handle_get_rel_exists_request(timeline.as_ref(), &req) + self.handle_get_rel_exists_request(&timeline, &req) }), PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME .with_label_values(&["get_rel_size", &tenant_id, &timeline_id]) .observe_closure_duration(|| { - self.handle_get_nblocks_request(timeline.as_ref(), &req) + self.handle_get_nblocks_request(&timeline, &req) }), PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME .with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]) .observe_closure_duration(|| { - self.handle_get_page_at_lsn_request(timeline.as_ref(), &req) + self.handle_get_page_at_lsn_request(&timeline, &req) }), PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME .with_label_values(&["get_db_size", &tenant_id, &timeline_id]) .observe_closure_duration(|| { - self.handle_db_size_request(timeline.as_ref(), &req) + self.handle_db_size_request(&timeline, &req) }), }; @@ -636,11 +636,11 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( - timeline: &T, + fn wait_or_get_last_lsn( + timeline: &Timeline, mut lsn: Lsn, latest: bool, - latest_gc_cutoff_lsn: &RwLockReadGuard, + latest_gc_cutoff_lsn: &RcuReadGuard, ) -> Result { if latest { // Latest page version was requested. If LSN is given, it is a hint @@ -684,9 +684,9 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + fn handle_get_rel_exists_request( &self, - timeline: &T, + timeline: &Timeline, req: &PagestreamExistsRequest, ) -> Result { let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); @@ -701,9 +701,9 @@ impl PageServerHandler { })) } - fn handle_get_nblocks_request( + fn handle_get_nblocks_request( &self, - timeline: &T, + timeline: &Timeline, req: &PagestreamNblocksRequest, ) -> Result { let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); @@ -717,9 +717,9 @@ impl PageServerHandler { })) } - fn handle_db_size_request( + fn handle_db_size_request( &self, - timeline: &T, + timeline: &Timeline, req: &PagestreamDbSizeRequest, ) -> Result { let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered(); @@ -735,9 +735,9 @@ impl PageServerHandler { })) } - fn handle_get_page_at_lsn_request( + fn handle_get_page_at_lsn_request( &self, - timeline: &T, + timeline: &Timeline, req: &PagestreamGetPageRequest, ) -> Result { let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) @@ -745,7 +745,7 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; /* - // Add a 1s delay to some requests. The delayed causes the requests to + // Add a 1s delay to some requests. The delay helps the requests to // hit the race condition from github issue #1047 more easily. use rand::Rng; if rand::thread_rng().gen::() < 5 { @@ -1077,7 +1077,7 @@ impl postgres_backend::Handler for PageServerHandler { .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("do_gc ") { // Run GC immediately on given timeline. - // FIXME: This is just for tests. See test_runner/batch_others/test_gc.py. + // FIXME: This is just for tests. See test_runner/regress/test_gc.py. // This probably should require special authentication or a global flag to // enable, I don't think we want to or need to allow regular clients to invoke // GC. diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 88fac0ad5a..0f0bb1ed53 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -7,8 +7,8 @@ //! Clarify that) //! use crate::keyspace::{KeySpace, KeySpaceAccum}; +use crate::layered_repository::Timeline; use crate::reltag::{RelTag, SlruKind}; -use crate::repository::Timeline; use crate::repository::*; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; @@ -18,7 +18,7 @@ use postgres_ffi::v14::xlog_utils::TimestampTz; use postgres_ffi::BLCKSZ; use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, HashSet}; +use std::collections::{hash_map, HashMap, HashSet}; use std::ops::Range; use tracing::{debug, trace, warn}; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -35,23 +35,13 @@ pub enum LsnForTimestamp { } /// -/// This trait provides all the functionality to store PostgreSQL relations, SLRUs, +/// This impl provides all the functionality to store PostgreSQL relations, SLRUs, /// and other special kinds of files, in a versioned key-value store. The -/// Timeline trait provides the key-value store. +/// Timeline struct provides the key-value store. /// -/// This is a trait, so that we can easily include all these functions in a Timeline -/// implementation. You're not expected to have different implementations of this trait, -/// rather, this provides an interface and implementation, over Timeline. -/// -/// If you wanted to store other kinds of data in the Neon repository, e.g. -/// flat files or MySQL, you would create a new trait like this, with all the -/// functions that make sense for the kind of data you're storing. For flat files, -/// for example, you might have a function like "fn read(path, offset, size)". -/// We might also have that situation in the future, to support multiple PostgreSQL -/// versions, if there are big changes in how the data is organized in the data -/// directory, or if new special files are introduced. -/// -pub trait DatadirTimeline: Timeline { +/// This is a separate impl, so that we can easily include all these functions in a Timeline +/// implementation, and might be moved into a separate struct later. +impl Timeline { /// Start ingesting a WAL record, or other atomic modification of /// the timeline. /// @@ -75,7 +65,7 @@ pub trait DatadirTimeline: Timeline { /// functions of the timeline until you finish! And if you update the /// same page twice, the last update wins. /// - fn begin_modification(&self, lsn: Lsn) -> DatadirModification + pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification where Self: Sized, { @@ -93,7 +83,7 @@ pub trait DatadirTimeline: Timeline { //------------------------------------------------------------------------------ /// Look up given page version. - fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { + pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); let nblocks = self.get_rel_size(tag, lsn)?; @@ -110,7 +100,7 @@ pub trait DatadirTimeline: Timeline { } // Get size of a database in blocks - fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { let mut total_blocks = 0; let rels = self.list_rels(spcnode, dbnode, lsn)?; @@ -123,7 +113,7 @@ pub trait DatadirTimeline: Timeline { } /// Get size of a relation file - fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { + pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { @@ -151,7 +141,7 @@ pub trait DatadirTimeline: Timeline { } /// Does relation exist? - fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { + pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); // first try to lookup relation in cache @@ -169,7 +159,7 @@ pub trait DatadirTimeline: Timeline { } /// Get a list of all existing relations in given tablespace and database. - fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { + pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); let buf = self.get(key, lsn)?; @@ -187,7 +177,7 @@ pub trait DatadirTimeline: Timeline { } /// Look up given SLRU page version. - fn get_slru_page_at_lsn( + pub fn get_slru_page_at_lsn( &self, kind: SlruKind, segno: u32, @@ -199,14 +189,19 @@ pub trait DatadirTimeline: Timeline { } /// Get size of an SLRU segment - fn get_slru_segment_size(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { + pub fn get_slru_segment_size( + &self, + kind: SlruKind, + segno: u32, + lsn: Lsn, + ) -> Result { let key = slru_segment_size_to_key(kind, segno); let mut buf = self.get(key, lsn)?; Ok(buf.get_u32_le()) } /// Get size of an SLRU segment - fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { + pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { // fetch directory listing let key = slru_dir_to_key(kind); let buf = self.get(key, lsn)?; @@ -223,7 +218,7 @@ pub trait DatadirTimeline: Timeline { /// so it's not well defined which LSN you get if there were multiple commits /// "in flight" at that point in time. /// - fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { + pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); let min_lsn = *gc_cutoff_lsn_guard; let max_lsn = self.get_last_record_lsn(); @@ -286,7 +281,7 @@ pub trait DatadirTimeline: Timeline { /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits /// with a smaller/larger timestamp. /// - fn is_latest_commit_timestamp_ge_than( + pub fn is_latest_commit_timestamp_ge_than( &self, search_timestamp: TimestampTz, probe_lsn: Lsn, @@ -317,7 +312,7 @@ pub trait DatadirTimeline: Timeline { } /// Get a list of SLRU segments - fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { + pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { // fetch directory entry let key = slru_dir_to_key(kind); @@ -327,14 +322,14 @@ pub trait DatadirTimeline: Timeline { Ok(dir.segments) } - fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { let key = relmap_file_key(spcnode, dbnode); let buf = self.get(key, lsn)?; Ok(buf) } - fn list_dbdirs(&self, lsn: Lsn) -> Result> { + pub fn list_dbdirs(&self, lsn: Lsn) -> Result> { // fetch directory entry let buf = self.get(DBDIR_KEY, lsn)?; let dir = DbDirectory::des(&buf)?; @@ -342,13 +337,13 @@ pub trait DatadirTimeline: Timeline { Ok(dir.dbdirs) } - fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { + pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { let key = twophase_file_key(xid); let buf = self.get(key, lsn)?; Ok(buf) } - fn list_twophase_files(&self, lsn: Lsn) -> Result> { + pub fn list_twophase_files(&self, lsn: Lsn) -> Result> { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn)?; let dir = TwoPhaseDirectory::des(&buf)?; @@ -356,11 +351,11 @@ pub trait DatadirTimeline: Timeline { Ok(dir.xids) } - fn get_control_file(&self, lsn: Lsn) -> Result { + pub fn get_control_file(&self, lsn: Lsn) -> Result { self.get(CONTROLFILE_KEY, lsn) } - fn get_checkpoint(&self, lsn: Lsn) -> Result { + pub fn get_checkpoint(&self, lsn: Lsn) -> Result { self.get(CHECKPOINT_KEY, lsn) } @@ -369,29 +364,29 @@ pub trait DatadirTimeline: Timeline { /// /// Only relation blocks are counted currently. That excludes metadata, /// SLRUs, twophase files etc. - fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { + pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn)?; let dbdir = DbDirectory::des(&buf)?; - let mut total_size: usize = 0; + let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { for rel in self.list_rels(*spcnode, *dbnode, lsn)? { let relsize_key = rel_size_to_key(rel); let mut buf = self.get(relsize_key, lsn)?; let relsize = buf.get_u32_le(); - total_size += relsize as usize; + total_size += relsize as u64; } } - Ok(total_size * BLCKSZ as usize) + Ok(total_size * BLCKSZ as u64) } /// /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). - fn collect_keyspace(&self, lsn: Lsn) -> Result { + pub fn collect_keyspace(&self, lsn: Lsn) -> Result { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); @@ -465,27 +460,54 @@ pub trait DatadirTimeline: Timeline { } /// Get cached size of relation if it not updated after specified LSN - fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option; + pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { + let rel_size_cache = self.rel_size_cache.read().unwrap(); + if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { + if lsn >= *cached_lsn { + return Some(*nblocks); + } + } + None + } /// Update cached relation size if there is no more recent update - fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber); + pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { + let mut rel_size_cache = self.rel_size_cache.write().unwrap(); + match rel_size_cache.entry(tag) { + hash_map::Entry::Occupied(mut entry) => { + let cached_lsn = entry.get_mut(); + if lsn >= cached_lsn.0 { + *cached_lsn = (lsn, nblocks); + } + } + hash_map::Entry::Vacant(entry) => { + entry.insert((lsn, nblocks)); + } + } + } /// Store cached relation size - fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber); + pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { + let mut rel_size_cache = self.rel_size_cache.write().unwrap(); + rel_size_cache.insert(tag, (lsn, nblocks)); + } /// Remove cached relation size - fn remove_cached_rel_size(&self, tag: &RelTag); + pub fn remove_cached_rel_size(&self, tag: &RelTag) { + let mut rel_size_cache = self.rel_size_cache.write().unwrap(); + rel_size_cache.remove(tag); + } } /// DatadirModification represents an operation to ingest an atomic set of /// updates to the repository. It is created by the 'begin_record' /// function. It is called for each WAL record, so that all the modifications /// by a one WAL record appear atomic. -pub struct DatadirModification<'a, T: DatadirTimeline> { +pub struct DatadirModification<'a> { /// The timeline this modification applies to. You can access this to /// read the state, but note that any pending updates are *not* reflected /// in the state in 'tline' yet. - pub tline: &'a T, + pub tline: &'a Timeline, /// Lsn assigned by begin_modification pub lsn: Lsn, @@ -495,10 +517,10 @@ pub struct DatadirModification<'a, T: DatadirTimeline> { // underlying key-value store by the 'finish' function. pending_updates: HashMap, pending_deletions: Vec>, - pending_nblocks: isize, + pending_nblocks: i64, } -impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { +impl<'a> DatadirModification<'a> { /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to @@ -654,7 +676,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { } // Update logical database size. - self.pending_nblocks -= total_blocks as isize; + self.pending_nblocks -= total_blocks as i64; // Delete all relations and metadata files for the spcnode/dnode self.delete(dbdir_key_range(spcnode, dbnode)); @@ -697,7 +719,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); - self.pending_nblocks += nblocks as isize; + self.pending_nblocks += nblocks as i64; // Update relation size cache self.tline.set_cached_rel_size(rel, self.lsn, nblocks); @@ -727,7 +749,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { self.tline.set_cached_rel_size(rel, self.lsn, nblocks); // Update logical database size. - self.pending_nblocks -= old_size as isize - nblocks as isize; + self.pending_nblocks -= old_size as i64 - nblocks as i64; } Ok(()) } @@ -749,7 +771,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { // Update relation size cache self.tline.set_cached_rel_size(rel, self.lsn, nblocks); - self.pending_nblocks += nblocks as isize - old_size as isize; + self.pending_nblocks += nblocks as i64 - old_size as i64; } Ok(()) } @@ -772,7 +794,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { // update logical size let size_key = rel_size_to_key(rel); let old_size = self.get(size_key)?.get_u32_le(); - self.pending_nblocks -= old_size as isize; + self.pending_nblocks -= old_size as i64; // Remove enty from relation size cache self.tline.remove_cached_rel_size(&rel); @@ -914,7 +936,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { result?; if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize); + writer.update_current_logical_size(pending_nblocks * BLCKSZ as i64); self.pending_nblocks = 0; } @@ -942,7 +964,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { writer.finish_write(lsn); if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize); + writer.update_current_logical_size(pending_nblocks * BLCKSZ as i64); } Ok(()) @@ -1368,10 +1390,10 @@ fn is_slru_block_key(key: Key) -> bool { // #[cfg(test)] -pub fn create_test_timeline( - repo: R, +pub fn create_test_timeline( + repo: &crate::layered_repository::Repository, timeline_id: utils::zid::ZTimelineId, -) -> Result> { +) -> Result> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index d09b01437c..e46a39436d 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,19 +1,13 @@ use crate::layered_repository::metadata::TimelineMetadata; -use crate::storage_sync::index::RemoteIndex; use crate::walrecord::ZenithWalRecord; -use crate::CheckpointConfig; use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::fmt; use std::ops::{AddAssign, Range}; -use std::sync::{Arc, RwLockReadGuard}; +use std::sync::Arc; use std::time::Duration; -use utils::{ - lsn::{Lsn, RecordLsn}, - zid::ZTimelineId, -}; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] /// Key used in the Repository kv-store. @@ -181,78 +175,6 @@ impl Value { } } -/// -/// A repository corresponds to one .neon directory. One repository holds multiple -/// timelines, forked off from the same initial call to 'initdb'. -pub trait Repository: Send + Sync { - type Timeline: crate::DatadirTimeline; - - /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. - /// See [`crate::remote_storage`] for more details about the synchronization. - fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; - - /// Get Timeline handle for given zenith timeline ID. - /// This function is idempotent. It doesn't change internal state in any way. - fn get_timeline(&self, timelineid: ZTimelineId) -> Option>; - - /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. - fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result>; - - /// Lists timelines the repository contains. - /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. - fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline)>; - - /// Create a new, empty timeline. The caller is responsible for loading data into it - /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. - fn create_empty_timeline( - &self, - timeline_id: ZTimelineId, - initdb_lsn: Lsn, - ) -> Result>; - - /// Branch a timeline - fn branch_timeline( - &self, - src: ZTimelineId, - dst: ZTimelineId, - start_lsn: Option, - ) -> Result<()>; - - /// Flush all data to disk. - /// - /// this is used at graceful shutdown. - fn checkpoint(&self) -> Result<()>; - - /// perform one garbage collection iteration, removing old data files from disk. - /// this function is periodically called by gc thread. - /// also it can be explicitly requested through page server api 'do_gc' command. - /// - /// 'timelineid' specifies the timeline to GC, or None for all. - /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). - /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC - /// to make tests more deterministic. - /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? - fn gc_iteration( - &self, - timelineid: Option, - horizon: u64, - pitr: Duration, - checkpoint_before_gc: bool, - ) -> Result; - - /// Perform one compaction iteration. - /// This function is periodically called by compactor thread. - /// Also it can be explicitly requested per timeline through page server - /// api's 'compact' command. - fn compaction_iteration(&self) -> Result<()>; - - /// removes timeline-related in-memory data - fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()>; - - /// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn. - fn get_remote_index(&self) -> &RemoteIndex; -} - /// A timeline, that belongs to the current repository. pub enum RepositoryTimeline { /// Timeline, with its files present locally in pageserver's working directory. @@ -304,621 +226,3 @@ impl AddAssign for GcResult { self.elapsed += other.elapsed; } } - -pub trait Timeline: Send + Sync { - //------------------------------------------------------------------------------ - // Public GET functions - //------------------------------------------------------------------------------ - - /// - /// Wait until WAL has been received and processed up to this LSN. - /// - /// You should call this before any of the other get_* or list_* functions. Calling - /// those functions with an LSN that has been processed yet is an error. - /// - fn wait_lsn(&self, lsn: Lsn) -> Result<()>; - - /// Lock and get timeline's GC cuttof - fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard; - - /// Look up given page version. - /// - /// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction - /// above this needs to store suitable metadata to track what data exists with - /// what keys, in separate metadata entries. If a non-existent key is requested, - /// the Repository implementation may incorrectly return a value from an ancestor - /// branch, for example, or waste a lot of cycles chasing the non-existing key. - /// - fn get(&self, key: Key, lsn: Lsn) -> Result; - - /// Get the ancestor's timeline id - fn get_ancestor_timeline_id(&self) -> Option; - - /// Get the LSN where this branch was created - fn get_ancestor_lsn(&self) -> Lsn; - - //------------------------------------------------------------------------------ - // Public PUT functions, to update the repository with new page versions. - // - // These are called by the WAL receiver to digest WAL records. - //------------------------------------------------------------------------------ - /// Atomically get both last and prev. - fn get_last_record_rlsn(&self) -> RecordLsn; - - /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. - fn get_last_record_lsn(&self) -> Lsn; - - fn get_prev_record_lsn(&self) -> Lsn; - - fn get_disk_consistent_lsn(&self) -> Lsn; - - /// Mutate the timeline with a [`TimelineWriter`]. - /// - /// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter - /// is a generic type in this trait. But that doesn't currently work in - /// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html - fn writer<'a>(&'a self) -> Box; - - /// - /// Flush to disk all data that was written with the put_* functions - /// - /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't - /// know anything about them here in the repository. - fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>; - - /// - /// Check that it is valid to request operations with that lsn. - fn check_lsn_is_in_scope( - &self, - lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, - ) -> Result<()>; - - /// Get the physical size of the timeline at the latest LSN - fn get_physical_size(&self) -> u64; - /// Get the physical size of the timeline at the latest LSN non incrementally - fn get_physical_size_non_incremental(&self) -> Result; -} - -/// Various functions to mutate the timeline. -// TODO Currently, Deref is used to allow easy access to read methods from this trait. -// This is probably considered a bad practice in Rust and should be fixed eventually, -// but will cause large code changes. -pub trait TimelineWriter<'a> { - /// Put a new page version that can be constructed from a WAL record - /// - /// This will implicitly extend the relation, if the page is beyond the - /// current end-of-file. - fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()>; - - fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()>; - - /// Track the end of the latest digested WAL record. - /// - /// Call this after you have finished writing all the WAL up to 'lsn'. - /// - /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for - /// the 'lsn' or anything older. The previous last record LSN is stored alongside - /// the latest and can be read. - fn finish_write(&self, lsn: Lsn); - - fn update_current_logical_size(&self, delta: isize); -} - -#[cfg(test)] -pub mod repo_harness { - use bytes::BytesMut; - use once_cell::sync::Lazy; - use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; - use std::{fs, path::PathBuf}; - - use crate::{ - config::PageServerConf, - layered_repository::LayeredRepository, - walredo::{WalRedoError, WalRedoManager}, - }; - - use super::*; - use crate::tenant_config::{TenantConf, TenantConfOpt}; - use hex_literal::hex; - use utils::zid::ZTenantId; - - pub const TIMELINE_ID: ZTimelineId = - ZTimelineId::from_array(hex!("11223344556677881122334455667788")); - pub const NEW_TIMELINE_ID: ZTimelineId = - ZTimelineId::from_array(hex!("AA223344556677881122334455667788")); - - /// Convenience function to create a page image with given string as the only content - #[allow(non_snake_case)] - pub fn TEST_IMG(s: &str) -> Bytes { - let mut buf = BytesMut::new(); - buf.extend_from_slice(s.as_bytes()); - buf.resize(64, 0); - - buf.freeze() - } - - static LOCK: Lazy> = Lazy::new(|| RwLock::new(())); - - impl From for TenantConfOpt { - fn from(tenant_conf: TenantConf) -> Self { - Self { - checkpoint_distance: Some(tenant_conf.checkpoint_distance), - checkpoint_timeout: Some(tenant_conf.checkpoint_timeout), - compaction_target_size: Some(tenant_conf.compaction_target_size), - compaction_period: Some(tenant_conf.compaction_period), - compaction_threshold: Some(tenant_conf.compaction_threshold), - gc_horizon: Some(tenant_conf.gc_horizon), - gc_period: Some(tenant_conf.gc_period), - image_creation_threshold: Some(tenant_conf.image_creation_threshold), - pitr_interval: Some(tenant_conf.pitr_interval), - walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), - lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), - max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), - } - } - } - - pub struct RepoHarness<'a> { - pub conf: &'static PageServerConf, - pub tenant_conf: TenantConf, - pub tenant_id: ZTenantId, - - pub lock_guard: ( - Option>, - Option>, - ), - } - - impl<'a> RepoHarness<'a> { - pub fn create(test_name: &'static str) -> Result { - Self::create_internal(test_name, false) - } - pub fn create_exclusive(test_name: &'static str) -> Result { - Self::create_internal(test_name, true) - } - fn create_internal(test_name: &'static str, exclusive: bool) -> Result { - let lock_guard = if exclusive { - (None, Some(LOCK.write().unwrap())) - } else { - (Some(LOCK.read().unwrap()), None) - }; - - let repo_dir = PageServerConf::test_repo_dir(test_name); - let _ = fs::remove_dir_all(&repo_dir); - fs::create_dir_all(&repo_dir)?; - - let conf = PageServerConf::dummy_conf(repo_dir); - // Make a static copy of the config. This can never be free'd, but that's - // OK in a test. - let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - - let tenant_conf = TenantConf::dummy_conf(); - - let tenant_id = ZTenantId::generate(); - fs::create_dir_all(conf.tenant_path(&tenant_id))?; - fs::create_dir_all(conf.timelines_path(&tenant_id))?; - - Ok(Self { - conf, - tenant_conf, - tenant_id, - lock_guard, - }) - } - - pub fn load(&self) -> LayeredRepository { - self.try_load().expect("failed to load test repo") - } - - pub fn try_load(&self) -> Result { - let walredo_mgr = Arc::new(TestRedoManager); - - let repo = LayeredRepository::new( - self.conf, - TenantConfOpt::from(self.tenant_conf), - walredo_mgr, - self.tenant_id, - RemoteIndex::default(), - false, - ); - // populate repo with locally available timelines - for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) - .expect("should be able to read timelines dir") - { - let timeline_dir_entry = timeline_dir_entry.unwrap(); - let timeline_id: ZTimelineId = timeline_dir_entry - .path() - .file_name() - .unwrap() - .to_string_lossy() - .parse() - .unwrap(); - - repo.attach_timeline(timeline_id)?; - } - - Ok(repo) - } - - pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { - self.conf.timeline_path(timeline_id, &self.tenant_id) - } - } - - // Mock WAL redo manager that doesn't do much - pub struct TestRedoManager; - - impl WalRedoManager for TestRedoManager { - fn request_redo( - &self, - key: Key, - lsn: Lsn, - base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, - ) -> Result { - let s = format!( - "redo for {} to get to {}, with {} and {} records", - key, - lsn, - if base_img.is_some() { - "base image" - } else { - "no base image" - }, - records.len() - ); - println!("{}", s); - - Ok(TEST_IMG(&s)) - } - } -} - -/// -/// Tests that should work the same with any Repository/Timeline implementation. -/// -#[allow(clippy::bool_assert_comparison)] -#[cfg(test)] -mod tests { - use super::repo_harness::*; - use super::*; - //use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT}; - //use std::sync::Arc; - use bytes::BytesMut; - use hex_literal::hex; - use once_cell::sync::Lazy; - - static TEST_KEY: Lazy = - Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001"))); - - #[test] - fn test_basic() -> Result<()> { - let repo = RepoHarness::create("test_basic")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - let writer = tline.writer(); - writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; - writer.finish_write(Lsn(0x10)); - drop(writer); - - let writer = tline.writer(); - writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; - writer.finish_write(Lsn(0x20)); - drop(writer); - - assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); - - Ok(()) - } - - #[test] - fn no_duplicate_timelines() -> Result<()> { - let repo = RepoHarness::create("no_duplicate_timelines")?.load(); - let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) { - Ok(_) => panic!("duplicate timeline creation should fail"), - Err(e) => assert_eq!(e.to_string(), "Timeline already exists"), - } - - Ok(()) - } - - /// Convenience function to create a page image with given string as the only content - pub fn test_value(s: &str) -> Value { - let mut buf = BytesMut::new(); - buf.extend_from_slice(s.as_bytes()); - Value::Image(buf.freeze()) - } - - /// - /// Test branch creation - /// - #[test] - fn test_branch() -> Result<()> { - let repo = RepoHarness::create("test_branch")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - use std::str::from_utf8; - - #[allow(non_snake_case)] - let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); - #[allow(non_snake_case)] - let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap(); - - // Insert a value on the timeline - writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?; - writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?; - writer.finish_write(Lsn(0x20)); - - writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?; - writer.finish_write(Lsn(0x30)); - writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?; - writer.finish_write(Lsn(0x40)); - - //assert_current_logical_size(&tline, Lsn(0x40)); - - // Branch the history, modify relation differently on the new timeline - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; - let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("Should have a local timeline"); - let new_writer = newtline.writer(); - new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?; - new_writer.finish_write(Lsn(0x40)); - - // Check page contents on both branches - assert_eq!( - from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?, - "foo at 0x40" - ); - assert_eq!( - from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?, - "bar at 0x40" - ); - assert_eq!( - from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?, - "foobar at 0x20" - ); - - //assert_current_logical_size(&tline, Lsn(0x40)); - - Ok(()) - } - - fn make_some_layers(tline: &T, start_lsn: Lsn) -> Result<()> { - let mut lsn = start_lsn; - #[allow(non_snake_case)] - { - let writer = tline.writer(); - // Create a relation on the timeline - writer.put( - *TEST_KEY, - lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), - )?; - writer.finish_write(lsn); - lsn += 0x10; - writer.put( - *TEST_KEY, - lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), - )?; - writer.finish_write(lsn); - lsn += 0x10; - } - tline.checkpoint(CheckpointConfig::Forced)?; - { - let writer = tline.writer(); - writer.put( - *TEST_KEY, - lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), - )?; - writer.finish_write(lsn); - lsn += 0x10; - writer.put( - *TEST_KEY, - lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), - )?; - writer.finish_write(lsn); - } - tline.checkpoint(CheckpointConfig::Forced) - } - - #[test] - fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; - - // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - // FIXME: this doesn't actually remove any layer currently, given how the checkpointing - // and compaction works. But it does set the 'cutoff' point so that the cross check - // below should fail. - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; - - // try to branch at lsn 25, should fail because we already garbage collected the data - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { - Ok(_) => panic!("branching should have failed"), - Err(err) => { - assert!(err.to_string().contains("invalid branch start lsn")); - assert!(err - .source() - .unwrap() - .to_string() - .contains("we might've already garbage collected needed data")) - } - } - - Ok(()) - } - - #[test] - fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { - let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - - repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; - // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { - Ok(_) => panic!("branching should have failed"), - Err(err) => { - assert!(&err.to_string().contains("invalid branch start lsn")); - assert!(&err - .source() - .unwrap() - .to_string() - .contains("is earlier than latest GC horizon")); - } - } - - Ok(()) - } - - /* - // FIXME: This currently fails to error out. Calling GC doesn't currently - // remove the old value, we'd need to work a little harder - #[test] - fn test_prohibit_get_for_garbage_collected_data() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? - .load(); - - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; - - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; - let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); - assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); - match tline.get(*TEST_KEY, Lsn(0x25)) { - Ok(_) => panic!("request for page should have failed"), - Err(err) => assert!(err.to_string().contains("not found at")), - } - Ok(()) - } - */ - - #[test] - fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { - let repo = - RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; - - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("Should have a local timeline"); - // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; - assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); - - Ok(()) - } - #[test] - fn test_parent_keeps_data_forever_after_branching() -> Result<()> { - let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(tline.as_ref(), Lsn(0x20))?; - - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("Should have a local timeline"); - - make_some_layers(newtline.as_ref(), Lsn(0x60))?; - - // run gc on parent - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; - - // Check that the data is still accessible on the branch. - assert_eq!( - newtline.get(*TEST_KEY, Lsn(0x50))?, - TEST_IMG(&format!("foo at {}", Lsn(0x40))) - ); - - Ok(()) - } - - #[test] - fn timeline_load() -> Result<()> { - const TEST_NAME: &str = "timeline_load"; - let harness = RepoHarness::create(TEST_NAME)?; - { - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; - make_some_layers(tline.as_ref(), Lsn(0x8000))?; - tline.checkpoint(CheckpointConfig::Forced)?; - } - - let repo = harness.load(); - let tline = repo - .get_timeline(TIMELINE_ID) - .expect("cannot load timeline"); - assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); - - assert!(repo.get_timeline_load(TIMELINE_ID).is_ok()); - - let tline = repo - .get_timeline(TIMELINE_ID) - .expect("cannot load timeline"); - assert!(matches!(tline, RepositoryTimeline::Loaded(_))); - - Ok(()) - } - - #[test] - fn timeline_load_with_ancestor() -> Result<()> { - const TEST_NAME: &str = "timeline_load_with_ancestor"; - let harness = RepoHarness::create(TEST_NAME)?; - // create two timelines - { - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - make_some_layers(tline.as_ref(), Lsn(0x20))?; - tline.checkpoint(CheckpointConfig::Forced)?; - - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; - - let newtline = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("Should have a local timeline"); - - make_some_layers(newtline.as_ref(), Lsn(0x60))?; - tline.checkpoint(CheckpointConfig::Forced)?; - } - - // check that both of them are initially unloaded - let repo = harness.load(); - { - let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); - - let tline = repo - .get_timeline(NEW_TIMELINE_ID) - .expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Unloaded { .. })); - } - // load only child timeline - let _ = repo - .get_timeline_load(NEW_TIMELINE_ID) - .expect("cannot load timeline"); - - // check that both, child and ancestor are loaded - let tline = repo - .get_timeline(NEW_TIMELINE_ID) - .expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Loaded(_))); - - let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline"); - assert!(matches!(tline, RepositoryTimeline::Loaded(_))); - - Ok(()) - } -} diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 15f24d7e24..a52cde7286 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -156,7 +156,7 @@ use std::{ use anyhow::{anyhow, bail, Context}; use futures::stream::{FuturesUnordered, StreamExt}; use once_cell::sync::{Lazy, OnceCell}; -use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use remote_storage::GenericRemoteStorage; use tokio::{ fs, runtime::Runtime, @@ -253,36 +253,20 @@ pub struct SyncStartupData { /// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states. pub fn start_local_timeline_sync( config: &'static PageServerConf, + storage: Option>, ) -> anyhow::Result { let local_timeline_files = local_tenant_timeline_files(config) .context("Failed to collect local tenant timeline files")?; - match config.remote_storage_config.as_ref() { - Some(storage_config) => { - match GenericRemoteStorage::new(config.workdir.clone(), storage_config) - .context("Failed to init the generic remote storage")? - { - GenericRemoteStorage::Local(local_fs_storage) => { - storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - local_fs_storage, - storage_config.max_concurrent_syncs, - storage_config.max_sync_errors, - ) - } - GenericRemoteStorage::S3(s3_bucket_storage) => { - storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - s3_bucket_storage, - storage_config.max_concurrent_syncs, - storage_config.max_sync_errors, - ) - } - } - .context("Failed to spawn the storage sync thread") - } + match storage.zip(config.remote_storage_config.as_ref()) { + Some((storage, storage_config)) => storage_sync::spawn_storage_sync_thread( + config, + local_timeline_files, + storage, + storage_config.max_concurrent_syncs, + storage_config.max_sync_errors, + ) + .context("Failed to spawn the storage sync thread"), None => { info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new(); @@ -810,17 +794,13 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { /// Launch a thread to perform remote storage sync tasks. /// See module docs for loop step description. -pub(super) fn spawn_storage_sync_thread( +pub(super) fn spawn_storage_sync_thread( conf: &'static PageServerConf, local_timeline_files: HashMap)>, - storage: S, + storage: Arc, max_concurrent_timelines_sync: NonZeroUsize, max_sync_errors: NonZeroU32, -) -> anyhow::Result -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> anyhow::Result { let sync_queue = SyncQueue::new(max_concurrent_timelines_sync); SYNC_QUEUE .set(sync_queue) @@ -860,7 +840,7 @@ where storage_sync_loop( runtime, conf, - (Arc::new(storage), remote_index_clone, sync_queue), + (storage, remote_index_clone, sync_queue), max_sync_errors, ); Ok(()) @@ -873,15 +853,12 @@ where }) } -fn storage_sync_loop( +fn storage_sync_loop( runtime: Runtime, conf: &'static PageServerConf, - (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, -) where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) { info!("Starting remote storage sync loop"); loop { let loop_storage = Arc::clone(&storage); @@ -983,18 +960,14 @@ enum UploadStatus { Nothing, } -async fn process_batches( +async fn process_batches( conf: &'static PageServerConf, max_sync_errors: NonZeroU32, - storage: Arc, + storage: Arc, index: &RemoteIndex, batched_tasks: HashMap, sync_queue: &SyncQueue, -) -> HashSet -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> HashSet { let mut sync_results = batched_tasks .into_iter() .map(|(sync_id, batch)| { @@ -1030,17 +1003,13 @@ where downloaded_timelines } -async fn process_sync_task_batch( +async fn process_sync_task_batch( conf: &'static PageServerConf, - (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (Arc, RemoteIndex, &SyncQueue), max_sync_errors: NonZeroU32, sync_id: ZTenantTimelineId, batch: SyncTaskBatch, -) -> DownloadStatus -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> DownloadStatus { let sync_start = Instant::now(); let current_remote_timeline = { index.read().await.timeline_entry(&sync_id).cloned() }; @@ -1175,19 +1144,15 @@ where download_status } -async fn download_timeline_data( +async fn download_timeline_data( conf: &'static PageServerConf, - (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), current_remote_timeline: Option<&RemoteTimeline>, sync_id: ZTenantTimelineId, new_download_data: SyncData, sync_start: Instant, task_name: &str, -) -> DownloadStatus -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> DownloadStatus { match download_timeline_layers( conf, storage, @@ -1298,17 +1263,14 @@ async fn update_local_metadata( Ok(()) } -async fn delete_timeline_data( +async fn delete_timeline_data( conf: &'static PageServerConf, - (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), sync_id: ZTenantTimelineId, mut new_delete_data: SyncData, sync_start: Instant, task_name: &str, -) where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) { let timeline_delete = &mut new_delete_data.data; if !timeline_delete.deletion_registered { @@ -1343,19 +1305,15 @@ async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result( +async fn upload_timeline_data( conf: &'static PageServerConf, - (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue), + (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), current_remote_timeline: Option<&RemoteTimeline>, sync_id: ZTenantTimelineId, new_upload_data: SyncData, sync_start: Instant, task_name: &str, -) -> UploadStatus -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> UploadStatus { let mut uploaded_data = match upload_timeline_layers( storage, sync_queue, @@ -1406,17 +1364,13 @@ enum RemoteDataUpdate<'a> { Delete(&'a HashSet), } -async fn update_remote_data( +async fn update_remote_data( conf: &'static PageServerConf, - storage: &S, + storage: &GenericRemoteStorage, index: &RemoteIndex, sync_id: ZTenantTimelineId, update: RemoteDataUpdate<'_>, -) -> anyhow::Result<()> -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> anyhow::Result<()> { let updated_remote_timeline = { let mut index_accessor = index.write().await; @@ -1642,7 +1596,7 @@ fn register_sync_status( mod test_utils { use utils::lsn::Lsn; - use crate::repository::repo_harness::RepoHarness; + use crate::layered_repository::repo_harness::RepoHarness; use super::*; @@ -1687,7 +1641,7 @@ mod test_utils { #[cfg(test)] mod tests { use super::test_utils::dummy_metadata; - use crate::repository::repo_harness::TIMELINE_ID; + use crate::layered_repository::repo_harness::TIMELINE_ID; use hex_literal::hex; use utils::lsn::Lsn; diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index a1b26ee9a2..d80a082d0c 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -1,27 +1,25 @@ //! Timeline synchronization logic to delete a bulk of timeline's remote files from the remote storage. +use std::path::Path; + use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; use tracing::{debug, error, info}; use crate::storage_sync::{SyncQueue, SyncTask}; -use remote_storage::RemoteStorage; +use remote_storage::{GenericRemoteStorage, RemoteStorage}; use utils::zid::ZTenantTimelineId; use super::{LayersDeletion, SyncData}; /// Attempts to remove the timleline layers from the remote storage. /// If the task had not adjusted the metadata before, the deletion will fail. -pub(super) async fn delete_timeline_layers<'a, P, S>( - storage: &'a S, +pub(super) async fn delete_timeline_layers<'a>( + storage: &'a GenericRemoteStorage, sync_queue: &SyncQueue, sync_id: ZTenantTimelineId, mut delete_data: SyncData, -) -> bool -where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> bool { if !delete_data.data.deletion_registered { error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing"); delete_data.retries += 1; @@ -45,25 +43,14 @@ where let mut delete_tasks = layers_to_delete .into_iter() .map(|local_layer_path| async { - let storage_path = - match storage - .remote_object_id(&local_layer_path) - .with_context(|| { - format!( - "Failed to get the layer storage path for local path '{}'", - local_layer_path.display() - ) - }) { - Ok(path) => path, - Err(e) => return Err((e, local_layer_path)), - }; - - match storage.delete(&storage_path).await.with_context(|| { - format!( - "Failed to delete remote layer from storage at '{:?}'", - storage_path - ) - }) { + match match storage { + GenericRemoteStorage::Local(storage) => { + remove_storage_object(storage, &local_layer_path).await + } + GenericRemoteStorage::S3(storage) => { + remove_storage_object(storage, &local_layer_path).await + } + } { Ok(()) => Ok(local_layer_path), Err(e) => Err((e, local_layer_path)), } @@ -101,6 +88,28 @@ where errored } +async fn remove_storage_object(storage: &S, local_layer_path: &Path) -> anyhow::Result<()> +where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let storage_path = storage + .remote_object_id(local_layer_path) + .with_context(|| { + format!( + "Failed to get the layer storage path for local path '{}'", + local_layer_path.display() + ) + })?; + + storage.delete(&storage_path).await.with_context(|| { + format!( + "Failed to delete remote layer from storage at '{:?}'", + storage_path + ) + }) +} + #[cfg(test)] mod tests { use std::{collections::HashSet, num::NonZeroUsize}; @@ -111,10 +120,10 @@ mod tests { use utils::lsn::Lsn; use crate::{ - repository::repo_harness::{RepoHarness, TIMELINE_ID}, + layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::test_utils::{create_local_timeline, dummy_metadata}, }; - use remote_storage::LocalFs; + use remote_storage::{LocalFs, RemoteStorage}; use super::*; @@ -123,10 +132,10 @@ mod tests { let harness = RepoHarness::create("delete_timeline_negative")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new( + let storage = GenericRemoteStorage::Local(LocalFs::new( tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), - )?; + )?); let deleted = delete_timeline_layers( &storage, @@ -158,17 +167,20 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "c", "d"]; - let storage = LocalFs::new( + let storage = GenericRemoteStorage::Local(LocalFs::new( tempdir()?.path().to_path_buf(), harness.conf.workdir.clone(), - )?; + )?); + + let local_storage = storage.as_local().unwrap(); + let current_retries = 3; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); let timeline_upload = create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; for local_path in timeline_upload.layers_to_upload { - let remote_path = storage.remote_object_id(&local_path)?; + let remote_path = local_storage.remote_object_id(&local_path)?; let remote_parent_dir = remote_path.parent().unwrap(); if !remote_parent_dir.exists() { fs::create_dir_all(&remote_parent_dir).await?; @@ -176,11 +188,11 @@ mod tests { fs::copy(&local_path, &remote_path).await?; } assert_eq!( - storage + local_storage .list() .await? .into_iter() - .map(|remote_path| storage.local_path(&remote_path).unwrap()) + .map(|remote_path| local_storage.local_path(&remote_path).unwrap()) .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) .sorted() .collect::>(), @@ -213,11 +225,11 @@ mod tests { assert!(deleted, "Should be able to delete timeline files"); assert_eq!( - storage + local_storage .list() .await? .into_iter() - .map(|remote_path| storage.local_path(&remote_path).unwrap()) + .map(|remote_path| local_storage.local_path(&remote_path).unwrap()) .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) .sorted() .collect::>(), diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index f714888d9a..8e6aa47c88 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -9,7 +9,9 @@ use std::{ use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use remote_storage::{path_with_suffix_extension, DownloadError, RemoteObjectName, RemoteStorage}; +use remote_storage::{ + path_with_suffix_extension, Download, DownloadError, GenericRemoteStorage, RemoteStorage, +}; use tokio::{ fs, io::{self, AsyncWriteExt}, @@ -62,15 +64,11 @@ impl Default for TenantIndexParts { } } -pub async fn download_index_parts( +pub async fn download_index_parts( conf: &'static PageServerConf, - storage: &S, + storage: &GenericRemoteStorage, keys: HashSet, -) -> HashMap -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> HashMap { let mut index_parts: HashMap = HashMap::new(); let mut part_downloads = keys @@ -114,60 +112,17 @@ where /// Note: The function is rather expensive from s3 access point of view, it will execute ceil(N/1000) + N requests. /// At least one request to obtain a list of tenant timelines (more requests is there are more than 1000 timelines). /// And then will attempt to download all index files that belong to these timelines. -pub async fn gather_tenant_timelines_index_parts( +pub async fn gather_tenant_timelines_index_parts( conf: &'static PageServerConf, - storage: &S, + storage: &GenericRemoteStorage, tenant_id: ZTenantId, -) -> anyhow::Result> -where - P: RemoteObjectName + Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> anyhow::Result> { let tenant_path = conf.timelines_path(&tenant_id); - let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| { - format!( - "Failed to get tenant storage path for local path '{}'", - tenant_path.display() - ) - })?; - - let timelines = storage - .list_prefixes(Some(tenant_storage_path)) + let timeline_sync_ids = get_timeline_sync_ids(storage, &tenant_path, tenant_id) .await - .with_context(|| { - format!( - "Failed to list tenant storage path to get remote timelines to download: {}", - tenant_id - ) - })?; + .with_context(|| format!("Failed to list timeline sync ids for tenat {tenant_id}"))?; - if timelines.is_empty() { - anyhow::bail!( - "no timelines found on the remote storage for tenant {}", - tenant_id - ) - } - - let mut sync_ids = HashSet::new(); - - for timeline_remote_storage_key in timelines { - let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { - anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") - })?; - - let timeline_id: ZTimelineId = object_name - .parse() - .with_context(|| { - format!("failed to parse object name into timeline id for tenant {tenant_id} '{object_name}'") - })?; - - sync_ids.insert(ZTenantTimelineId { - tenant_id, - timeline_id, - }); - } - - match download_index_parts(conf, storage, sync_ids) + match download_index_parts(conf, storage, timeline_sync_ids) .await .remove(&tenant_id) .ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug."))? @@ -180,29 +135,15 @@ where } /// Retrieves index data from the remote storage for a given timeline. -async fn download_index_part( +async fn download_index_part( conf: &'static PageServerConf, - storage: &S, + storage: &GenericRemoteStorage, sync_id: ZTenantTimelineId, -) -> Result -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> Result { let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let part_storage_path = storage - .remote_object_id(&index_part_path) - .with_context(|| { - format!( - "Failed to get the index part storage path for local path '{}'", - index_part_path.display() - ) - }) - .map_err(DownloadError::BadInput)?; - - let mut index_part_download = storage.download(&part_storage_path).await?; + let mut index_part_download = download_storage_object(storage, &index_part_path).await?; let mut index_part_bytes = Vec::new(); io::copy( @@ -211,14 +152,18 @@ where ) .await .with_context(|| { - format!("Failed to download an index part from storage path {part_storage_path:?}") + format!( + "Failed to download an index part into file '{}'", + index_part_path.display() + ) }) .map_err(DownloadError::Other)?; let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) .with_context(|| { format!( - "Failed to deserialize index part file from storage path '{part_storage_path:?}'" + "Failed to deserialize index part file into file '{}'", + index_part_path.display() ) }) .map_err(DownloadError::Other)?; @@ -249,18 +194,14 @@ pub(super) enum DownloadedTimeline { /// updated in the end, if the remote one contains a newer disk_consistent_lsn. /// /// On an error, bumps the retries count and updates the files to skip with successful downloads, rescheduling the task. -pub(super) async fn download_timeline_layers<'a, P, S>( +pub(super) async fn download_timeline_layers<'a>( conf: &'static PageServerConf, - storage: &'a S, + storage: &'a GenericRemoteStorage, sync_queue: &'a SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, sync_id: ZTenantTimelineId, mut download_data: SyncData, -) -> DownloadedTimeline -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> DownloadedTimeline { let remote_timeline = match remote_timeline { Some(remote_timeline) => { if !remote_timeline.awaits_download { @@ -300,15 +241,6 @@ where layer_desination_path.display() ); } else { - let layer_storage_path = storage - .remote_object_id(&layer_desination_path) - .with_context(|| { - format!( - "Failed to get the layer storage path for local path '{}'", - layer_desination_path.display() - ) - })?; - // Perform a rename inspired by durable_rename from file_utils.c. // The sequence: // write(tmp) @@ -329,19 +261,23 @@ where temp_file_path.display() ) })?; - let mut download = storage - .download(&layer_storage_path) + + let mut layer_download = download_storage_object(storage, &layer_desination_path) .await .with_context(|| { format!( - "Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'" + "Failed to initiate the download the layer for {sync_id} into file '{}'", + temp_file_path.display() + ) + })?; + io::copy(&mut layer_download.download_stream, &mut destination_file) + .await + .with_context(|| { + format!( + "Failed to download the layer for {sync_id} into file '{}'", + temp_file_path.display() ) })?; - io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| { - format!( - "Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display() - ) - })?; // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: // A file will not be closed immediately when it goes out of scope if there are any IO operations @@ -429,6 +365,121 @@ where } } +async fn download_storage_object( + storage: &GenericRemoteStorage, + to_path: &Path, +) -> Result { + async fn do_download_storage_object( + storage: &S, + to_path: &Path, + ) -> Result + where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, + { + let remote_object_path = storage + .remote_object_id(to_path) + .with_context(|| { + format!( + "Failed to get the storage path for target local path '{}'", + to_path.display() + ) + }) + .map_err(DownloadError::BadInput)?; + + storage.download(&remote_object_path).await + } + + match storage { + GenericRemoteStorage::Local(storage) => do_download_storage_object(storage, to_path).await, + GenericRemoteStorage::S3(storage) => do_download_storage_object(storage, to_path).await, + } +} + +async fn get_timeline_sync_ids( + storage: &GenericRemoteStorage, + tenant_path: &Path, + tenant_id: ZTenantId, +) -> anyhow::Result> { + let timeline_ids: Vec = match storage { + GenericRemoteStorage::Local(storage) => list_prefixes(storage, tenant_path) + .await? + .into_iter() + .map(|timeline_directory_path| { + timeline_directory_path + .file_stem() + .with_context(|| { + format!( + "Failed to get timeline id string from file '{}'", + timeline_directory_path.display() + ) + })? + .to_string_lossy() + .as_ref() + .parse() + .with_context(|| { + format!( + "failed to parse directory name '{}' as timeline id", + timeline_directory_path.display() + ) + }) + }) + .collect::>(), + GenericRemoteStorage::S3(storage) => list_prefixes(storage, tenant_path) + .await? + .into_iter() + .map(|s3_path| { + s3_path + .object_name() + .with_context(|| { + format!("Failed to get object name out of S3 path {s3_path:?}") + })? + .parse() + .with_context(|| { + format!("failed to parse object name '{s3_path:?}' as timeline id") + }) + }) + .collect::>(), + } + .with_context(|| { + format!("Tenant {tenant_id} has at least one incorrect timeline subdirectory") + })?; + + if timeline_ids.is_empty() { + anyhow::bail!("no timelines found on the remote storage for tenant {tenant_id}") + } + + Ok(timeline_ids + .into_iter() + .map(|timeline_id| ZTenantTimelineId { + tenant_id, + timeline_id, + }) + .collect()) +} + +async fn list_prefixes(storage: &S, tenant_path: &Path) -> anyhow::Result> +where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let tenant_storage_path = storage.remote_object_id(tenant_path).with_context(|| { + format!( + "Failed to get tenant storage path for local path '{}'", + tenant_path.display() + ) + })?; + + storage + .list_prefixes(Some(&tenant_storage_path)) + .await + .with_context(|| { + format!( + "Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download" + ) + }) +} + async fn fsync_path(path: impl AsRef) -> Result<(), io::Error> { fs::File::open(path).await?.sync_all().await } @@ -445,7 +496,7 @@ mod tests { use utils::lsn::Lsn; use crate::{ - repository::repo_harness::{RepoHarness, TIMELINE_ID}, + layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::{ index::RelativePath, test_utils::{create_local_timeline, dummy_metadata}, @@ -461,10 +512,11 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; - let storage = LocalFs::new( - tempdir()?.path().to_path_buf(), + let storage = GenericRemoteStorage::Local(LocalFs::new( + tempdir()?.path().to_owned(), harness.conf.workdir.clone(), - )?; + )?); + let local_storage = storage.as_local().unwrap(); let current_retries = 3; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -472,7 +524,7 @@ mod tests { create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; for local_path in timeline_upload.layers_to_upload { - let remote_path = storage.remote_object_id(&local_path)?; + let remote_path = local_storage.remote_object_id(&local_path)?; let remote_parent_dir = remote_path.parent().unwrap(); if !remote_parent_dir.exists() { fs::create_dir_all(&remote_parent_dir).await?; @@ -558,7 +610,10 @@ mod tests { let harness = RepoHarness::create("download_timeline_negatives")?; let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; + let storage = GenericRemoteStorage::Local(LocalFs::new( + tempdir()?.path().to_owned(), + harness.conf.workdir.clone(), + )?); let empty_remote_timeline_download = download_timeline_layers( harness.conf, @@ -614,10 +669,11 @@ mod tests { let harness = RepoHarness::create("test_download_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new( - tempdir()?.path().to_path_buf(), + let storage = GenericRemoteStorage::Local(LocalFs::new( + tempdir()?.path().to_owned(), harness.conf.workdir.clone(), - )?; + )?); + let local_storage = storage.as_local().unwrap(); let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -638,7 +694,7 @@ mod tests { metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let storage_path = storage.remote_object_id(&local_index_part_path)?; + let storage_path = local_storage.remote_object_id(&local_index_part_path)?; fs::create_dir_all(storage_path.parent().unwrap()).await?; fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?; diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 134ae893bc..7e644da412 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -210,7 +210,7 @@ impl RemoteTimelineIndex { } /// Restored index part data about the timeline, stored in the remote index. -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, Clone)] pub struct RemoteTimeline { timeline_layers: HashSet, missing_layers: HashSet, @@ -341,7 +341,7 @@ mod tests { use std::collections::BTreeSet; use super::*; - use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID}; + use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}; #[test] fn index_part_conversion() { diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 2c41f58721..a8c768e0ae 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -1,11 +1,14 @@ //! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints. -use std::{fmt::Debug, path::PathBuf}; +use std::{ + fmt::Debug, + path::{Path, PathBuf}, +}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; use once_cell::sync::Lazy; -use remote_storage::RemoteStorage; +use remote_storage::{GenericRemoteStorage, RemoteStorage}; use tokio::fs; use tracing::{debug, error, info, warn}; @@ -30,16 +33,12 @@ static NO_LAYERS_UPLOAD: Lazy = Lazy::new(|| { }); /// Serializes and uploads the given index part data to the remote storage. -pub(super) async fn upload_index_part( +pub(super) async fn upload_index_part( conf: &'static PageServerConf, - storage: &S, + storage: &GenericRemoteStorage, sync_id: ZTenantTimelineId, index_part: IndexPart, -) -> anyhow::Result<()> -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> anyhow::Result<()> { let index_part_bytes = serde_json::to_vec(&index_part) .context("Failed to serialize index part file into bytes")?; let index_part_size = index_part_bytes.len(); @@ -48,27 +47,9 @@ where let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME) .with_extension(IndexPart::FILE_EXTENSION); - let index_part_storage_path = - storage - .remote_object_id(&index_part_path) - .with_context(|| { - format!( - "Failed to get the index part storage path for local path '{}'", - index_part_path.display() - ) - })?; - - storage - .upload( - index_part_bytes, - index_part_size, - &index_part_storage_path, - None, - ) + upload_storage_object(storage, index_part_bytes, index_part_size, &index_part_path) .await - .with_context(|| { - format!("Failed to upload index part to the storage path '{index_part_storage_path:?}'") - }) + .with_context(|| format!("Failed to upload index part for '{sync_id}'")) } /// Timeline upload result, with extra data, needed for uploading. @@ -84,17 +65,13 @@ pub(super) enum UploadedTimeline { /// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload. /// /// On an error, bumps the retries count and reschedules the entire task. -pub(super) async fn upload_timeline_layers<'a, P, S>( - storage: &'a S, +pub(super) async fn upload_timeline_layers<'a>( + storage: &'a GenericRemoteStorage, sync_queue: &SyncQueue, remote_timeline: Option<&'a RemoteTimeline>, sync_id: ZTenantTimelineId, mut upload_data: SyncData, -) -> UploadedTimeline -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ +) -> UploadedTimeline { let upload = &mut upload_data.data; let new_upload_lsn = upload .metadata @@ -132,16 +109,6 @@ where let mut upload_tasks = layers_to_upload .into_iter() .map(|source_path| async move { - let storage_path = storage - .remote_object_id(&source_path) - .with_context(|| { - format!( - "Failed to get the layer storage path for local path '{}'", - source_path.display() - ) - }) - .map_err(UploadError::Other)?; - let source_file = match fs::File::open(&source_path).await.with_context(|| { format!( "Failed to upen a source file for layer '{}'", @@ -164,15 +131,10 @@ where .map_err(UploadError::Other)? .len() as usize; - match storage - .upload(source_file, source_size, &storage_path, None) + match upload_storage_object(storage, source_file, source_size, &source_path) .await - .with_context(|| { - format!( - "Failed to upload a layer from local path '{}'", - source_path.display() - ) - }) { + .with_context(|| format!("Failed to upload layer file for {sync_id}")) + { Ok(()) => Ok(source_path), Err(e) => Err(UploadError::MissingLocalFile(source_path, e)), } @@ -231,6 +193,51 @@ where } } +async fn upload_storage_object( + storage: &GenericRemoteStorage, + from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, + from_size_bytes: usize, + from_path: &Path, +) -> anyhow::Result<()> { + async fn do_upload_storage_object( + storage: &S, + from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, + from_size_bytes: usize, + from_path: &Path, + ) -> anyhow::Result<()> + where + P: std::fmt::Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, + { + let target_storage_path = storage.remote_object_id(from_path).with_context(|| { + format!( + "Failed to get the storage path for source local path '{}'", + from_path.display() + ) + })?; + + storage + .upload(from, from_size_bytes, &target_storage_path, None) + .await + .with_context(|| { + format!( + "Failed to upload from '{}' to storage path '{:?}'", + from_path.display(), + target_storage_path + ) + }) + } + + match storage { + GenericRemoteStorage::Local(storage) => { + do_upload_storage_object(storage, from, from_size_bytes, from_path).await + } + GenericRemoteStorage::S3(storage) => { + do_upload_storage_object(storage, from, from_size_bytes, from_path).await + } + } +} + enum UploadError { MissingLocalFile(PathBuf, anyhow::Error), Other(anyhow::Error), @@ -243,12 +250,12 @@ mod tests { num::NonZeroUsize, }; - use remote_storage::LocalFs; + use remote_storage::{LocalFs, RemoteStorage}; use tempfile::tempdir; use utils::lsn::Lsn; use crate::{ - repository::repo_harness::{RepoHarness, TIMELINE_ID}, + layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}, storage_sync::{ index::RelativePath, test_utils::{create_local_timeline, dummy_metadata}, @@ -264,10 +271,11 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a", "b"]; - let storage = LocalFs::new( - tempdir()?.path().to_path_buf(), + let storage = GenericRemoteStorage::Local(LocalFs::new( + tempdir()?.path().to_owned(), harness.conf.workdir.clone(), - )?; + )?); + let local_storage = storage.as_local().unwrap(); let current_retries = 3; let metadata = dummy_metadata(Lsn(0x30)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -276,7 +284,7 @@ mod tests { timeline_upload.metadata = None; assert!( - storage.list().await?.is_empty(), + local_storage.list().await?.is_empty(), "Storage should be empty before any uploads are made" ); @@ -322,7 +330,7 @@ mod tests { "Successful upload without metadata should not have it returned either" ); - let storage_files = storage.list().await?; + let storage_files = local_storage.list().await?; assert_eq!( storage_files.len(), layer_files.len(), @@ -331,7 +339,7 @@ mod tests { assert_eq!( storage_files .into_iter() - .map(|storage_path| storage.local_path(&storage_path)) + .map(|storage_path| local_storage.local_path(&storage_path)) .collect::>>()?, layer_files .into_iter() @@ -351,7 +359,11 @@ mod tests { let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); let layer_files = ["a1", "b1"]; - let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; + let storage = GenericRemoteStorage::Local(LocalFs::new( + tempdir()?.path().to_owned(), + harness.conf.workdir.clone(), + )?); + let local_storage = storage.as_local().unwrap(); let current_retries = 5; let metadata = dummy_metadata(Lsn(0x40)); @@ -365,7 +377,7 @@ mod tests { create_local_timeline(&harness, TIMELINE_ID, &layers_to_upload, metadata.clone()) .await?; assert!( - storage.list().await?.is_empty(), + local_storage.list().await?.is_empty(), "Storage should be empty before any uploads are made" ); @@ -414,7 +426,7 @@ mod tests { "Successful upload should not change its metadata" ); - let storage_files = storage.list().await?; + let storage_files = local_storage.list().await?; assert_eq!( storage_files.len(), layer_files.len(), @@ -423,7 +435,7 @@ mod tests { assert_eq!( storage_files .into_iter() - .map(|storage_path| storage.local_path(&storage_path)) + .map(|storage_path| local_storage.local_path(&storage_path)) .collect::>>()?, layer_files .into_iter() @@ -440,7 +452,11 @@ mod tests { let harness = RepoHarness::create("test_upload_index_part")?; let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?; + let storage = GenericRemoteStorage::Local(LocalFs::new( + tempdir()?.path().to_owned(), + harness.conf.workdir.clone(), + )?); + let local_storage = storage.as_local().unwrap(); let metadata = dummy_metadata(Lsn(0x40)); let local_timeline_path = harness.timeline_path(&TIMELINE_ID); @@ -458,12 +474,12 @@ mod tests { ); assert!( - storage.list().await?.is_empty(), + local_storage.list().await?.is_empty(), "Storage should be empty before any uploads are made" ); upload_index_part(harness.conf, &storage, sync_id, index_part.clone()).await?; - let storage_files = storage.list().await?; + let storage_files = local_storage.list().await?; assert_eq!( storage_files.len(), 1, diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 64f1caa542..4a907ac0e1 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,8 +3,8 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; -use crate::layered_repository::{load_metadata, LayeredRepository, LayeredTimeline}; -use crate::repository::Repository; +use crate::layered_repository::{load_metadata, Repository, Timeline}; +use crate::repository::RepositoryTimeline; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; @@ -12,6 +12,7 @@ use crate::thread_mgr::ThreadKind; use crate::walredo::PostgresRedoManager; use crate::{thread_mgr, timelines, walreceiver}; use anyhow::Context; +use remote_storage::GenericRemoteStorage; use serde::{Deserialize, Serialize}; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; @@ -21,6 +22,7 @@ use tokio::sync::mpsc; use tracing::*; use utils::lsn::Lsn; +pub use tenants_state::try_send_timeline_update; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; mod tenants_state { @@ -68,7 +70,7 @@ mod tenants_state { Ok(()) } - pub(super) fn try_send_timeline_update(update: LocalTimelineUpdate) { + pub fn try_send_timeline_update(update: LocalTimelineUpdate) { match TIMELINE_UPDATE_SENDER .read() .expect("Failed to read() timeline_update_sender lock, it got poisoned") @@ -94,13 +96,7 @@ mod tenants_state { struct Tenant { state: TenantState, /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk. - repo: Arc, - /// Timelines, located locally in the pageserver's datadir. - /// Timelines can entirely be removed entirely by the `detach` operation only. - /// - /// Local timelines have more metadata that's loaded into memory, - /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`]. - local_timelines: HashMap>, + repo: Arc, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -136,7 +132,10 @@ impl fmt::Display for TenantState { /// Initialize repositories with locally available timelines. /// Timelines that are only partially available locally (remote storage has more data than this pageserver) /// are scheduled for download and added to the repository once download is completed. -pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result { +pub fn init_tenant_mgr( + conf: &'static PageServerConf, + remote_storage: Option>, +) -> anyhow::Result { let (timeline_updates_sender, timeline_updates_receiver) = mpsc::unbounded_channel::(); tenants_state::set_timeline_update_sender(timeline_updates_sender)?; @@ -145,7 +144,7 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result, + timeline: Arc, }, } impl std::fmt::Debug for LocalTimelineUpdate { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Detach { id, .. } => f.debug_tuple("Remove").field(id).finish(), - Self::Attach { id, .. } => f.debug_tuple("Add").field(id).finish(), + Self::Detach { id, .. } => f.debug_tuple("Detach").field(id).finish(), + Self::Attach { id, .. } => f.debug_tuple("Attach").field(id).finish(), } } } @@ -289,7 +288,6 @@ pub fn create_tenant_repository( v.insert(Tenant { state: TenantState::Idle, repo, - local_timelines: HashMap::new(), }); Ok(Some(tenant_id)) } @@ -365,7 +363,7 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow: Ok(()) } -pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { +pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { let m = tenants_state::read_tenants(); let tenant = m .get(&tenant_id) @@ -379,21 +377,15 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result> { - let mut m = tenants_state::write_tenants(); - let tenant = m - .get_mut(&tenant_id) - .with_context(|| format!("Tenant {tenant_id} not found"))?; - - if let Some(page_tline) = tenant.local_timelines.get(&timeline_id) { - Ok(Arc::clone(page_tline)) - } else { - let page_tline = load_local_timeline(&tenant.repo, timeline_id) - .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?; - tenant - .local_timelines - .insert(timeline_id, Arc::clone(&page_tline)); - Ok(page_tline) +) -> anyhow::Result> { + let repository = get_repository_for_tenant(tenant_id)?; + match repository.get_timeline(timeline_id) { + Some(RepositoryTimeline::Loaded(loaded_timeline)) => { + loaded_timeline.init_logical_size()?; + Ok(loaded_timeline) + } + _ => load_local_timeline(&repository, timeline_id) + .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}")), } } @@ -420,10 +412,7 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow thread_mgr::shutdown_threads(None, None, Some(timeline_id)); debug!("thread shutdown completed"); match tenants_state::write_tenants().get_mut(&tenant_id) { - Some(tenant) => { - tenant.repo.delete_timeline(timeline_id)?; - tenant.local_timelines.remove(&timeline_id); - } + Some(tenant) => tenant.repo.delete_timeline(timeline_id)?, None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), } @@ -435,23 +424,21 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any // shutdown the tenant and timeline threads: gc, compaction, page service threads) thread_mgr::shutdown_threads(None, Some(tenant_id), None); - // FIXME should we protect somehow from starting new threads/walreceivers when tenant is in stopping state? - // send stop signal to wal receiver and collect join handles while holding the lock - let walreceiver_join_handles = { - let tenants = tenants_state::write_tenants(); - let tenant = tenants.get(&tenant_id).context("tenant not found")?; - let mut walreceiver_join_handles = Vec::with_capacity(tenant.local_timelines.len()); - for timeline_id in tenant.local_timelines.keys() { + let mut walreceiver_join_handles = Vec::new(); + let removed_tenant = { + let mut tenants_accessor = tenants_state::write_tenants(); + tenants_accessor.remove(&tenant_id) + }; + if let Some(tenant) = removed_tenant { + for (timeline_id, _) in tenant.repo.list_timelines() { let (sender, receiver) = std::sync::mpsc::channel::<()>(); tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { - id: ZTenantTimelineId::new(tenant_id, *timeline_id), + id: ZTenantTimelineId::new(tenant_id, timeline_id), join_confirmation_sender: sender, }); - walreceiver_join_handles.push((*timeline_id, receiver)); + walreceiver_join_handles.push((timeline_id, receiver)); } - // drop the tenants lock - walreceiver_join_handles - }; + } // wait for wal receivers to stop without holding the lock, because walreceiver // will attempt to change tenant state which is protected by the same global tenants lock. @@ -484,19 +471,13 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any } fn load_local_timeline( - repo: &LayeredRepository, + repo: &Repository, timeline_id: ZTimelineId, -) -> anyhow::Result> { +) -> anyhow::Result> { let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| { format!("Inmem timeline {timeline_id} not found in tenant's repository") })?; inmem_timeline.init_logical_size()?; - - tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach { - id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id), - datadir: Arc::clone(&inmem_timeline), - }); - Ok(inmem_timeline) } @@ -588,37 +569,24 @@ fn init_local_repository( } fn attach_downloaded_tenant( - repo: &LayeredRepository, + repo: &Repository, downloaded_timelines: HashSet, ) -> anyhow::Result<()> { - let mut registration_queue = Vec::with_capacity(downloaded_timelines.len()); - - // first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration - for timeline_id in downloaded_timelines { + // first, register timeline metadata to ensure ancestors will be found later during layer load + for &timeline_id in &downloaded_timelines { repo.attach_timeline(timeline_id).with_context(|| { format!("Failed to load timeline {timeline_id} into in-memory repository") })?; - registration_queue.push(timeline_id); } - for timeline_id in registration_queue { - let tenant_id = repo.tenant_id(); - match tenants_state::write_tenants().get_mut(&tenant_id) { - Some(tenant) => match tenant.local_timelines.entry(timeline_id) { - Entry::Occupied(_) => { - anyhow::bail!("Local timeline {timeline_id} already registered") - } - Entry::Vacant(v) => { - v.insert(load_local_timeline(repo, timeline_id).with_context(|| { - format!("Failed to register add local timeline for tenant {tenant_id}") - })?); - } - }, - None => anyhow::bail!( - "Tenant {} not found in local tenant state", - repo.tenant_id() - ), - } + // and then load its layers in memory + for timeline_id in downloaded_timelines { + let _ = load_local_timeline(repo, timeline_id).with_context(|| { + format!( + "Failed to register add local timeline for tenant {}", + repo.tenant_id(), + ) + })?; } Ok(()) @@ -630,14 +598,14 @@ fn load_local_repo( conf: &'static PageServerConf, tenant_id: ZTenantId, remote_index: &RemoteIndex, -) -> anyhow::Result> { +) -> anyhow::Result> { let mut m = tenants_state::write_tenants(); let tenant = m.entry(tenant_id).or_insert_with(|| { // Set up a WAL redo manager, for applying WAL records. let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); // Set up an object repository, for actual data storage. - let repo: Arc = Arc::new(LayeredRepository::new( + let repo: Arc = Arc::new(Repository::new( conf, TenantConfOpt::default(), Arc::new(walredo_mgr), @@ -648,12 +616,11 @@ fn load_local_repo( Tenant { state: TenantState::Idle, repo, - local_timelines: HashMap::new(), } }); // Restore tenant config - let tenant_conf = LayeredRepository::load_tenant_config(conf, tenant_id)?; + let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?; tenant.repo.update_tenant_config(tenant_conf)?; Ok(Arc::clone(&tenant.repo)) diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index e51744d3cc..ca239ae254 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -5,7 +5,6 @@ use std::collections::HashMap; use std::ops::ControlFlow; use std::time::Duration; -use crate::repository::Repository; use crate::tenant_mgr::TenantState; use crate::thread_mgr::ThreadKind; use crate::{tenant_mgr, thread_mgr}; diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 0d35195691..4f760751db 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -20,15 +20,14 @@ use utils::{ use crate::import_datadir; use crate::tenant_mgr; +use crate::CheckpointConfig; use crate::{ - config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex, - tenant_config::TenantConfOpt, + config::PageServerConf, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, }; use crate::{ - layered_repository::{LayeredRepository, LayeredTimeline}, + layered_repository::{Repository, Timeline}, walredo::WalRedoManager, }; -use crate::{repository::Timeline, CheckpointConfig}; #[derive(Debug, Clone, Copy)] pub struct PointInTime { @@ -42,7 +41,7 @@ pub fn create_repo( tenant_id: ZTenantId, wal_redo_manager: Arc, remote_index: RemoteIndex, -) -> Result> { +) -> Result> { let repo_dir = conf.tenant_path(&tenant_id); ensure!( !repo_dir.exists(), @@ -57,9 +56,9 @@ pub fn create_repo( info!("created directory structure in {}", repo_dir.display()); // Save tenant's config - LayeredRepository::persist_tenant_config(conf, tenant_id, tenant_conf)?; + Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?; - Ok(Arc::new(LayeredRepository::new( + Ok(Arc::new(Repository::new( conf, tenant_conf, wal_redo_manager, @@ -104,11 +103,11 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // - run initdb to init temporary instance and get bootstrap data // - after initialization complete, remove the temp dir. // -fn bootstrap_timeline( +fn bootstrap_timeline( conf: &'static PageServerConf, tenantid: ZTenantId, tli: ZTimelineId, - repo: &R, + repo: &Repository, ) -> Result<()> { let initdb_path = conf .tenant_path(&tenantid) @@ -160,7 +159,7 @@ pub(crate) fn create_timeline( new_timeline_id: Option, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, -) -> Result)>> { +) -> Result)>> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 05afe4ba3e..c0965e7a22 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -30,6 +30,7 @@ use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; +use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::walrecord::*; @@ -43,15 +44,15 @@ use utils::lsn::Lsn; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); -pub struct WalIngest<'a, T: DatadirTimeline> { - timeline: &'a T, +pub struct WalIngest<'a> { + timeline: &'a Timeline, checkpoint: CheckPoint, checkpoint_modified: bool, } -impl<'a, T: DatadirTimeline> WalIngest<'a, T> { - pub fn new(timeline: &T, startpoint: Lsn) -> Result> { +impl<'a> WalIngest<'a> { + pub fn new(timeline: &Timeline, startpoint: Lsn) -> Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint)?; @@ -77,7 +78,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { &mut self, recdata: Bytes, lsn: Lsn, - modification: &mut DatadirModification, + modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, ) -> Result<()> { modification.lsn = lsn; @@ -266,7 +267,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_decoded_block( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, @@ -326,7 +327,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_heapam_record( &mut self, buf: &mut Bytes, - modification: &mut DatadirModification, + modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, ) -> Result<()> { // Handle VM bit updates that are implicitly part of heap records. @@ -470,7 +471,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record. fn ingest_xlog_dbase_create( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlCreateDatabase, ) -> Result<()> { let db_id = rec.db_id; @@ -537,7 +538,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_xlog_smgr_create( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlSmgrCreate, ) -> Result<()> { let rel = RelTag { @@ -555,7 +556,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { /// This is the same logic as in PostgreSQL's smgr_redo() function. fn ingest_xlog_smgr_truncate( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rec: &XlSmgrTruncate, ) -> Result<()> { let spcnode = rec.rnode.spcnode; @@ -620,7 +621,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { /// fn ingest_xact_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, parsed: &XlXactParsedRecord, is_commit: bool, ) -> Result<()> { @@ -689,7 +690,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_clog_truncate_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlClogTruncate, ) -> Result<()> { info!( @@ -747,7 +748,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_multixact_create_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlMultiXactCreate, ) -> Result<()> { // Create WAL record for updating the multixact-offsets page @@ -826,7 +827,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_multixact_truncate_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlMultiXactTruncate, ) -> Result<()> { self.checkpoint.oldestMulti = xlrec.end_trunc_off; @@ -860,7 +861,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn ingest_relmap_page( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, xlrec: &XlRelmapUpdate, decoded: &DecodedWALRecord, ) -> Result<()> { @@ -876,7 +877,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn put_rel_creation( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, ) -> Result<()> { modification.put_rel_creation(rel, 0)?; @@ -885,7 +886,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn put_rel_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, img: Bytes, @@ -897,7 +898,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn put_rel_wal_record( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, rec: ZenithWalRecord, @@ -909,7 +910,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn put_rel_truncation( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, nblocks: BlockNumber, ) -> Result<()> { @@ -917,11 +918,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { Ok(()) } - fn put_rel_drop( - &mut self, - modification: &mut DatadirModification, - rel: RelTag, - ) -> Result<()> { + fn put_rel_drop(&mut self, modification: &mut DatadirModification, rel: RelTag) -> Result<()> { modification.put_rel_drop(rel)?; Ok(()) } @@ -937,7 +934,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn handle_rel_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, ) -> Result<()> { @@ -968,7 +965,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn put_slru_page_image( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, kind: SlruKind, segno: u32, blknum: BlockNumber, @@ -981,7 +978,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { fn handle_slru_extend( &mut self, - modification: &mut DatadirModification, + modification: &mut DatadirModification, kind: SlruKind, segno: u32, blknum: BlockNumber, @@ -1032,9 +1029,9 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { #[cfg(test)] mod tests { use super::*; + use crate::layered_repository::repo_harness::*; + use crate::layered_repository::Timeline; use crate::pgdatadir_mapping::create_test_timeline; - use crate::repository::repo_harness::*; - use crate::repository::Timeline; use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; @@ -1046,13 +1043,13 @@ mod tests { forknum: 0, }; - fn assert_current_logical_size(_timeline: &T, _lsn: Lsn) { + fn assert_current_logical_size(_timeline: &Timeline, _lsn: Lsn) { // TODO } static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - fn init_walingest_test(tline: &T) -> Result> { + fn init_walingest_test(tline: &Timeline) -> Result { let mut m = tline.begin_modification(Lsn(0x10)); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file @@ -1065,7 +1062,7 @@ mod tests { #[test] fn test_relsize() -> Result<()> { let repo = RepoHarness::create("test_relsize")?.load(); - let tline = create_test_timeline(repo, TIMELINE_ID)?; + let tline = create_test_timeline(&repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1193,7 +1190,7 @@ mod tests { #[test] fn test_drop_extend() -> Result<()> { let repo = RepoHarness::create("test_drop_extend")?.load(); - let tline = create_test_timeline(repo, TIMELINE_ID)?; + let tline = create_test_timeline(&repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1233,7 +1230,7 @@ mod tests { #[test] fn test_truncate_extend() -> Result<()> { let repo = RepoHarness::create("test_truncate_extend")?.load(); - let tline = create_test_timeline(repo, TIMELINE_ID)?; + let tline = create_test_timeline(&repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; // Create a 20 MB relation (the size is arbitrary) @@ -1321,7 +1318,7 @@ mod tests { #[test] fn test_large_rel() -> Result<()> { let repo = RepoHarness::create("test_large_rel")?.load(); - let tline = create_test_timeline(repo, TIMELINE_ID)?; + let tline = create_test_timeline(&repo, TIMELINE_ID)?; let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 8a466a8a67..d6420e1d18 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -269,7 +269,7 @@ async fn wal_receiver_main_thread_loop_step<'a>( } } // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. - LocalTimelineUpdate::Attach { id, datadir } => { + LocalTimelineUpdate::Attach { id, timeline } => { let timeline_connection_managers = local_timeline_wal_receivers .entry(id.tenant_id) .or_default(); @@ -305,7 +305,7 @@ async fn wal_receiver_main_thread_loop_step<'a>( id, broker_prefix.to_owned(), etcd_client.clone(), - datadir, + timeline, wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag, diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index e8e0a7c52b..0261203049 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -16,7 +16,7 @@ use std::{ time::Duration, }; -use crate::{layered_repository::LayeredTimeline, repository::Timeline}; +use crate::layered_repository::Timeline; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -39,7 +39,7 @@ pub(super) fn spawn_connection_manager_task( id: ZTenantTimelineId, broker_loop_prefix: String, mut client: Client, - local_timeline: Arc, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -242,7 +242,7 @@ const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; struct WalreceiverState { id: ZTenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. - local_timeline: Arc, + local_timeline: Arc, /// The timeout on the connection to safekeeper for WAL streaming. wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. @@ -300,7 +300,7 @@ struct EtcdSkTimeline { impl WalreceiverState { fn new( id: ZTenantTimelineId, - local_timeline: Arc, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -735,12 +735,8 @@ fn wal_stream_connection_string( #[cfg(test)] mod tests { - use crate::repository::{ - repo_harness::{RepoHarness, TIMELINE_ID}, - Repository, - }; - use super::*; + use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID}; #[test] fn no_connection_no_candidate() -> anyhow::Result<()> { diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 025bfeb506..f816198eda 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -20,11 +20,7 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument}; use super::TaskEvent; use crate::{ - layered_repository::WalReceiverInfo, - pgdatadir_mapping::DatadirTimeline, - repository::{Repository, Timeline}, - tenant_mgr, - walingest::WalIngest, + layered_repository::WalReceiverInfo, tenant_mgr, walingest::WalIngest, walrecord::DecodedWALRecord, }; use postgres_ffi::v14::waldecoder::WalStreamDecoder; @@ -67,7 +63,7 @@ pub async fn handle_walreceiver_connection( ) .await .context("Timed out while waiting for walreceiver connection to open")? - .context("Failed to open walreceiver conection")?; + .context("Failed to open walreceiver connection")?; info!("connected!"); let mut connection_status = WalConnectionStatus { diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 9cf347573a..bf48bd1759 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -89,15 +89,52 @@ pub trait WalRedoManager: Send + Sync { // for access to the postgres process ('wait') since there is only one for // each tenant. +/// Time buckets are small because we want to be able to measure the +/// smallest redo processing times. These buckets allow us to measure down +/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec. +/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec. +macro_rules! redo_histogram_time_buckets { + () => { + vec![ + 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, + 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, + ] + }; +} + +/// While we're at it, also measure the amount of records replayed in each +/// operation. We have a global 'total replayed' counter, but that's not +/// as useful as 'what is the skew for how many records we replay in one +/// operation'. +macro_rules! redo_histogram_count_buckets { + () => { + vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0] + }; +} + static WAL_REDO_TIME: Lazy = Lazy::new(|| { - register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo") - .expect("failed to define a metric") + register_histogram!( + "pageserver_wal_redo_seconds", + "Time spent on WAL redo", + redo_histogram_time_buckets!() + ) + .expect("failed to define a metric") }); static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_wait_seconds", - "Time spent waiting for access to the WAL redo process" + "Time spent waiting for access to the WAL redo process", + redo_histogram_time_buckets!(), + ) + .expect("failed to define a metric") +}); + +static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_records_histogram", + "Histogram of number of records replayed per redo", + redo_histogram_count_buckets!(), ) .expect("failed to define a metric") }); @@ -262,7 +299,10 @@ impl PostgresRedoManager { let end_time = Instant::now(); let duration = end_time.duration_since(lock_time); + WAL_REDO_TIME.observe(duration.as_secs_f64()); + WAL_REDO_RECORDS_HISTOGRAM.observe(records.len() as f64); + debug!( "postgres applied {} WAL records in {} us to reconstruct page image at LSN {}", records.len(), diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile new file mode 100644 index 0000000000..a6ce611974 --- /dev/null +++ b/pgxn/neon/Makefile @@ -0,0 +1,26 @@ +# pgxs/neon/Makefile + + +MODULE_big = neon +OBJS = \ + $(WIN32RES) \ + inmem_smgr.o \ + libpagestore.o \ + libpqwalproposer.o \ + pagestore_smgr.o \ + relsize_cache.o \ + neon.o \ + walproposer.o \ + walproposer_utils.o + +PG_CPPFLAGS = -I$(libpq_srcdir) +SHLIB_LINK_INTERNAL = $(libpq) + +EXTENSION = neon +DATA = neon--1.0.sql +PGFILEDESC = "neon - cloud storage for PostgreSQL" + + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/pgxn/neon/inmem_smgr.c b/pgxn/neon/inmem_smgr.c new file mode 100644 index 0000000000..7840292b08 --- /dev/null +++ b/pgxn/neon/inmem_smgr.c @@ -0,0 +1,286 @@ +/*------------------------------------------------------------------------- + * + * inmem_smgr.c + * + * This is an implementation of the SMGR interface, used in the WAL redo + * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent + * storage, the pages that are written out are kept in a small number of + * in-memory buffers. + * + * Normally, replaying a WAL record only needs to access a handful of + * buffers, which fit in the normal buffer cache, so this is just for + * "overflow" storage when the buffer cache is not large enough. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * contrib/neon/inmem_smgr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog.h" +#include "pagestore_client.h" +#include "storage/block.h" +#include "storage/buf_internals.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" + +/* Size of the in-memory smgr */ +#define MAX_PAGES 64 + +/* If more than WARN_PAGES are used, print a warning in the log */ +#define WARN_PAGES 32 + +static BufferTag page_tag[MAX_PAGES]; +static char page_body[MAX_PAGES][BLCKSZ]; +static int used_pages; + +static int +locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno) +{ + /* We only hold a small number of pages, so linear search */ + for (int i = 0; i < used_pages; i++) + { + if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + && forknum == page_tag[i].forkNum + && blkno == page_tag[i].blockNum) + { + return i; + } + } + return -1; +} + +/* + * inmem_init() -- Initialize private state + */ +void +inmem_init(void) +{ + used_pages = 0; +} + +/* + * inmem_exists() -- Does the physical file exist? + */ +bool +inmem_exists(SMgrRelation reln, ForkNumber forknum) +{ + for (int i = 0; i < used_pages; i++) + { + if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + && forknum == page_tag[i].forkNum) + { + return true; + } + } + return false; +} + +/* + * inmem_create() -- Create a new relation on zenithd storage + * + * If isRedo is true, it's okay for the relation to exist already. + */ +void +inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ +} + +/* + * inmem_unlink() -- Unlink a relation. + */ +void +inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) +{ +} + +/* + * inmem_extend() -- Add a block to the specified relation. + * + * The semantics are nearly the same as mdwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +void +inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + char *buffer, bool skipFsync) +{ + /* same as smgwrite() for us */ + inmem_write(reln, forknum, blkno, buffer, skipFsync); +} + +/* + * inmem_open() -- Initialize newly-opened relation. + */ +void +inmem_open(SMgrRelation reln) +{ +} + +/* + * inmem_close() -- Close the specified relation, if it isn't closed already. + */ +void +inmem_close(SMgrRelation reln, ForkNumber forknum) +{ +} + +/* + * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + return true; +} + +/* + * inmem_writeback() -- Tell the kernel to write pages back to storage. + */ +void +inmem_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ +} + +/* + * inmem_read() -- Read the specified block from a relation. + */ +void +inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + char *buffer) +{ + int pg; + + pg = locate_page(reln, forknum, blkno); + if (pg < 0) + memset(buffer, 0, BLCKSZ); + else + memcpy(buffer, page_body[pg], BLCKSZ); +} + +/* + * inmem_write() -- Write the supplied block at the appropriate location. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +void +inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) +{ + int pg; + + pg = locate_page(reln, forknum, blocknum); + if (pg < 0) + { + /* + * We assume the buffer cache is large enough to hold all the buffers + * needed for most operations. Overflowing to this "in-mem smgr" in rare + * cases is OK. But if we find that we're using more than WARN_PAGES, + * print a warning so that we get alerted and get to investigate why + * we're accessing so many buffers. + */ + elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, + "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + blocknum, + used_pages); + if (used_pages == MAX_PAGES) + elog(ERROR, "Inmem storage overflow"); + + pg = used_pages; + used_pages++; + INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum); + } else { + elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + blocknum, + used_pages); + } + memcpy(page_body[pg], buffer, BLCKSZ); +} + +/* + * inmem_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +inmem_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + /* + * It's not clear why a WAL redo function would call smgrnblocks(). + * During recovery, at least before reaching consistency, the size of a + * relation could be arbitrarily small, if it was truncated after the + * record being replayed, or arbitrarily large if it was extended + * afterwards. But one place where it's called is in + * XLogReadBufferExtended(): it extends the relation, if it's smaller than + * the requested page. That's a waste of time in the WAL redo + * process. Pretend that all relations are maximally sized to avoid it. + */ + return MaxBlockNumber; +} + +/* + * inmem_truncate() -- Truncate relation to specified number of blocks. + */ +void +inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ +} + +/* + * inmem_immedsync() -- Immediately sync a relation to stable storage. + */ +void +inmem_immedsync(SMgrRelation reln, ForkNumber forknum) +{ +} + +static const struct f_smgr inmem_smgr = +{ + .smgr_init = inmem_init, + .smgr_shutdown = NULL, + .smgr_open = inmem_open, + .smgr_close = inmem_close, + .smgr_create = inmem_create, + .smgr_exists = inmem_exists, + .smgr_unlink = inmem_unlink, + .smgr_extend = inmem_extend, + .smgr_prefetch = inmem_prefetch, + .smgr_read = inmem_read, + .smgr_write = inmem_write, + .smgr_writeback = inmem_writeback, + .smgr_nblocks = inmem_nblocks, + .smgr_truncate = inmem_truncate, + .smgr_immedsync = inmem_immedsync, +}; + +const f_smgr * +smgr_inmem(BackendId backend, RelFileNode rnode) +{ + Assert(InRecovery); + if (backend != InvalidBackendId) + return smgr_standard(backend, rnode); + else + return &inmem_smgr; +} + +void +smgr_init_inmem() +{ + inmem_init(); +} diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c new file mode 100644 index 0000000000..649fc1037e --- /dev/null +++ b/pgxn/neon/libpagestore.c @@ -0,0 +1,432 @@ +/*------------------------------------------------------------------------- + * + * libpagestore.c + * Handles network communications with the remote pagestore. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/libpqpagestore.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pagestore_client.h" +#include "fmgr.h" +#include "access/xlog.h" + +#include "libpq-fe.h" +#include "libpq/pqformat.h" +#include "libpq/libpq.h" + +#include "miscadmin.h" +#include "pgstat.h" +#include "utils/guc.h" + +#include "neon.h" +#include "walproposer.h" +#include "walproposer_utils.h" + + +#define PageStoreTrace DEBUG5 + +#define NEON_TAG "[NEON_SMGR] " +#define neon_log(tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG fmt, ## __VA_ARGS__), \ + errhidestmt(true), errhidecontext(true))) + +bool connected = false; +PGconn *pageserver_conn = NULL; + +char *page_server_connstring_raw; + +static ZenithResponse *pageserver_call(ZenithRequest *request); +page_server_api api = { + .request = pageserver_call +}; + +static void +pageserver_connect() +{ + char *query; + int ret; + + Assert(!connected); + + pageserver_conn = PQconnectdb(page_server_connstring); + + if (PQstatus(pageserver_conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + PQfinish(pageserver_conn); + pageserver_conn = NULL; + ereport(ERROR, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "could not establish connection to pageserver"), + errdetail_internal("%s", msg))); + } + + query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline); + ret = PQsendQuery(pageserver_conn, query); + if (ret != 1) + { + PQfinish(pageserver_conn); + pageserver_conn = NULL; + neon_log(ERROR, "could not send pagestream command to pageserver"); + } + + while (PQisBusy(pageserver_conn)) + { + int wc; + + /* Sleep until there's something to do */ + wc = WaitLatchOrSocket(MyLatch, + WL_LATCH_SET | WL_SOCKET_READABLE | + WL_EXIT_ON_PM_DEATH, + PQsocket(pageserver_conn), + -1L, PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Data available in socket? */ + if (wc & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(pageserver_conn)) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + PQfinish(pageserver_conn); + pageserver_conn = NULL; + + neon_log(ERROR, "could not complete handshake with pageserver: %s", + msg); + } + } + } + + neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw); + + connected = true; +} + +/* + * A wrapper around PQgetCopyData that checks for interrupts while sleeping. + */ +static int +call_PQgetCopyData(PGconn *conn, char **buffer) +{ + int ret; + +retry: + ret = PQgetCopyData(conn, buffer, 1 /* async */ ); + + if (ret == 0) + { + int wc; + + /* Sleep until there's something to do */ + wc = WaitLatchOrSocket(MyLatch, + WL_LATCH_SET | WL_SOCKET_READABLE | + WL_EXIT_ON_PM_DEATH, + PQsocket(conn), + -1L, PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Data available in socket? */ + if (wc & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(conn)) + neon_log(ERROR, "could not get response from pageserver: %s", + PQerrorMessage(conn)); + } + + goto retry; + } + + return ret; +} + + +static ZenithResponse * +pageserver_call(ZenithRequest *request) +{ + StringInfoData req_buff; + StringInfoData resp_buff; + ZenithResponse *resp; + + PG_TRY(); + { + /* If the connection was lost for some reason, reconnect */ + if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) + { + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } + + if (!connected) + pageserver_connect(); + + req_buff = zm_pack_request(request); + + /* + * Send request. + * + * In principle, this could block if the output buffer is full, and we + * should use async mode and check for interrupts while waiting. In + * practice, our requests are small enough to always fit in the output + * and TCP buffer. + */ + if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn)) + { + neon_log(ERROR, "failed to send page request: %s", + PQerrorMessage(pageserver_conn)); + } + pfree(req_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = zm_to_string((ZenithMessage *) request); + + neon_log(PageStoreTrace, "sent request: %s", msg); + pfree(msg); + } + + /* read response */ + resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data); + resp_buff.cursor = 0; + + if (resp_buff.len == -1) + neon_log(ERROR, "end of COPY"); + else if (resp_buff.len == -2) + neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); + + resp = zm_unpack_response(&resp_buff); + PQfreemem(resp_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = zm_to_string((ZenithMessage *) resp); + + neon_log(PageStoreTrace, "got response: %s", msg); + pfree(msg); + } + } + PG_CATCH(); + { + /* + * If anything goes wrong while we were sending a request, it's not + * clear what state the connection is in. For example, if we sent the + * request but didn't receive a response yet, we might receive the + * response some time later after we have already sent a new unrelated + * request. Close the connection to avoid getting confused. + */ + if (connected) + { + neon_log(LOG, "dropping connection to page server due to error"); + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } + PG_RE_THROW(); + } + PG_END_TRY(); + + return (ZenithResponse *) resp; +} + + +static bool +check_zenith_id(char **newval, void **extra, GucSource source) +{ + uint8 zid[16]; + + return **newval == '\0' || HexDecodeString(zid, *newval, 16); +} + +static char * +substitute_pageserver_password(const char *page_server_connstring_raw) +{ + char *host = NULL; + char *port = NULL; + char *user = NULL; + char *auth_token = NULL; + char *err = NULL; + char *page_server_connstring = NULL; + PQconninfoOption *conn_options; + PQconninfoOption *conn_option; + MemoryContext oldcontext; + + /* + * Here we substitute password in connection string with an environment + * variable. To simplify things we construct a connection string back with + * only known options. In particular: host port user and password. We do + * not currently use other options and constructing full connstring in an + * URI shape is quite messy. + */ + + if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0') + return NULL; + + /* extract the auth token from the connection string */ + conn_options = PQconninfoParse(page_server_connstring_raw, &err); + if (conn_options == NULL) + { + /* The error string is malloc'd, so we must free it explicitly */ + char *errcopy = err ? pstrdup(err) : "out of memory"; + + PQfreemem(err); + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid connection string syntax: %s", errcopy))); + } + + /* + * Trying to populate pageserver connection string with auth token from + * environment. We are looking for password in with placeholder value like + * $ENV_VAR_NAME, so if password field is present and starts with $ we try + * to fetch environment variable value and fail loudly if it is not set. + */ + for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++) + { + if (strcmp(conn_option->keyword, "host") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + host = conn_option->val; + } + else if (strcmp(conn_option->keyword, "port") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + port = conn_option->val; + } + else if (strcmp(conn_option->keyword, "user") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + user = conn_option->val; + } + else if (strcmp(conn_option->keyword, "password") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + { + /* ensure that this is a template */ + if (strncmp(conn_option->val, "$", 1) != 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1]))); + + neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]); + auth_token = getenv(&conn_option->val[1]); + if (!auth_token) + { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1]))); + } + else + { + neon_log(LOG, "using auth token from environment passed via env"); + } + } + } + } + + /* + * allocate connection string in TopMemoryContext to make sure it is not + * freed + */ + oldcontext = CurrentMemoryContext; + MemoryContextSwitchTo(TopMemoryContext); + page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port); + MemoryContextSwitchTo(oldcontext); + + PQconninfoFree(conn_options); + return page_server_connstring; +} + +/* + * Module initialization function + */ +void +pg_init_libpagestore(void) +{ + DefineCustomStringVariable("neon.pageserver_connstring", + "connection string to the page server", + NULL, + &page_server_connstring_raw, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + NULL, NULL, NULL); + + DefineCustomStringVariable("neon.timeline_id", + "Zenith timelineid the server is running on", + NULL, + &zenith_timeline, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_zenith_id, NULL, NULL); + + DefineCustomStringVariable("neon.tenant_id", + "Neon tenantid the server is running on", + NULL, + &zenith_tenant, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_zenith_id, NULL, NULL); + + DefineCustomBoolVariable("neon.wal_redo", + "start in wal-redo mode", + NULL, + &wal_redo, + false, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + + DefineCustomIntVariable("neon.max_cluster_size", + "cluster size limit", + NULL, + &max_cluster_size, + -1, -1, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MB, + NULL, NULL, NULL); + + relsize_hash_init(); + + if (page_server != NULL) + neon_log(ERROR, "libpagestore already loaded"); + + neon_log(PageStoreTrace, "libpagestore already loaded"); + page_server = &api; + + /* substitute password in pageserver_connstring */ + page_server_connstring = substitute_pageserver_password(page_server_connstring_raw); + + /* Is there more correct way to pass CustomGUC to postgres code? */ + zenith_timeline_walproposer = zenith_timeline; + zenith_tenant_walproposer = zenith_tenant; + + if (wal_redo) + { + neon_log(PageStoreTrace, "set inmem_smgr hook"); + smgr_hook = smgr_inmem; + smgr_init_hook = smgr_init_inmem; + } + else if (page_server_connstring && page_server_connstring[0]) + { + neon_log(PageStoreTrace, "set neon_smgr hook"); + smgr_hook = smgr_zenith; + smgr_init_hook = smgr_init_zenith; + dbsize_hook = zenith_dbsize; + } +} diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c new file mode 100644 index 0000000000..2b2b7a1a6a --- /dev/null +++ b/pgxn/neon/libpqwalproposer.c @@ -0,0 +1,413 @@ +#include "postgres.h" + +#include "libpq-fe.h" +#include "neon.h" +#include "walproposer.h" + +/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ +struct WalProposerConn +{ + PGconn* pg_conn; + bool is_nonblocking; /* whether the connection is non-blocking */ + char *recvbuf; /* last received data from libpqprop_async_read */ +}; + +/* Prototypes for exported functions */ +static char* libpqprop_error_message(WalProposerConn* conn); +static WalProposerConnStatusType libpqprop_status(WalProposerConn* conn); +static WalProposerConn* libpqprop_connect_start(char* conninfo); +static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn* conn); +static bool libpqprop_send_query(WalProposerConn* conn, char* query); +static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn* conn); +static pgsocket libpqprop_socket(WalProposerConn* conn); +static int libpqprop_flush(WalProposerConn* conn); +static void libpqprop_finish(WalProposerConn* conn); +static PGAsyncReadResult libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount); +static PGAsyncWriteResult libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size); +static bool libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size); + +static WalProposerFunctionsType PQWalProposerFunctions = { + libpqprop_error_message, + libpqprop_status, + libpqprop_connect_start, + libpqprop_connect_poll, + libpqprop_send_query, + libpqprop_get_query_result, + libpqprop_socket, + libpqprop_flush, + libpqprop_finish, + libpqprop_async_read, + libpqprop_async_write, + libpqprop_blocking_write, +}; + +/* Module initialization */ +void +pg_init_libpqwalproposer(void) +{ + if (WalProposerFunctions != NULL) + elog(ERROR, "libpqwalproposer already loaded"); + WalProposerFunctions = &PQWalProposerFunctions; +} + +/* Helper function */ +static bool +ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking) +{ + /* If we're already correctly blocking or nonblocking, all good */ + if (is_nonblocking == conn->is_nonblocking) + return true; + + /* Otherwise, set it appropriately */ + if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1) + return false; + + conn->is_nonblocking = is_nonblocking; + return true; +} + +/* Exported function definitions */ +static char* +libpqprop_error_message(WalProposerConn* conn) +{ + return PQerrorMessage(conn->pg_conn); +} + +static WalProposerConnStatusType +libpqprop_status(WalProposerConn* conn) +{ + switch (PQstatus(conn->pg_conn)) + { + case CONNECTION_OK: + return WP_CONNECTION_OK; + case CONNECTION_BAD: + return WP_CONNECTION_BAD; + default: + return WP_CONNECTION_IN_PROGRESS; + } +} + +static WalProposerConn* +libpqprop_connect_start(char* conninfo) +{ + WalProposerConn* conn; + PGconn* pg_conn; + + pg_conn = PQconnectStart(conninfo); + /* + * Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the + * behavior of PQconnectStart here. + */ + if (!pg_conn) + return NULL; + + /* + * And in theory this allocation can fail as well, but it's incredibly unlikely if we just + * successfully allocated a PGconn. + * + * palloc will exit on failure though, so there's not much we could do if it *did* fail. + */ + conn = palloc(sizeof(WalProposerConn)); + conn->pg_conn = pg_conn; + conn->is_nonblocking = false; /* connections always start in blocking mode */ + conn->recvbuf = NULL; + return conn; +} + +static WalProposerConnectPollStatusType +libpqprop_connect_poll(WalProposerConn* conn) +{ + WalProposerConnectPollStatusType return_val; + + switch (PQconnectPoll(conn->pg_conn)) + { + case PGRES_POLLING_FAILED: + return_val = WP_CONN_POLLING_FAILED; + break; + case PGRES_POLLING_READING: + return_val = WP_CONN_POLLING_READING; + break; + case PGRES_POLLING_WRITING: + return_val = WP_CONN_POLLING_WRITING; + break; + case PGRES_POLLING_OK: + return_val = WP_CONN_POLLING_OK; + break; + + /* There's a comment at its source about this constant being unused. We'll expect it's never + * returned. */ + case PGRES_POLLING_ACTIVE: + elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); + /* This return is never actually reached, but it's here to make the compiler happy */ + return WP_CONN_POLLING_FAILED; + + default: + Assert(false); + return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ + } + + return return_val; +} + +static bool +libpqprop_send_query(WalProposerConn* conn, char* query) +{ + /* We need to be in blocking mode for sending the query to run without + * requiring a call to PQflush */ + if (!ensure_nonblocking_status(conn, false)) + return false; + + /* PQsendQuery returns 1 on success, 0 on failure */ + if (!PQsendQuery(conn->pg_conn, query)) + return false; + + return true; +} + +static WalProposerExecStatusType +libpqprop_get_query_result(WalProposerConn* conn) +{ + PGresult* result; + WalProposerExecStatusType return_val; + + /* Marker variable if we need to log an unexpected success result */ + char* unexpected_success = NULL; + + /* Consume any input that we might be missing */ + if (!PQconsumeInput(conn->pg_conn)) + return WP_EXEC_FAILED; + + if (PQisBusy(conn->pg_conn)) + return WP_EXEC_NEEDS_INPUT; + + + result = PQgetResult(conn->pg_conn); + /* PQgetResult returns NULL only if getting the result was successful & there's no more of the + * result to get. */ + if (!result) + { + elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); + return WP_EXEC_UNEXPECTED_SUCCESS; + } + + /* Helper macro to reduce boilerplate */ + #define UNEXPECTED_SUCCESS(msg) \ + return_val = WP_EXEC_UNEXPECTED_SUCCESS; \ + unexpected_success = msg; \ + break; + + + switch (PQresultStatus(result)) + { + /* "true" success case */ + case PGRES_COPY_BOTH: + return_val = WP_EXEC_SUCCESS_COPYBOTH; + break; + + /* Unexpected success case */ + case PGRES_EMPTY_QUERY: + UNEXPECTED_SUCCESS("empty query return"); + case PGRES_COMMAND_OK: + UNEXPECTED_SUCCESS("data-less command end"); + case PGRES_TUPLES_OK: + UNEXPECTED_SUCCESS("tuples return"); + case PGRES_COPY_OUT: + UNEXPECTED_SUCCESS("'Copy Out' response"); + case PGRES_COPY_IN: + UNEXPECTED_SUCCESS("'Copy In' response"); + case PGRES_SINGLE_TUPLE: + UNEXPECTED_SUCCESS("single tuple return"); + case PGRES_PIPELINE_SYNC: + UNEXPECTED_SUCCESS("pipeline sync point"); + + /* Failure cases */ + case PGRES_BAD_RESPONSE: + case PGRES_NONFATAL_ERROR: + case PGRES_FATAL_ERROR: + case PGRES_PIPELINE_ABORTED: + return_val = WP_EXEC_FAILED; + break; + + default: + Assert(false); + return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ + } + + if (unexpected_success) + elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); + + return return_val; +} + +static pgsocket +libpqprop_socket(WalProposerConn* conn) +{ + return PQsocket(conn->pg_conn); +} + +static int +libpqprop_flush(WalProposerConn* conn) +{ + return (PQflush(conn->pg_conn)); +} + +static void +libpqprop_finish(WalProposerConn* conn) +{ + if (conn->recvbuf != NULL) + PQfreemem(conn->recvbuf); + PQfinish(conn->pg_conn); + pfree(conn); +} + +/* + * Receive a message from the safekeeper. + * + * On success, the data is placed in *buf. It is valid until the next call + * to this function. + */ +static PGAsyncReadResult +libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) +{ + int result; + + if (conn->recvbuf != NULL) + { + PQfreemem(conn->recvbuf); + conn->recvbuf = NULL; + } + + /* Call PQconsumeInput so that we have the data we need */ + if (!PQconsumeInput(conn->pg_conn)) + { + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + + /* The docs for PQgetCopyData list the return values as: + * 0 if the copy is still in progress, but no "complete row" is + * available + * -1 if the copy is done + * -2 if an error occured + * (> 0) if it was successful; that value is the amount transferred. + * + * The protocol we use between walproposer and safekeeper means that we + * *usually* wouldn't expect to see that the copy is done, but this can + * sometimes be triggered by the server returning an ErrorResponse (which + * also happens to have the effect that the copy is done). + */ + switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true)) + { + case 0: + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_TRY_AGAIN; + case -1: + { + /* + * If we get -1, it's probably because of a server error; the + * safekeeper won't normally send a CopyDone message. + * + * We can check PQgetResult to make sure that the server failed; + * it'll always result in PGRES_FATAL_ERROR + */ + ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); + + if (status != PGRES_FATAL_ERROR) + elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); + + /* If there was actually an error, it'll be properly reported by + * calls to PQerrorMessage -- we don't have to do anything else */ + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + case -2: + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + default: + /* Positive values indicate the size of the returned result */ + *amount = result; + *buf = conn->recvbuf; + return PG_ASYNC_READ_SUCCESS; + } +} + +static PGAsyncWriteResult +libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) +{ + int result; + + /* If we aren't in non-blocking mode, switch to it. */ + if (!ensure_nonblocking_status(conn, true)) + return PG_ASYNC_WRITE_FAIL; + + /* The docs for PQputcopyData list the return values as: + * 1 if the data was queued, + * 0 if it was not queued because of full buffers, or + * -1 if an error occured + */ + result = PQputCopyData(conn->pg_conn, buf, size); + + /* We won't get a result of zero because walproposer always empties the + * connection's buffers before sending more */ + Assert(result != 0); + + switch (result) + { + case 1: + /* good -- continue */ + break; + case -1: + return PG_ASYNC_WRITE_FAIL; + default: + elog(FATAL, "invalid return %d from PQputCopyData", result); + } + + /* After queueing the data, we still need to flush to get it to send. + * This might take multiple tries, but we don't want to wait around + * until it's done. + * + * PQflush has the following returns (directly quoting the docs): + * 0 if sucessful, + * 1 if it was unable to send all the data in the send queue yet + * -1 if it failed for some reason + */ + switch (result = PQflush(conn->pg_conn)) { + case 0: + return PG_ASYNC_WRITE_SUCCESS; + case 1: + return PG_ASYNC_WRITE_TRY_FLUSH; + case -1: + return PG_ASYNC_WRITE_FAIL; + default: + elog(FATAL, "invalid return %d from PQflush", result); + } +} + +static bool +libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size) +{ + int result; + + /* If we are in non-blocking mode, switch out of it. */ + if (!ensure_nonblocking_status(conn, false)) + return false; + + /* Ths function is very similar to libpqprop_async_write. For more + * information, refer to the comments there */ + if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1) + return false; + + Assert(result == 1); + + /* Because the connection is non-blocking, flushing returns 0 or -1 */ + + if ((result = PQflush(conn->pg_conn)) == -1) + return false; + + Assert(result == 0); + return true; +} diff --git a/pgxn/neon/neon--1.0.sql b/pgxn/neon/neon--1.0.sql new file mode 100644 index 0000000000..34f1ba78d4 --- /dev/null +++ b/pgxn/neon/neon--1.0.sql @@ -0,0 +1,17 @@ +\echo Use "CREATE EXTENSION neon" to load this file. \quit + +CREATE FUNCTION pg_cluster_size() +RETURNS bigint +AS 'MODULE_PATHNAME', 'pg_cluster_size' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION backpressure_lsns( + OUT received_lsn pg_lsn, + OUT disk_consistent_lsn pg_lsn, + OUT remote_consistent_lsn pg_lsn +) +RETURNS record +AS 'MODULE_PATHNAME', 'backpressure_lsns' +LANGUAGE C STRICT +PARALLEL UNSAFE; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c new file mode 100644 index 0000000000..595a126f04 --- /dev/null +++ b/pgxn/neon/neon.c @@ -0,0 +1,82 @@ +/*------------------------------------------------------------------------- + * + * neon.c + * Utility functions to expose neon specific information to user + * + * IDENTIFICATION + * contrib/neon/neon.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "fmgr.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "catalog/pg_type.h" +#include "replication/walsender.h" +#include "funcapi.h" +#include "access/htup_details.h" +#include "utils/pg_lsn.h" +#include "utils/guc.h" + +#include "neon.h" +#include "walproposer.h" + +PG_MODULE_MAGIC; +void _PG_init(void); + + +void _PG_init(void) +{ + pg_init_libpagestore(); + pg_init_libpqwalproposer(); + pg_init_walproposer(); + + EmitWarningsOnPlaceholders("neon"); +} + +PG_FUNCTION_INFO_V1(pg_cluster_size); +PG_FUNCTION_INFO_V1(backpressure_lsns); + +Datum +pg_cluster_size(PG_FUNCTION_ARGS) +{ + int64 size; + + size = GetZenithCurrentClusterSize(); + + if (size == 0) + PG_RETURN_NULL(); + + PG_RETURN_INT64(size); +} + + +Datum +backpressure_lsns(PG_FUNCTION_ARGS) +{ + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; + Datum values[3]; + bool nulls[3]; + TupleDesc tupdesc; + + replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); + + tupdesc = CreateTemplateTupleDesc(3); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + MemSet(nulls, 0, sizeof(nulls)); + values[0] = LSNGetDatum(writePtr); + values[1] = LSNGetDatum(flushPtr); + values[2] = LSNGetDatum(applyPtr); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control new file mode 100644 index 0000000000..84f79881c1 --- /dev/null +++ b/pgxn/neon/neon.control @@ -0,0 +1,4 @@ +# neon extension +comment = 'cloud storage for PostgreSQL' +default_version = '1.0' +module_pathname = '$libdir/neon' diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h new file mode 100644 index 0000000000..2c66bc7bf0 --- /dev/null +++ b/pgxn/neon/neon.h @@ -0,0 +1,19 @@ +/*------------------------------------------------------------------------- + * + * neon.h + * Functions used in the initialization of this extension. + * + * IDENTIFICATION + * contrib/neon/neon.h + * + *------------------------------------------------------------------------- + */ + +#ifndef NEON_H +#define NEON_H + +extern void pg_init_libpagestore(void); +extern void pg_init_libpqwalproposer(void); +extern void pg_init_walproposer(void); + +#endif /* NEON_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h new file mode 100644 index 0000000000..f79a3c9142 --- /dev/null +++ b/pgxn/neon/pagestore_client.h @@ -0,0 +1,221 @@ +/*------------------------------------------------------------------------- + * + * pagestore_client.h + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * contrib/neon/pagestore_client.h + * + *------------------------------------------------------------------------- + */ +#ifndef pageserver_h +#define pageserver_h + +#include "postgres.h" + +#include "access/xlogdefs.h" +#include "storage/relfilenode.h" +#include "storage/block.h" +#include "storage/smgr.h" +#include "lib/stringinfo.h" +#include "libpq/pqformat.h" +#include "utils/memutils.h" + +#include "pg_config.h" + +typedef enum +{ + /* pagestore_client -> pagestore */ + T_ZenithExistsRequest = 0, + T_ZenithNblocksRequest, + T_ZenithGetPageRequest, + T_ZenithDbSizeRequest, + + /* pagestore -> pagestore_client */ + T_ZenithExistsResponse = 100, + T_ZenithNblocksResponse, + T_ZenithGetPageResponse, + T_ZenithErrorResponse, + T_ZenithDbSizeResponse, +} ZenithMessageTag; + + + +/* base struct for c-style inheritance */ +typedef struct +{ + ZenithMessageTag tag; +} ZenithMessage; + +#define messageTag(m) (((const ZenithMessage *)(m))->tag) + +/* + * supertype of all the Zenith*Request structs below + * + * If 'latest' is true, we are requesting the latest page version, and 'lsn' + * is just a hint to the server that we know there are no versions of the page + * (or relation size, for exists/nblocks requests) later than the 'lsn'. + */ +typedef struct +{ + ZenithMessageTag tag; + bool latest; /* if true, request latest page version */ + XLogRecPtr lsn; /* request page version @ this LSN */ +} ZenithRequest; + +typedef struct +{ + ZenithRequest req; + RelFileNode rnode; + ForkNumber forknum; +} ZenithExistsRequest; + +typedef struct +{ + ZenithRequest req; + RelFileNode rnode; + ForkNumber forknum; +} ZenithNblocksRequest; + + +typedef struct +{ + ZenithRequest req; + Oid dbNode; +} ZenithDbSizeRequest; + + +typedef struct +{ + ZenithRequest req; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; +} ZenithGetPageRequest; + +/* supertype of all the Zenith*Response structs below */ +typedef struct +{ + ZenithMessageTag tag; +} ZenithResponse; + +typedef struct +{ + ZenithMessageTag tag; + bool exists; +} ZenithExistsResponse; + +typedef struct +{ + ZenithMessageTag tag; + uint32 n_blocks; +} ZenithNblocksResponse; + +typedef struct +{ + ZenithMessageTag tag; + char page[FLEXIBLE_ARRAY_MEMBER]; +} ZenithGetPageResponse; + +typedef struct +{ + ZenithMessageTag tag; + int64 db_size; +} ZenithDbSizeResponse; + +typedef struct +{ + ZenithMessageTag tag; + char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */ +} ZenithErrorResponse; + +extern StringInfoData zm_pack_request(ZenithRequest *msg); +extern ZenithResponse *zm_unpack_response(StringInfo s); +extern char *zm_to_string(ZenithMessage *msg); + +/* + * API + */ + +typedef struct +{ + ZenithResponse *(*request) (ZenithRequest *request); +} page_server_api; + +extern page_server_api *page_server; + +extern char *page_server_connstring; +extern char *zenith_timeline; +extern char *zenith_tenant; +extern bool wal_redo; +extern int32 max_cluster_size; + +extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode); +extern void smgr_init_zenith(void); + +extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); +extern void smgr_init_inmem(void); +extern void smgr_shutdown_inmem(void); + +/* zenith storage manager functionality */ + +extern void zenith_init(void); +extern void zenith_open(SMgrRelation reln); +extern void zenith_close(SMgrRelation reln, ForkNumber forknum); +extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum); +extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void zenith_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); + +extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); + +extern void zenith_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum); +extern const int64 zenith_dbsize(Oid dbNode); +extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum); + +/* zenith wal-redo storage manager functionality */ + +extern void inmem_init(void); +extern void inmem_open(SMgrRelation reln); +extern void inmem_close(SMgrRelation reln, ForkNumber forknum); +extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum); +extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void inmem_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); +extern void inmem_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); +extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); + + +/* utils for zenith relsize cache */ +extern void relsize_hash_init(void); +extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size); +extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); +extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); +extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); + +#endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c new file mode 100644 index 0000000000..3e1b74dba7 --- /dev/null +++ b/pgxn/neon/pagestore_smgr.c @@ -0,0 +1,1696 @@ +/*------------------------------------------------------------------------- + * + * pagestore_smgr.c + * + * + * + * Temporary and unlogged rels + * --------------------------- + * + * Temporary and unlogged tables are stored locally, by md.c. The functions + * here just pass the calls through to corresponding md.c functions. + * + * Index build operations that use the buffer cache are also handled locally, + * just like unlogged tables. Such operations must be marked by calling + * smgr_start_unlogged_build() and friends. + * + * In order to know what relations are permanent and which ones are not, we + * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set + * by smgropen() callers, when they have the relcache entry at hand. However, + * sometimes we need to open an SmgrRelation for a relation without the + * relcache. That is needed when we evict a buffer; we might not have the + * SmgrRelation for that relation open yet. To deal with that, the + * 'relpersistence' can be left to zero, meaning we don't know if it's + * permanent or not. Most operations are not allowed with relpersistence==0, + * but smgrwrite() does work, which is what we need for buffer eviction. and + * smgrunlink() so that a backend doesn't need to have the relcache entry at + * transaction commit, where relations that were dropped in the transaction + * are unlinked. + * + * If smgrwrite() is called and smgr_relpersistence == 0, we check if the + * relation file exists locally or not. If it does exist, we assume it's an + * unlogged relation and write the page there. Otherwise it must be a + * permanent relation, WAL-logged and stored on the page server, and we ignore + * the write like we do for permanent relations. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/pagestore_smgr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlog_internal.h" +#include "catalog/pg_class.h" +#include "pagestore_client.h" +#include "pagestore_client.h" +#include "storage/smgr.h" +#include "access/xlogdefs.h" +#include "postmaster/interrupt.h" +#include "replication/walsender.h" +#include "storage/bufmgr.h" +#include "storage/md.h" +#include "fmgr.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "catalog/pg_tablespace_d.h" +#include "postmaster/autovacuum.h" + +/* + * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API + * calls to md.c, and *also* do the calls to the Page Server. On every + * read, compare the versions we read from local disk and Page Server, + * and Assert that they are identical. + */ +/* #define DEBUG_COMPARE_LOCAL */ + +#ifdef DEBUG_COMPARE_LOCAL +#include "access/nbtree.h" +#include "storage/bufpage.h" +#include "access/xlog_internal.h" + +static char *hexdump_page(char *page); +#endif + +#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId) + +const int SmgrTrace = DEBUG5; + +page_server_api *page_server; + +/* GUCs */ +char *page_server_connstring; // with substituted password +char *zenith_timeline; +char *zenith_tenant; +bool wal_redo = false; +int32 max_cluster_size; + +/* unlogged relation build states */ +typedef enum +{ + UNLOGGED_BUILD_NOT_IN_PROGRESS = 0, + UNLOGGED_BUILD_PHASE_1, + UNLOGGED_BUILD_PHASE_2, + UNLOGGED_BUILD_NOT_PERMANENT +} UnloggedBuildPhase; + +static SMgrRelation unlogged_build_rel = NULL; +static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + +StringInfoData +zm_pack_request(ZenithRequest *msg) +{ + StringInfoData s; + + initStringInfo(&s); + pq_sendbyte(&s, msg->tag); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_ZenithExistsRequest: + { + ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_ZenithNblocksRequest: + { + ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_ZenithDbSizeRequest: + { + ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->dbNode); + + break; + } + case T_ZenithGetPageRequest: + { + ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + pq_sendint32(&s, msg_req->blkno); + + break; + } + + /* pagestore -> pagestore_client. We never need to create these. */ + case T_ZenithExistsResponse: + case T_ZenithNblocksResponse: + case T_ZenithGetPageResponse: + case T_ZenithErrorResponse: + case T_ZenithDbSizeResponse: + default: + elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag); + break; + } + return s; +} + +ZenithResponse * +zm_unpack_response(StringInfo s) +{ + ZenithMessageTag tag = pq_getmsgbyte(s); + ZenithResponse *resp = NULL; + + switch (tag) + { + /* pagestore -> pagestore_client */ + case T_ZenithExistsResponse: + { + ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse)); + + msg_resp->tag = tag; + msg_resp->exists = pq_getmsgbyte(s); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithNblocksResponse: + { + ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse)); + + msg_resp->tag = tag; + msg_resp->n_blocks = pq_getmsgint(s, 4); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithGetPageResponse: + { + ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ); + + msg_resp->tag = tag; + /* XXX: should be varlena */ + memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithDbSizeResponse: + { + ZenithDbSizeResponse *msg_resp = palloc0(sizeof(ZenithDbSizeResponse)); + + msg_resp->tag = tag; + msg_resp->db_size = pq_getmsgint64(s); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithErrorResponse: + { + ZenithErrorResponse *msg_resp; + size_t msglen; + const char *msgtext; + + msgtext = pq_getmsgrawstring(s); + msglen = strlen(msgtext); + + msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1); + msg_resp->tag = tag; + memcpy(msg_resp->message, msgtext, msglen + 1); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + /* + * pagestore_client -> pagestore + * + * We create these ourselves, and don't need to decode them. + */ + case T_ZenithExistsRequest: + case T_ZenithNblocksRequest: + case T_ZenithGetPageRequest: + case T_ZenithDbSizeRequest: + default: + elog(ERROR, "unexpected zenith message tag 0x%02x", tag); + break; + } + + return resp; +} + +/* dump to json for debugging / error reporting purposes */ +char * +zm_to_string(ZenithMessage *msg) +{ + StringInfoData s; + + initStringInfo(&s); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_ZenithExistsRequest: + { + ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + case T_ZenithNblocksRequest: + { + ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + case T_ZenithGetPageRequest: + { + ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + case T_ZenithDbSizeRequest: + { + ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeRequest\""); + appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + + /* pagestore -> pagestore_client */ + case T_ZenithExistsResponse: + { + ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\""); + appendStringInfo(&s, ", \"exists\": %d}", + msg_resp->exists + ); + appendStringInfoChar(&s, '}'); + + break; + } + case T_ZenithNblocksResponse: + { + ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks + ); + appendStringInfoChar(&s, '}'); + + break; + } + case T_ZenithGetPageResponse: + { +#if 0 + ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg; +#endif + + appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\""); + appendStringInfo(&s, ", \"page\": \"XXX\"}"); + appendStringInfoChar(&s, '}'); + break; + } + case T_ZenithErrorResponse: + { + ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg; + + /* FIXME: escape double-quotes in the message */ + appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\""); + appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); + appendStringInfoChar(&s, '}'); + break; + } + case T_ZenithDbSizeResponse: + { + ZenithDbSizeResponse *msg_resp = (ZenithDbSizeResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeResponse\""); + appendStringInfo(&s, ", \"db_size\": %ld}", + msg_resp->db_size + ); + appendStringInfoChar(&s, '}'); + + break; + } + + default: + appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); + } + return s.data; +} + +/* + * Wrapper around log_newpage() that makes a temporary copy of the block and + * WAL-logs that. This makes it safe to use while holding only a shared lock + * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint + * directly because it skips the logging if the LSN is new enough. + */ +static XLogRecPtr +log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, + Page page, bool page_std) +{ + PGAlignedBlock copied_buffer; + + memcpy(copied_buffer.data, page, BLCKSZ); + return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std); +} + +/* + * Is 'buffer' identical to a freshly initialized empty heap page? + */ +static bool +PageIsEmptyHeapPage(char *buffer) +{ + PGAlignedBlock empty_page; + + PageInit((Page) empty_page.data, BLCKSZ, 0); + + return memcmp(buffer, empty_page.data, BLCKSZ) == 0; +} + +static void +zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) +{ + XLogRecPtr lsn = PageGetLSN(buffer); + + if (ShutdownRequestPending) + return; + + /* + * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM + * changes are not WAL-logged when the changes are made, so this is our + * last chance to log them, otherwise they're lost. That's OK for + * correctness, the non-logged updates are not critical. But we want to + * have a reasonably up-to-date VM and FSM in the page server. + */ + if (forknum == FSM_FORKNUM && !RecoveryInProgress()) + { + /* FSM is never WAL-logged and we don't care. */ + XLogRecPtr recptr; + + recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); + XLogFlush(recptr); + lsn = recptr; + ereport(SmgrTrace, + (errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress()) + { + /* + * Always WAL-log vm. We should never miss clearing visibility map + * bits. + * + * TODO Is it too bad for performance? Hopefully we do not evict + * actively used vm too often. + */ + XLogRecPtr recptr; + + recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); + XLogFlush(recptr); + lsn = recptr; + + ereport(SmgrTrace, + (errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + else if (lsn == InvalidXLogRecPtr) + { + /* + * When PostgreSQL extends a relation, it calls smgrextend() with an all-zeros pages, + * and we can just ignore that in Zenith. We do need to remember the new size, + * though, so that smgrnblocks() returns the right answer after the rel has + * been extended. We rely on the relsize cache for that. + * + * A completely empty heap page doesn't need to be WAL-logged, either. The + * heapam can leave such a page behind, if e.g. an insert errors out after + * initializing the page, but before it has inserted the tuple and WAL-logged + * the change. When we read the page from the page server, it will come back + * as all-zeros. That's OK, the heapam will initialize an all-zeros page on + * first use. + * + * In other scenarios, evicting a dirty page with no LSN is a bad sign: it implies + * that the page was not WAL-logged, and its contents will be lost when it's + * evicted. + */ + if (PageIsNew(buffer)) + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is all-zeros", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + else if (PageIsEmptyHeapPage(buffer)) + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + else + { + ereport(PANIC, + (errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + } + else + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + + /* + * Remember the LSN on this page. When we read the page again, we must + * read the same or newer version of it. + */ + SetLastWrittenPageLSN(lsn); +} + + +/* + * zenith_init() -- Initialize private state + */ +void +zenith_init(void) +{ + /* noop */ +#ifdef DEBUG_COMPARE_LOCAL + mdinit(); +#endif +} + +/* + * GetXLogInsertRecPtr uses XLogBytePosToRecPtr to convert logical insert (reserved) position + * to physical position in WAL. It always adds SizeOfXLogShortPHD: + * seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + * so even if there are no records on the page, offset will be SizeOfXLogShortPHD. + * It may cause problems with XLogFlush. So return pointer backward to the origin of the page. + */ +static XLogRecPtr +zm_adjust_lsn(XLogRecPtr lsn) +{ + /* + * If lsn points to the beging of first record on page or segment, then + * "return" it back to the page origin + */ + if ((lsn & (XLOG_BLCKSZ - 1)) == SizeOfXLogShortPHD) + { + lsn -= SizeOfXLogShortPHD; + } + else if ((lsn & (wal_segment_size - 1)) == SizeOfXLogLongPHD) + { + lsn -= SizeOfXLogLongPHD; + } + return lsn; +} + +/* + * Return LSN for requesting pages and number of blocks from page server + */ +static XLogRecPtr +zenith_get_request_lsn(bool *latest) +{ + XLogRecPtr lsn; + + if (RecoveryInProgress()) + { + *latest = false; + lsn = GetXLogReplayRecPtr(NULL); + elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", + (uint32) ((lsn) >> 32), (uint32) (lsn)); + } + else if (am_walsender) + { + *latest = true; + lsn = InvalidXLogRecPtr; + elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 "); + } + else + { + XLogRecPtr flushlsn; + + /* + * Use the latest LSN that was evicted from the buffer cache. Any + * pages modified by later WAL records must still in the buffer cache, + * so our request cannot concern those. + */ + *latest = true; + lsn = GetLastWrittenPageLSN(); + Assert(lsn != InvalidXLogRecPtr); + elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ", + (uint32) ((lsn) >> 32), (uint32) (lsn)); + + lsn = zm_adjust_lsn(lsn); + + /* + * Is it possible that the last-written LSN is ahead of last flush + * LSN? Generally not, we shouldn't evict a page from the buffer cache + * before all its modifications have been safely flushed. That's the + * "WAL before data" rule. However, such case does exist at index building, + * _bt_blwritepage logs the full page without flushing WAL before + * smgrextend (files are fsynced before build ends). + */ + flushlsn = GetFlushRecPtr(); + if (lsn > flushlsn) + { + elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", + (uint32) (lsn >> 32), (uint32) lsn, + (uint32) (flushlsn >> 32), (uint32) flushlsn); + XLogFlush(lsn); + } + } + + return lsn; +} + + +/* + * zenith_exists() -- Does the physical file exist? + */ +bool +zenith_exists(SMgrRelation reln, ForkNumber forkNum) +{ + bool exists; + ZenithResponse *resp; + BlockNumber n_blocks; + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + /* + * We don't know if it's an unlogged rel stored locally, or permanent + * rel stored in the page server. First check if it exists locally. + * If it does, great. Otherwise check if it exists in the page server. + */ + if (mdexists(reln, forkNum)) + return true; + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdexists(reln, forkNum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks)) + { + return true; + } + + /* + * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server + * will error out if you check that, because the whole dbdir for tablespace + * 0, db 0 doesn't exists. We possibly should change the page server to + * accept that and return 'false', to be consistent with mdexists(). But + * we probably also should fix pg_table_size() to not call smgrexists() + * with bogus relfilenode. + * + * For now, handle that special case here. + */ + if (reln->smgr_rnode.node.spcNode == 0 && + reln->smgr_rnode.node.dbNode == 0 && + reln->smgr_rnode.node.relNode == 0) + { + return false; + } + + request_lsn = zenith_get_request_lsn(&latest); + { + ZenithExistsRequest request = { + .req.tag = T_ZenithExistsRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .rnode = reln->smgr_rnode.node, + .forknum = forkNum + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithExistsResponse: + exists = ((ZenithExistsResponse *) resp)->exists; + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + pfree(resp); + return exists; +} + +/* + * zenith_create() -- Create a new relation on zenithd storage + * + * If isRedo is true, it's okay for the relation to exist already. + */ +void +zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) +{ + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdcreate(reln, forkNum, isRedo); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + elog(SmgrTrace, "Create relation %u/%u/%u.%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum); + + /* + * Newly created relation is empty, remember that in the relsize cache. + * + * FIXME: This is currently not just an optimization, but required for + * correctness. Postgres can call smgrnblocks() on the newly-created + * relation. Currently, we don't call SetLastWrittenPageLSN() when a new + * relation created, so if we didn't remember the size in the relsize + * cache, we might call smgrnblocks() on the newly-created relation before + * the creation WAL record hass been received by the page server. + */ + set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdcreate(reln, forkNum, isRedo); +#endif +} + +/* + * zenith_unlink() -- Unlink a relation. + * + * Note that we're passed a RelFileNodeBackend --- by the time this is called, + * there won't be an SMgrRelation hashtable entry anymore. + * + * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber + * to delete all forks. + * + * + * If isRedo is true, it's unsurprising for the relation to be already gone. + * Also, we should remove the file immediately instead of queuing a request + * for later, since during redo there's no possibility of creating a + * conflicting relation. + * + * Note: any failure should be reported as WARNING not ERROR, because + * we are usually not in a transaction anymore when this is called. + */ +void +zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) +{ + /* + * Might or might not exist locally, depending on whether it's + * an unlogged or permanent relation (or if DEBUG_COMPARE_LOCAL is + * set). Try to unlink, it won't do any harm if the file doesn't + * exist. + */ + mdunlink(rnode, forkNum, isRedo); + if (!RelFileNodeBackendIsTemp(rnode)) { + forget_cached_relsize(rnode.node, forkNum); + } +} + +/* + * zenith_extend() -- Add a block to the specified relation. + * + * The semantics are nearly the same as mdwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +void +zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer, bool skipFsync) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdextend(reln, forkNum, blkno, buffer, skipFsync); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* + * Check that the cluster size limit has not been exceeded. + * + * Temporary and unlogged relations are not included in the cluster size measured + * by the page server, so ignore those. Autovacuum processes are also exempt. + */ + if (max_cluster_size > 0 && + reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && + !IsAutoVacuumWorkerProcess()) + { + uint64 current_size = GetZenithCurrentClusterSize(); + + if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", + max_cluster_size), + errhint("This limit is defined by neon.max_cluster_size GUC"))); + } + + zenith_wallog_page(reln, forkNum, blkno, buffer); + set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1); + + lsn = PageGetLSN(buffer); + elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, blkno, + (uint32) (lsn >> 32), (uint32) lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdextend(reln, forkNum, blkno, buffer, skipFsync); +#endif +} + +/* + * zenith_open() -- Initialize newly-opened relation. + */ +void +zenith_open(SMgrRelation reln) +{ + /* + * We don't have anything special to do here. Call mdopen() to let md.c + * initialize itself. That's only needed for temporary or unlogged + * relations, but it's dirt cheap so do it always to make sure the md + * fields are initialized, for debugging purposes if nothing else. + */ + mdopen(reln); + + /* no work */ + elog(SmgrTrace, "[ZENITH_SMGR] open noop"); +} + +/* + * zenith_close() -- Close the specified relation, if it isn't closed already. + */ +void +zenith_close(SMgrRelation reln, ForkNumber forknum) +{ + /* + * Let md.c close it, if it had it open. Doesn't hurt to do this + * even for permanent relations that have no local storage. + */ + mdclose(reln, forknum); +} + +/* + * zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* probably shouldn't happen, but ignore it */ + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdprefetch(reln, forknum, blocknum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* not implemented */ + elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop"); + return true; +} + +/* + * zenith_writeback() -- Tell the kernel to write pages back to storage. + * + * This accepts a range of blocks because flushing several pages at once is + * considerably more efficient than doing so individually. + */ +void +zenith_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* mdwriteback() does nothing if the file doesn't exist */ + mdwriteback(reln, forknum, blocknum, nblocks); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdwriteback(reln, forknum, blocknum, nblocks); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* not implemented */ + elog(SmgrTrace, "[ZENITH_SMGR] writeback noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdwriteback(reln, forknum, blocknum, nblocks); +#endif +} + +/* + * While function is defined in the zenith extension it's used within neon_test_utils directly. + * To avoid breaking tests in the runtime please keep function signature in sync. + */ +void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer) +{ + ZenithResponse *resp; + + { + ZenithGetPageRequest request = { + .req.tag = T_ZenithGetPageRequest, + .req.latest = request_latest, + .req.lsn = request_lsn, + .rnode = rnode, + .forknum = forkNum, + .blkno = blkno + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithGetPageResponse: + memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ); + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + blkno, + rnode.spcNode, + rnode.dbNode, + rnode.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + + pfree(resp); +} + +/* + * zenith_read() -- Read the specified block from a relation. + */ +void +zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer) +{ + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrread() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdread(reln, forkNum, blkno, buffer); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + request_lsn = zenith_get_request_lsn(&latest); + zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); + +#ifdef DEBUG_COMPARE_LOCAL + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + char pageserver_masked[BLCKSZ]; + char mdbuf[BLCKSZ]; + char mdbuf_masked[BLCKSZ]; + + mdread(reln, forkNum, blkno, mdbuf); + + memcpy(pageserver_masked, buffer, BLCKSZ); + memcpy(mdbuf_masked, mdbuf, BLCKSZ); + + if (PageIsNew(mdbuf)) + { + if (!PageIsNew(pageserver_masked)) + { + elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(buffer)); + } + } + else if (PageIsNew(buffer)) + { + elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf)); + } + else if (PageGetSpecialSize(mdbuf) == 0) + { + /* assume heap */ + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + { + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + { + /* assume btree */ + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + } + } +#endif +} + +#ifdef DEBUG_COMPARE_LOCAL +static char * +hexdump_page(char *page) +{ + StringInfoData result; + + initStringInfo(&result); + + for (int i = 0; i < BLCKSZ; i++) + { + if (i % 8 == 0) + appendStringInfo(&result, " "); + if (i % 40 == 0) + appendStringInfo(&result, "\n"); + appendStringInfo(&result, "%02x", (unsigned char) (page[i])); + } + + return result.data; +} +#endif + +/* + * zenith_write() -- Write the supplied block at the appropriate location. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +void +zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + /* This is a bit tricky. Check if the relation exists locally */ + if (mdexists(reln, forknum)) + { + /* It exists locally. Guess it's unlogged then. */ + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + + /* + * We could set relpersistence now that we have determined + * that it's local. But we don't dare to do it, because that + * would immediately allow reads as well, which shouldn't + * happen. We could cache it with a different 'relpersistence' + * value, but this isn't performance critical. + */ + return; + } + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + zenith_wallog_page(reln, forknum, blocknum, buffer); + + lsn = PageGetLSN(buffer); + elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, blocknum, + (uint32) (lsn >> 32), (uint32) lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif +} + +/* + * zenith_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +zenith_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + ZenithResponse *resp; + BlockNumber n_blocks; + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdnblocks(reln, forknum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks)) + { + elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, n_blocks); + return n_blocks; + } + + request_lsn = zenith_get_request_lsn(&latest); + { + ZenithNblocksRequest request = { + .req.tag = T_ZenithNblocksRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .rnode = reln->smgr_rnode.node, + .forknum = forknum, + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithNblocksResponse: + n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks; + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks); + + elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + n_blocks); + + pfree(resp); + return n_blocks; +} + +/* + * zenith_db_size() -- Get the size of the database in bytes. + */ +const int64 +zenith_dbsize(Oid dbNode) +{ + ZenithResponse *resp; + int64 db_size; + XLogRecPtr request_lsn; + bool latest; + + request_lsn = zenith_get_request_lsn(&latest); + { + ZenithDbSizeRequest request = { + .req.tag = T_ZenithDbSizeRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .dbNode = dbNode, + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithDbSizeResponse: + db_size = ((ZenithDbSizeResponse *) resp)->db_size; + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read db size of db %u from page server at lsn %X/%08X", + dbNode, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + + elog(SmgrTrace, "zenith_dbsize: db %u (request LSN %X/%08X): %ld bytes", + dbNode, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + db_size); + + pfree(resp); + return db_size; +} + +/* + * zenith_truncate() -- Truncate relation to specified number of blocks. + */ +void +zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdtruncate(reln, forknum, nblocks); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks); + + /* + * Truncating a relation drops all its buffers from the buffer cache + * without calling smgrwrite() on them. But we must account for that in + * our tracking of last-written-LSN all the same: any future smgrnblocks() + * request must return the new size after the truncation. We don't know + * what the LSN of the truncation record was, so be conservative and use + * the most recently inserted WAL record's LSN. + */ + lsn = GetXLogInsertRecPtr(); + + lsn = zm_adjust_lsn(lsn); + + /* + * Flush it, too. We don't actually care about it here, but let's uphold + * the invariant that last-written LSN <= flush LSN. + */ + XLogFlush(lsn); + + SetLastWrittenPageLSN(lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdtruncate(reln, forknum, nblocks); +#endif +} + +/* + * zenith_immedsync() -- Immediately sync a relation to stable storage. + * + * Note that only writes already issued are synced; this routine knows + * nothing of dirty buffers that may exist inside the buffer manager. We + * sync active and inactive segments; smgrDoPendingSyncs() relies on this. + * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of + * some segment, then mdtruncate() renders that segment inactive. If we + * crash before the next checkpoint syncs the newly-inactive segment, that + * segment may survive recovery, reintroducing unwanted data into the table. + */ +void +zenith_immedsync(SMgrRelation reln, ForkNumber forknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdimmedsync(reln, forknum); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdimmedsync(reln, forknum); +#endif +} + +/* + * zenith_start_unlogged_build() -- Starting build operation on a rel. + * + * Some indexes are built in two phases, by first populating the table with + * regular inserts, using the shared buffer cache but skipping WAL-logging, + * and WAL-logging the whole relation after it's done. Zenith relies on the + * WAL to reconstruct pages, so we cannot use the page server in the + * first phase when the changes are not logged. + */ +static void +zenith_start_unlogged_build(SMgrRelation reln) +{ + /* + * Currently, there can be only one unlogged relation build operation in + * progress at a time. That's enough for the current usage. + */ + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) + elog(ERROR, "unlogged relation build is already in progress"); + Assert(unlogged_build_rel == NULL); + + ereport(SmgrTrace, + (errmsg("starting unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + unlogged_build_rel = reln; + unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (smgrnblocks(reln, MAIN_FORKNUM) != 0) + elog(ERROR, "cannot perform unlogged index build, index is not empty "); + + unlogged_build_rel = reln; + unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; + + /* Make the relation look like it's unlogged */ + reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; + + /* + * FIXME: should we pass isRedo true to create the tablespace dir if it + * doesn't exist? Is it needed? + */ + mdcreate(reln, MAIN_FORKNUM, false); +} + +/* + * zenith_finish_unlogged_build_phase_1() + * + * Call this after you have finished populating a relation in unlogged mode, + * before you start WAL-logging it. + */ +static void +zenith_finish_unlogged_build_phase_1(SMgrRelation reln) +{ + Assert(unlogged_build_rel == reln); + + ereport(SmgrTrace, + (errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) + return; + + Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); + Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); + + unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; +} + +/* + * zenith_end_unlogged_build() -- Finish an unlogged rel build. + * + * Call this after you have finished WAL-logging an relation that was + * first populated without WAL-logging. + * + * This removes the local copy of the rel, since it's now been fully + * WAL-logged and is present in the page server. + */ +static void +zenith_end_unlogged_build(SMgrRelation reln) +{ + Assert(unlogged_build_rel == reln); + + ereport(SmgrTrace, + (errmsg("ending unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) + { + RelFileNodeBackend rnode; + + Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); + Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); + + /* Make the relation look permanent again */ + reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT; + + /* Remove local copy */ + rnode = reln->smgr_rnode; + for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u", + rnode.node.spcNode, + rnode.node.dbNode, + rnode.node.relNode, + forknum); + + forget_cached_relsize(rnode.node, forknum); + mdclose(reln, forknum); + /* use isRedo == true, so that we drop it immediately */ + mdunlink(rnode, forknum, true); + } + } + + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; +} + +static void +AtEOXact_zenith(XactEvent event, void *arg) +{ + switch (event) + { + case XACT_EVENT_ABORT: + case XACT_EVENT_PARALLEL_ABORT: + + /* + * Forget about any build we might have had in progress. The local + * file will be unlinked by smgrDoPendingDeletes() + */ + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + break; + + case XACT_EVENT_COMMIT: + case XACT_EVENT_PARALLEL_COMMIT: + case XACT_EVENT_PREPARE: + case XACT_EVENT_PRE_COMMIT: + case XACT_EVENT_PARALLEL_PRE_COMMIT: + case XACT_EVENT_PRE_PREPARE: + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) + { + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + (errmsg("unlogged index build was not properly finished")))); + } + break; + } +} + +static const struct f_smgr zenith_smgr = +{ + .smgr_init = zenith_init, + .smgr_shutdown = NULL, + .smgr_open = zenith_open, + .smgr_close = zenith_close, + .smgr_create = zenith_create, + .smgr_exists = zenith_exists, + .smgr_unlink = zenith_unlink, + .smgr_extend = zenith_extend, + .smgr_prefetch = zenith_prefetch, + .smgr_read = zenith_read, + .smgr_write = zenith_write, + .smgr_writeback = zenith_writeback, + .smgr_nblocks = zenith_nblocks, + .smgr_truncate = zenith_truncate, + .smgr_immedsync = zenith_immedsync, + + .smgr_start_unlogged_build = zenith_start_unlogged_build, + .smgr_finish_unlogged_build_phase_1 = zenith_finish_unlogged_build_phase_1, + .smgr_end_unlogged_build = zenith_end_unlogged_build, +}; + + +const f_smgr * +smgr_zenith(BackendId backend, RelFileNode rnode) +{ + + /* Don't use page server for temp relations */ + if (backend != InvalidBackendId) + return smgr_standard(backend, rnode); + else + return &zenith_smgr; +} + +void +smgr_init_zenith(void) +{ + RegisterXactCallback(AtEOXact_zenith, NULL); + + smgr_init_standard(); + zenith_init(); +} diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c new file mode 100644 index 0000000000..8dfcffe1d1 --- /dev/null +++ b/pgxn/neon/relsize_cache.c @@ -0,0 +1,167 @@ +/*------------------------------------------------------------------------- + * + * relsize_cache.c + * Relation size cache for better zentih performance. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/relsize_cache.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pagestore_client.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" +#include "storage/lwlock.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "catalog/pg_tablespace_d.h" +#include "utils/dynahash.h" +#include "utils/guc.h" + + +typedef struct +{ + RelFileNode rnode; + ForkNumber forknum; +} RelTag; + +typedef struct +{ + RelTag tag; + BlockNumber size; +} RelSizeEntry; + +static HTAB *relsize_hash; +static LWLockId relsize_lock; +static int relsize_hash_size; +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; + +/* + * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB, + * which seems reasonable. + */ +#define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024) + +static void +zenith_smgr_shmem_startup(void) +{ + static HASHCTL info; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize"); + info.keysize = sizeof(RelTag); + info.entrysize = sizeof(RelSizeEntry); + relsize_hash = ShmemInitHash("neon_relsize", + relsize_hash_size, relsize_hash_size, + &info, + HASH_ELEM | HASH_BLOBS); + LWLockRelease(AddinShmemInitLock); +} + +bool +get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size) +{ + bool found = false; + + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_SHARED); + entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); + if (entry != NULL) + { + *size = entry->size; + found = true; + } + LWLockRelease(relsize_lock); + } + return found; +} + +void +set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL); + entry->size = size; + LWLockRelease(relsize_lock); + } +} + +void +update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + bool found; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found); + if (!found || entry->size < size) + entry->size = size; + LWLockRelease(relsize_lock); + } +} + +void +forget_cached_relsize(RelFileNode rnode, ForkNumber forknum) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + hash_search(relsize_hash, &tag, HASH_REMOVE, NULL); + LWLockRelease(relsize_lock); + } +} + +void +relsize_hash_init(void) +{ + DefineCustomIntVariable("neon.relsize_hash_size", + "Sets the maximum number of cached relation sizes for neon", + NULL, + &relsize_hash_size, + DEFAULT_RELSIZE_HASH_SIZE, + 0, + INT_MAX, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + + if (relsize_hash_size > 0) + { + RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); + RequestNamedLWLockTranche("neon_relsize", 1); + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = zenith_smgr_shmem_startup; + } +} diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c new file mode 100644 index 0000000000..9625325c0a --- /dev/null +++ b/pgxn/neon/walproposer.c @@ -0,0 +1,2403 @@ +/*------------------------------------------------------------------------- + * + * walproposer.c + * + * Proposer/leader part of the total order broadcast protocol between postgres + * and WAL safekeepers. + * + * We have two ways of launching WalProposer: + * + * 1. As a background worker which will run physical WalSender with + * am_wal_proposer flag set to true. WalSender in turn would handle WAL + * reading part and call WalProposer when ready to scatter WAL. + * + * 2. As a standalone utility by running `postgres --sync-safekeepers`. That + * is needed to create LSN from which it is safe to start postgres. More + * specifically it addresses following problems: + * + * a) Chicken-or-the-egg problem: compute postgres needs data directory + * with non-rel files that are downloaded from pageserver by calling + * basebackup@LSN. This LSN is not arbitrary, it must include all + * previously committed transactions and defined through consensus + * voting, which happens... in walproposer, a part of compute node. + * + * b) Just warranting such LSN is not enough, we must also actually commit + * it and make sure there is a safekeeper who knows this LSN is + * committed so WAL before it can be streamed to pageserver -- otherwise + * basebackup will hang waiting for WAL. Advancing commit_lsn without + * playing consensus game is impossible, so speculative 'let's just poll + * safekeepers, learn start LSN of future epoch and run basebackup' + * won't work. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include +#include "access/xlogdefs.h" +#include "access/xlogutils.h" +#include "storage/latch.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "access/xlog.h" +#include "libpq/pqformat.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/timestamp.h" + +#include "neon.h" +#include "walproposer.h" +#include "walproposer_utils.h" +#include "replication/walpropshim.h" + + +char *wal_acceptors_list; +int wal_acceptor_reconnect_timeout; +int wal_acceptor_connect_timeout; +bool am_wal_proposer; + +char *zenith_timeline_walproposer = NULL; +char *zenith_tenant_walproposer = NULL; + +/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */ +WalProposerFunctionsType *WalProposerFunctions = NULL; + +#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" + +static int n_safekeepers = 0; +static int quorum = 0; +static Safekeeper safekeeper[MAX_SAFEKEEPERS]; +static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ +static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to safekeepers */ +static ProposerGreeting greetRequest; +static VoteRequest voteRequest; /* Vote request for safekeeper */ +static WaitEventSet *waitEvents; +static AppendResponse quorumFeedback; +/* + * Minimal LSN which may be needed for recovery of some safekeeper, + * record-aligned (first record which might not yet received by someone). + */ +static XLogRecPtr truncateLsn; +/* + * Term of the proposer. We want our term to be highest and unique, + * so we collect terms from safekeepers quorum, choose max and +1. + * After that our term is fixed and must not change. If we observe + * that some safekeeper has higher term, it means that we have another + * running compute, so we must stop immediately. + */ +static term_t propTerm; +static TermHistory propTermHistory; /* term history of the proposer */ +static XLogRecPtr propEpochStartLsn; /* epoch start lsn of the proposer */ +static term_t donorEpoch; /* Most advanced acceptor epoch */ +static int donor; /* Most advanced acceptor */ +static XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +static int n_votes = 0; +static int n_connected = 0; +static TimestampTz last_reconnect_attempt; + +static WalproposerShmemState *walprop_shared; + +/* Prototypes for private functions */ +static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId); +static void WalProposerStartImpl(void); +static void WalProposerLoop(void); +static void InitEventSet(void); +static void UpdateEventSet(Safekeeper *sk, uint32 events); +static void HackyRemoveWalProposerEvent(Safekeeper *to_remove); +static void ShutdownConnection(Safekeeper *sk); +static void ResetConnection(Safekeeper *sk); +static long TimeToReconnect(TimestampTz now); +static void ReconnectSafekeepers(void); +static void AdvancePollState(Safekeeper *sk, uint32 events); +static void HandleConnectionEvent(Safekeeper *sk); +static void SendStartWALPush(Safekeeper *sk); +static void RecvStartWALPushResult(Safekeeper *sk); +static void SendProposerGreeting(Safekeeper *sk); +static void RecvAcceptorGreeting(Safekeeper *sk); +static void SendVoteRequest(Safekeeper *sk); +static void RecvVoteResponse(Safekeeper *sk); +static void HandleElectedProposer(void); +static term_t GetHighestTerm(TermHistory *th); +static term_t GetEpoch(Safekeeper *sk); +static void DetermineEpochStartLsn(void); +static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); +static void SendProposerElected(Safekeeper *sk); +static void WalProposerStartStreaming(XLogRecPtr startpos); +static void StartStreaming(Safekeeper *sk); +static void SendMessageToNode(Safekeeper *sk); +static void BroadcastAppendRequest(void); +static void HandleActiveState(Safekeeper *sk, uint32 events); +static bool SendAppendRequests(Safekeeper *sk); +static bool RecvAppendResponses(Safekeeper *sk); +static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs); +static XLogRecPtr CalculateMinFlushLsn(void); +static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void); +static void HandleSafekeeperResponse(void); +static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); +static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); +static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); +static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); +static bool AsyncFlush(Safekeeper *sk); + + +static void nwp_shmem_startup_hook(void); +static void nwp_register_gucs(void); +static void nwp_prepare_shmem(void); +static uint64 backpressure_lag_impl(void); + + +static shmem_startup_hook_type prev_shmem_startup_hook_type; + + + +void pg_init_walproposer(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + nwp_register_gucs(); + + nwp_prepare_shmem(); + + delay_backend_us = &backpressure_lag_impl; + + WalProposerRegister(); + + WalProposerInit = &WalProposerInitImpl; + WalProposerStart = &WalProposerStartImpl; +} + +static void nwp_register_gucs(void) +{ + DefineCustomStringVariable( + "neon.safekeepers", + "List of Neon WAL acceptors (host:port)", + NULL, /* long_desc */ + &wal_acceptors_list, /* valueAddr */ + "", /* bootValue */ + PGC_POSTMASTER, + GUC_LIST_INPUT, /* extensions can't use GUC_LIST_QUOTE */ + NULL, NULL, NULL + ); + + DefineCustomIntVariable( + "neon.safekeeper_reconnect_timeout", + "Timeout for reconnecting to offline wal acceptor.", + NULL, + &wal_acceptor_reconnect_timeout, + 1000, 0, INT_MAX, /* default, min, max */ + PGC_SIGHUP, /* context */ + GUC_UNIT_MS, /* flags */ + NULL, NULL, NULL + ); + + DefineCustomIntVariable( + "neon.safekeeper_connect_timeout", + "Timeout after which give up connection attempt to safekeeper.", + NULL, + &wal_acceptor_connect_timeout, + 5000, 0, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MS, + NULL, NULL, NULL + ); + +} + +/* shmem handling */ + +static void nwp_prepare_shmem(void) +{ + RequestAddinShmemSpace(WalproposerShmemSize()); + + prev_shmem_startup_hook_type = shmem_startup_hook; + shmem_startup_hook = nwp_shmem_startup_hook; +} + +static void nwp_shmem_startup_hook(void) +{ + if (prev_shmem_startup_hook_type) + prev_shmem_startup_hook_type(); + + WalproposerShmemInit(); +} + +/* + * WAL proposer bgworker entry point. + */ +void +WalProposerMain(Datum main_arg) +{ + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + GetXLogReplayRecPtr(&ThisTimeLineID); + + WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier()); + + last_reconnect_attempt = GetCurrentTimestamp(); + + application_name = (char *) "walproposer"; /* for + * synchronous_standby_names */ + am_wal_proposer = true; + am_walsender = true; + InitWalSender(); + InitProcessPhase2(); + + /* Create replication slot for WAL proposer if not exists */ + if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) + { + ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); + ReplicationSlotReserveWal(); + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + ReplicationSlotRelease(); + } + + WalProposerStart(); +} + +/* + * Create new AppendRequest message and start sending it. This function is + * called from walsender every time the new WAL is available. + */ +void +WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos) +{ + Assert(startpos == availableLsn && endpos >= availableLsn); + availableLsn = endpos; + BroadcastAppendRequest(); +} + +/* + * Advance the WAL proposer state machine, waiting each time for events to occur. + * Will exit only when latch is set, i.e. new WAL should be pushed from walsender + * to walproposer. + */ +void +WalProposerPoll(void) +{ + while (true) + { + Safekeeper *sk; + int rc; + WaitEvent event; + TimestampTz now = GetCurrentTimestamp(); + + rc = WaitEventSetWait(waitEvents, TimeToReconnect(now), + &event, 1, WAIT_EVENT_WAL_SENDER_MAIN); + sk = (Safekeeper *) event.user_data; + + /* + * If the event contains something that one of our safekeeper states + * was waiting for, we'll advance its state. + */ + if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))) + AdvancePollState(sk, event.events); + + /* + * If the timeout expired, attempt to reconnect to any safekeepers that + * we dropped + */ + ReconnectSafekeepers(); + + /* + * If wait is terminated by latch set (walsenders' latch is set on + * each wal flush), then exit loop. (no need for pm death check due to + * WL_EXIT_ON_PM_DEATH) + */ + if (rc != 0 && (event.events & WL_LATCH_SET)) + { + ResetLatch(MyLatch); + break; + } + if (rc == 0) /* timeout expired: poll state */ + { + TimestampTz now; + + /* + * If no WAL was generated during timeout (and we have already + * collected the quorum), then send pool message + */ + if (availableLsn != InvalidXLogRecPtr) + { + BroadcastAppendRequest(); + } + + /* + * Abandon connection attempts which take too long. + */ + now = GetCurrentTimestamp(); + for (int i = 0; i < n_safekeepers; i++) + { + Safekeeper *sk = &safekeeper[i]; + + if ((sk->state == SS_CONNECTING_WRITE || + sk->state == SS_CONNECTING_READ) && + TimestampDifferenceExceeds(sk->startedConnAt, now, + wal_acceptor_connect_timeout)) + { + elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms", + sk->host, sk->port, wal_acceptor_connect_timeout); + ShutdownConnection(sk); + } + } + } + } +} + +/* + * Register a background worker proposing WAL to wal acceptors. + */ +void +WalProposerRegister(void) +{ + BackgroundWorker bgw; + + if (*wal_acceptors_list == '\0') + return; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +static void +WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId) +{ + char *host; + char *sep; + char *port; + + /* Load the libpq-specific functions */ + if (WalProposerFunctions == NULL) + elog(ERROR, "libpqwalproposer didn't initialize correctly"); + + load_file("libpqwalreceiver", false); + if (WalReceiverFunctions == NULL) + elog(ERROR, "libpqwalreceiver didn't initialize correctly"); + + for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep) + { + port = strchr(host, ':'); + if (port == NULL) + { + elog(FATAL, "port is not specified"); + } + *port++ = '\0'; + sep = strchr(port, ','); + if (sep != NULL) + *sep++ = '\0'; + if (n_safekeepers + 1 >= MAX_SAFEKEEPERS) + { + elog(FATAL, "Too many safekeepers"); + } + safekeeper[n_safekeepers].host = host; + safekeeper[n_safekeepers].port = port; + safekeeper[n_safekeepers].state = SS_OFFLINE; + safekeeper[n_safekeepers].conn = NULL; + + /* + * Set conninfo to empty. We'll fill it out once later, in + * `ResetConnection` as needed + */ + safekeeper[n_safekeepers].conninfo[0] = '\0'; + initStringInfo(&safekeeper[n_safekeepers].outbuf); + safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open, .segment_close = wal_segment_close), NULL); + if (safekeeper[n_safekeepers].xlogreader == NULL) + elog(FATAL, "Failed to allocate xlog reader"); + safekeeper[n_safekeepers].flushWrite = false; + safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr; + safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr; + n_safekeepers += 1; + } + if (n_safekeepers < 1) + { + elog(FATAL, "Safekeepers addresses are not specified"); + } + quorum = n_safekeepers / 2 + 1; + + /* Fill the greeting package */ + greetRequest.tag = 'g'; + greetRequest.protocolVersion = SK_PROTOCOL_VERSION; + greetRequest.pgVersion = PG_VERSION_NUM; + pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId)); + greetRequest.systemId = systemId; + if (!zenith_timeline_walproposer) + elog(FATAL, "neon.timeline_id is not provided"); + if (*zenith_timeline_walproposer != '\0' && + !HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16)) + elog(FATAL, "Could not parse neon.timeline_id, %s", zenith_timeline_walproposer); + if (!zenith_tenant_walproposer) + elog(FATAL, "neon.tenant_id is not provided"); + if (*zenith_tenant_walproposer != '\0' && + !HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16)) + elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); + + greetRequest.timeline = ThisTimeLineID; + greetRequest.walSegSize = wal_segment_size; + + InitEventSet(); +} + +static void +WalProposerStartImpl(void) +{ + + /* Initiate connections to all safekeeper nodes */ + for (int i = 0; i < n_safekeepers; i++) + { + ResetConnection(&safekeeper[i]); + } + + WalProposerLoop(); +} + +static void +WalProposerLoop(void) +{ + while (true) + WalProposerPoll(); +} + +/* Initializes the internal event set, provided that it is currently null */ +static void +InitEventSet(void) +{ + if (waitEvents) + elog(FATAL, "double-initialization of event set"); + + waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers); + AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); +} + +/* + * Updates the events we're already waiting on for the safekeeper, setting it to + * the provided `events` + * + * This function is called any time the safekeeper's state switches to one where + * it has to wait to continue. This includes the full body of AdvancePollState + * and calls to IO helper functions. + */ +static void +UpdateEventSet(Safekeeper *sk, uint32 events) +{ + /* eventPos = -1 when we don't have an event */ + Assert(sk->eventPos != -1); + + ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); +} + +/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set. + * + * Note: Internally, this completely reconstructs the event set. It should be avoided if possible. + */ +static void +HackyRemoveWalProposerEvent(Safekeeper *to_remove) +{ + /* Remove the existing event set */ + if (waitEvents) + { + FreeWaitEventSet(waitEvents); + waitEvents = NULL; + } + /* Re-initialize it without adding any safekeeper events */ + InitEventSet(); + + /* + * loop through the existing safekeepers. If they aren't the one we're + * removing, and if they have a socket we can use, re-add the applicable + * events. + */ + for (int i = 0; i < n_safekeepers; i++) + { + uint32 desired_events = WL_NO_EVENTS; + Safekeeper *sk = &safekeeper[i]; + + sk->eventPos = -1; + + if (sk == to_remove) + continue; + + /* If this safekeeper isn't offline, add an event for it! */ + if (sk->conn != NULL) + { + desired_events = SafekeeperStateDesiredEvents(sk->state); + sk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(sk->conn), NULL, sk); + } + } +} + +/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */ +static void +ShutdownConnection(Safekeeper *sk) +{ + if (sk->conn) + walprop_finish(sk->conn); + sk->conn = NULL; + sk->state = SS_OFFLINE; + sk->flushWrite = false; + sk->streamingAt = InvalidXLogRecPtr; + + if (sk->voteResponse.termHistory.entries) + pfree(sk->voteResponse.termHistory.entries); + sk->voteResponse.termHistory.entries = NULL; + + HackyRemoveWalProposerEvent(sk); +} + +/* + * This function is called to establish new connection or to reestablish + * connection in case of connection failure. + * + * On success, sets the state to SS_CONNECTING_WRITE. + */ +static void +ResetConnection(Safekeeper *sk) +{ + pgsocket sock; /* socket of the new connection */ + + if (sk->state != SS_OFFLINE) + { + ShutdownConnection(sk); + } + + /* + * Try to establish new connection + * + * If the connection information hasn't been filled out, we need to do + * that here. + */ + if (sk->conninfo[0] == '\0') + { + int written = 0; + written = snprintf((char *) &sk->conninfo, MAXCONNINFO, + "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", + sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); + // currently connection string is not that long, but once we pass something like jwt we might overflow the buffer, + // so it is better to be defensive and check that everything aligns well + if (written > MAXCONNINFO || written < 0) + elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); + } + + sk->conn = walprop_connect_start((char *) &sk->conninfo); + + /* + * "If the result is null, then libpq has been unable to allocate a new + * PGconn structure" + */ + if (!sk->conn) + elog(FATAL, "failed to allocate new PGconn object"); + + /* + * PQconnectStart won't actually start connecting until we run + * PQconnectPoll. Before we do that though, we need to check that it + * didn't immediately fail. + */ + if (walprop_status(sk->conn) == WP_CONNECTION_BAD) + { + /*--- + * According to libpq docs: + * "If the result is CONNECTION_BAD, the connection attempt has already failed, + * typically because of invalid connection parameters." + * We should report this failure. + * + * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS + */ + elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s", + sk->conninfo, walprop_error_message(sk->conn)); + + /* + * Even though the connection failed, we still need to clean up the + * object + */ + walprop_finish(sk->conn); + sk->conn = NULL; + return; + } + + /* + * The documentation for PQconnectStart states that we should call + * PQconnectPoll in a loop until it returns PGRES_POLLING_OK or + * PGRES_POLLING_FAILED. The other two possible returns indicate whether + * we should wait for reading or writing on the socket. For the first + * iteration of the loop, we're expected to wait until the socket becomes + * writable. + * + * The wording of the documentation is a little ambiguous; thankfully + * there's an example in the postgres source itself showing this behavior. + * (see libpqrcv_connect, defined in + * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c) + */ + elog(LOG, "connecting with node %s:%s", sk->host, sk->port); + + sk->state = SS_CONNECTING_WRITE; + sk->startedConnAt = GetCurrentTimestamp(); + + sock = walprop_socket(sk->conn); + sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk); + return; +} + +/* + * How much milliseconds left till we should attempt reconnection to + * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect + * (do we actually need this?). + */ +static long +TimeToReconnect(TimestampTz now) +{ + TimestampTz passed; + TimestampTz till_reconnect; + + if (wal_acceptor_reconnect_timeout <= 0) + return -1; + + passed = now - last_reconnect_attempt; + till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed; + if (till_reconnect <= 0) + return 0; + return (long) (till_reconnect / 1000); +} + +/* If the timeout has expired, attempt to reconnect to all offline safekeepers */ +static void +ReconnectSafekeepers(void) +{ + TimestampTz now = GetCurrentTimestamp(); + + if (TimeToReconnect(now) == 0) + { + last_reconnect_attempt = now; + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_OFFLINE) + ResetConnection(&safekeeper[i]); + } + } +} + +/* + * Performs the logic for advancing the state machine of the specified safekeeper, + * given that a certain set of events has occured. + */ +static void +AdvancePollState(Safekeeper *sk, uint32 events) +{ + /* + * Sanity check. We assume further down that the operations don't + * block because the socket is ready. + */ + AssertEventsOkForState(events, sk); + + /* Execute the code corresponding to the current state */ + switch (sk->state) + { + /* + * safekeepers are only taken out of SS_OFFLINE by calls to + * ResetConnection + */ + case SS_OFFLINE: + elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline", + sk->host, sk->port); + break; /* actually unreachable, but prevents + * -Wimplicit-fallthrough */ + + /* + * Both connecting states run the same logic. The only + * difference is the events they're expecting + */ + case SS_CONNECTING_READ: + case SS_CONNECTING_WRITE: + HandleConnectionEvent(sk); + break; + + /* + * Waiting for a successful CopyBoth response. + */ + case SS_WAIT_EXEC_RESULT: + RecvStartWALPushResult(sk); + break; + + /* + * Finish handshake comms: receive information about the safekeeper. + */ + case SS_HANDSHAKE_RECV: + RecvAcceptorGreeting(sk); + break; + + /* + * Voting is an idle state - we don't expect any events to trigger. + * Refer to the execution of SS_HANDSHAKE_RECV to see how nodes are + * transferred from SS_VOTING to sending actual vote requests. + */ + case SS_VOTING: + elog(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return; + + /* Read the safekeeper response for our candidate */ + case SS_WAIT_VERDICT: + RecvVoteResponse(sk); + break; + + /* Flush proposer announcement message */ + case SS_SEND_ELECTED_FLUSH: + + /* + * AsyncFlush ensures we only move on to SS_ACTIVE once the flush + * completes. If we still have more to do, we'll wait until the next + * poll comes along. + */ + if (!AsyncFlush(sk)) + return; + + /* flush is done, event set and state will be updated later */ + StartStreaming(sk); + break; + + /* + * Idle state for waiting votes from quorum. + */ + case SS_IDLE: + elog(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return; + + /* + * Active state is used for streaming WAL and receiving feedback. + */ + case SS_ACTIVE: + HandleActiveState(sk, events); + break; + } +} + +static void +HandleConnectionEvent(Safekeeper *sk) +{ + WalProposerConnectPollStatusType result = walprop_connect_poll(sk->conn); + + /* The new set of events we'll wait on, after updating */ + uint32 new_events = WL_NO_EVENTS; + + switch (result) + { + case WP_CONN_POLLING_OK: + elog(LOG, "connected with node %s:%s", sk->host, + sk->port); + + /* + * We have to pick some event to update event set. + * We'll eventually need the socket to be readable, + * so we go with that. + */ + new_events = WL_SOCKET_READABLE; + break; + + /* + * If we need to poll to finish connecting, + * continue doing that + */ + case WP_CONN_POLLING_READING: + sk->state = SS_CONNECTING_READ; + new_events = WL_SOCKET_READABLE; + break; + case WP_CONN_POLLING_WRITING: + sk->state = SS_CONNECTING_WRITE; + new_events = WL_SOCKET_WRITEABLE; + break; + + case WP_CONN_POLLING_FAILED: + elog(WARNING, "failed to connect to node '%s:%s': %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + + /* + * If connecting failed, we don't want to restart + * the connection because that might run us into a + * loop. Instead, shut it down -- it'll naturally + * restart at a slower interval on calls to + * ReconnectSafekeepers. + */ + ShutdownConnection(sk); + return; + } + + /* + * Because PQconnectPoll can change the socket, we have to + * un-register the old event and re-register an event on + * the new socket. + */ + HackyRemoveWalProposerEvent(sk); + sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk); + + /* If we successfully connected, send START_WAL_PUSH query */ + if (result == WP_CONN_POLLING_OK) + SendStartWALPush(sk); +} + +/* + * Send "START_WAL_PUSH" message as an empty query to the safekeeper. Performs + * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something + * goes wrong, change state to SS_OFFLINE and shutdown the connection. + */ +static void +SendStartWALPush(Safekeeper *sk) +{ + if (!walprop_send_query(sk->conn, "START_WAL_PUSH")) + { + elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return; + } + sk->state = SS_WAIT_EXEC_RESULT; + UpdateEventSet(sk, WL_SOCKET_READABLE); +} + +static void +RecvStartWALPushResult(Safekeeper *sk) +{ + switch (walprop_get_query_result(sk->conn)) + { + /* + * Successful result, move on to starting the + * handshake + */ + case WP_EXEC_SUCCESS_COPYBOTH: + + SendProposerGreeting(sk); + break; + + /* + * Needs repeated calls to finish. Wait until the + * socket is readable + */ + case WP_EXEC_NEEDS_INPUT: + + /* + * SS_WAIT_EXEC_RESULT is always reached through an + * event, so we don't need to update the event set + */ + break; + + case WP_EXEC_FAILED: + elog(WARNING, "Failed to send query to safekeeper %s:%s: %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return; + + /* + * Unexpected result -- funamdentally an error, but we + * want to produce a custom message, rather than a + * generic "something went wrong" + */ + case WP_EXEC_UNEXPECTED_SUCCESS: + elog(WARNING, "Received bad response from safekeeper %s:%s query execution", + sk->host, sk->port); + ShutdownConnection(sk); + return; + } +} + +/* + * Start handshake: first of all send information about the + * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for + * a response to finish the handshake. + */ +static void +SendProposerGreeting(Safekeeper *sk) +{ + /* + * On failure, logging & resetting the connection is handled. + * We just need to handle the control flow. + */ + BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV); +} + +static void +RecvAcceptorGreeting(Safekeeper *sk) +{ + /* + * If our reading doesn't immediately succeed, any necessary + * error handling or state setting is taken care of. We can + * leave any other work until later. + */ + sk->greetResponse.apm.tag = 'g'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) + return; + + /* Protocol is all good, move to voting. */ + sk->state = SS_VOTING; + + ++n_connected; + if (n_connected <= quorum) + { + /* We're still collecting terms from the majority. */ + propTerm = Max(sk->greetResponse.term, propTerm); + + /* Quorum is acquried, prepare the vote request. */ + if (n_connected == quorum) + { + propTerm++; + elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm); + + voteRequest = (VoteRequest) + { + .tag = 'v', + .term = propTerm + }; + memcpy(voteRequest.proposerId.data, greetRequest.proposerId.data, UUID_LEN); + } + } + else if (sk->greetResponse.term > propTerm) + { + /* Another compute with higher term is running. */ + elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->greetResponse.term, propTerm); + } + + /* + * Check if we have quorum. If there aren't enough safekeepers, + * wait and do nothing. We'll eventually get a task when the + * election starts. + * + * If we do have quorum, we can start an election. + */ + if (n_connected < quorum) + { + /* + * SS_VOTING is an idle state; read-ready indicates the + * connection closed. + */ + UpdateEventSet(sk, WL_SOCKET_READABLE); + } + else + { + /* + * Now send voting request to the cohort and wait + * responses + */ + for (int j = 0; j < n_safekeepers; j++) + { + /* + * Remember: SS_VOTING indicates that the safekeeper is + * participating in voting, but hasn't sent anything + * yet. + */ + if (safekeeper[j].state == SS_VOTING) + SendVoteRequest(&safekeeper[j]); + } + } +} + +static void +SendVoteRequest(Safekeeper *sk) +{ + /* We have quorum for voting, send our vote request */ + elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term); + /* On failure, logging & resetting is handled */ + if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT)) + return; + + /* If successful, wait for read-ready with SS_WAIT_VERDICT */ +} + +static void +RecvVoteResponse(Safekeeper *sk) +{ + sk->voteResponse.apm.tag = 'v'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) + return; + + elog(LOG, + "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), + LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + + /* + * In case of acceptor rejecting our vote, bail out, but only + * if either it already lives in strictly higher term + * (concurrent compute spotted) or we are not elected yet and + * thus need the vote. + */ + if ((!sk->voteResponse.voteGiven) && + (sk->voteResponse.term > propTerm || n_votes < quorum)) + { + elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->voteResponse.term, propTerm); + } + Assert(sk->voteResponse.term == propTerm); + + /* Handshake completed, do we have quorum? */ + n_votes++; + if (n_votes < quorum) + { + sk->state = SS_IDLE; /* can't do much yet, no quorum */ + } + else if (n_votes > quorum) + { + /* recovery already performed, just start streaming */ + SendProposerElected(sk); + } + else + { + sk->state = SS_IDLE; + UpdateEventSet(sk, WL_SOCKET_READABLE); /* Idle states wait for + * read-ready */ + + HandleElectedProposer(); + } +} + +/* + * Called once a majority of acceptors have voted for us and current proposer + * has been elected. + * + * Sends ProposerElected message to all acceptors in SS_IDLE state and starts + * replication from walsender. + */ +static void +HandleElectedProposer(void) +{ + DetermineEpochStartLsn(); + + /* + * Check if not all safekeepers are up-to-date, we need to + * download WAL needed to synchronize them + */ + if (truncateLsn < propEpochStartLsn) + { + elog(LOG, + "start recovery because truncateLsn=%X/%X is not " + "equal to epochStartLsn=%X/%X", + LSN_FORMAT_ARGS(truncateLsn), + LSN_FORMAT_ARGS(propEpochStartLsn)); + /* Perform recovery */ + if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn)) + elog(FATAL, "Failed to recover state"); + } + else if (syncSafekeepers) + { + /* Sync is not needed: just exit */ + fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); + exit(0); + } + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_IDLE) + SendProposerElected(&safekeeper[i]); + } + + /* + * The proposer has been elected, and there will be no quorum waiting + * after this point. There will be no safekeeper with state SS_IDLE + * also, because that state is used only for quorum waiting. + */ + + if (syncSafekeepers) + { + /* + * Send empty message to enforce receiving feedback + * even from nodes who are fully recovered; this is + * required to learn they switched epoch which finishes + * sync-safeekepers who doesn't generate any real new + * records. Will go away once we switch to async acks. + */ + BroadcastAppendRequest(); + + /* keep polling until all safekeepers are synced */ + return; + } + + WalProposerStartStreaming(propEpochStartLsn); + /* Should not return here */ +} + +/* latest term in TermHistory, or 0 is there is no entries */ +static term_t +GetHighestTerm(TermHistory *th) +{ + return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0; +} + +/* safekeeper's epoch is the term of the highest entry in the log */ +static term_t +GetEpoch(Safekeeper *sk) +{ + return GetHighestTerm(&sk->voteResponse.termHistory); +} + +/* If LSN points to the page header, skip it */ +static XLogRecPtr +SkipXLogPageHeader(XLogRecPtr lsn) +{ + if (XLogSegmentOffset(lsn, wal_segment_size) == 0) + { + lsn += SizeOfXLogLongPHD; + } + else if (lsn % XLOG_BLCKSZ == 0) + { + lsn += SizeOfXLogShortPHD; + } + return lsn; +} + +/* + * Called after majority of acceptors gave votes, it calculates the most + * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since + * which we'll write WAL in our term. + * + * Sets truncateLsn along the way (though it is not of much use at this point -- + * only for skipping recovery). + */ +static void +DetermineEpochStartLsn(void) +{ + TermHistory *dth; + + propEpochStartLsn = InvalidXLogRecPtr; + donorEpoch = 0; + truncateLsn = InvalidXLogRecPtr; + timelineStartLsn = InvalidXLogRecPtr; + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_IDLE) + { + if (GetEpoch(&safekeeper[i]) > donorEpoch || + (GetEpoch(&safekeeper[i]) == donorEpoch && + safekeeper[i].voteResponse.flushLsn > propEpochStartLsn)) + { + donorEpoch = GetEpoch(&safekeeper[i]); + propEpochStartLsn = safekeeper[i].voteResponse.flushLsn; + donor = i; + } + truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn); + + if (safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr) + { + /* timelineStartLsn should be the same everywhere or unknown */ + if (timelineStartLsn != InvalidXLogRecPtr && + timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn) + { + elog(WARNING, + "inconsistent timelineStartLsn: current %X/%X, received %X/%X", + LSN_FORMAT_ARGS(timelineStartLsn), + LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn)); + } + timelineStartLsn = safekeeper[i].voteResponse.timelineStartLsn; + } + } + } + + /* + * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was + * committed yet. Start streaming then from the basebackup LSN. + */ + if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers) + { + propEpochStartLsn = truncateLsn = GetRedoStartLsn(); + if (timelineStartLsn == InvalidXLogRecPtr) + { + timelineStartLsn = GetRedoStartLsn(); + } + elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn)); + } + + /* + * If propEpochStartLsn is not 0, at least one msg with WAL was sent to + * some connected safekeeper; it must have carried truncateLsn pointing to + * the first record. + */ + Assert((truncateLsn != InvalidXLogRecPtr) || + (syncSafekeepers && truncateLsn == propEpochStartLsn)); + + /* + * We will be generating WAL since propEpochStartLsn, so we should set + * availableLsn to mark this LSN as the latest available position. + */ + availableLsn = propEpochStartLsn; + + /* + * Proposer's term history is the donor's + its own entry. + */ + dth = &safekeeper[donor].voteResponse.termHistory; + propTermHistory.n_entries = dth->n_entries + 1; + propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries); + memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); + propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm; + propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn; + + elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", + quorum, + propTerm, + LSN_FORMAT_ARGS(propEpochStartLsn), + safekeeper[donor].host, safekeeper[donor].port, + LSN_FORMAT_ARGS(truncateLsn) + ); + + /* + * Ensure the basebackup we are running (at RedoStartLsn) matches LSN since + * which we are going to write according to the consensus. If not, we must + * bail out, as clog and other non rel data is inconsistent. + */ + if (!syncSafekeepers) + { + /* + * Basebackup LSN always points to the beginning of the record (not the + * page), as StartupXLOG most probably wants it this way. Safekeepers + * don't skip header as they need continious stream of data, so + * correct LSN for comparison. + */ + if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn()) + { + /* + * However, allow to proceed if previously elected leader was me; plain + * restart of walproposer not intervened by concurrent compute (who could + * generate WAL) is ok. + */ + if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == + walprop_shared->mineLastElectedTerm))) + { + elog(PANIC, + "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", + LSN_FORMAT_ARGS(propEpochStartLsn), + LSN_FORMAT_ARGS(GetRedoStartLsn())); + } + } + walprop_shared->mineLastElectedTerm = propTerm; + } +} + +/* + * Receive WAL from most advanced safekeeper + */ +static bool +WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos) +{ + char conninfo[MAXCONNINFO]; + char *err; + WalReceiverConn *wrconn; + WalRcvStreamOptions options; + + sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", + safekeeper[donor].host, safekeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer); + wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); + if (!wrconn) + { + ereport(WARNING, + (errmsg("could not connect to WAL acceptor %s:%s: %s", + safekeeper[donor].host, safekeeper[donor].port, + err))); + return false; + } + elog(LOG, + "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline " + "%d", + safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32), + (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); + + options.logical = false; + options.startpoint = startpos; + options.slotname = NULL; + options.proto.physical.startpointTLI = timeline; + + if (walrcv_startstreaming(wrconn, &options)) + { + XLogRecPtr rec_start_lsn; + XLogRecPtr rec_end_lsn = 0; + int len; + char *buf; + pgsocket wait_fd = PGINVALID_SOCKET; + + while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0) + { + if (len == 0) + { + (void) WaitLatchOrSocket( + MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd, + -1, WAIT_EVENT_WAL_RECEIVER_MAIN); + } + else + { + Assert(buf[0] == 'w' || buf[0] == 'k'); + if (buf[0] == 'k') + continue; /* keepalive */ + memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], + sizeof rec_start_lsn); + rec_start_lsn = pg_ntoh64(rec_start_lsn); + rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE; + + /* write WAL to disk */ + XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn); + + ereport(DEBUG1, + (errmsg("Recover message %X/%X length %d", + LSN_FORMAT_ARGS(rec_start_lsn), len))); + if (rec_end_lsn >= endpos) + break; + } + } + ereport(LOG, + (errmsg("end of replication stream at %X/%X: %m", + LSN_FORMAT_ARGS(rec_end_lsn)))); + walrcv_disconnect(wrconn); + + /* failed to receive all WAL till endpos */ + if (rec_end_lsn < endpos) + return false; + } + else + { + ereport(LOG, + (errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X", + timeline, (uint32) (startpos >> 32), (uint32) startpos))); + return false; + } + + return true; +} + +/* + * Determine for sk the starting streaming point and send it message + * 1) Announcing we are elected proposer (which immediately advances epoch if + * safekeeper is synced, being important for sync-safekeepers) + * 2) Communicating starting streaming point -- safekeeper must truncate its WAL + * beyond it -- and history of term switching. + * + * Sets sk->startStreamingAt. + */ +static void +SendProposerElected(Safekeeper *sk) +{ + ProposerElected msg; + TermHistory *th; + term_t lastCommonTerm; + int i; + + /* + * Determine start LSN by comparing safekeeper's log term switch history and + * proposer's, searching for the divergence point. + * + * Note: there is a vanishingly small chance of no common point even if + * there is some WAL on safekeeper, if immediately after bootstrap compute + * wrote some WAL on single sk and died; we stream since the beginning then. + */ + th = &sk->voteResponse.termHistory; + /* + * If any WAL is present on the sk, it must be authorized by some term. + * OTOH, without any WAL there are no term swiches in the log. + */ + Assert((th->n_entries == 0) == + (sk->voteResponse.flushLsn == InvalidXLogRecPtr)); + /* We must start somewhere. */ + Assert(propTermHistory.n_entries >= 1); + + for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++) + { + if (propTermHistory.entries[i].term != th->entries[i].term) + break; + /* term must begin everywhere at the same point */ + Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn); + } + i--; /* step back to the last common term */ + if (i < 0) + { + /* safekeeper is empty or no common point, start from the beginning */ + sk->startStreamingAt = propTermHistory.entries[0].lsn; + + if (sk->startStreamingAt < truncateLsn) + { + /* + * There's a gap between the WAL starting point and a truncateLsn, + * which can't appear in a normal working cluster. That gap means + * that all safekeepers reported that they have persisted WAL up + * to the truncateLsn before, but now current safekeeper tells + * otherwise. + * + * Also we have a special condition here, which is empty safekeeper + * with no history. In combination with a gap, that can happen when + * we introduce a new safekeeper to the cluster. This is a rare case, + * which is triggered manually for now, and should be treated with + * care. + */ + + /* + * truncateLsn will not change without ack from current safekeeper, + * and it's aligned to the WAL record, so we can safely start + * streaming from this point. + */ + sk->startStreamingAt = truncateLsn; + + elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X", + sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn), + LSN_FORMAT_ARGS(sk->startStreamingAt)); + } + } + else + { + /* + * End of (common) term is the start of the next except it is the last + * one; there it is flush_lsn in case of safekeeper or, in case of + * proposer, LSN it is currently writing, but then we just pick + * safekeeper pos as it obviously can't be higher. + */ + if (propTermHistory.entries[i].term == propTerm) + { + sk->startStreamingAt = sk->voteResponse.flushLsn; + } + else + { + XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; + XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : + sk->voteResponse.flushLsn); + sk->startStreamingAt = Min(propEndLsn, skEndLsn); + } + } + + Assert(sk->startStreamingAt >= truncateLsn && sk->startStreamingAt <= availableLsn); + + msg.tag = 'e'; + msg.term = propTerm; + msg.startStreamingAt = sk->startStreamingAt; + msg.termHistory = &propTermHistory; + msg.timelineStartLsn = timelineStartLsn; + + lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0; + elog(LOG, + "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", + sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); + + resetStringInfo(&sk->outbuf); + pq_sendint64_le(&sk->outbuf, msg.tag); + pq_sendint64_le(&sk->outbuf, msg.term); + pq_sendint64_le(&sk->outbuf, msg.startStreamingAt); + pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries); + for (int i = 0; i < msg.termHistory->n_entries; i++) + { + pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term); + pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn); + } + pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn); + + if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH)) + return; + + StartStreaming(sk); +} + +/* + * Start walsender streaming replication + */ +static void +WalProposerStartStreaming(XLogRecPtr startpos) +{ + StartReplicationCmd cmd; + + elog(LOG, "WAL proposer starts streaming at %X/%X", + LSN_FORMAT_ARGS(startpos)); + cmd.slotname = WAL_PROPOSER_SLOT_NAME; + cmd.timeline = greetRequest.timeline; + cmd.startpoint = startpos; + StartProposerReplication(&cmd); +} + +/* + * Start streaming to safekeeper sk, always updates state to SS_ACTIVE and sets + * correct event set. + */ +static void +StartStreaming(Safekeeper *sk) +{ + /* + * This is the only entrypoint to state SS_ACTIVE. It's executed + * exactly once for a connection. + */ + sk->state = SS_ACTIVE; + sk->streamingAt = sk->startStreamingAt; + + /* event set will be updated inside SendMessageToNode */ + SendMessageToNode(sk); +} + +/* + * Try to send message to the particular node. Always updates event set. Will + * send at least one message, if socket is ready. + * + * Can be used only for safekeepers in SS_ACTIVE state. State can be changed + * in case of errors. + */ +static void +SendMessageToNode(Safekeeper *sk) +{ + Assert(sk->state == SS_ACTIVE); + + /* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */ + HandleActiveState(sk, WL_SOCKET_WRITEABLE); +} + +/* + * Broadcast new message to all caught-up safekeepers + */ +static void +BroadcastAppendRequest() +{ + for (int i = 0; i < n_safekeepers; i++) + if (safekeeper[i].state == SS_ACTIVE) + SendMessageToNode(&safekeeper[i]); +} + +static void +PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) +{ + Assert(endLsn >= beginLsn); + req->tag = 'a'; + req->term = propTerm; + req->epochStartLsn = propEpochStartLsn; + req->beginLsn = beginLsn; + req->endLsn = endLsn; + req->commitLsn = GetAcknowledgedByQuorumWALPosition(); + req->truncateLsn = truncateLsn; + req->proposerId = greetRequest.proposerId; +} + +/* + * Process all events happened in SS_ACTIVE state, update event set after that. + */ +static void +HandleActiveState(Safekeeper *sk, uint32 events) +{ + uint32 newEvents = WL_SOCKET_READABLE; + + if (events & WL_SOCKET_WRITEABLE) + if (!SendAppendRequests(sk)) + return; + + if (events & WL_SOCKET_READABLE) + if (!RecvAppendResponses(sk)) + return; + + /* + * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data + * in the buffer. + * + * LSN comparison checks if we have pending unsent messages. This check isn't + * necessary now, because we always send append messages immediately after + * arrival. But it's good to have it here in case we change this behavior + * in the future. + */ + if (sk->streamingAt != availableLsn || sk->flushWrite) + newEvents |= WL_SOCKET_WRITEABLE; + + UpdateEventSet(sk, newEvents); +} + +/* + * Send WAL messages starting from sk->streamingAt until the end or non-writable + * socket, whichever comes first. Caller should take care of updating event set. + * Even if no unsent WAL is available, at least one empty message will be sent + * as a heartbeat, if socket is ready. + * + * Can change state if Async* functions encounter errors and reset connection. + * Returns false in this case, true otherwise. + */ +static bool +SendAppendRequests(Safekeeper *sk) +{ + XLogRecPtr endLsn; + AppendRequestHeader *req; + PGAsyncWriteResult writeResult; + WALReadError errinfo; + bool sentAnything = false; + + if (sk->flushWrite) + { + if (!AsyncFlush(sk)) + /* + * AsyncFlush failed, that could happen if the socket is closed or + * we have nothing to write and should wait for writeable socket. + */ + return sk->state == SS_ACTIVE; + + /* Event set will be updated in the end of HandleActiveState */ + sk->flushWrite = false; + } + + while (sk->streamingAt != availableLsn || !sentAnything) + { + sentAnything = true; + + endLsn = sk->streamingAt; + endLsn += MAX_SEND_SIZE; + + /* if we went beyond available WAL, back off */ + if (endLsn > availableLsn) { + endLsn = availableLsn; + } + + req = &sk->appendRequest; + PrepareAppendRequest(&sk->appendRequest, sk->streamingAt, endLsn); + + ereport(DEBUG2, + (errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", + req->endLsn - req->beginLsn, + LSN_FORMAT_ARGS(req->beginLsn), + LSN_FORMAT_ARGS(req->endLsn), + LSN_FORMAT_ARGS(req->commitLsn), + LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port))); + + resetStringInfo(&sk->outbuf); + + /* write AppendRequest header */ + appendBinaryStringInfo(&sk->outbuf, (char*) req, sizeof(AppendRequestHeader)); + + /* write the WAL itself */ + enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); + if (!WALRead(sk->xlogreader, + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req->endLsn - req->beginLsn, + ThisTimeLineID, + &errinfo)) + { + WALReadRaiseError(&errinfo); + } + sk->outbuf.len += req->endLsn - req->beginLsn; + + writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len); + + /* Mark current message as sent, whatever the result is */ + sk->streamingAt = endLsn; + + switch (writeResult) + { + case PG_ASYNC_WRITE_SUCCESS: + /* Continue writing the next message */ + break; + + case PG_ASYNC_WRITE_TRY_FLUSH: + /* + * We still need to call PQflush some more to finish the job. + * Caller function will handle this by setting right event set. + */ + sk->flushWrite = true; + return true; + + case PG_ASYNC_WRITE_FAIL: + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } + } + + return true; +} + +/* + * Receive and process all available feedback. + * + * Can change state if Async* functions encounter errors and reset connection. + * Returns false in this case, true otherwise. + * + * NB: This function can call SendMessageToNode and produce new messages. + */ +static bool +RecvAppendResponses(Safekeeper *sk) +{ + XLogRecPtr minQuorumLsn; + bool readAnything = false; + + while (true) + { + /* + * If our reading doesn't immediately succeed, any + * necessary error handling or state setting is taken care + * of. We can leave any other work until later. + */ + sk->appendResponse.apm.tag = 'a'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) + break; + + ereport(DEBUG2, + (errmsg("received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", + sk->appendResponse.term, + LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), + LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), + sk->host, sk->port))); + + if (sk->appendResponse.term > propTerm) + { + /* Another compute with higher term is running. */ + elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", + sk->host, sk->port, + sk->appendResponse.term, propTerm); + } + + readAnything = true; + } + + if (!readAnything) + return sk->state == SS_ACTIVE; + + HandleSafekeeperResponse(); + + /* + * Also send the new commit lsn to all the safekeepers. + */ + minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); + if (minQuorumLsn > lastSentCommitLsn) + { + BroadcastAppendRequest(); + lastSentCommitLsn = minQuorumLsn; + } + + return sk->state == SS_ACTIVE; +} + +/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ +void +ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *rf) +{ + uint8 nkeys; + int i; + int32 len; + + /* get number of custom keys */ + nkeys = pq_getmsgbyte(reply_message); + + for (i = 0; i < nkeys; i++) + { + const char *key = pq_getmsgstring(reply_message); + if (strcmp(key, "current_timeline_size") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->currentClusterSize = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", + rf->currentClusterSize); + } + else if (strcmp(key, "ps_writelsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_writelsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_writelsn)); + } + else if (strcmp(key, "ps_flushlsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_flushlsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_flushlsn)); + } + else if (strcmp(key, "ps_applylsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_applylsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_applylsn)); + } + else if (strcmp(key, "ps_replytime") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_replytime = pq_getmsgint64(reply_message); + { + char *replyTimeStr; + + /* Copy because timestamptz_to_str returns a static buffer */ + replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", + rf->ps_replytime, replyTimeStr); + + pfree(replyTimeStr); + } + } + else + { + len = pq_getmsgint(reply_message, sizeof(int32)); // read value length + // Skip unknown keys to support backward compatibile protocol changes + elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + pq_getmsgbytes(reply_message, len); + }; + } +} + +/* + * Combine hot standby feedbacks from all safekeepers. + */ +static void +CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) +{ + hs->ts = 0; + hs->xmin.value = ~0; /* largest unsigned value */ + hs->catalog_xmin.value = ~0; /* largest unsigned value */ + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].appendResponse.hs.ts != 0) + { + if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin)) + { + hs->xmin = safekeeper[i].appendResponse.hs.xmin; + hs->ts = safekeeper[i].appendResponse.hs.ts; + } + if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin)) + { + hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin; + hs->ts = safekeeper[i].appendResponse.hs.ts; + } + } + } +} + + +/* + * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the + * last WAL record that can be safely discarded. + */ +static XLogRecPtr +CalculateMinFlushLsn(void) +{ + XLogRecPtr lsn = n_safekeepers > 0 + ? safekeeper[0].appendResponse.flushLsn + : InvalidXLogRecPtr; + for (int i = 1; i < n_safekeepers; i++) + { + lsn = Min(lsn, safekeeper[i].appendResponse.flushLsn); + } + return lsn; +} + +/* + * Calculate WAL position acknowledged by quorum + */ +static XLogRecPtr +GetAcknowledgedByQuorumWALPosition(void) +{ + XLogRecPtr responses[MAX_SAFEKEEPERS]; + + /* + * Sort acknowledged LSNs + */ + for (int i = 0; i < n_safekeepers; i++) + { + /* + * Like in Raft, we aren't allowed to commit entries from previous + * terms, so ignore reported LSN until it gets to epochStartLsn. + */ + responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? + safekeeper[i].appendResponse.flushLsn : 0; + } + qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + + /* + * Get the smallest LSN committed by quorum + */ + return responses[n_safekeepers - quorum]; +} + +/* + * ReplicationFeedbackShmemSize --- report amount of shared memory space needed + */ +Size +WalproposerShmemSize(void) +{ + return sizeof(WalproposerShmemState); +} + +bool +WalproposerShmemInit(void) +{ + bool found; + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + walprop_shared = ShmemInitStruct("Walproposer shared state", + sizeof(WalproposerShmemState), + &found); + + if (!found) + { + memset(walprop_shared, 0, WalproposerShmemSize()); + SpinLockInit(&walprop_shared->mutex); + } + LWLockRelease(AddinShmemInitLock); + + return found; +} + +void +replication_feedback_set(ReplicationFeedback *rf) +{ + SpinLockAcquire(&walprop_shared->mutex); + memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); + SpinLockRelease(&walprop_shared->mutex); +} + + +void +replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) +{ + SpinLockAcquire(&walprop_shared->mutex); + *writeLsn = walprop_shared->feedback.ps_writelsn; + *flushLsn = walprop_shared->feedback.ps_flushlsn; + *applyLsn = walprop_shared->feedback.ps_applylsn; + SpinLockRelease(&walprop_shared->mutex); +} + + +/* + * Get ReplicationFeedback fields from the most advanced safekeeper + */ +static void +GetLatestZentihFeedback(ReplicationFeedback *rf) +{ + int latest_safekeeper = 0; + XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) + { + latest_safekeeper = i; + ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn; + } + } + + rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; + rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn; + rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn; + rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; + rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; + + elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu," + " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", + rf->currentClusterSize, + LSN_FORMAT_ARGS(rf->ps_writelsn), + LSN_FORMAT_ARGS(rf->ps_flushlsn), + LSN_FORMAT_ARGS(rf->ps_applylsn), + rf->ps_replytime); + + replication_feedback_set(rf); +} + +static void +HandleSafekeeperResponse(void) +{ + HotStandbyFeedback hsFeedback; + XLogRecPtr minQuorumLsn; + XLogRecPtr diskConsistentLsn; + XLogRecPtr minFlushLsn; + + + minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); + diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; + + if (!syncSafekeepers) + { + // Get ReplicationFeedback fields from the most advanced safekeeper + GetLatestZentihFeedback(&quorumFeedback.rf); + SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); + } + + if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn) + { + + if (minQuorumLsn > quorumFeedback.flushLsn) + quorumFeedback.flushLsn = minQuorumLsn; + + /* advance the replication slot */ + if (!syncSafekeepers) + ProcessStandbyReply( + // write_lsn - This is what durably stored in WAL service. + quorumFeedback.flushLsn, + //flush_lsn - This is what durably stored in WAL service. + quorumFeedback.flushLsn, + //apply_lsn - This is what processed and durably saved at pageserver. + quorumFeedback.rf.ps_flushlsn, + GetCurrentTimestamp(), false); + } + + CombineHotStanbyFeedbacks(&hsFeedback); + if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) + { + quorumFeedback.hs = hsFeedback; + if (!syncSafekeepers) + ProcessStandbyHSFeedback(hsFeedback.ts, + XidFromFullTransactionId(hsFeedback.xmin), + EpochFromFullTransactionId(hsFeedback.xmin), + XidFromFullTransactionId(hsFeedback.catalog_xmin), + EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + } + + /* + * Try to advance truncateLsn to minFlushLsn, which is the last record + * flushed to all safekeepers. We must always start streaming from the + * beginning of the record, which simplifies decoding on the far end. + * + * Advanced truncateLsn should be not further than nearest commitLsn. + * This prevents surprising violation of truncateLsn <= commitLsn + * invariant which might occur because 1) truncateLsn can be advanced + * immediately once chunk is broadcast to all safekeepers, and + * commitLsn generally can't be advanced based on feedback from + * safekeeper who is still in the previous epoch (similar to 'leader + * can't commit entries from previous term' in Raft); 2) chunks we + * read from WAL and send are plain sheets of bytes, but safekeepers + * ack only on record boundaries. + */ + minFlushLsn = CalculateMinFlushLsn(); + if (minFlushLsn > truncateLsn) + { + truncateLsn = minFlushLsn; + + /* + * Advance the replication slot to free up old WAL files. Note + * that slot doesn't exist if we are in syncSafekeepers mode. + */ + if (MyReplicationSlot) + PhysicalConfirmReceivedLocation(truncateLsn); + } + + /* + * Generally sync is done when majority switched the epoch so we committed + * epochStartLsn and made the majority aware of it, ensuring they are + * ready to give all WAL to pageserver. It would mean whichever majority + * is alive, there will be at least one safekeeper who is able to stream + * WAL to pageserver to make basebackup possible. However, since at the + * moment we don't have any good mechanism of defining the healthy and + * most advanced safekeeper who should push the wal into pageserver and + * basically the random one gets connected, to prevent hanging basebackup + * (due to pageserver connecting to not-synced-safekeeper) we currently + * wait for all seemingly alive safekeepers to get synced. + */ + if (syncSafekeepers) + { + int n_synced; + + n_synced = 0; + for (int i = 0; i < n_safekeepers; i++) + { + Safekeeper *sk = &safekeeper[i]; + bool synced = sk->appendResponse.commitLsn >= propEpochStartLsn; + + /* alive safekeeper which is not synced yet; wait for it */ + if (sk->state != SS_OFFLINE && !synced) + return; + if (synced) + n_synced++; + } + if (n_synced >= quorum) + { + /* All safekeepers synced! */ + fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); + exit(0); + } + } +} + +/* + * Try to read CopyData message from i'th safekeeper, resetting connection on + * failure. + */ +static bool +AsyncRead(Safekeeper *sk, char **buf, int *buf_size) +{ + switch (walprop_async_read(sk->conn, buf, buf_size)) + { + case PG_ASYNC_READ_SUCCESS: + return true; + + case PG_ASYNC_READ_TRY_AGAIN: + /* WL_SOCKET_READABLE is always set during copyboth */ + return false; + + case PG_ASYNC_READ_FAIL: + elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host, + sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + } + Assert(false); + return false; +} + +/* + * Read next message with known type into provided struct, by reading a CopyData + * block from the safekeeper's postgres connection, returning whether the read + * was successful. + * + * If the read needs more polling, we return 'false' and keep the state + * unmodified, waiting until it becomes read-ready to try again. If it fully + * failed, a warning is emitted and the connection is reset. + */ +static bool +AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) +{ + char *buf; + int buf_size; + uint64 tag; + StringInfoData s; + + if (!(AsyncRead(sk, &buf, &buf_size))) + return false; + + /* parse it */ + s.data = buf; + s.len = buf_size; + s.cursor = 0; + + tag = pq_getmsgint64_le(&s); + if (tag != anymsg->tag) + { + elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return false; + } + + switch (tag) + { + case 'g': + { + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + msg->term = pq_getmsgint64_le(&s); + msg->nodeId = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->voteGiven = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->truncateLsn = pq_getmsgint64_le(&s); + msg->termHistory.n_entries = pq_getmsgint32_le(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (int i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + } + msg->timelineStartLsn = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; + msg->term = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->commitLsn = pq_getmsgint64_le(&s); + msg->hs.ts = pq_getmsgint64_le(&s); + msg->hs.xmin.value = pq_getmsgint64_le(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); + if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) + ParseReplicationFeedbackMessage(&s, &msg->rf); + pq_getmsgend(&s); + return true; + } + + default: + { + Assert(false); + return false; + } + } +} + +/* + * Blocking equivalent to AsyncWrite. + * + * We use this everywhere messages are small enough that they should fit in a + * single packet. + */ +static bool +BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state) +{ + uint32 events; + + if (!walprop_blocking_write(sk->conn, msg, msg_size)) + { + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + } + + sk->state = success_state; + + /* + * If the new state will be waiting for events to happen, update the event + * set to wait for those + */ + events = SafekeeperStateDesiredEvents(success_state); + if (events) + UpdateEventSet(sk, events); + + return true; +} + +/* + * Starts a write into the 'i'th safekeeper's postgres connection, moving to + * flush_state (adjusting eventset) if write still needs flushing. + * + * Returns false if sending is unfinished (requires flushing or conn failed). + * Upon failure, a warning is emitted and the connection is reset. + */ +static bool +AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state) +{ + switch (walprop_async_write(sk->conn, msg, msg_size)) + { + case PG_ASYNC_WRITE_SUCCESS: + return true; + case PG_ASYNC_WRITE_TRY_FLUSH: + + /* + * We still need to call PQflush some more to finish the job; go + * to the appropriate state. Update the event set at the bottom of + * this function + */ + sk->state = flush_state; + UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); + return false; + case PG_ASYNC_WRITE_FAIL: + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } +} + +/* + * Flushes a previous call to AsyncWrite. This only needs to be called when the + * socket becomes read or write ready *after* calling AsyncWrite. + * + * If flushing successfully completes returns true, otherwise false. Event set + * is updated only if connection fails, otherwise caller should manually unset + * WL_SOCKET_WRITEABLE. + */ +static bool +AsyncFlush(Safekeeper *sk) +{ + /*--- + * PQflush returns: + * 0 if successful [we're good to move on] + * 1 if unable to send everything yet [call PQflush again] + * -1 if it failed [emit an error] + */ + switch (walprop_flush(sk->conn)) + { + case 0: + /* flush is done */ + return true; + case 1: + /* Nothing to do; try again when the socket's ready */ + return false; + case -1: + elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ResetConnection(sk); + return false; + default: + Assert(false); + return false; + } +} + +// Check if we need to suspend inserts because of lagging replication. +static uint64 +backpressure_lag_impl(void) +{ + if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) + { + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; + XLogRecPtr myFlushLsn = GetFlushRecPtr(); + + replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); +#define MB ((XLogRecPtr)1024*1024) + + elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", + LSN_FORMAT_ARGS(myFlushLsn), + LSN_FORMAT_ARGS(writePtr), + LSN_FORMAT_ARGS(flushPtr), + LSN_FORMAT_ARGS(applyPtr)); + + if ((writePtr != InvalidXLogRecPtr + && max_replication_write_lag > 0 + && myFlushLsn > writePtr + max_replication_write_lag*MB)) + { + return (myFlushLsn - writePtr - max_replication_write_lag*MB); + } + + if ((flushPtr != InvalidXLogRecPtr + && max_replication_flush_lag > 0 + && myFlushLsn > flushPtr + max_replication_flush_lag*MB)) + { + return (myFlushLsn - flushPtr - max_replication_flush_lag*MB); + } + + if ((applyPtr != InvalidXLogRecPtr + && max_replication_apply_lag > 0 + && myFlushLsn > applyPtr + max_replication_apply_lag*MB)) + { + return (myFlushLsn - applyPtr - max_replication_apply_lag*MB); + } + } + return 0; +} diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h new file mode 100644 index 0000000000..b684d5264f --- /dev/null +++ b/pgxn/neon/walproposer.h @@ -0,0 +1,540 @@ +#ifndef __NEON_WALPROPOSER_H__ +#define __NEON_WALPROPOSER_H__ + +#include "access/xlogdefs.h" +#include "postgres.h" +#include "port.h" +#include "access/xlog_internal.h" +#include "access/transam.h" +#include "nodes/replnodes.h" +#include "utils/uuid.h" +#include "replication/walreceiver.h" + +#define SK_MAGIC 0xCafeCeefu +#define SK_PROTOCOL_VERSION 2 + +#define MAX_SAFEKEEPERS 32 +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single WAL message */ +#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ +#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender message header */ +#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender message header */ + +/* + * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured, + * because all WL_* events are given flags equal to some (1 << i), starting from i = 0 + */ +#define WL_NO_EVENTS 0 + +extern char* wal_acceptors_list; +extern int wal_acceptor_reconnect_timeout; +extern int wal_acceptor_connect_timeout; +extern bool am_wal_proposer; + +struct WalProposerConn; /* Defined in libpqwalproposer */ +typedef struct WalProposerConn WalProposerConn; + +struct WalMessage; +typedef struct WalMessage WalMessage; + +extern char *zenith_timeline_walproposer; +extern char *zenith_tenant_walproposer; + +/* Possible return values from ReadPGAsync */ +typedef enum +{ + /* The full read was successful. buf now points to the data */ + PG_ASYNC_READ_SUCCESS, + /* The read is ongoing. Wait until the connection is read-ready, then try + * again. */ + PG_ASYNC_READ_TRY_AGAIN, + /* Reading failed. Check PQerrorMessage(conn) */ + PG_ASYNC_READ_FAIL, +} PGAsyncReadResult; + +/* Possible return values from WritePGAsync */ +typedef enum +{ + /* The write fully completed */ + PG_ASYNC_WRITE_SUCCESS, + /* The write started, but you'll need to call PQflush some more times + * to finish it off. We just tried, so it's best to wait until the + * connection is read- or write-ready to try again. + * + * If it becomes read-ready, call PQconsumeInput and flush again. If it + * becomes write-ready, just call PQflush. + */ + PG_ASYNC_WRITE_TRY_FLUSH, + /* Writing failed. Check PQerrorMessage(conn) */ + PG_ASYNC_WRITE_FAIL, +} PGAsyncWriteResult; + +/* + * WAL safekeeper state, which is used to wait for some event. + * + * States are listed here in the order that they're executed. + * + * Most states, upon failure, will move back to SS_OFFLINE by calls to + * ResetConnection or ShutdownConnection. + */ +typedef enum +{ + /* + * Does not have an active connection and will stay that way until + * further notice. + * + * Moves to SS_CONNECTING_WRITE by calls to ResetConnection. + */ + SS_OFFLINE, + + /* + * Connecting states. "_READ" waits for the socket to be available for + * reading, "_WRITE" waits for writing. There's no difference in the code + * they execute when polled, but we have this distinction in order to + * recreate the event set in HackyRemoveWalProposerEvent. + * + * After the connection is made, "START_WAL_PUSH" query is sent. + */ + SS_CONNECTING_WRITE, + SS_CONNECTING_READ, + + /* + * Waiting for the result of the "START_WAL_PUSH" command. + * + * After we get a successful result, sends handshake to safekeeper. + */ + SS_WAIT_EXEC_RESULT, + + /* + * Executing the receiving half of the handshake. After receiving, moves to + * SS_VOTING. + */ + SS_HANDSHAKE_RECV, + + /* + * Waiting to participate in voting, but a quorum hasn't yet been reached. + * This is an idle state - we do not expect AdvancePollState to be called. + * + * Moved externally by execution of SS_HANDSHAKE_RECV, when we received a + * quorum of handshakes. + */ + SS_VOTING, + + /* + * Already sent voting information, waiting to receive confirmation from the + * node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet. + */ + SS_WAIT_VERDICT, + + /* Need to flush ProposerElected message. */ + SS_SEND_ELECTED_FLUSH, + + /* + * Waiting for quorum to send WAL. Idle state. If the socket becomes + * read-ready, the connection has been closed. + * + * Moves to SS_ACTIVE only by call to StartStreaming. + */ + SS_IDLE, + + /* + * Active phase, when we acquired quorum and have WAL to send or feedback + * to read. + */ + SS_ACTIVE, +} SafekeeperState; + +/* Consensus logical timestamp. */ +typedef uint64 term_t; + +/* neon storage node id */ +typedef uint64 NNodeId; + +/* + * Proposer <-> Acceptor messaging. + */ + +/* Initial Proposer -> Acceptor message */ +typedef struct ProposerGreeting +{ + uint64 tag; /* message tag */ + uint32 protocolVersion; /* proposer-safekeeper protocol version */ + uint32 pgVersion; + pg_uuid_t proposerId; + uint64 systemId; /* Postgres system identifier */ + uint8 ztimelineid[16]; /* Zenith timeline id */ + uint8 ztenantid[16]; + TimeLineID timeline; + uint32 walSegSize; +} ProposerGreeting; + +typedef struct AcceptorProposerMessage +{ + uint64 tag; +} AcceptorProposerMessage; + +/* + * Acceptor -> Proposer initial response: the highest term acceptor voted for. + */ +typedef struct AcceptorGreeting +{ + AcceptorProposerMessage apm; + term_t term; + NNodeId nodeId; +} AcceptorGreeting; + +/* + * Proposer -> Acceptor vote request. + */ +typedef struct VoteRequest +{ + uint64 tag; + term_t term; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} VoteRequest; + +/* Element of term switching chain. */ +typedef struct TermSwitchEntry +{ + term_t term; + XLogRecPtr lsn; +} TermSwitchEntry; + +typedef struct TermHistory +{ + uint32 n_entries; + TermSwitchEntry *entries; +} TermHistory; + +/* Vote itself, sent from safekeeper to proposer */ +typedef struct VoteResponse { + AcceptorProposerMessage apm; + term_t term; + uint64 voteGiven; + /* + * Safekeeper flush_lsn (end of WAL) + history of term switches allow + * proposer to choose the most advanced one. + */ + XLogRecPtr flushLsn; + XLogRecPtr truncateLsn; /* minimal LSN which may be needed for recovery of some safekeeper */ + TermHistory termHistory; + XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +} VoteResponse; + +/* + * Proposer -> Acceptor message announcing proposer is elected and communicating + * epoch history to it. + */ +typedef struct ProposerElected +{ + uint64 tag; + term_t term; + /* proposer will send since this point */ + XLogRecPtr startStreamingAt; + /* history of term switches up to this proposer */ + TermHistory *termHistory; + /* timeline globally starts at this LSN */ + XLogRecPtr timelineStartLsn; +} ProposerElected; + +/* + * Header of request with WAL message sent from proposer to safekeeper. + */ +typedef struct AppendRequestHeader +{ + uint64 tag; + term_t term; /* term of the proposer */ + /* + * LSN since which current proposer appends WAL (begin_lsn of its first + * record); determines epoch switch point. + */ + XLogRecPtr epochStartLsn; + XLogRecPtr beginLsn; /* start position of message in WAL */ + XLogRecPtr endLsn; /* end position of message in WAL */ + XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + /* + * minimal LSN which may be needed for recovery of some safekeeper (end lsn + * + 1 of last chunk streamed to everyone) + */ + XLogRecPtr truncateLsn; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} AppendRequestHeader; + +/* + * Hot standby feedback received from replica + */ +typedef struct HotStandbyFeedback +{ + TimestampTz ts; + FullTransactionId xmin; + FullTransactionId catalog_xmin; +} HotStandbyFeedback; + + +typedef struct ReplicationFeedback +{ + // current size of the timeline on pageserver + uint64 currentClusterSize; + // standby_status_update fields that safekeeper received from pageserver + XLogRecPtr ps_writelsn; + XLogRecPtr ps_flushlsn; + XLogRecPtr ps_applylsn; + TimestampTz ps_replytime; +} ReplicationFeedback; + + +typedef struct WalproposerShmemState +{ + slock_t mutex; + ReplicationFeedback feedback; + term_t mineLastElectedTerm; +} WalproposerShmemState; + +/* + * Report safekeeper state to proposer + */ +typedef struct AppendResponse +{ + AcceptorProposerMessage apm; + /* + * Current term of the safekeeper; if it is higher than proposer's, the + * compute is out of date. + */ + term_t term; + // TODO: add comment + XLogRecPtr flushLsn; + // Safekeeper reports back his awareness about which WAL is committed, as + // this is a criterion for walproposer --sync mode exit + XLogRecPtr commitLsn; + HotStandbyFeedback hs; + // Feedback recieved from pageserver includes standby_status_update fields + // and custom zenith feedback. + // This part of the message is extensible. + ReplicationFeedback rf; +} AppendResponse; + +// ReplicationFeedback is extensible part of the message that is parsed separately +// Other fields are fixed part +#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) + + +/* + * Descriptor of safekeeper + */ +typedef struct Safekeeper +{ + char const* host; + char const* port; + char conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */ + + /* + * postgres protocol connection to the WAL acceptor + * + * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we + * reach SS_ACTIVE; not before. + */ + WalProposerConn* conn; + /* + * Temporary buffer for the message being sent to the safekeeper. + */ + StringInfoData outbuf; + /* + * WAL reader, allocated for each safekeeper. + */ + XLogReaderState* xlogreader; + + /* + * Streaming will start here; must be record boundary. + */ + XLogRecPtr startStreamingAt; + + bool flushWrite; /* set to true if we need to call AsyncFlush, to flush pending messages */ + XLogRecPtr streamingAt; /* current streaming position */ + AppendRequestHeader appendRequest; /* request for sending to safekeeper */ + + int eventPos; /* position in wait event set. Equal to -1 if no event */ + SafekeeperState state; /* safekeeper state machine state */ + TimestampTz startedConnAt; /* when connection attempt started */ + AcceptorGreeting greetResponse; /* acceptor greeting */ + VoteResponse voteResponse; /* the vote */ + AppendResponse appendResponse; /* feedback for master */ +} Safekeeper; + + +extern PGDLLIMPORT void WalProposerMain(Datum main_arg); +void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); +void WalProposerPoll(void); +void WalProposerRegister(void); +void ParseReplicationFeedbackMessage(StringInfo reply_message, + ReplicationFeedback *rf); +extern void StartProposerReplication(StartReplicationCmd *cmd); + +Size WalproposerShmemSize(void); +bool WalproposerShmemInit(void); +void replication_feedback_set(ReplicationFeedback *rf); +void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); + +/* libpqwalproposer hooks & helper type */ + +/* Re-exported PostgresPollingStatusType */ +typedef enum +{ + WP_CONN_POLLING_FAILED = 0, + WP_CONN_POLLING_READING, + WP_CONN_POLLING_WRITING, + WP_CONN_POLLING_OK, + /* + * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused. + * We've removed it here to avoid clutter. + */ +} WalProposerConnectPollStatusType; + +/* Re-exported and modified ExecStatusType */ +typedef enum +{ + /* We received a single CopyBoth result */ + WP_EXEC_SUCCESS_COPYBOTH, + /* Any success result other than a single CopyBoth was received. The specifics of the result + * were already logged, but it may be useful to provide an error message indicating which + * safekeeper messed up. + * + * Do not expect PQerrorMessage to be appropriately set. */ + WP_EXEC_UNEXPECTED_SUCCESS, + /* No result available at this time. Wait until read-ready, then call again. Internally, this is + * returned when PQisBusy indicates that PQgetResult would block. */ + WP_EXEC_NEEDS_INPUT, + /* Catch-all failure. Check PQerrorMessage. */ + WP_EXEC_FAILED, +} WalProposerExecStatusType; + +/* Re-exported ConnStatusType */ +typedef enum +{ + WP_CONNECTION_OK, + WP_CONNECTION_BAD, + + /* + * The original ConnStatusType has many more tags, but requests that + * they not be relied upon (except for displaying to the user). We + * don't need that extra functionality, so we collect them into a + * single tag here. + */ + WP_CONNECTION_IN_PROGRESS, +} WalProposerConnStatusType; + +/* Re-exported PQerrorMessage */ +typedef char* (*walprop_error_message_fn) (WalProposerConn* conn); + +/* Re-exported PQstatus */ +typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn); + +/* Re-exported PQconnectStart */ +typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo); + +/* Re-exported PQconectPoll */ +typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn); + +/* Blocking wrapper around PQsendQuery */ +typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query); + +/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */ +typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn); + +/* Re-exported PQsocket */ +typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn); + +/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */ +typedef int (*walprop_flush_fn) (WalProposerConn* conn); + +/* Re-exported PQfinish */ +typedef void (*walprop_finish_fn) (WalProposerConn* conn); + +/* + * Ergonomic wrapper around PGgetCopyData + * + * Reads a CopyData block from a safekeeper, setting *amount to the number + * of bytes returned. + * + * This function is allowed to assume certain properties specific to the + * protocol with the safekeepers, so it should not be used as-is for any + * other purpose. + * + * Note: If possible, using is generally preferred, because it + * performs a bit of extra checking work that's always required and is normally + * somewhat verbose. + */ +typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn, + char** buf, + int* amount); + +/* + * Ergonomic wrapper around PQputCopyData + PQflush + * + * Starts to write a CopyData block to a safekeeper. + * + * For information on the meaning of return codes, refer to PGAsyncWriteResult. + */ +typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn, + void const* buf, + size_t size); + +/* + * Blocking equivalent to walprop_async_write_fn + * + * Returns 'true' if successful, 'false' on failure. + */ +typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size); + +/* All libpqwalproposer exported functions collected together. */ +typedef struct WalProposerFunctionsType +{ + walprop_error_message_fn walprop_error_message; + walprop_status_fn walprop_status; + walprop_connect_start_fn walprop_connect_start; + walprop_connect_poll_fn walprop_connect_poll; + walprop_send_query_fn walprop_send_query; + walprop_get_query_result_fn walprop_get_query_result; + walprop_socket_fn walprop_socket; + walprop_flush_fn walprop_flush; + walprop_finish_fn walprop_finish; + walprop_async_read_fn walprop_async_read; + walprop_async_write_fn walprop_async_write; + walprop_blocking_write_fn walprop_blocking_write; +} WalProposerFunctionsType; + +/* Allow the above functions to be "called" with normal syntax */ +#define walprop_error_message(conn) \ + WalProposerFunctions->walprop_error_message(conn) +#define walprop_status(conn) \ + WalProposerFunctions->walprop_status(conn) +#define walprop_connect_start(conninfo) \ + WalProposerFunctions->walprop_connect_start(conninfo) +#define walprop_connect_poll(conn) \ + WalProposerFunctions->walprop_connect_poll(conn) +#define walprop_send_query(conn, query) \ + WalProposerFunctions->walprop_send_query(conn, query) +#define walprop_get_query_result(conn) \ + WalProposerFunctions->walprop_get_query_result(conn) +#define walprop_set_nonblocking(conn, arg) \ + WalProposerFunctions->walprop_set_nonblocking(conn, arg) +#define walprop_socket(conn) \ + WalProposerFunctions->walprop_socket(conn) +#define walprop_flush(conn) \ + WalProposerFunctions->walprop_flush(conn) +#define walprop_finish(conn) \ + WalProposerFunctions->walprop_finish(conn) +#define walprop_async_read(conn, buf, amount) \ + WalProposerFunctions->walprop_async_read(conn, buf, amount) +#define walprop_async_write(conn, buf, size) \ + WalProposerFunctions->walprop_async_write(conn, buf, size) +#define walprop_blocking_write(conn, buf, size) \ + WalProposerFunctions->walprop_blocking_write(conn, buf, size) + +/* + * The runtime location of the libpqwalproposer functions. + * + * This pointer is set by the initializer in libpqwalproposer, so that we + * can use it later. + */ +extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions; + +#endif /* __NEON_WALPROPOSER_H__ */ diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c new file mode 100644 index 0000000000..7b96fd580c --- /dev/null +++ b/pgxn/neon/walproposer_utils.c @@ -0,0 +1,1110 @@ +#include "postgres.h" + +#include "access/timeline.h" +#include "access/xlogutils.h" +#include "common/logging.h" +#include "common/ip.h" +#include "funcapi.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "postmaster/interrupt.h" +#include "replication/slot.h" +#include "walproposer_utils.h" +#include "replication/walsender_private.h" + +#include "storage/ipc.h" +#include "utils/builtins.h" +#include "utils/ps_status.h" + +#include "libpq-fe.h" +#include +#include + +/* + * These variables are used similarly to openLogFile/SegNo, + * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID + * corresponding the filename of walpropFile. + */ +static int walpropFile = -1; +static TimeLineID walpropFileTLI = 0; +static XLogSegNo walpropSegNo = 0; + +/* START cloned file-local variables and functions from walsender.c */ + +/* + * xlogreader used for replication. Note that a WAL sender doing physical + * replication does not need xlogreader to read WAL, but it needs one to + * keep a state of its work. + */ +static XLogReaderState *xlogreader = NULL; + +/* + * These variables keep track of the state of the timeline we're currently + * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric, + * the timeline is not the latest timeline on this server, and the server's + * history forked off from that timeline at sendTimeLineValidUpto. + */ +static TimeLineID sendTimeLine = 0; +static TimeLineID sendTimeLineNextTLI = 0; +static bool sendTimeLineIsHistoric = false; +static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr; + +/* + * Timestamp of last ProcessRepliesIfAny() that saw a reply from the + * standby. Set to 0 if wal_sender_timeout doesn't need to be active. + */ +static TimestampTz last_reply_timestamp = 0; + +/* Have we sent a heartbeat message asking for reply, since last reply? */ +static bool waiting_for_ping_response = false; + +static bool streamingDoneSending; +static bool streamingDoneReceiving; + +/* Are we there yet? */ +static bool WalSndCaughtUp = false; + +/* Flags set by signal handlers for later service in main loop */ +static volatile sig_atomic_t got_STOPPING = false; + +/* + * How far have we sent WAL already? This is also advertised in + * MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.) + */ +static XLogRecPtr sentPtr = InvalidXLogRecPtr; + +/* + * This is set while we are streaming. When not set + * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set, + * the main loop is responsible for checking got_STOPPING and terminating when + * it's set (after streaming any remaining WAL). + */ +static volatile sig_atomic_t replication_active = false; + +typedef void (*WalSndSendDataCallback) (void); +static void WalSndLoop(WalSndSendDataCallback send_data); +static void XLogSendPhysical(void); +static XLogRecPtr GetStandbyFlushRecPtr(void); + +static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p); + +/* END cloned file-level variables and functions from walsender.c */ + +int +CompareLsn(const void *a, const void *b) +{ + XLogRecPtr lsn1 = *((const XLogRecPtr *) a); + XLogRecPtr lsn2 = *((const XLogRecPtr *) b); + + if (lsn1 < lsn2) + return -1; + else if (lsn1 == lsn2) + return 0; + else + return 1; +} + +/* Returns a human-readable string corresonding to the SafekeeperState + * + * The string should not be freed. + * + * The strings are intended to be used as a prefix to "state", e.g.: + * + * elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state)); + * + * If this sort of phrasing doesn't fit the message, instead use something like: + * + * elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); + */ +char* +FormatSafekeeperState(SafekeeperState state) +{ + char* return_val = NULL; + + switch (state) + { + case SS_OFFLINE: + return_val = "offline"; + break; + case SS_CONNECTING_READ: + case SS_CONNECTING_WRITE: + return_val = "connecting"; + break; + case SS_WAIT_EXEC_RESULT: + return_val = "receiving query result"; + break; + case SS_HANDSHAKE_RECV: + return_val = "handshake (receiving)"; + break; + case SS_VOTING: + return_val = "voting"; + break; + case SS_WAIT_VERDICT: + return_val = "wait-for-verdict"; + break; + case SS_SEND_ELECTED_FLUSH: + return_val = "send-announcement-flush"; + break; + case SS_IDLE: + return_val = "idle"; + break; + case SS_ACTIVE: + return_val = "active"; + break; + } + + Assert(return_val != NULL); + + return return_val; +} + +/* Asserts that the provided events are expected for given safekeeper's state */ +void +AssertEventsOkForState(uint32 events, Safekeeper* sk) +{ + uint32 expected = SafekeeperStateDesiredEvents(sk->state); + + /* The events are in-line with what we're expecting, under two conditions: + * (a) if we aren't expecting anything, `events` has no read- or + * write-ready component. + * (b) if we are expecting something, there's overlap + * (i.e. `events & expected != 0`) + */ + bool events_ok_for_state; /* long name so the `Assert` is more clear later */ + + if (expected == WL_NO_EVENTS) + events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0); + else + events_ok_for_state = ((events & expected) != 0); + + if (!events_ok_for_state) + { + /* To give a descriptive message in the case of failure, we use elog and + * then an assertion that's guaranteed to fail. */ + elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", + FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state)); + Assert(events_ok_for_state); + } +} + +/* Returns the set of events a safekeeper in this state should be waiting on + * + * This will return WL_NO_EVENTS (= 0) for some events. */ +uint32 +SafekeeperStateDesiredEvents(SafekeeperState state) +{ + uint32 result = WL_NO_EVENTS; + + /* If the state doesn't have a modifier, we can check the base state */ + switch (state) + { + /* Connecting states say what they want in the name */ + case SS_CONNECTING_READ: + result = WL_SOCKET_READABLE; + break; + case SS_CONNECTING_WRITE: + result = WL_SOCKET_WRITEABLE; + break; + + /* Reading states need the socket to be read-ready to continue */ + case SS_WAIT_EXEC_RESULT: + case SS_HANDSHAKE_RECV: + case SS_WAIT_VERDICT: + result = WL_SOCKET_READABLE; + break; + + /* Idle states use read-readiness as a sign that the connection has been + * disconnected. */ + case SS_VOTING: + case SS_IDLE: + result = WL_SOCKET_READABLE; + break; + + /* + * Flush states require write-ready for flushing. + * Active state does both reading and writing. + * + * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should + * check sk->flushWrite here to set WL_SOCKET_WRITEABLE. + */ + case SS_SEND_ELECTED_FLUSH: + case SS_ACTIVE: + result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; + break; + + /* The offline state expects no events. */ + case SS_OFFLINE: + result = WL_NO_EVENTS; + break; + + default: + Assert(false); + break; + } + + return result; +} + +/* Returns a human-readable string corresponding to the event set + * + * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the + * returned string may be meaingless. + * + * The string should not be freed. It should also not be expected to remain the same between + * function calls. */ +char* +FormatEvents(uint32 events) +{ + static char return_str[8]; + + /* Helper variable to check if there's extra bits */ + uint32 all_flags = WL_LATCH_SET + | WL_SOCKET_READABLE + | WL_SOCKET_WRITEABLE + | WL_TIMEOUT + | WL_POSTMASTER_DEATH + | WL_EXIT_ON_PM_DEATH + | WL_SOCKET_CONNECTED; + + /* The formatting here isn't supposed to be *particularly* useful -- it's just to give an + * sense of what events have been triggered without needing to remember your powers of two. */ + + return_str[0] = (events & WL_LATCH_SET ) ? 'L' : '_'; + return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_'; + return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_'; + return_str[3] = (events & WL_TIMEOUT ) ? 'T' : '_'; + return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_'; + return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_'; + return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_'; + + if (events & (~all_flags)) + { + elog(WARNING, "Event formatting found unexpected component %d", + events & (~all_flags)); + return_str[6] = '*'; + return_str[7] = '\0'; + } + else + return_str[6] = '\0'; + + return (char *) &return_str; +} + +/* + * Convert a character which represents a hexadecimal digit to an integer. + * + * Returns -1 if the character is not a hexadecimal digit. + */ +static int +HexDecodeChar(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + + return -1; +} + +/* + * Decode a hex string into a byte string, 2 hex chars per byte. + * + * Returns false if invalid characters are encountered; otherwise true. + */ +bool +HexDecodeString(uint8 *result, char *input, int nbytes) +{ + int i; + + for (i = 0; i < nbytes; ++i) + { + int n1 = HexDecodeChar(input[i * 2]); + int n2 = HexDecodeChar(input[i * 2 + 1]); + + if (n1 < 0 || n2 < 0) + return false; + result[i] = n1 * 16 + n2; + } + + return true; +} + +/* -------------------------------- + * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order + * -------------------------------- + */ +uint32 +pq_getmsgint32_le(StringInfo msg) +{ + uint32 n32; + + pq_copymsgbytes(msg, (char *) &n32, sizeof(n32)); + + return n32; +} + +/* -------------------------------- + * pq_getmsgint64 - get a binary 8-byte int from a message buffer in native (LE) order + * -------------------------------- + */ +uint64 +pq_getmsgint64_le(StringInfo msg) +{ + uint64 n64; + + pq_copymsgbytes(msg, (char *) &n64, sizeof(n64)); + + return n64; +} + +/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */ +void +pq_sendint32_le(StringInfo buf, uint32 i) +{ + enlargeStringInfo(buf, sizeof(uint32)); + memcpy(buf->data + buf->len, &i, sizeof(uint32)); + buf->len += sizeof(uint32); +} + +/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */ +void +pq_sendint64_le(StringInfo buf, uint64 i) +{ + enlargeStringInfo(buf, sizeof(uint64)); + memcpy(buf->data + buf->len, &i, sizeof(uint64)); + buf->len += sizeof(uint64); +} + +/* + * Write XLOG data to disk. + */ +void +XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) +{ + int startoff; + int byteswritten; + + while (nbytes > 0) + { + int segbytes; + + /* Close the current segment if it's completed */ + if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) + XLogWalPropClose(recptr); + + if (walpropFile < 0) + { + bool use_existent = true; + + /* Create/use new log file */ + XLByteToSeg(recptr, walpropSegNo, wal_segment_size); + walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); + walpropFileTLI = ThisTimeLineID; + } + + /* Calculate the start offset of the received logs */ + startoff = XLogSegmentOffset(recptr, wal_segment_size); + + if (startoff + nbytes > wal_segment_size) + segbytes = wal_segment_size - startoff; + else + segbytes = nbytes; + + /* OK to write the logs */ + errno = 0; + + byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff); + if (byteswritten <= 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno; + + /* if write didn't set errno, assume no disk space */ + if (errno == 0) + errno = ENOSPC; + + save_errno = errno; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to log segment %s " + "at offset %u, length %lu: %m", + xlogfname, startoff, (unsigned long) segbytes))); + } + + /* Update state for write */ + recptr += byteswritten; + + nbytes -= byteswritten; + buf += byteswritten; + } + + /* + * Close the current segment if it's fully written up in the last cycle of + * the loop. + */ + if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) + { + XLogWalPropClose(recptr); + } +} + +/* + * Close the current segment. + */ +void +XLogWalPropClose(XLogRecPtr recptr) +{ + Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)); + + if (close(walpropFile) != 0) + { + char xlogfname[MAXFNAMELEN]; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); + + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close log segment %s: %m", + xlogfname))); + } + + walpropFile = -1; +} + +/* START of cloned functions from walsender.c */ + +/* + * Handle START_REPLICATION command. + * + * At the moment, this never returns, but an ereport(ERROR) will take us back + * to the main loop. + */ +void +StartProposerReplication(StartReplicationCmd *cmd) +{ + XLogRecPtr FlushPtr; + + if (ThisTimeLineID == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); + + /* create xlogreader for physical replication */ + xlogreader = + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.segment_open = WalSndSegmentOpen, + .segment_close = wal_segment_close), + NULL); + + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* + * We assume here that we're logging enough information in the WAL for + * log-shipping, since this is checked in PostmasterMain(). + * + * NOTE: wal_level can only change at shutdown, so in most cases it is + * difficult for there to be WAL data that we can still see that was + * written at wal_level='minimal'. + */ + + if (cmd->slotname) + { + ReplicationSlotAcquire(cmd->slotname, true); + if (SlotIsLogical(MyReplicationSlot)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use a logical replication slot for physical replication"))); + + /* + * We don't need to verify the slot's restart_lsn here; instead we + * rely on the caller requesting the starting point to use. If the + * WAL segment doesn't exist, we'll fail later. + */ + } + + /* + * Select the timeline. If it was given explicitly by the client, use + * that. Otherwise use the timeline of the last replayed record, which is + * kept in ThisTimeLineID. + * + * Neon doesn't currently use PG Timelines, but it may in the future, so + * we keep this code around to lighten the load for when we need it. + */ + if (am_cascading_walsender) + { + /* this also updates ThisTimeLineID */ + FlushPtr = GetStandbyFlushRecPtr(); + } + else + FlushPtr = GetFlushRecPtr(); + + if (cmd->timeline != 0) + { + XLogRecPtr switchpoint; + + sendTimeLine = cmd->timeline; + if (sendTimeLine == ThisTimeLineID) + { + sendTimeLineIsHistoric = false; + sendTimeLineValidUpto = InvalidXLogRecPtr; + } + else + { + List *timeLineHistory; + + sendTimeLineIsHistoric = true; + + /* + * Check that the timeline the client requested exists, and the + * requested start location is on that timeline. + */ + timeLineHistory = readTimeLineHistory(ThisTimeLineID); + switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory, + &sendTimeLineNextTLI); + list_free_deep(timeLineHistory); + + /* + * Found the requested timeline in the history. Check that + * requested startpoint is on that timeline in our history. + * + * This is quite loose on purpose. We only check that we didn't + * fork off the requested timeline before the switchpoint. We + * don't check that we switched *to* it before the requested + * starting point. This is because the client can legitimately + * request to start replication from the beginning of the WAL + * segment that contains switchpoint, but on the new timeline, so + * that it doesn't end up with a partial segment. If you ask for + * too old a starting point, you'll get an error later when we + * fail to find the requested WAL segment in pg_wal. + * + * XXX: we could be more strict here and only allow a startpoint + * that's older than the switchpoint, if it's still in the same + * WAL segment. + */ + if (!XLogRecPtrIsInvalid(switchpoint) && + switchpoint < cmd->startpoint) + { + ereport(ERROR, + (errmsg("requested starting point %X/%X on timeline %u is not in this server's history", + LSN_FORMAT_ARGS(cmd->startpoint), + cmd->timeline), + errdetail("This server's history forked from timeline %u at %X/%X.", + cmd->timeline, + LSN_FORMAT_ARGS(switchpoint)))); + } + sendTimeLineValidUpto = switchpoint; + } + } + else + { + sendTimeLine = ThisTimeLineID; + sendTimeLineValidUpto = InvalidXLogRecPtr; + sendTimeLineIsHistoric = false; + } + + streamingDoneSending = streamingDoneReceiving = false; + + /* If there is nothing to stream, don't even enter COPY mode */ + if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto) + { + /* + * When we first start replication the standby will be behind the + * primary. For some applications, for example synchronous + * replication, it is important to have a clear state for this initial + * catchup mode, so we can trigger actions when we change streaming + * state later. We may stay in this state for a long time, which is + * exactly why we want to be able to monitor whether or not we are + * still here. + */ + WalSndSetState(WALSNDSTATE_CATCHUP); + + /* + * Don't allow a request to stream from a future point in WAL that + * hasn't been flushed to disk in this server yet. + */ + if (FlushPtr < cmd->startpoint) + { + ereport(ERROR, + (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", + LSN_FORMAT_ARGS(cmd->startpoint), + LSN_FORMAT_ARGS(FlushPtr)))); + } + + /* Start streaming from the requested point */ + sentPtr = cmd->startpoint; + + /* Initialize shared memory status, too */ + SpinLockAcquire(&MyWalSnd->mutex); + MyWalSnd->sentPtr = sentPtr; + SpinLockRelease(&MyWalSnd->mutex); + + SyncRepInitConfig(); + + /* Main loop of walsender */ + replication_active = true; + + WalSndLoop(XLogSendPhysical); + + replication_active = false; + if (got_STOPPING) + proc_exit(0); + WalSndSetState(WALSNDSTATE_STARTUP); + + Assert(streamingDoneSending && streamingDoneReceiving); + } + + if (cmd->slotname) + ReplicationSlotRelease(); + + /* + * Copy is finished now. Send a single-row result set indicating the next + * timeline. + */ + if (sendTimeLineIsHistoric) + { + char startpos_str[8 + 1 + 8 + 1]; + DestReceiver *dest; + TupOutputState *tstate; + TupleDesc tupdesc; + Datum values[2]; + bool nulls[2]; + + snprintf(startpos_str, sizeof(startpos_str), "%X/%X", + LSN_FORMAT_ARGS(sendTimeLineValidUpto)); + + dest = CreateDestReceiver(DestRemoteSimple); + MemSet(nulls, false, sizeof(nulls)); + + /* + * Need a tuple descriptor representing two columns. int8 may seem + * like a surprising data type for this, but in theory int4 would not + * be wide enough for this, as TimeLineID is unsigned. + */ + tupdesc = CreateTemplateTupleDesc(2); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli", + INT8OID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos", + TEXTOID, -1, 0); + + /* prepare for projection of tuple */ + tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); + + values[0] = Int64GetDatum((int64) sendTimeLineNextTLI); + values[1] = CStringGetTextDatum(startpos_str); + + /* send it to dest */ + do_tup_output(tstate, values, nulls); + + end_tup_output(tstate); + } + + /* Send CommandComplete message */ + EndReplicationCommand("START_STREAMING"); +} + +/* + * Returns the latest point in WAL that has been safely flushed to disk, and + * can be sent to the standby. This should only be called when in recovery, + * ie. we're streaming to a cascaded standby. + * + * As a side-effect, ThisTimeLineID is updated to the TLI of the last + * replayed WAL record. + */ +static XLogRecPtr +GetStandbyFlushRecPtr(void) +{ + XLogRecPtr replayPtr; + TimeLineID replayTLI; + XLogRecPtr receivePtr; + TimeLineID receiveTLI; + XLogRecPtr result; + + /* + * We can safely send what's already been replayed. Also, if walreceiver + * is streaming WAL from the same timeline, we can send anything that it + * has streamed, but hasn't been replayed yet. + */ + + receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI); + replayPtr = GetXLogReplayRecPtr(&replayTLI); + + ThisTimeLineID = replayTLI; + + result = replayPtr; + if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr) + result = receivePtr; + + return result; +} + +/* XLogReaderRoutine->segment_open callback */ +static void +WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p) +{ + char path[MAXPGPATH]; + + /*------- + * When reading from a historic timeline, and there is a timeline switch + * within this segment, read from the WAL segment belonging to the new + * timeline. + * + * For example, imagine that this server is currently on timeline 5, and + * we're streaming timeline 4. The switch from timeline 4 to 5 happened at + * 0/13002088. In pg_wal, we have these files: + * + * ... + * 000000040000000000000012 + * 000000040000000000000013 + * 000000050000000000000013 + * 000000050000000000000014 + * ... + * + * In this situation, when requested to send the WAL from segment 0x13, on + * timeline 4, we read the WAL from file 000000050000000000000013. Archive + * recovery prefers files from newer timelines, so if the segment was + * restored from the archive on this server, the file belonging to the old + * timeline, 000000040000000000000013, might not exist. Their contents are + * equal up to the switchpoint, because at a timeline switch, the used + * portion of the old segment is copied to the new file. ------- + */ + *tli_p = sendTimeLine; + if (sendTimeLineIsHistoric) + { + XLogSegNo endSegNo; + + XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize); + if (nextSegNo == endSegNo) + *tli_p = sendTimeLineNextTLI; + } + + XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize); + state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (state->seg.ws_file >= 0) + return; + + /* + * If the file is not found, assume it's because the standby asked for a + * too old WAL segment that has already been removed or recycled. + */ + if (errno == ENOENT) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("requested WAL segment %s has already been removed", + xlogfname))); + } + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); +} + + +/* Main loop of walsender process that streams the WAL over Copy messages. */ +static void +WalSndLoop(WalSndSendDataCallback send_data) +{ + /* + * Initialize the last reply timestamp. That enables timeout processing + * from hereon. + */ + last_reply_timestamp = GetCurrentTimestamp(); + waiting_for_ping_response = false; + + /* + * Loop until we reach the end of this timeline or the client requests to + * stop streaming. + */ + for (;;) + { + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Process any requests or signals received recently */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + SyncRepInitConfig(); + } + + /* always true */ + if (am_wal_proposer) + { + send_data(); + if (WalSndCaughtUp) + { + if (MyWalSnd->state == WALSNDSTATE_CATCHUP) + WalSndSetState(WALSNDSTATE_STREAMING); + WalProposerPoll(); + WalSndCaughtUp = false; + } + continue; + } + } +} + +/* + * Send out the WAL in its normal physical/stored form. + * + * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk, + * but not yet sent to the client, and buffer it in the libpq output + * buffer. + * + * If there is no unsent WAL remaining, WalSndCaughtUp is set to true, + * otherwise WalSndCaughtUp is set to false. + */ +static void +XLogSendPhysical(void) +{ + XLogRecPtr SendRqstPtr; + XLogRecPtr startptr; + XLogRecPtr endptr; + Size nbytes PG_USED_FOR_ASSERTS_ONLY; + + /* If requested switch the WAL sender to the stopping state. */ + if (got_STOPPING) + WalSndSetState(WALSNDSTATE_STOPPING); + + if (streamingDoneSending) + { + WalSndCaughtUp = true; + return; + } + + /* Figure out how far we can safely send the WAL. */ + if (sendTimeLineIsHistoric) + { + /* + * Streaming an old timeline that's in this server's history, but is + * not the one we're currently inserting or replaying. It can be + * streamed up to the point where we switched off that timeline. + */ + SendRqstPtr = sendTimeLineValidUpto; + } + else if (am_cascading_walsender) + { + /* + * Streaming the latest timeline on a standby. + * + * Attempt to send all WAL that has already been replayed, so that we + * know it's valid. If we're receiving WAL through streaming + * replication, it's also OK to send any WAL that has been received + * but not replayed. + * + * The timeline we're recovering from can change, or we can be + * promoted. In either case, the current timeline becomes historic. We + * need to detect that so that we don't try to stream past the point + * where we switched to another timeline. We check for promotion or + * timeline switch after calculating FlushPtr, to avoid a race + * condition: if the timeline becomes historic just after we checked + * that it was still current, it's still be OK to stream it up to the + * FlushPtr that was calculated before it became historic. + */ + bool becameHistoric = false; + + SendRqstPtr = GetStandbyFlushRecPtr(); + + if (!RecoveryInProgress()) + { + /* + * We have been promoted. RecoveryInProgress() updated + * ThisTimeLineID to the new current timeline. + */ + am_cascading_walsender = false; + becameHistoric = true; + } + else + { + /* + * Still a cascading standby. But is the timeline we're sending + * still the one recovery is recovering from? ThisTimeLineID was + * updated by the GetStandbyFlushRecPtr() call above. + */ + if (sendTimeLine != ThisTimeLineID) + becameHistoric = true; + } + + if (becameHistoric) + { + /* + * The timeline we were sending has become historic. Read the + * timeline history file of the new timeline to see where exactly + * we forked off from the timeline we were sending. + */ + List *history; + + history = readTimeLineHistory(ThisTimeLineID); + sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI); + + Assert(sendTimeLine < sendTimeLineNextTLI); + list_free_deep(history); + + sendTimeLineIsHistoric = true; + + SendRqstPtr = sendTimeLineValidUpto; + } + } + else + { + /* + * Streaming the current timeline on a primary. + * + * Attempt to send all data that's already been written out and + * fsync'd to disk. We cannot go further than what's been written out + * given the current implementation of WALRead(). And in any case + * it's unsafe to send WAL that is not securely down to disk on the + * primary: if the primary subsequently crashes and restarts, standbys + * must not have applied any WAL that got lost on the primary. + */ + SendRqstPtr = GetFlushRecPtr(); + } + + /* + * Record the current system time as an approximation of the time at which + * this WAL location was written for the purposes of lag tracking. + * + * In theory we could make XLogFlush() record a time in shmem whenever WAL + * is flushed and we could get that time as well as the LSN when we call + * GetFlushRecPtr() above (and likewise for the cascading standby + * equivalent), but rather than putting any new code into the hot WAL path + * it seems good enough to capture the time here. We should reach this + * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that + * may take some time, we read the WAL flush pointer and take the time + * very close to together here so that we'll get a later position if it is + * still moving. + * + * Because LagTrackerWrite ignores samples when the LSN hasn't advanced, + * this gives us a cheap approximation for the WAL flush time for this + * LSN. + * + * Note that the LSN is not necessarily the LSN for the data contained in + * the present message; it's the end of the WAL, which might be further + * ahead. All the lag tracking machinery cares about is finding out when + * that arbitrary LSN is eventually reported as written, flushed and + * applied, so that it can measure the elapsed time. + */ + LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp()); + + /* + * If this is a historic timeline and we've reached the point where we + * forked to the next timeline, stop streaming. + * + * Note: We might already have sent WAL > sendTimeLineValidUpto. The + * startup process will normally replay all WAL that has been received + * from the primary, before promoting, but if the WAL streaming is + * terminated at a WAL page boundary, the valid portion of the timeline + * might end in the middle of a WAL record. We might've already sent the + * first half of that partial WAL record to the cascading standby, so that + * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't + * replay the partial WAL record either, so it can still follow our + * timeline switch. + */ + if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr) + { + /* close the current file. */ + if (xlogreader->seg.ws_file >= 0) + wal_segment_close(xlogreader); + + /* Send CopyDone */ + pq_putmessage_noblock('c', NULL, 0); + streamingDoneSending = true; + + WalSndCaughtUp = true; + + elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)", + LSN_FORMAT_ARGS(sendTimeLineValidUpto), + LSN_FORMAT_ARGS(sentPtr)); + return; + } + + /* Do we have any work to do? */ + Assert(sentPtr <= SendRqstPtr); + if (SendRqstPtr <= sentPtr) + { + WalSndCaughtUp = true; + return; + } + + /* + * Figure out how much to send in one message. If there's no more than + * MAX_SEND_SIZE bytes to send, send everything. Otherwise send + * MAX_SEND_SIZE bytes, but round back to logfile or page boundary. + * + * The rounding is not only for performance reasons. Walreceiver relies on + * the fact that we never split a WAL record across two messages. Since a + * long WAL record is split at page boundary into continuation records, + * page boundary is always a safe cut-off point. We also assume that + * SendRqstPtr never points to the middle of a WAL record. + */ + startptr = sentPtr; + endptr = startptr; + endptr += MAX_SEND_SIZE; + + /* if we went beyond SendRqstPtr, back off */ + if (SendRqstPtr <= endptr) + { + endptr = SendRqstPtr; + if (sendTimeLineIsHistoric) + WalSndCaughtUp = false; + else + WalSndCaughtUp = true; + } + else + { + /* round down to page boundary. */ + endptr -= (endptr % XLOG_BLCKSZ); + WalSndCaughtUp = false; + } + + nbytes = endptr - startptr; + Assert(nbytes <= MAX_SEND_SIZE); + + /* always true */ + if (am_wal_proposer) + { + WalProposerBroadcast(startptr, endptr); + } + else + { + /* code removed for brevity */ + } + sentPtr = endptr; + + /* Update shared memory status */ + { + WalSnd *walsnd = MyWalSnd; + + SpinLockAcquire(&walsnd->mutex); + walsnd->sentPtr = sentPtr; + SpinLockRelease(&walsnd->mutex); + } + + /* Report progress of XLOG streaming in PS display */ + if (update_process_title) + { + char activitymsg[50]; + + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + LSN_FORMAT_ARGS(sentPtr)); + set_ps_display(activitymsg); + } +} + diff --git a/pgxn/neon/walproposer_utils.h b/pgxn/neon/walproposer_utils.h new file mode 100644 index 0000000000..4771d3ff82 --- /dev/null +++ b/pgxn/neon/walproposer_utils.h @@ -0,0 +1,19 @@ +#ifndef __NEON_WALPROPOSER_UTILS_H__ +#define __NEON_WALPROPOSER_UTILS_H__ + +#include "walproposer.h" + +int CompareLsn(const void *a, const void *b); +char* FormatSafekeeperState(SafekeeperState state); +void AssertEventsOkForState(uint32 events, Safekeeper* sk); +uint32 SafekeeperStateDesiredEvents(SafekeeperState state); +char* FormatEvents(uint32 events); +bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint32 pq_getmsgint32_le(StringInfo msg); +uint64 pq_getmsgint64_le(StringInfo msg); +void pq_sendint32_le(StringInfo buf, uint32 i); +void pq_sendint64_le(StringInfo buf, uint64 i); +void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); +void XLogWalPropClose(XLogRecPtr recptr); + +#endif /* __NEON_WALPROPOSER_UTILS_H__ */ diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile new file mode 100644 index 0000000000..9c774ec185 --- /dev/null +++ b/pgxn/neon_test_utils/Makefile @@ -0,0 +1,15 @@ +# pgxs/neon_test_utils/Makefile + + +MODULE_big = neon_test_utils +OBJS = \ + $(WIN32RES) \ + neontest.o + +EXTENSION = neon_test_utils +DATA = neon_test_utils--1.0.sql +PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.0.sql new file mode 100644 index 0000000000..402981a9a6 --- /dev/null +++ b/pgxn/neon_test_utils/neon_test_utils--1.0.sql @@ -0,0 +1,29 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit + +CREATE FUNCTION test_consume_xids(nxids int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_xids' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION clear_buffer_cache() +RETURNS VOID +AS 'MODULE_PATHNAME', 'clear_buffer_cache' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION neon_xlogflush(lsn pg_lsn) +RETURNS VOID +AS 'MODULE_PATHNAME', 'neon_xlogflush' +LANGUAGE C PARALLEL UNSAFE; diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control new file mode 100644 index 0000000000..94e6720503 --- /dev/null +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -0,0 +1,5 @@ +# neon_test_utils extension +comment = 'helpers for neon testing and debugging' +default_version = '1.0' +module_pathname = '$libdir/neon_test_utils' +relocatable = true diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c new file mode 100644 index 0000000000..3e30065cd3 --- /dev/null +++ b/pgxn/neon_test_utils/neontest.c @@ -0,0 +1,304 @@ +/*------------------------------------------------------------------------- + * + * neontest.c + * Helpers for neon testing and debugging + * + * IDENTIFICATION + * contrib/neon_test_utils/neontest.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relation.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/namespace.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "utils/builtins.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/varlena.h" +#include "../neon/pagestore_client.h" + +PG_MODULE_MAGIC; + +extern void _PG_init(void); + +PG_FUNCTION_INFO_V1(test_consume_xids); +PG_FUNCTION_INFO_V1(clear_buffer_cache); +PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); +PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); +PG_FUNCTION_INFO_V1(neon_xlogflush); + +/* + * Linkage to functions in zenith module. + * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c + */ +typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); + +static zenith_read_at_lsn_type zenith_read_at_lsn_ptr; + +/* + * Module initialize function: fetch function pointers for cross-module calls. + */ +void +_PG_init(void) +{ + /* Asserts verify that typedefs above match original declarations */ + AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type); + zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type) + load_external_function("$libdir/neon", "zenith_read_at_lsn", + true, NULL); +} + +#define zenith_read_at_lsn zenith_read_at_lsn_ptr + +/* + * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. + */ +Datum +test_consume_xids(PG_FUNCTION_ARGS) +{ + int32 nxids = PG_GETARG_INT32(0); + TransactionId topxid; + FullTransactionId fullxid; + TransactionId xid; + TransactionId targetxid; + + /* make sure we have a top-XID first */ + topxid = GetTopTransactionId(); + + xid = ReadNextTransactionId(); + + targetxid = xid + nxids; + while (targetxid < FirstNormalTransactionId) + targetxid++; + + while (TransactionIdPrecedes(xid, targetxid)) + { + fullxid = GetNewTransactionId(true); + xid = XidFromFullTransactionId(fullxid); + elog(DEBUG1, "topxid: %u xid: %u", topxid, xid); + } + + PG_RETURN_VOID(); +} + +/* + * Flush the buffer cache, evicting all pages that are not currently pinned. + */ +Datum +clear_buffer_cache(PG_FUNCTION_ARGS) +{ + bool save_zenith_test_evict; + + /* + * Temporarily set the zenith_test_evict GUC, so that when we pin and + * unpin a buffer, the buffer is evicted. We use that hack to evict all + * buffers, as there is no explicit "evict this buffer" function in the + * buffer manager. + */ + save_zenith_test_evict = zenith_test_evict; + zenith_test_evict = true; + PG_TRY(); + { + /* Scan through all the buffers */ + for (int i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr; + uint32 buf_state; + Buffer bufferid; + bool isvalid; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blocknum; + + /* Peek into the buffer header to see what page it holds. */ + bufHdr = GetBufferDescriptor(i); + buf_state = LockBufHdr(bufHdr); + + if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) + isvalid = true; + else + isvalid = false; + bufferid = BufferDescriptorGetBuffer(bufHdr); + rnode = bufHdr->tag.rnode; + forknum = bufHdr->tag.forkNum; + blocknum = bufHdr->tag.blockNum; + + UnlockBufHdr(bufHdr, buf_state); + + /* + * Pin the buffer, and release it again. Because we have + * zenith_test_evict==true, this will evict the page from + * the buffer cache if no one else is holding a pin on it. + */ + if (isvalid) + { + if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid)) + ReleaseBuffer(bufferid); + } + } + } + PG_FINALLY(); + { + /* restore the GUC */ + zenith_test_evict = save_zenith_test_evict; + } + PG_END_TRY(); + + PG_RETURN_VOID(); +} + + +/* + * Reads the page from page server without buffer cache + * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN + * NULL read lsn will result in reading the latest version. + * + * Note: reading latest version will result in waiting for latest changes to reach the page server, + * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page + */ +Datum +get_raw_page_at_lsn(PG_FUNCTION_ARGS) +{ + bytea *raw_page; + ForkNumber forknum; + RangeVar *relrv; + Relation rel; + char *raw_page_data; + text *relname; + text *forkname; + uint32 blkno; + + bool request_latest = PG_ARGISNULL(3); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) + PG_RETURN_NULL(); + + relname = PG_GETARG_TEXT_PP(0); + forkname = PG_GETARG_TEXT_PP(1); + blkno = PG_GETARG_UINT32(2); + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to use raw page functions"))); + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + /* Check that this relation has storage */ + if (rel->rd_rel->relkind == RELKIND_VIEW) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from view \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from composite type \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from foreign table \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from partitioned table \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from partitioned index \"%s\"", + RelationGetRelationName(rel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + + forknum = forkname_to_number(text_to_cstring(forkname)); + + /* Initialize buffer to copy to */ + raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); + raw_page_data = VARDATA(raw_page); + + zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); + + relation_close(rel, AccessShareLock); + + PG_RETURN_BYTEA_P(raw_page); +} + +/* + * Another option to read a relation page from page server without cache + * this version doesn't validate input and allows reading blocks of dropped relations + * + * Note: reading latest version will result in waiting for latest changes to reach the page server, + * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page + */ +Datum +get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) +{ + char *raw_page_data; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to use raw page functions"))); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || + PG_ARGISNULL(3) || PG_ARGISNULL(4)) + PG_RETURN_NULL(); + + { + RelFileNode rnode = { + .spcNode = PG_GETARG_OID(0), + .dbNode = PG_GETARG_OID(1), + .relNode = PG_GETARG_OID(2) + }; + + ForkNumber forknum = PG_GETARG_UINT32(3); + + uint32 blkno = PG_GETARG_UINT32(4); + bool request_latest = PG_ARGISNULL(5); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); + + + /* Initialize buffer to copy to */ + bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); + raw_page_data = VARDATA(raw_page); + + zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); + PG_RETURN_BYTEA_P(raw_page); + } +} + +/* + * Directly calls XLogFlush(lsn) to flush WAL buffers. + */ +Datum +neon_xlogflush(PG_FUNCTION_ARGS) +{ + XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogFlush(lsn); + PG_RETURN_VOID(); +} diff --git a/poetry.lock b/poetry.lock index 6ab6bb0e20..2af0d97511 100644 --- a/poetry.lock +++ b/poetry.lock @@ -13,6 +13,32 @@ psycopg2-binary = ">=2.8.4" [package.extras] sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"] +[[package]] +name = "allure-pytest" +version = "2.10.0" +description = "Allure pytest integration" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +allure-python-commons = "2.10.0" +pytest = ">=4.5.0" +six = ">=1.9.0" + +[[package]] +name = "allure-python-commons" +version = "2.10.0" +description = "Common module for integrate allure with python-based frameworks" +category = "main" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +attrs = ">=16.0.0" +pluggy = ">=0.4.0" +six = ">=1.9.0" + [[package]] name = "async-timeout" version = "4.0.2" @@ -30,9 +56,9 @@ optional = false python-versions = ">=3.6.0" [package.extras] -dev = ["Cython (>=0.29.24,<0.30.0)", "pytest (>=6.0)", "Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "pycodestyle (>=2.7.0,<2.8.0)", "flake8 (>=3.9.2,<3.10.0)", "uvloop (>=0.15.3)"] -docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"] -test = ["pycodestyle (>=2.7.0,<2.8.0)", "flake8 (>=3.9.2,<3.10.0)", "uvloop (>=0.15.3)"] +dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=3.9.2,<3.10.0)", "pycodestyle (>=2.7.0,<2.8.0)", "pytest (>=6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "uvloop (>=0.15.3)"] +docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +test = ["flake8 (>=3.9.2,<3.10.0)", "pycodestyle (>=2.7.0,<2.8.0)", "uvloop (>=0.15.3)"] [[package]] name = "atomicwrites" @@ -51,10 +77,10 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [package.extras] -dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"] -docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] -tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] -tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"] +dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] +docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] +tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] +tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] [[package]] name = "aws-sam-translator" @@ -69,7 +95,7 @@ boto3 = ">=1.19.5,<2.0.0" jsonschema = ">=3.2,<4.0" [package.extras] -dev = ["coverage (>=5.3,<6.0)", "flake8 (>=3.8.4,<3.9.0)", "tox (>=3.24,<4.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-xdist (>=2.5,<3.0)", "pytest-env (>=0.6.2,<0.7.0)", "pylint (>=2.9.0,<2.10.0)", "pyyaml (>=5.4,<6.0)", "pytest (>=6.2.5,<6.3.0)", "parameterized (>=0.7.4,<0.8.0)", "click (>=7.1,<8.0)", "dateparser (>=0.7,<1.0)", "boto3 (>=1.23,<2)", "tenacity (>=7.0.0,<7.1.0)", "requests (>=2.24.0,<2.25.0)", "docopt (>=0.6.2,<0.7.0)", "black (==20.8b1)"] +dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)"] [[package]] name = "aws-xray-sdk" @@ -91,6 +117,28 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "black" +version = "22.6.0" +description = "The uncompromising code formatter." +category = "dev" +optional = false +python-versions = ">=3.6.2" + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""} +typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + [[package]] name = "boto3" version = "1.24.38" @@ -109,8 +157,8 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.24.46" -description = "Type annotations for boto3 1.24.46 generated with mypy-boto3-builder 7.11.3" +version = "1.24.58" +description = "Type annotations for boto3 1.24.58 generated with mypy-boto3-builder 7.11.7" category = "main" optional = false python-versions = ">=3.7" @@ -127,7 +175,7 @@ account = ["mypy-boto3-account (>=1.24.0,<1.25.0)"] acm = ["mypy-boto3-acm (>=1.24.0,<1.25.0)"] acm-pca = ["mypy-boto3-acm-pca (>=1.24.0,<1.25.0)"] alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)"] -all = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)", "mypy-boto3-account (>=1.24.0,<1.25.0)", "mypy-boto3-acm (>=1.24.0,<1.25.0)", "mypy-boto3-acm-pca (>=1.24.0,<1.25.0)", "mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)", "mypy-boto3-amp (>=1.24.0,<1.25.0)", "mypy-boto3-amplify (>=1.24.0,<1.25.0)", "mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)", "mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)", "mypy-boto3-apigateway (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)", "mypy-boto3-appconfig (>=1.24.0,<1.25.0)", "mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)", "mypy-boto3-appflow (>=1.24.0,<1.25.0)", "mypy-boto3-appintegrations (>=1.24.0,<1.25.0)", "mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-application-insights (>=1.24.0,<1.25.0)", "mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-appmesh (>=1.24.0,<1.25.0)", "mypy-boto3-apprunner (>=1.24.0,<1.25.0)", "mypy-boto3-appstream (>=1.24.0,<1.25.0)", "mypy-boto3-appsync (>=1.24.0,<1.25.0)", "mypy-boto3-athena (>=1.24.0,<1.25.0)", "mypy-boto3-auditmanager (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)", "mypy-boto3-backup (>=1.24.0,<1.25.0)", "mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)", "mypy-boto3-batch (>=1.24.0,<1.25.0)", "mypy-boto3-billingconductor (>=1.24.0,<1.25.0)", "mypy-boto3-braket (>=1.24.0,<1.25.0)", "mypy-boto3-budgets (>=1.24.0,<1.25.0)", "mypy-boto3-ce (>=1.24.0,<1.25.0)", "mypy-boto3-chime (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)", "mypy-boto3-cloud9 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)", "mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-cloudfront (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)", "mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)", "mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)", "mypy-boto3-codeartifact (>=1.24.0,<1.25.0)", "mypy-boto3-codebuild (>=1.24.0,<1.25.0)", "mypy-boto3-codecommit (>=1.24.0,<1.25.0)", "mypy-boto3-codedeploy (>=1.24.0,<1.25.0)", "mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)", "mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-codepipeline (>=1.24.0,<1.25.0)", "mypy-boto3-codestar (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)", "mypy-boto3-comprehend (>=1.24.0,<1.25.0)", "mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)", "mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)", "mypy-boto3-config (>=1.24.0,<1.25.0)", "mypy-boto3-connect (>=1.24.0,<1.25.0)", "mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)", "mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)", "mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)", "mypy-boto3-cur (>=1.24.0,<1.25.0)", "mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)", "mypy-boto3-databrew (>=1.24.0,<1.25.0)", "mypy-boto3-dataexchange (>=1.24.0,<1.25.0)", "mypy-boto3-datapipeline (>=1.24.0,<1.25.0)", "mypy-boto3-datasync (>=1.24.0,<1.25.0)", "mypy-boto3-dax (>=1.24.0,<1.25.0)", "mypy-boto3-detective (>=1.24.0,<1.25.0)", "mypy-boto3-devicefarm (>=1.24.0,<1.25.0)", "mypy-boto3-devops-guru (>=1.24.0,<1.25.0)", "mypy-boto3-directconnect (>=1.24.0,<1.25.0)", "mypy-boto3-discovery (>=1.24.0,<1.25.0)", "mypy-boto3-dlm (>=1.24.0,<1.25.0)", "mypy-boto3-dms (>=1.24.0,<1.25.0)", "mypy-boto3-docdb (>=1.24.0,<1.25.0)", "mypy-boto3-drs (>=1.24.0,<1.25.0)", "mypy-boto3-ds (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)", "mypy-boto3-ebs (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)", "mypy-boto3-ecr (>=1.24.0,<1.25.0)", "mypy-boto3-ecr-public (>=1.24.0,<1.25.0)", "mypy-boto3-ecs (>=1.24.0,<1.25.0)", "mypy-boto3-efs (>=1.24.0,<1.25.0)", "mypy-boto3-eks (>=1.24.0,<1.25.0)", "mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)", "mypy-boto3-elasticache (>=1.24.0,<1.25.0)", "mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)", "mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)", "mypy-boto3-elb (>=1.24.0,<1.25.0)", "mypy-boto3-elbv2 (>=1.24.0,<1.25.0)", "mypy-boto3-emr (>=1.24.0,<1.25.0)", "mypy-boto3-emr-containers (>=1.24.0,<1.25.0)", "mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-es (>=1.24.0,<1.25.0)", "mypy-boto3-events (>=1.24.0,<1.25.0)", "mypy-boto3-evidently (>=1.24.0,<1.25.0)", "mypy-boto3-finspace (>=1.24.0,<1.25.0)", "mypy-boto3-finspace-data (>=1.24.0,<1.25.0)", "mypy-boto3-firehose (>=1.24.0,<1.25.0)", "mypy-boto3-fis (>=1.24.0,<1.25.0)", "mypy-boto3-fms (>=1.24.0,<1.25.0)", "mypy-boto3-forecast (>=1.24.0,<1.25.0)", "mypy-boto3-forecastquery (>=1.24.0,<1.25.0)", "mypy-boto3-frauddetector (>=1.24.0,<1.25.0)", "mypy-boto3-fsx (>=1.24.0,<1.25.0)", "mypy-boto3-gamelift (>=1.24.0,<1.25.0)", "mypy-boto3-gamesparks (>=1.24.0,<1.25.0)", "mypy-boto3-glacier (>=1.24.0,<1.25.0)", "mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)", "mypy-boto3-glue (>=1.24.0,<1.25.0)", "mypy-boto3-grafana (>=1.24.0,<1.25.0)", "mypy-boto3-greengrass (>=1.24.0,<1.25.0)", "mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)", "mypy-boto3-groundstation (>=1.24.0,<1.25.0)", "mypy-boto3-guardduty (>=1.24.0,<1.25.0)", "mypy-boto3-health (>=1.24.0,<1.25.0)", "mypy-boto3-healthlake (>=1.24.0,<1.25.0)", "mypy-boto3-honeycode (>=1.24.0,<1.25.0)", "mypy-boto3-iam (>=1.24.0,<1.25.0)", "mypy-boto3-identitystore (>=1.24.0,<1.25.0)", "mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)", "mypy-boto3-importexport (>=1.24.0,<1.25.0)", "mypy-boto3-inspector (>=1.24.0,<1.25.0)", "mypy-boto3-inspector2 (>=1.24.0,<1.25.0)", "mypy-boto3-iot (>=1.24.0,<1.25.0)", "mypy-boto3-iot-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)", "mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)", "mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)", "mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)", "mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)", "mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)", "mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)", "mypy-boto3-iotwireless (>=1.24.0,<1.25.0)", "mypy-boto3-ivs (>=1.24.0,<1.25.0)", "mypy-boto3-ivschat (>=1.24.0,<1.25.0)", "mypy-boto3-kafka (>=1.24.0,<1.25.0)", "mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-kendra (>=1.24.0,<1.25.0)", "mypy-boto3-keyspaces (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)", "mypy-boto3-kms (>=1.24.0,<1.25.0)", "mypy-boto3-lakeformation (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-lex-models (>=1.24.0,<1.25.0)", "mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)", "mypy-boto3-lightsail (>=1.24.0,<1.25.0)", "mypy-boto3-location (>=1.24.0,<1.25.0)", "mypy-boto3-logs (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)", "mypy-boto3-m2 (>=1.24.0,<1.25.0)", "mypy-boto3-machinelearning (>=1.24.0,<1.25.0)", "mypy-boto3-macie (>=1.24.0,<1.25.0)", "mypy-boto3-macie2 (>=1.24.0,<1.25.0)", "mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)", "mypy-boto3-medialive (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)", "mypy-boto3-mediatailor (>=1.24.0,<1.25.0)", "mypy-boto3-memorydb (>=1.24.0,<1.25.0)", "mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)", "mypy-boto3-mgh (>=1.24.0,<1.25.0)", "mypy-boto3-mgn (>=1.24.0,<1.25.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)", "mypy-boto3-mobile (>=1.24.0,<1.25.0)", "mypy-boto3-mq (>=1.24.0,<1.25.0)", "mypy-boto3-mturk (>=1.24.0,<1.25.0)", "mypy-boto3-mwaa (>=1.24.0,<1.25.0)", "mypy-boto3-neptune (>=1.24.0,<1.25.0)", "mypy-boto3-network-firewall (>=1.24.0,<1.25.0)", "mypy-boto3-networkmanager (>=1.24.0,<1.25.0)", "mypy-boto3-nimble (>=1.24.0,<1.25.0)", "mypy-boto3-opensearch (>=1.24.0,<1.25.0)", "mypy-boto3-opsworks (>=1.24.0,<1.25.0)", "mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)", "mypy-boto3-organizations (>=1.24.0,<1.25.0)", "mypy-boto3-outposts (>=1.24.0,<1.25.0)", "mypy-boto3-panorama (>=1.24.0,<1.25.0)", "mypy-boto3-personalize (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-events (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-pi (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)", "mypy-boto3-polly (>=1.24.0,<1.25.0)", "mypy-boto3-pricing (>=1.24.0,<1.25.0)", "mypy-boto3-proton (>=1.24.0,<1.25.0)", "mypy-boto3-qldb (>=1.24.0,<1.25.0)", "mypy-boto3-qldb-session (>=1.24.0,<1.25.0)", "mypy-boto3-quicksight (>=1.24.0,<1.25.0)", "mypy-boto3-ram (>=1.24.0,<1.25.0)", "mypy-boto3-rbin (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-rds-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-rekognition (>=1.24.0,<1.25.0)", "mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)", "mypy-boto3-resource-groups (>=1.24.0,<1.25.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)", "mypy-boto3-robomaker (>=1.24.0,<1.25.0)", "mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)", "mypy-boto3-route53 (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)", "mypy-boto3-route53domains (>=1.24.0,<1.25.0)", "mypy-boto3-route53resolver (>=1.24.0,<1.25.0)", "mypy-boto3-rum (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-s3control (>=1.24.0,<1.25.0)", "mypy-boto3-s3outposts (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-savingsplans (>=1.24.0,<1.25.0)", "mypy-boto3-schemas (>=1.24.0,<1.25.0)", "mypy-boto3-sdb (>=1.24.0,<1.25.0)", "mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)", "mypy-boto3-securityhub (>=1.24.0,<1.25.0)", "mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)", "mypy-boto3-service-quotas (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)", "mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)", "mypy-boto3-ses (>=1.24.0,<1.25.0)", "mypy-boto3-sesv2 (>=1.24.0,<1.25.0)", "mypy-boto3-shield (>=1.24.0,<1.25.0)", "mypy-boto3-signer (>=1.24.0,<1.25.0)", "mypy-boto3-sms (>=1.24.0,<1.25.0)", "mypy-boto3-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)", "mypy-boto3-snowball (>=1.24.0,<1.25.0)", "mypy-boto3-sns (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-ssm (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)", "mypy-boto3-sso (>=1.24.0,<1.25.0)", "mypy-boto3-sso-admin (>=1.24.0,<1.25.0)", "mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)", "mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)", "mypy-boto3-storagegateway (>=1.24.0,<1.25.0)", "mypy-boto3-sts (>=1.24.0,<1.25.0)", "mypy-boto3-support (>=1.24.0,<1.25.0)", "mypy-boto3-swf (>=1.24.0,<1.25.0)", "mypy-boto3-synthetics (>=1.24.0,<1.25.0)", "mypy-boto3-textract (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-query (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-write (>=1.24.0,<1.25.0)", "mypy-boto3-transcribe (>=1.24.0,<1.25.0)", "mypy-boto3-transfer (>=1.24.0,<1.25.0)", "mypy-boto3-translate (>=1.24.0,<1.25.0)", "mypy-boto3-voice-id (>=1.24.0,<1.25.0)", "mypy-boto3-waf (>=1.24.0,<1.25.0)", "mypy-boto3-waf-regional (>=1.24.0,<1.25.0)", "mypy-boto3-wafv2 (>=1.24.0,<1.25.0)", "mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)", "mypy-boto3-wisdom (>=1.24.0,<1.25.0)", "mypy-boto3-workdocs (>=1.24.0,<1.25.0)", "mypy-boto3-worklink (>=1.24.0,<1.25.0)", "mypy-boto3-workmail (>=1.24.0,<1.25.0)", "mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)", "mypy-boto3-xray (>=1.24.0,<1.25.0)"] +all = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)", "mypy-boto3-account (>=1.24.0,<1.25.0)", "mypy-boto3-acm (>=1.24.0,<1.25.0)", "mypy-boto3-acm-pca (>=1.24.0,<1.25.0)", "mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)", "mypy-boto3-amp (>=1.24.0,<1.25.0)", "mypy-boto3-amplify (>=1.24.0,<1.25.0)", "mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)", "mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)", "mypy-boto3-apigateway (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)", "mypy-boto3-appconfig (>=1.24.0,<1.25.0)", "mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)", "mypy-boto3-appflow (>=1.24.0,<1.25.0)", "mypy-boto3-appintegrations (>=1.24.0,<1.25.0)", "mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-application-insights (>=1.24.0,<1.25.0)", "mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-appmesh (>=1.24.0,<1.25.0)", "mypy-boto3-apprunner (>=1.24.0,<1.25.0)", "mypy-boto3-appstream (>=1.24.0,<1.25.0)", "mypy-boto3-appsync (>=1.24.0,<1.25.0)", "mypy-boto3-athena (>=1.24.0,<1.25.0)", "mypy-boto3-auditmanager (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)", "mypy-boto3-backup (>=1.24.0,<1.25.0)", "mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)", "mypy-boto3-backupstorage (>=1.24.0,<1.25.0)", "mypy-boto3-batch (>=1.24.0,<1.25.0)", "mypy-boto3-billingconductor (>=1.24.0,<1.25.0)", "mypy-boto3-braket (>=1.24.0,<1.25.0)", "mypy-boto3-budgets (>=1.24.0,<1.25.0)", "mypy-boto3-ce (>=1.24.0,<1.25.0)", "mypy-boto3-chime (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)", "mypy-boto3-cloud9 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)", "mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-cloudfront (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)", "mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)", "mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)", "mypy-boto3-codeartifact (>=1.24.0,<1.25.0)", "mypy-boto3-codebuild (>=1.24.0,<1.25.0)", "mypy-boto3-codecommit (>=1.24.0,<1.25.0)", "mypy-boto3-codedeploy (>=1.24.0,<1.25.0)", "mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)", "mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-codepipeline (>=1.24.0,<1.25.0)", "mypy-boto3-codestar (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)", "mypy-boto3-comprehend (>=1.24.0,<1.25.0)", "mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)", "mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)", "mypy-boto3-config (>=1.24.0,<1.25.0)", "mypy-boto3-connect (>=1.24.0,<1.25.0)", "mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)", "mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)", "mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)", "mypy-boto3-cur (>=1.24.0,<1.25.0)", "mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)", "mypy-boto3-databrew (>=1.24.0,<1.25.0)", "mypy-boto3-dataexchange (>=1.24.0,<1.25.0)", "mypy-boto3-datapipeline (>=1.24.0,<1.25.0)", "mypy-boto3-datasync (>=1.24.0,<1.25.0)", "mypy-boto3-dax (>=1.24.0,<1.25.0)", "mypy-boto3-detective (>=1.24.0,<1.25.0)", "mypy-boto3-devicefarm (>=1.24.0,<1.25.0)", "mypy-boto3-devops-guru (>=1.24.0,<1.25.0)", "mypy-boto3-directconnect (>=1.24.0,<1.25.0)", "mypy-boto3-discovery (>=1.24.0,<1.25.0)", "mypy-boto3-dlm (>=1.24.0,<1.25.0)", "mypy-boto3-dms (>=1.24.0,<1.25.0)", "mypy-boto3-docdb (>=1.24.0,<1.25.0)", "mypy-boto3-drs (>=1.24.0,<1.25.0)", "mypy-boto3-ds (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)", "mypy-boto3-ebs (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)", "mypy-boto3-ecr (>=1.24.0,<1.25.0)", "mypy-boto3-ecr-public (>=1.24.0,<1.25.0)", "mypy-boto3-ecs (>=1.24.0,<1.25.0)", "mypy-boto3-efs (>=1.24.0,<1.25.0)", "mypy-boto3-eks (>=1.24.0,<1.25.0)", "mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)", "mypy-boto3-elasticache (>=1.24.0,<1.25.0)", "mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)", "mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)", "mypy-boto3-elb (>=1.24.0,<1.25.0)", "mypy-boto3-elbv2 (>=1.24.0,<1.25.0)", "mypy-boto3-emr (>=1.24.0,<1.25.0)", "mypy-boto3-emr-containers (>=1.24.0,<1.25.0)", "mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-es (>=1.24.0,<1.25.0)", "mypy-boto3-events (>=1.24.0,<1.25.0)", "mypy-boto3-evidently (>=1.24.0,<1.25.0)", "mypy-boto3-finspace (>=1.24.0,<1.25.0)", "mypy-boto3-finspace-data (>=1.24.0,<1.25.0)", "mypy-boto3-firehose (>=1.24.0,<1.25.0)", "mypy-boto3-fis (>=1.24.0,<1.25.0)", "mypy-boto3-fms (>=1.24.0,<1.25.0)", "mypy-boto3-forecast (>=1.24.0,<1.25.0)", "mypy-boto3-forecastquery (>=1.24.0,<1.25.0)", "mypy-boto3-frauddetector (>=1.24.0,<1.25.0)", "mypy-boto3-fsx (>=1.24.0,<1.25.0)", "mypy-boto3-gamelift (>=1.24.0,<1.25.0)", "mypy-boto3-gamesparks (>=1.24.0,<1.25.0)", "mypy-boto3-glacier (>=1.24.0,<1.25.0)", "mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)", "mypy-boto3-glue (>=1.24.0,<1.25.0)", "mypy-boto3-grafana (>=1.24.0,<1.25.0)", "mypy-boto3-greengrass (>=1.24.0,<1.25.0)", "mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)", "mypy-boto3-groundstation (>=1.24.0,<1.25.0)", "mypy-boto3-guardduty (>=1.24.0,<1.25.0)", "mypy-boto3-health (>=1.24.0,<1.25.0)", "mypy-boto3-healthlake (>=1.24.0,<1.25.0)", "mypy-boto3-honeycode (>=1.24.0,<1.25.0)", "mypy-boto3-iam (>=1.24.0,<1.25.0)", "mypy-boto3-identitystore (>=1.24.0,<1.25.0)", "mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)", "mypy-boto3-importexport (>=1.24.0,<1.25.0)", "mypy-boto3-inspector (>=1.24.0,<1.25.0)", "mypy-boto3-inspector2 (>=1.24.0,<1.25.0)", "mypy-boto3-iot (>=1.24.0,<1.25.0)", "mypy-boto3-iot-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)", "mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)", "mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)", "mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)", "mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)", "mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)", "mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)", "mypy-boto3-iotwireless (>=1.24.0,<1.25.0)", "mypy-boto3-ivs (>=1.24.0,<1.25.0)", "mypy-boto3-ivschat (>=1.24.0,<1.25.0)", "mypy-boto3-kafka (>=1.24.0,<1.25.0)", "mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-kendra (>=1.24.0,<1.25.0)", "mypy-boto3-keyspaces (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)", "mypy-boto3-kms (>=1.24.0,<1.25.0)", "mypy-boto3-lakeformation (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-lex-models (>=1.24.0,<1.25.0)", "mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)", "mypy-boto3-lightsail (>=1.24.0,<1.25.0)", "mypy-boto3-location (>=1.24.0,<1.25.0)", "mypy-boto3-logs (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)", "mypy-boto3-m2 (>=1.24.0,<1.25.0)", "mypy-boto3-machinelearning (>=1.24.0,<1.25.0)", "mypy-boto3-macie (>=1.24.0,<1.25.0)", "mypy-boto3-macie2 (>=1.24.0,<1.25.0)", "mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)", "mypy-boto3-medialive (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)", "mypy-boto3-mediatailor (>=1.24.0,<1.25.0)", "mypy-boto3-memorydb (>=1.24.0,<1.25.0)", "mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)", "mypy-boto3-mgh (>=1.24.0,<1.25.0)", "mypy-boto3-mgn (>=1.24.0,<1.25.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)", "mypy-boto3-mobile (>=1.24.0,<1.25.0)", "mypy-boto3-mq (>=1.24.0,<1.25.0)", "mypy-boto3-mturk (>=1.24.0,<1.25.0)", "mypy-boto3-mwaa (>=1.24.0,<1.25.0)", "mypy-boto3-neptune (>=1.24.0,<1.25.0)", "mypy-boto3-network-firewall (>=1.24.0,<1.25.0)", "mypy-boto3-networkmanager (>=1.24.0,<1.25.0)", "mypy-boto3-nimble (>=1.24.0,<1.25.0)", "mypy-boto3-opensearch (>=1.24.0,<1.25.0)", "mypy-boto3-opsworks (>=1.24.0,<1.25.0)", "mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)", "mypy-boto3-organizations (>=1.24.0,<1.25.0)", "mypy-boto3-outposts (>=1.24.0,<1.25.0)", "mypy-boto3-panorama (>=1.24.0,<1.25.0)", "mypy-boto3-personalize (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-events (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-pi (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)", "mypy-boto3-polly (>=1.24.0,<1.25.0)", "mypy-boto3-pricing (>=1.24.0,<1.25.0)", "mypy-boto3-privatenetworks (>=1.24.0,<1.25.0)", "mypy-boto3-proton (>=1.24.0,<1.25.0)", "mypy-boto3-qldb (>=1.24.0,<1.25.0)", "mypy-boto3-qldb-session (>=1.24.0,<1.25.0)", "mypy-boto3-quicksight (>=1.24.0,<1.25.0)", "mypy-boto3-ram (>=1.24.0,<1.25.0)", "mypy-boto3-rbin (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-rds-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-rekognition (>=1.24.0,<1.25.0)", "mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)", "mypy-boto3-resource-groups (>=1.24.0,<1.25.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)", "mypy-boto3-robomaker (>=1.24.0,<1.25.0)", "mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)", "mypy-boto3-route53 (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)", "mypy-boto3-route53domains (>=1.24.0,<1.25.0)", "mypy-boto3-route53resolver (>=1.24.0,<1.25.0)", "mypy-boto3-rum (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-s3control (>=1.24.0,<1.25.0)", "mypy-boto3-s3outposts (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-savingsplans (>=1.24.0,<1.25.0)", "mypy-boto3-schemas (>=1.24.0,<1.25.0)", "mypy-boto3-sdb (>=1.24.0,<1.25.0)", "mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)", "mypy-boto3-securityhub (>=1.24.0,<1.25.0)", "mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)", "mypy-boto3-service-quotas (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)", "mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)", "mypy-boto3-ses (>=1.24.0,<1.25.0)", "mypy-boto3-sesv2 (>=1.24.0,<1.25.0)", "mypy-boto3-shield (>=1.24.0,<1.25.0)", "mypy-boto3-signer (>=1.24.0,<1.25.0)", "mypy-boto3-sms (>=1.24.0,<1.25.0)", "mypy-boto3-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)", "mypy-boto3-snowball (>=1.24.0,<1.25.0)", "mypy-boto3-sns (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-ssm (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)", "mypy-boto3-sso (>=1.24.0,<1.25.0)", "mypy-boto3-sso-admin (>=1.24.0,<1.25.0)", "mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)", "mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)", "mypy-boto3-storagegateway (>=1.24.0,<1.25.0)", "mypy-boto3-sts (>=1.24.0,<1.25.0)", "mypy-boto3-support (>=1.24.0,<1.25.0)", "mypy-boto3-support-app (>=1.24.0,<1.25.0)", "mypy-boto3-swf (>=1.24.0,<1.25.0)", "mypy-boto3-synthetics (>=1.24.0,<1.25.0)", "mypy-boto3-textract (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-query (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-write (>=1.24.0,<1.25.0)", "mypy-boto3-transcribe (>=1.24.0,<1.25.0)", "mypy-boto3-transfer (>=1.24.0,<1.25.0)", "mypy-boto3-translate (>=1.24.0,<1.25.0)", "mypy-boto3-voice-id (>=1.24.0,<1.25.0)", "mypy-boto3-waf (>=1.24.0,<1.25.0)", "mypy-boto3-waf-regional (>=1.24.0,<1.25.0)", "mypy-boto3-wafv2 (>=1.24.0,<1.25.0)", "mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)", "mypy-boto3-wisdom (>=1.24.0,<1.25.0)", "mypy-boto3-workdocs (>=1.24.0,<1.25.0)", "mypy-boto3-worklink (>=1.24.0,<1.25.0)", "mypy-boto3-workmail (>=1.24.0,<1.25.0)", "mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)", "mypy-boto3-xray (>=1.24.0,<1.25.0)"] amp = ["mypy-boto3-amp (>=1.24.0,<1.25.0)"] amplify = ["mypy-boto3-amplify (>=1.24.0,<1.25.0)"] amplifybackend = ["mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)"] @@ -152,6 +200,7 @@ autoscaling = ["mypy-boto3-autoscaling (>=1.24.0,<1.25.0)"] autoscaling-plans = ["mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)"] backup = ["mypy-boto3-backup (>=1.24.0,<1.25.0)"] backup-gateway = ["mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)"] +backupstorage = ["mypy-boto3-backupstorage (>=1.24.0,<1.25.0)"] batch = ["mypy-boto3-batch (>=1.24.0,<1.25.0)"] billingconductor = ["mypy-boto3-billingconductor (>=1.24.0,<1.25.0)"] braket = ["mypy-boto3-braket (>=1.24.0,<1.25.0)"] @@ -352,6 +401,7 @@ pinpoint-sms-voice = ["mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)"] pinpoint-sms-voice-v2 = ["mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)"] polly = ["mypy-boto3-polly (>=1.24.0,<1.25.0)"] pricing = ["mypy-boto3-pricing (>=1.24.0,<1.25.0)"] +privatenetworks = ["mypy-boto3-privatenetworks (>=1.24.0,<1.25.0)"] proton = ["mypy-boto3-proton (>=1.24.0,<1.25.0)"] qldb = ["mypy-boto3-qldb (>=1.24.0,<1.25.0)"] qldb-session = ["mypy-boto3-qldb-session (>=1.24.0,<1.25.0)"] @@ -414,6 +464,7 @@ stepfunctions = ["mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)"] storagegateway = ["mypy-boto3-storagegateway (>=1.24.0,<1.25.0)"] sts = ["mypy-boto3-sts (>=1.24.0,<1.25.0)"] support = ["mypy-boto3-support (>=1.24.0,<1.25.0)"] +support-app = ["mypy-boto3-support-app (>=1.24.0,<1.25.0)"] swf = ["mypy-boto3-swf (>=1.24.0,<1.25.0)"] synthetics = ["mypy-boto3-synthetics (>=1.24.0,<1.25.0)"] textract = ["mypy-boto3-textract (>=1.24.0,<1.25.0)"] @@ -551,11 +602,11 @@ cffi = ">=1.12" [package.extras] docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] -docstest = ["pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling (>=4.0.1)"] +docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] sdist = ["setuptools_rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] +test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"] [[package]] name = "docker" @@ -573,7 +624,7 @@ websocket-client = ">=0.32.0" [package.extras] ssh = ["paramiko (>=2.4.2)"] -tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"] +tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"] [[package]] name = "ecdsa" @@ -603,16 +654,16 @@ testing = ["pre-commit"] [[package]] name = "flake8" -version = "3.9.2" +version = "5.0.4" description = "the modular source code checker: pep8 pyflakes and co" category = "dev" optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +python-versions = ">=3.6.1" [package.dependencies] -mccabe = ">=0.6.0,<0.7.0" -pycodestyle = ">=2.7.0,<2.8.0" -pyflakes = ">=2.3.0,<2.4.0" +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.9.0,<2.10.0" +pyflakes = ">=2.5.0,<2.6.0" [[package]] name = "flask" @@ -673,9 +724,9 @@ python-versions = ">=3.7" zipp = ">=0.5" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] +docs = ["jaraco.packaging (>=9)", "rst.linker (>=1.9)", "sphinx"] perf = ["ipython"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] +testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] [[package]] name = "iniconfig" @@ -685,6 +736,20 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "isort" +version = "5.10.1" +description = "A Python utility / library to sort Python imports." +category = "dev" +optional = false +python-versions = ">=3.6.1,<4.0" + +[package.extras] +colors = ["colorama (>=0.4.3,<0.5.0)"] +pipfile_deprecated_finder = ["pipreqs", "requirementslib"] +plugins = ["setuptools"] +requirements_deprecated_finder = ["pip-api", "pipreqs"] + [[package]] name = "itsdangerous" version = "2.1.2" @@ -756,9 +821,9 @@ optional = false python-versions = ">=2.7" [package.extras] -testing = ["pytest-flake8 (>=1.1.1)", "jsonlib", "enum34", "pytest-flake8 (<1.1.0)", "sqlalchemy", "scikit-learn", "pymongo", "pandas", "numpy", "feedparser", "ecdsa", "pytest-cov", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest (>=3.5,!=3.7.3)"] -"testing.libs" = ["yajl", "ujson", "simplejson"] -docs = ["rst.linker (>=1.9)", "jaraco.packaging (>=3.2)", "sphinx"] +docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"] +testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"] +"testing.libs" = ["simplejson", "ujson", "yajl"] [[package]] name = "jsonpointer" @@ -783,7 +848,7 @@ six = ">=1.11.0" [package.extras] format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"] -format_nongpl = ["idna", "jsonpointer (>1.13)", "webcolors", "rfc3986-validator (>0.1.0)", "rfc3339-validator"] +format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"] [[package]] name = "junit-xml" @@ -806,15 +871,15 @@ python-versions = ">=3.7" [[package]] name = "mccabe" -version = "0.6.1" +version = "0.7.0" description = "McCabe checker, plugin for flake8" category = "dev" optional = false -python-versions = "*" +python-versions = ">=3.6" [[package]] name = "moto" -version = "3.1.17" +version = "3.1.18" description = "A library that allows your python tests to easily mock out the boto library" category = "main" optional = false @@ -828,7 +893,7 @@ cfn-lint = {version = ">=0.4.0", optional = true, markers = "extra == \"server\" cryptography = ">=3.3.1" docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""} ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""} -flask = {version = "*", optional = true, markers = "extra == \"server\""} +flask = {version = "<2.2.0", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""} @@ -848,14 +913,14 @@ werkzeug = ">=0.5,<2.2.0" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.7)", "openapi-spec-validator (>=0.2.8)", "setuptools"] -apigateway = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)"] +all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] apigatewayv2 = ["PyYAML (>=5.1)"] appsync = ["graphql-core"] awslambda = ["docker (>=2.5.1)"] batch = ["docker (>=2.5.1)"] -cloudformation = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.7)", "openapi-spec-validator (>=0.2.8)", "setuptools"] -cognitoidp = ["python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] ds = ["sshpubkeys (>=3.1.0)"] dynamodb = ["docker (>=2.5.1)"] dynamodb2 = ["docker (>=2.5.1)"] @@ -867,7 +932,7 @@ glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] route53resolver = ["sshpubkeys (>=3.1.0)"] s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.7)", "openapi-spec-validator (>=0.2.8)", "setuptools", "flask", "flask-cors"] +server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (<2.2.0)", "flask-cors", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] ssm = ["PyYAML (>=5.1)", "dataclasses"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] @@ -917,11 +982,11 @@ optional = false python-versions = ">=3.8" [package.extras] -default = ["numpy (>=1.19)", "scipy (>=1.8)", "matplotlib (>=3.4)", "pandas (>=1.3)"] -developer = ["pre-commit (>=2.19)", "mypy (>=0.960)"] -doc = ["sphinx (>=5)", "pydata-sphinx-theme (>=0.9)", "sphinx-gallery (>=0.10)", "numpydoc (>=1.4)", "pillow (>=9.1)", "nb2plots (>=0.6)", "texext (>=0.6.6)"] -extra = ["lxml (>=4.6)", "pygraphviz (>=1.9)", "pydot (>=1.4.2)", "sympy (>=1.10)"] -test = ["pytest (>=7.1)", "pytest-cov (>=3.0)", "codecov (>=2.1)"] +default = ["matplotlib (>=3.4)", "numpy (>=1.19)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=0.960)", "pre-commit (>=2.19)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.4)", "pillow (>=9.1)", "pydata-sphinx-theme (>=0.9)", "sphinx (>=5)", "sphinx-gallery (>=0.10)", "texext (>=0.6.6)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.9)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"] [[package]] name = "openapi-schema-validator" @@ -936,8 +1001,8 @@ jsonschema = ">=3.0.0,<5.0.0" [package.extras] isodate = ["isodate"] -strict-rfc3339 = ["strict-rfc3339"] rfc3339-validator = ["rfc3339-validator"] +strict-rfc3339 = ["strict-rfc3339"] [[package]] name = "openapi-spec-validator" @@ -966,6 +1031,14 @@ python-versions = ">=3.6" [package.dependencies] pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" +[[package]] +name = "pathspec" +version = "0.9.0" +description = "Utility library for gitignore style pattern matching of file paths." +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" + [[package]] name = "pbr" version = "5.9.0" @@ -974,6 +1047,18 @@ category = "main" optional = false python-versions = ">=2.6" +[[package]] +name = "platformdirs" +version = "2.5.2" +description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"] +test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"] + [[package]] name = "pluggy" version = "1.0.0" @@ -1023,11 +1108,11 @@ python-versions = "*" [[package]] name = "pycodestyle" -version = "2.7.0" +version = "2.9.1" description = "Python style guide checker" category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.6" [[package]] name = "pycparser" @@ -1039,11 +1124,11 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [[package]] name = "pyflakes" -version = "2.3.1" +version = "2.5.0" description = "passive checker of Python programs" category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.6" [[package]] name = "pyjwt" @@ -1058,9 +1143,9 @@ cryptography = {version = ">=3.3.1", optional = true, markers = "extra == \"cryp [package.extras] crypto = ["cryptography (>=3.3.1)"] -dev = ["sphinx", "sphinx-rtd-theme", "zope.interface", "cryptography (>=3.3.1)", "pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)", "mypy", "pre-commit"] +dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.3.1)", "mypy", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx", "sphinx-rtd-theme", "zope.interface"] docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] -tests = ["pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)"] +tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] [[package]] name = "pyparsing" @@ -1071,7 +1156,7 @@ optional = false python-versions = ">=3.6.8" [package.extras] -diagrams = ["railroad-diagrams", "jinja2"] +diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pypiwin32" @@ -1113,6 +1198,20 @@ toml = "*" [package.extras] testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] +[[package]] +name = "pytest-asyncio" +version = "0.19.0" +description = "Pytest support for asyncio" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +pytest = ">=6.1.0" + +[package.extras] +testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"] + [[package]] name = "pytest-forked" version = "1.4.0" @@ -1206,8 +1305,8 @@ rsa = "*" [package.extras] cryptography = ["cryptography (>=3.4.0)"] -pycrypto = ["pycrypto (>=2.6.0,<2.7.0)", "pyasn1"] -pycryptodome = ["pycryptodome (>=3.3.1,<4.0.0)", "pyasn1"] +pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"] +pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] [[package]] name = "pytz" @@ -1264,7 +1363,7 @@ requests = ">=2.0,<3.0" urllib3 = ">=1.25.10" [package.extras] -tests = ["pytest (>=7.0.0)", "coverage (>=6.0.0)", "pytest-cov", "pytest-asyncio", "pytest-localserver", "flake8", "types-mock", "types-requests", "mypy"] +tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"] [[package]] name = "rsa" @@ -1394,8 +1493,8 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" [package.extras] -brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] -secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] @@ -1438,14 +1537,6 @@ category = "main" optional = false python-versions = ">=3.4" -[[package]] -name = "yapf" -version = "0.31.0" -description = "A formatter for Python code." -category = "dev" -optional = false -python-versions = "*" - [[package]] name = "zipp" version = "3.8.1" @@ -1455,19 +1546,27 @@ optional = false python-versions = ">=3.7" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] +docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"] +testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "e58b30774603aa0f31579899a6c78579329c580f2f4bbaec209b0f9d52079fc6" +content-hash = "ead1495454ee6d880bb240447025db93a25ebe263c2709de5f144cc2d85dc975" [metadata.files] aiopg = [ {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"}, {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"}, ] +allure-pytest = [ + {file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"}, + {file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"}, +] +allure-python-commons = [ + {file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"}, + {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"}, +] async-timeout = [ {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"}, {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"}, @@ -1507,13 +1606,38 @@ backoff = [ {file = "backoff-1.11.1-py2.py3-none-any.whl", hash = "sha256:61928f8fa48d52e4faa81875eecf308eccfb1016b018bb6bd21e05b5d90a96c5"}, {file = "backoff-1.11.1.tar.gz", hash = "sha256:ccb962a2378418c667b3c979b504fdeb7d9e0d29c0579e3b13b86467177728cb"}, ] +black = [ + {file = "black-22.6.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f586c26118bc6e714ec58c09df0157fe2d9ee195c764f630eb0d8e7ccce72e69"}, + {file = "black-22.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b270a168d69edb8b7ed32c193ef10fd27844e5c60852039599f9184460ce0807"}, + {file = "black-22.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6797f58943fceb1c461fb572edbe828d811e719c24e03375fd25170ada53825e"}, + {file = "black-22.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c85928b9d5f83b23cee7d0efcb310172412fbf7cb9d9ce963bd67fd141781def"}, + {file = "black-22.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:f6fe02afde060bbeef044af7996f335fbe90b039ccf3f5eb8f16df8b20f77666"}, + {file = "black-22.6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cfaf3895a9634e882bf9d2363fed5af8888802d670f58b279b0bece00e9a872d"}, + {file = "black-22.6.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94783f636bca89f11eb5d50437e8e17fbc6a929a628d82304c80fa9cd945f256"}, + {file = "black-22.6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:2ea29072e954a4d55a2ff58971b83365eba5d3d357352a07a7a4df0d95f51c78"}, + {file = "black-22.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e439798f819d49ba1c0bd9664427a05aab79bfba777a6db94fd4e56fae0cb849"}, + {file = "black-22.6.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:187d96c5e713f441a5829e77120c269b6514418f4513a390b0499b0987f2ff1c"}, + {file = "black-22.6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:074458dc2f6e0d3dab7928d4417bb6957bb834434516f21514138437accdbe90"}, + {file = "black-22.6.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a218d7e5856f91d20f04e931b6f16d15356db1c846ee55f01bac297a705ca24f"}, + {file = "black-22.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:568ac3c465b1c8b34b61cd7a4e349e93f91abf0f9371eda1cf87194663ab684e"}, + {file = "black-22.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6c1734ab264b8f7929cef8ae5f900b85d579e6cbfde09d7387da8f04771b51c6"}, + {file = "black-22.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9a3ac16efe9ec7d7381ddebcc022119794872abce99475345c5a61aa18c45ad"}, + {file = "black-22.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:b9fd45787ba8aa3f5e0a0a98920c1012c884622c6c920dbe98dbd05bc7c70fbf"}, + {file = "black-22.6.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7ba9be198ecca5031cd78745780d65a3f75a34b2ff9be5837045dce55db83d1c"}, + {file = "black-22.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a3db5b6409b96d9bd543323b23ef32a1a2b06416d525d27e0f67e74f1446c8f2"}, + {file = "black-22.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:560558527e52ce8afba936fcce93a7411ab40c7d5fe8c2463e279e843c0328ee"}, + {file = "black-22.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b154e6bbde1e79ea3260c4b40c0b7b3109ffcdf7bc4ebf8859169a6af72cd70b"}, + {file = "black-22.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:4af5bc0e1f96be5ae9bd7aaec219c901a94d6caa2484c21983d043371c733fc4"}, + {file = "black-22.6.0-py3-none-any.whl", hash = "sha256:ac609cf8ef5e7115ddd07d85d988d074ed00e10fbc3445aee393e70164a2219c"}, + {file = "black-22.6.0.tar.gz", hash = "sha256:6c6d39e28aed379aec40da1c65434c77d75e65bb59a1e1c283de545fb4e7c6c9"}, +] boto3 = [ {file = "boto3-1.24.38-py3-none-any.whl", hash = "sha256:bcf97fd7c494f4e2bbbe2511625500654179c0a6b3bea977d46f97af764e85a4"}, {file = "boto3-1.24.38.tar.gz", hash = "sha256:f4c6b025f392c934338c7f01badfddbd0d3cf2397ff5df35c31409798dce33f5"}, ] boto3-stubs = [ - {file = "boto3-stubs-1.24.46.tar.gz", hash = "sha256:9482238ed9ea7794e6e66a41376bf75d5950f0328de09fac9d224906dcc624ef"}, - {file = "boto3_stubs-1.24.46-py3-none-any.whl", hash = "sha256:3aa84f2925b4b50b7f47ac41a11ac05302e744cdf460cb7bcf6488319393d8a4"}, + {file = "boto3-stubs-1.24.58.tar.gz", hash = "sha256:95ab521a9a931cc21d48c97c5bd7de0e37370d9b6a298e3905ec621db9243897"}, + {file = "boto3_stubs-1.24.58-py3-none-any.whl", hash = "sha256:a16940df2a347f7890075af8c0b202b06057bc18ff4c640ef94e09ce4176adb9"}, ] botocore = [ {file = "botocore-1.27.38-py3-none-any.whl", hash = "sha256:46a0264ff3335496bd9cb404f83ec0d8eb7bfdef8f74a830c13e6a6b9612adea"}, @@ -1650,8 +1774,8 @@ execnet = [ {file = "execnet-1.9.0.tar.gz", hash = "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5"}, ] flake8 = [ - {file = "flake8-3.9.2-py2.py3-none-any.whl", hash = "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"}, - {file = "flake8-3.9.2.tar.gz", hash = "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b"}, + {file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"}, + {file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"}, ] flask = [ {file = "Flask-2.1.3-py3-none-any.whl", hash = "sha256:9013281a7402ad527f8fd56375164f3aa021ecfaff89bfe3825346c24f87e04c"}, @@ -1677,6 +1801,10 @@ iniconfig = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, ] +isort = [ + {file = "isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"}, + {file = "isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"}, +] itsdangerous = [ {file = "itsdangerous-2.1.2-py3-none-any.whl", hash = "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44"}, {file = "itsdangerous-2.1.2.tar.gz", hash = "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"}, @@ -1759,12 +1887,12 @@ markupsafe = [ {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, ] mccabe = [ - {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, - {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] moto = [ - {file = "moto-3.1.17-py3-none-any.whl", hash = "sha256:84797321fad9a9e924c1c0385b302c80ec23429724c016b504f4bfca9d40d33a"}, - {file = "moto-3.1.17.tar.gz", hash = "sha256:f2e5b32e8910c51c0b0de5b73f902bc53e06fb1c1d077d2b848d27e0b0cbe65e"}, + {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"}, + {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"}, ] mypy = [ {file = "mypy-0.971-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f2899a3cbd394da157194f913a931edfd4be5f274a88041c9dc2d9cdcb1c315c"}, @@ -1815,10 +1943,18 @@ packaging = [ {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, ] +pathspec = [ + {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"}, + {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"}, +] pbr = [ {file = "pbr-5.9.0-py2.py3-none-any.whl", hash = "sha256:e547125940bcc052856ded43be8e101f63828c2d94239ffbe2b327ba3d5ccf0a"}, {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"}, ] +platformdirs = [ + {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"}, + {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"}, +] pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, @@ -1905,16 +2041,16 @@ pyasn1 = [ {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, ] pycodestyle = [ - {file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"}, - {file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"}, + {file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"}, + {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"}, ] pycparser = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] pyflakes = [ - {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"}, - {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, + {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"}, + {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"}, ] pyjwt = [ {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, @@ -1955,6 +2091,10 @@ pytest = [ {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"}, {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, ] +pytest-asyncio = [ + {file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"}, + {file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"}, +] pytest-forked = [ {file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"}, {file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"}, @@ -2172,10 +2312,6 @@ xmltodict = [ {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"}, {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"}, ] -yapf = [ - {file = "yapf-0.31.0-py2.py3-none-any.whl", hash = "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"}, - {file = "yapf-0.31.0.tar.gz", hash = "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d"}, -] zipp = [ {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"}, {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"}, diff --git a/pre-commit.py b/pre-commit.py index ea6a22a7fe..560df6cd0c 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 -from typing import List +import argparse +import enum import subprocess import sys -import enum -import argparse -import os +from typing import List @enum.unique @@ -37,15 +36,24 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str: return cmd -def yapf(fix_inplace: bool) -> str: - cmd = "poetry run yapf --recursive" - if fix_inplace: - cmd += " --in-place" - else: - cmd += " --diff" +def black(fix_inplace: bool) -> str: + cmd = "poetry run black" + if not fix_inplace: + cmd += " --diff --check" return cmd +def isort(fix_inplace: bool) -> str: + cmd = "poetry run isort" + if not fix_inplace: + cmd += " --diff --check" + return cmd + + +def flake8() -> str: + return "poetry run flake8" + + def mypy() -> str: return "poetry run mypy" @@ -71,11 +79,13 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color: else: print("Please inspect the output below and run make fmt to fix automatically.") if suffix == ".py": - print("If the output is empty, ensure that you've installed Python tooling by\n" - "running './scripts/pysync' in the current directory (no root needed)") + print( + "If the output is empty, ensure that you've installed Python tooling by\n" + "running './scripts/pysync' in the current directory (no root needed)" + ) print() print(res.stdout.decode()) - exit(1) + sys.exit(1) print(colorify("[OK]", Color.GREEN, no_color)) @@ -83,10 +93,12 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color: if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--fix-inplace", action="store_true", help="apply fixes inplace") - parser.add_argument("--no-color", - action="store_true", - help="disable colored output", - default=not sys.stdout.isatty()) + parser.add_argument( + "--no-color", + action="store_true", + help="disable colored output", + default=not sys.stdout.isatty(), + ) args = parser.parse_args() files = get_commit_files() @@ -101,9 +113,23 @@ if __name__ == "__main__": no_color=args.no_color, ) check( - name="yapf", + name="isort", suffix=".py", - cmd=yapf(fix_inplace=args.fix_inplace), + cmd=isort(fix_inplace=args.fix_inplace), + changed_files=files, + no_color=args.no_color, + ) + check( + name="black", + suffix=".py", + cmd=black(fix_inplace=args.fix_inplace), + changed_files=files, + no_color=args.no_color, + ) + check( + name="flake8", + suffix=".py", + cmd=flake8(), changed_files=files, no_color=args.no_color, ) diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 230fc8a253..5a450793f1 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -11,10 +11,11 @@ bstr = "0.2.17" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" futures = "0.3.13" -hashbrown = "0.11.2" +hashbrown = "0.12" hex = "0.4.3" hmac = "0.12.1" hyper = "0.14" +itertools = "0.10.3" once_cell = "1.13.0" md5 = "0.7.0" parking_lot = "0.12" @@ -23,7 +24,7 @@ rand = "0.8.3" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } routerify = "3" rustls = "0.20.0" -rustls-pemfile = "0.2.1" +rustls-pemfile = "1" scopeguard = "1.1.0" serde = "1" serde_json = "1" diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index bb7e7ef67b..9c43620ffb 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -127,7 +127,7 @@ impl BackendType> { } } -impl BackendType { +impl BackendType> { /// Authenticate the client via the requested backend, possibly using credentials. pub async fn authenticate( mut self, @@ -149,7 +149,7 @@ impl BackendType { // Finally we may finish the initialization of `creds`. // TODO: add missing type safety to ClientCredentials. - creds.project = Some(payload.project); + creds.project = Some(payload.project.into()); let mut config = match &self { Console(creds) => { diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index 87906679ea..e239320e9b 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -121,7 +121,7 @@ pub enum AuthInfo { #[must_use] pub(super) struct Api<'a> { endpoint: &'a ApiUrl, - creds: &'a ClientCredentials, + creds: &'a ClientCredentials<'a>, } impl<'a> Api<'a> { @@ -143,7 +143,7 @@ impl<'a> Api<'a> { url.path_segments_mut().push("proxy_get_role_secret"); url.query_pairs_mut() .append_pair("project", self.creds.project().expect("impossible")) - .append_pair("role", &self.creds.user); + .append_pair("role", self.creds.user); // TODO: use a proper logger println!("cplane request: {url}"); @@ -187,8 +187,8 @@ impl<'a> Api<'a> { config .host(host) .port(port) - .dbname(&self.creds.dbname) - .user(&self.creds.user); + .dbname(self.creds.dbname) + .user(self.creds.user); Ok(config) } diff --git a/proxy/src/auth/backend/legacy_console.rs b/proxy/src/auth/backend/legacy_console.rs index 17ba44e833..b99a004dcd 100644 --- a/proxy/src/auth/backend/legacy_console.rs +++ b/proxy/src/auth/backend/legacy_console.rs @@ -56,7 +56,7 @@ enum ProxyAuthResponse { NotReady { ready: bool }, // TODO: get rid of `ready` } -impl ClientCredentials { +impl ClientCredentials<'_> { fn is_existing_user(&self) -> bool { self.user.ends_with("@zenith") } @@ -64,15 +64,15 @@ impl ClientCredentials { async fn authenticate_proxy_client( auth_endpoint: &reqwest::Url, - creds: &ClientCredentials, + creds: &ClientCredentials<'_>, md5_response: &str, salt: &[u8; 4], psql_session_id: &str, ) -> Result { let mut url = auth_endpoint.clone(); url.query_pairs_mut() - .append_pair("login", &creds.user) - .append_pair("database", &creds.dbname) + .append_pair("login", creds.user) + .append_pair("database", creds.dbname) .append_pair("md5response", md5_response) .append_pair("salt", &hex::encode(salt)) .append_pair("psql_session_id", psql_session_id); @@ -103,7 +103,7 @@ async fn authenticate_proxy_client( async fn handle_existing_user( auth_endpoint: &reqwest::Url, client: &mut PqStream, - creds: &ClientCredentials, + creds: &ClientCredentials<'_>, ) -> auth::Result { let psql_session_id = super::link::new_psql_session_id(); let md5_salt = rand::random(); @@ -136,7 +136,7 @@ async fn handle_existing_user( pub async fn handle_user( auth_endpoint: &reqwest::Url, auth_link_uri: &reqwest::Url, - creds: &ClientCredentials, + creds: &ClientCredentials<'_>, client: &mut PqStream, ) -> auth::Result { if creds.is_existing_user() { diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs index 183fa52ec1..2055ee14c8 100644 --- a/proxy/src/auth/backend/postgres.rs +++ b/proxy/src/auth/backend/postgres.rs @@ -17,7 +17,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; #[must_use] pub(super) struct Api<'a> { endpoint: &'a ApiUrl, - creds: &'a ClientCredentials, + creds: &'a ClientCredentials<'a>, } // Helps eliminate graceless `.map_err` calls without introducing another ctor. @@ -87,8 +87,8 @@ impl<'a> Api<'a> { config .host(self.endpoint.host_str().unwrap_or("localhost")) .port(self.endpoint.port().unwrap_or(5432)) - .dbname(&self.creds.dbname) - .user(&self.creds.user); + .dbname(self.creds.dbname) + .user(self.creds.user); Ok(config) } diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 4c72da1c48..ea71eba010 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,6 +1,7 @@ //! User credentials used in authentication. use crate::error::UserFacingError; +use std::borrow::Cow; use thiserror::Error; use utils::pq_proto::StartupMessageParams; @@ -27,51 +28,59 @@ impl UserFacingError for ClientCredsParseError {} /// Various client credentials which we use for authentication. /// Note that we don't store any kind of client key or password here. #[derive(Debug, Clone, PartialEq, Eq)] -pub struct ClientCredentials { - pub user: String, - pub dbname: String, - pub project: Option, +pub struct ClientCredentials<'a> { + pub user: &'a str, + pub dbname: &'a str, + pub project: Option>, } -impl ClientCredentials { +impl ClientCredentials<'_> { pub fn project(&self) -> Option<&str> { self.project.as_deref() } } -impl ClientCredentials { +impl<'a> ClientCredentials<'a> { pub fn parse( - mut options: StartupMessageParams, + params: &'a StartupMessageParams, sni: Option<&str>, common_name: Option<&str>, ) -> Result { use ClientCredsParseError::*; - // Some parameters are absolutely necessary, others not so much. - let mut get_param = |key| options.remove(key).ok_or(MissingKey(key)); - // Some parameters are stored in the startup message. + let get_param = |key| params.get(key).ok_or(MissingKey(key)); let user = get_param("user")?; let dbname = get_param("database")?; - let project_a = get_param("project").ok(); + + // Project name might be passed via PG's command-line options. + let project_a = params.options_raw().and_then(|options| { + for opt in options { + if let Some(value) = opt.strip_prefix("project=") { + return Some(Cow::Borrowed(value)); + } + } + None + }); // Alternative project name is in fact a subdomain from SNI. // NOTE: we do not consider SNI if `common_name` is missing. let project_b = sni .zip(common_name) .map(|(sni, cn)| { - // TODO: what if SNI is present but just a common name? subdomain_from_sni(sni, cn) - .ok_or_else(|| InconsistentSni(sni.to_owned(), cn.to_owned())) + .ok_or_else(|| InconsistentSni(sni.into(), cn.into())) + .map(Cow::<'static, str>::Owned) }) .transpose()?; let project = match (project_a, project_b) { // Invariant: if we have both project name variants, they should match. - (Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a, b))), - (a, b) => a.or(b).map(|name| { - // Invariant: project name may not contain certain characters. - check_project_name(name).map_err(MalformedProjectName) + (Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a.into(), b.into()))), + // Invariant: project name may not contain certain characters. + (a, b) => a.or(b).map(|name| match project_name_valid(&name) { + false => Err(MalformedProjectName(name.into())), + true => Ok(name), }), } .transpose()?; @@ -84,12 +93,8 @@ impl ClientCredentials { } } -fn check_project_name(name: String) -> Result { - if name.chars().all(|c| c.is_alphanumeric() || c == '-') { - Ok(name) - } else { - Err(name) - } +fn project_name_valid(name: &str) -> bool { + name.chars().all(|c| c.is_alphanumeric() || c == '-') } fn subdomain_from_sni(sni: &str, common_name: &str) -> Option { @@ -102,18 +107,14 @@ fn subdomain_from_sni(sni: &str, common_name: &str) -> Option { mod tests { use super::*; - fn make_options<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> StartupMessageParams { - StartupMessageParams::from(pairs.map(|(k, v)| (k.to_owned(), v.to_owned()))) - } - #[test] #[ignore = "TODO: fix how database is handled"] fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. - let options = make_options([("user", "john_doe")]); + let options = StartupMessageParams::new([("user", "john_doe")]); // TODO: check that `creds.dbname` is None. - let creds = ClientCredentials::parse(options, None, None)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); Ok(()) @@ -121,9 +122,9 @@ mod tests { #[test] fn parse_missing_project() -> anyhow::Result<()> { - let options = make_options([("user", "john_doe"), ("database", "world")]); + let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); - let creds = ClientCredentials::parse(options, None, None)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project, None); @@ -133,12 +134,12 @@ mod tests { #[test] fn parse_project_from_sni() -> anyhow::Result<()> { - let options = make_options([("user", "john_doe"), ("database", "world")]); + let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); let sni = Some("foo.localhost"); let common_name = Some("localhost"); - let creds = ClientCredentials::parse(options, sni, common_name)?; + let creds = ClientCredentials::parse(&options, sni, common_name)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("foo")); @@ -148,13 +149,13 @@ mod tests { #[test] fn parse_project_from_options() -> anyhow::Result<()> { - let options = make_options([ + let options = StartupMessageParams::new([ ("user", "john_doe"), ("database", "world"), - ("project", "bar"), + ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let creds = ClientCredentials::parse(options, None, None)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("bar")); @@ -164,16 +165,16 @@ mod tests { #[test] fn parse_projects_identical() -> anyhow::Result<()> { - let options = make_options([ + let options = StartupMessageParams::new([ ("user", "john_doe"), ("database", "world"), - ("project", "baz"), + ("options", "project=baz"), ]); let sni = Some("baz.localhost"); let common_name = Some("localhost"); - let creds = ClientCredentials::parse(options, sni, common_name)?; + let creds = ClientCredentials::parse(&options, sni, common_name)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.dbname, "world"); assert_eq!(creds.project.as_deref(), Some("baz")); @@ -183,17 +184,17 @@ mod tests { #[test] fn parse_projects_different() { - let options = make_options([ + let options = StartupMessageParams::new([ ("user", "john_doe"), ("database", "world"), - ("project", "first"), + ("options", "project=first"), ]); let sni = Some("second.localhost"); let common_name = Some("localhost"); assert!(matches!( - ClientCredentials::parse(options, sni, common_name).expect_err("should fail"), + ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"), ClientCredsParseError::InconsistentProjectNames(_, _) )); } diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index a801313635..b7412b6f5b 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -95,7 +95,7 @@ impl<'a> Session<'a> { /// Store the cancel token for the given session. /// This enables query cancellation in [`crate::proxy::handshake`]. - pub fn enable_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData { + pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData { self.cancel_map .0 .lock() diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 3bad36661b..4ae44ded57 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,9 +1,11 @@ use crate::{cancellation::CancelClosure, error::UserFacingError}; use futures::TryFutureExt; +use itertools::Itertools; use std::{io, net::SocketAddr}; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::NoTls; +use utils::pq_proto::StartupMessageParams; #[derive(Debug, Error)] pub enum ConnectionError { @@ -110,7 +112,42 @@ pub struct PostgresConnection { impl NodeInfo { /// Connect to a corresponding compute node. - pub async fn connect(&self) -> Result<(PostgresConnection, CancelClosure), ConnectionError> { + pub async fn connect( + mut self, + params: &StartupMessageParams, + ) -> Result<(PostgresConnection, CancelClosure), ConnectionError> { + if let Some(options) = params.options_raw() { + // We must drop all proxy-specific parameters. + #[allow(unstable_name_collisions)] + let options: String = options + .filter(|opt| !opt.starts_with("project=")) + .intersperse(" ") // TODO: use impl from std once it's stabilized + .collect(); + + self.config.options(&options); + } + + if let Some(app_name) = params.get("application_name") { + self.config.application_name(app_name); + } + + if let Some(replication) = params.get("replication") { + use tokio_postgres::config::ReplicationMode; + match replication { + "true" | "on" | "yes" | "1" => { + self.config.replication_mode(ReplicationMode::Physical); + } + "database" => { + self.config.replication_mode(ReplicationMode::Logical); + } + _other => {} + } + } + + // TODO: extend the list of the forwarded startup parameters. + // Currently, tokio-postgres doesn't allow us to pass + // arbitrary parameters, but the ones above are a good start. + let (socket_addr, mut stream) = self .connect_raw() .await diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 29be79c886..72cb822910 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,6 +1,6 @@ use crate::auth; use crate::cancellation::{self, CancelMap}; -use crate::config::{ProxyConfig, TlsConfig}; +use crate::config::{AuthUrls, ProxyConfig, TlsConfig}; use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; use futures::TryFutureExt; @@ -93,20 +93,21 @@ async fn handle_client( None => return Ok(()), // it's a cancellation request }; + // Extract credentials which we're going to use for auth. let creds = { let sni = stream.get_ref().sni_hostname(); let common_name = tls.and_then(|tls| tls.common_name.as_deref()); let result = config .auth_backend - .map(|_| auth::ClientCredentials::parse(params, sni, common_name)) + .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? }; - let client = Client::new(stream, creds); + let client = Client::new(stream, creds, ¶ms); cancel_map - .with_session(|session| client.connect_to_db(config, session)) + .with_session(|session| client.connect_to_db(&config.auth_urls, session)) .await } @@ -174,38 +175,57 @@ async fn handshake( } /// Thin connection context. -struct Client { +struct Client<'a, S> { /// The underlying libpq protocol stream. stream: PqStream, /// Client credentials that we care about. - creds: auth::BackendType, + creds: auth::BackendType>, + /// KV-dictionary with PostgreSQL connection params. + params: &'a StartupMessageParams, } -impl Client { +impl<'a, S> Client<'a, S> { /// Construct a new connection context. - fn new(stream: PqStream, creds: auth::BackendType) -> Self { - Self { stream, creds } + fn new( + stream: PqStream, + creds: auth::BackendType>, + params: &'a StartupMessageParams, + ) -> Self { + Self { + stream, + creds, + params, + } } } -impl Client { +impl Client<'_, S> { /// Let the client authenticate and connect to the designated compute node. async fn connect_to_db( self, - config: &ProxyConfig, + urls: &AuthUrls, session: cancellation::Session<'_>, ) -> anyhow::Result<()> { - let Self { mut stream, creds } = self; + let Self { + mut stream, + creds, + params, + } = self; // Authenticate and connect to a compute node. - let auth = creds.authenticate(&config.auth_urls, &mut stream).await; + let auth = creds.authenticate(urls, &mut stream).await; let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; + let reported_auth_ok = node.reported_auth_ok; - let (db, cancel_closure) = node.connect().or_else(|e| stream.throw_error(e)).await?; - let cancel_key_data = session.enable_cancellation(cancel_closure); + let (db, cancel_closure) = node + .connect(params) + .or_else(|e| stream.throw_error(e)) + .await?; + + let cancel_key_data = session.enable_query_cancellation(cancel_closure); // Report authentication success if we haven't done this already. - if !node.reported_auth_ok { + if !reported_auth_ok { stream .write_message_noflush(&Be::AuthenticationOk)? .write_message_noflush(&BeParameterStatusMessage::encoding())?; diff --git a/pyproject.toml b/pyproject.toml index 8a3d22f088..ec166ea7cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,12 +27,54 @@ prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" Werkzeug = "2.1.2" pytest-order = "^1.0.1" +allure-pytest = "^2.10.0" +pytest-asyncio = "^0.19.0" [tool.poetry.dev-dependencies] -yapf = "==0.31.0" -flake8 = "^3.9.2" +flake8 = "^5.0.4" mypy = "==0.971" +black = "^22.6.0" +isort = "^5.10.1" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" + +[tool.black] +line-length = 100 +extend-exclude = ''' +/( + vendor +)/ +''' + +[tool.isort] +profile = "black" +line_length = 100 +skip_gitignore = true +skip = [ + "vendor", +] + +[tool.mypy] +# mypy uses regex +exclude = "^vendor/" +# some tests don't typecheck when this flag is set +check_untyped_defs = false +# Help mypy find imports when running against list of individual files. +# Without this line it would behave differently when executed on the entire project. +mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner" + +disallow_incomplete_defs = false +disallow_untyped_calls = false +disallow_untyped_decorators = false +disallow_untyped_defs = false +strict = true + +[[tool.mypy.overrides]] +module = [ + "asyncpg.*", + "cached_property.*", + "pg8000.*", +] +ignore_missing_imports = true diff --git a/run_clippy.sh b/run_clippy.sh index 13af3fd2c5..9feb8de4ea 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -13,10 +13,10 @@ # avoid running regular linting script that checks every feature. if [[ "$OSTYPE" == "darwin"* ]]; then # no extra features to test currently, add more here when needed - cargo clippy --all --all-targets -- -A unknown_lints -D warnings + cargo clippy --locked --all --all-targets -- -A unknown_lints -D warnings else # * `-A unknown_lints` – do not warn about unknown lint suppressions # that people with newer toolchains might use # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) - cargo clippy --all --all-targets --all-features -- -A unknown_lints -D warnings + cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings fi diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index c90c2a0446..3e301259ed 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -11,7 +11,6 @@ use anyhow::{bail, Context, Result}; use postgres_ffi::PG_TLI; use regex::Regex; -use std::str::FromStr; use std::sync::Arc; use tracing::info; use utils::{ @@ -67,18 +66,22 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { // ztenant id and ztimeline id are passed in connection string params fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> { if let FeStartupPacket::StartupMessage { params, .. } = sm { - self.ztenantid = match params.get("ztenantid") { - Some(z) => Some(ZTenantId::from_str(z)?), // just curious, can I do that from .map? - _ => None, - }; - - self.ztimelineid = match params.get("ztimelineid") { - Some(z) => Some(ZTimelineId::from_str(z)?), - _ => None, - }; + if let Some(options) = params.options_raw() { + for opt in options { + match opt.split_once('=') { + Some(("ztenantid", value)) => { + self.ztenantid = Some(value.parse()?); + } + Some(("ztimelineid", value)) => { + self.ztimelineid = Some(value.parse()?); + } + _ => continue, + } + } + } if let Some(app_name) = params.get("application_name") { - self.appname = Some(app_name.clone()); + self.appname = Some(app_name.to_owned()); } Ok(()) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 22f8ca2de4..ed34669dde 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -332,7 +332,7 @@ pub struct AppendRequestHeader { } /// Report safekeeper state to proposer -#[derive(Debug, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Serialize, Deserialize)] pub struct AppendResponse { // Current term of the safekeeper; if it is higher than proposer's, the // compute is out of date. diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 97ec945c3e..38523f9f82 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -36,7 +36,7 @@ const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; type FullTransactionId = u64; /// Hot standby feedback received from replica -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct HotStandbyFeedback { pub ts: TimestampTz, pub xmin: FullTransactionId, @@ -54,7 +54,7 @@ impl HotStandbyFeedback { } /// Standby status update -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Deserialize)] pub struct StandbyReply { pub write_lsn: Lsn, // last lsn received by pageserver pub flush_lsn: Lsn, // pageserver's disk consistent lSN diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 3a10c5d59e..f482dbb3aa 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -529,7 +529,7 @@ impl Timeline { // release the lock before removing } let _enter = - info_span!("", timeline = %self.zttid.tenant_id, tenant = %self.zttid.timeline_id) + info_span!("", tenant = %self.zttid.tenant_id, timeline = %self.zttid.timeline_id) .entered(); remover(horizon_segno - 1)?; self.mutex.lock().unwrap().last_removed_segno = horizon_segno; @@ -626,7 +626,7 @@ impl GlobalTimelines { zttid: ZTenantTimelineId, create: bool, ) -> Result> { - let _enter = info_span!("", timeline = %zttid.tenant_id).entered(); + let _enter = info_span!("", timeline = %zttid.timeline_id).entered(); let mut state = TIMELINES_STATE.lock().unwrap(); diff --git a/scripts/coverage b/scripts/coverage index f2c46d9ae9..af0d067419 100755 --- a/scripts/coverage +++ b/scripts/coverage @@ -9,13 +9,6 @@ # * https://github.com/taiki-e/cargo-llvm-cov # * https://github.com/llvm/llvm-project/tree/main/llvm/test/tools/llvm-cov -from abc import ABC, abstractmethod -from dataclasses import dataclass -from pathlib import Path -from tempfile import TemporaryDirectory -from textwrap import dedent -from typing import Any, Dict, Iterator, Iterable, List, Optional - import argparse import hashlib import json @@ -24,6 +17,12 @@ import shutil import socket import subprocess import sys +from abc import ABC, abstractmethod +from dataclasses import dataclass +from pathlib import Path +from tempfile import TemporaryDirectory +from textwrap import dedent +from typing import Any, Dict, Iterable, Iterator, List, Optional def file_mtime_or_zero(path: Path) -> int: diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 96f1d36ddb..af847be49e 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -20,20 +20,21 @@ # For more context on how to use this, see: # https://github.com/neondatabase/cloud/wiki/Storage-format-migration -import os -from os import path -import shutil -from pathlib import Path -import tempfile -from contextlib import closing -import psycopg2 -import subprocess import argparse +import os +import shutil +import subprocess +import tempfile import time -import requests import uuid +from contextlib import closing +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import psycopg2 +import requests from psycopg2.extensions import connection as PgConnection -from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple +from psycopg2.extensions import parse_dsn ############################################### ### client-side utils copied from test fixtures @@ -45,7 +46,7 @@ _global_counter = 0 def global_counter() -> int: - """ A really dumb global counter. + """A really dumb global counter. This is useful for giving output files a unique number, so if we run the same command multiple times we can keep their output separate. """ @@ -55,7 +56,7 @@ def global_counter() -> int: def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: - """ Run a process and capture its output + """Run a process and capture its output Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" where "cmd" is the name of the program and NNN is an incrementing counter. @@ -63,13 +64,13 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: Returns basepath for files with captured output. """ assert type(cmd) is list - base = os.path.basename(cmd[0]) + '_{}'.format(global_counter()) + base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) basepath = os.path.join(capture_dir, base) - stdout_filename = basepath + '.stdout' - stderr_filename = basepath + '.stderr' + stdout_filename = basepath + ".stdout" + stderr_filename = basepath + ".stderr" - with open(stdout_filename, 'w') as stdout_f: - with open(stderr_filename, 'w') as stderr_f: + with open(stdout_filename, "w") as stdout_f: + with open(stderr_filename, "w") as stderr_f: print('(capturing output to "{}.stdout")'.format(base)) subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) @@ -77,15 +78,16 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: class PgBin: - """ A helper class for executing postgres binaries """ + """A helper class for executing postgres binaries""" + def __init__(self, log_dir: Path, pg_distrib_dir): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin') + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "bin") self.env = os.environ.copy() - self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib') + self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), "lib") def _fixpath(self, command: List[str]): - if '/' not in command[0]: + if "/" not in command[0]: command[0] = os.path.join(self.pg_bin_path, command[0]) def _build_env(self, env_add: Optional[Env]) -> Env: @@ -106,15 +108,17 @@ class PgBin: """ self._fixpath(command) - print('Running command "{}"'.format(' '.join(command))) + print('Running command "{}"'.format(" ".join(command))) env = self._build_env(env) subprocess.run(command, env=env, cwd=cwd, check=True) - def run_capture(self, - command: List[str], - env: Optional[Env] = None, - cwd: Optional[str] = None, - **kwargs: Any) -> str: + def run_capture( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[str] = None, + **kwargs: Any, + ) -> str: """ Run one of the postgres binaries, with stderr and stdout redirected to a file. This is just like `run`, but for chatty programs. Returns basepath for files @@ -122,35 +126,31 @@ class PgBin: """ self._fixpath(command) - print('Running command "{}"'.format(' '.join(command))) + print('Running command "{}"'.format(" ".join(command))) env = self._build_env(env) - return subprocess_capture(str(self.log_dir), - command, - env=env, - cwd=cwd, - check=True, - **kwargs) + return subprocess_capture( + str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs + ) class PgProtocol: - """ Reusable connection logic """ + """Reusable connection logic""" + def __init__(self, **kwargs): self.default_options = kwargs def conn_options(self, **kwargs): conn_options = self.default_options.copy() - if 'dsn' in kwargs: - conn_options.update(parse_dsn(kwargs['dsn'])) + if "dsn" in kwargs: + conn_options.update(parse_dsn(kwargs["dsn"])) conn_options.update(kwargs) # Individual statement timeout in seconds. 2 minutes should be # enough for our tests, but if you need a longer, you can # change it by calling "SET statement_timeout" after # connecting. - if 'options' in conn_options: - conn_options['options'] = f"-cstatement_timeout=120s " + conn_options['options'] - else: - conn_options['options'] = "-cstatement_timeout=120s" + conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}" + return conn_options # autocommit=True here by default because that's what we need most of the time @@ -194,18 +194,18 @@ class PgProtocol: class VanillaPostgres(PgProtocol): def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): - super().__init__(host='localhost', port=port, dbname='postgres') + super().__init__(host="localhost", port=port, dbname="postgres") self.pgdatadir = pgdatadir self.pg_bin = pg_bin self.running = False if init: - self.pg_bin.run_capture(['initdb', '-D', str(pgdatadir)]) + self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)]) self.configure([f"port = {port}\n"]) def configure(self, options: List[str]): """Append lines into postgresql.conf file.""" assert not self.running - with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file: + with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: conf_file.write("\n".join(options)) def start(self, log_path: Optional[str] = None): @@ -216,12 +216,13 @@ class VanillaPostgres(PgProtocol): log_path = os.path.join(self.pgdatadir, "pg.log") self.pg_bin.run_capture( - ['pg_ctl', '-w', '-D', str(self.pgdatadir), '-l', log_path, 'start']) + ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"] + ) def stop(self): assert self.running self.running = False - self.pg_bin.run_capture(['pg_ctl', '-w', '-D', str(self.pgdatadir), 'stop']) + self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) def __enter__(self): return self @@ -246,9 +247,9 @@ class NeonPageserverHttpClient(requests.Session): res.raise_for_status() except requests.RequestException as e: try: - msg = res.json()['msg'] - except: - msg = '' + msg = res.json()["msg"] + except: # noqa: E722 + msg = "" raise NeonPageserverApiException(msg) from e def check_status(self): @@ -265,17 +266,17 @@ class NeonPageserverHttpClient(requests.Session): res = self.post( f"http://{self.host}:{self.port}/v1/tenant", json={ - 'new_tenant_id': new_tenant_id.hex, + "new_tenant_id": new_tenant_id.hex, }, ) if res.status_code == 409: if ok_if_exists: - print(f'could not create tenant: already exists for id {new_tenant_id}') + print(f"could not create tenant: already exists for id {new_tenant_id}") else: res.raise_for_status() elif res.status_code == 201: - print(f'created tenant {new_tenant_id}') + print(f"created tenant {new_tenant_id}") else: self.verbose_error(res) @@ -299,47 +300,55 @@ class NeonPageserverHttpClient(requests.Session): def lsn_to_hex(num: int) -> str: - """ Convert lsn from int to standard hex notation. """ - return "{:X}/{:X}".format(num >> 32, num & 0xffffffff) + """Convert lsn from int to standard hex notation.""" + return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF) def lsn_from_hex(lsn_hex: str) -> int: - """ Convert lsn from hex notation to int. """ - l, r = lsn_hex.split('/') + """Convert lsn from hex notation to int.""" + l, r = lsn_hex.split("/") return (int(l, 16) << 32) + int(r, 16) -def remote_consistent_lsn(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID) -> int: +def remote_consistent_lsn( + pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID +) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) - if detail['remote'] is None: + if detail["remote"] is None: # No remote information at all. This happens right after creating # a timeline, before any part of it has been uploaded to remote # storage yet. return 0 else: - lsn_str = detail['remote']['remote_consistent_lsn'] + lsn_str = detail["remote"]["remote_consistent_lsn"] assert isinstance(lsn_str, str) return lsn_from_hex(lsn_str) -def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int): +def wait_for_upload( + pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID, + lsn: int, +): """waits for local timeline upload up to specified lsn""" for i in range(10): current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) if current_lsn >= lsn: return - print("waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + print( + "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 + ) + ) time.sleep(1) - raise Exception("timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn))) + raise Exception( + "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn) + ) + ) ############## @@ -399,7 +408,7 @@ def reconstruct_paths(log_dir, pg_bin, base_tar): # Add all template0copy paths to template0 prefix = f"base/{oid}/" if filepath.startswith(prefix): - suffix = filepath[len(prefix):] + suffix = filepath[len(prefix) :] yield f"base/{template0_oid}/{suffix}" elif filepath.startswith("global"): print(f"skipping {database} global file {filepath}") @@ -451,55 +460,53 @@ def get_rlsn(pageserver_connstr, tenant_id, timeline_id): return last_lsn, prev_lsn -def import_timeline(args, - psql_path, - pageserver_connstr, - pageserver_http, - tenant_id, - timeline_id, - last_lsn, - prev_lsn, - tar_filename): +def import_timeline( + args, + psql_path, + pageserver_connstr, + pageserver_http, + tenant_id, + timeline_id, + last_lsn, + prev_lsn, + tar_filename, +): # Import timelines to new pageserver import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn}" full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ - stderr_filename2 = path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") - stdout_filename = path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout") + stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") + stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout") print(f"Running: {full_cmd}") - with open(stdout_filename, 'w') as stdout_f: - with open(stderr_filename2, 'w') as stderr_f: + with open(stdout_filename, "w") as stdout_f: + with open(stderr_filename2, "w") as stderr_f: print(f"(capturing output to {stdout_filename})") pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) - subprocess.run(full_cmd, - stdout=stdout_f, - stderr=stderr_f, - env=pg_bin._build_env(None), - shell=True, - check=True) + subprocess.run( + full_cmd, + stdout=stdout_f, + stderr=stderr_f, + env=pg_bin._build_env(None), + shell=True, + check=True, + ) - print(f"Done import") + print("Done import") # Wait until pageserver persists the files - wait_for_upload(pageserver_http, - uuid.UUID(tenant_id), - uuid.UUID(timeline_id), - lsn_from_hex(last_lsn)) + wait_for_upload( + pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn) + ) -def export_timeline(args, - psql_path, - pageserver_connstr, - tenant_id, - timeline_id, - last_lsn, - prev_lsn, - tar_filename): +def export_timeline( + args, psql_path, pageserver_connstr, tenant_id, timeline_id, last_lsn, prev_lsn, tar_filename +): # Choose filenames incomplete_filename = tar_filename + ".incomplete" - stderr_filename = path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr") + stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr") # Construct export command query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}" @@ -507,15 +514,13 @@ def export_timeline(args, # Run export command print(f"Running: {cmd}") - with open(incomplete_filename, 'w') as stdout_f: - with open(stderr_filename, 'w') as stderr_f: + with open(incomplete_filename, "w") as stdout_f: + with open(stderr_filename, "w") as stderr_f: print(f"(capturing output to {incomplete_filename})") pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) - subprocess.run(cmd, - stdout=stdout_f, - stderr=stderr_f, - env=pg_bin._build_env(None), - check=True) + subprocess.run( + cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True + ) # Add missing rels pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) @@ -551,27 +556,28 @@ def main(args: argparse.Namespace): for timeline in timelines: # Skip timelines we don't need to export - if args.timelines and timeline['timeline_id'] not in args.timelines: + if args.timelines and timeline["timeline_id"] not in args.timelines: print(f"Skipping timeline {timeline['timeline_id']}") continue # Choose filenames - tar_filename = path.join(args.work_dir, - f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar") + tar_filename = os.path.join( + args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" + ) # Export timeline from old pageserver if args.only_import is False: last_lsn, prev_lsn = get_rlsn( old_pageserver_connstr, - timeline['tenant_id'], - timeline['timeline_id'], + timeline["tenant_id"], + timeline["timeline_id"], ) export_timeline( args, psql_path, old_pageserver_connstr, - timeline['tenant_id'], - timeline['timeline_id'], + timeline["tenant_id"], + timeline["timeline_id"], last_lsn, prev_lsn, tar_filename, @@ -583,8 +589,8 @@ def main(args: argparse.Namespace): psql_path, new_pageserver_connstr, new_http_client, - timeline['tenant_id'], - timeline['timeline_id'], + timeline["tenant_id"], + timeline["timeline_id"], last_lsn, prev_lsn, tar_filename, @@ -592,117 +598,118 @@ def main(args: argparse.Namespace): # Re-export and compare re_export_filename = tar_filename + ".reexport" - export_timeline(args, - psql_path, - new_pageserver_connstr, - timeline['tenant_id'], - timeline['timeline_id'], - last_lsn, - prev_lsn, - re_export_filename) + export_timeline( + args, + psql_path, + new_pageserver_connstr, + timeline["tenant_id"], + timeline["timeline_id"], + last_lsn, + prev_lsn, + re_export_filename, + ) # Check the size is the same - old_size = os.path.getsize(tar_filename), - new_size = os.path.getsize(re_export_filename), + old_size = (os.path.getsize(tar_filename),) + new_size = (os.path.getsize(re_export_filename),) if old_size != new_size: raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}") -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - '--tenant-id', - dest='tenants', + "--tenant-id", + dest="tenants", required=True, - nargs='+', - help='Id of the tenant to migrate. You can pass multiple arguments', + nargs="+", + help="Id of the tenant to migrate. You can pass multiple arguments", ) parser.add_argument( - '--timeline-id', - dest='timelines', + "--timeline-id", + dest="timelines", required=False, - nargs='+', - help='Id of the timeline to migrate. You can pass multiple arguments', + nargs="+", + help="Id of the timeline to migrate. You can pass multiple arguments", ) parser.add_argument( - '--from-host', - dest='old_pageserver_host', + "--from-host", + dest="old_pageserver_host", required=True, - help='Host of the pageserver to migrate data from', + help="Host of the pageserver to migrate data from", ) parser.add_argument( - '--from-http-port', - dest='old_pageserver_http_port', + "--from-http-port", + dest="old_pageserver_http_port", required=False, type=int, default=9898, - help='HTTP port of the pageserver to migrate data from. Default: 9898', + help="HTTP port of the pageserver to migrate data from. Default: 9898", ) parser.add_argument( - '--from-pg-port', - dest='old_pageserver_pg_port', + "--from-pg-port", + dest="old_pageserver_pg_port", required=False, type=int, default=6400, - help='pg port of the pageserver to migrate data from. Default: 6400', + help="pg port of the pageserver to migrate data from. Default: 6400", ) parser.add_argument( - '--to-host', - dest='new_pageserver_host', + "--to-host", + dest="new_pageserver_host", required=True, - help='Host of the pageserver to migrate data to', + help="Host of the pageserver to migrate data to", ) parser.add_argument( - '--to-http-port', - dest='new_pageserver_http_port', + "--to-http-port", + dest="new_pageserver_http_port", required=False, default=9898, type=int, - help='HTTP port of the pageserver to migrate data to. Default: 9898', + help="HTTP port of the pageserver to migrate data to. Default: 9898", ) parser.add_argument( - '--to-pg-port', - dest='new_pageserver_pg_port', + "--to-pg-port", + dest="new_pageserver_pg_port", required=False, default=6400, type=int, - help='pg port of the pageserver to migrate data to. Default: 6400', + help="pg port of the pageserver to migrate data to. Default: 6400", ) parser.add_argument( - '--ignore-tenant-exists', - dest='ok_if_exists', + "--ignore-tenant-exists", + dest="ok_if_exists", required=False, - help= - 'Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.', + help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.", ) parser.add_argument( - '--pg-distrib-dir', - dest='pg_distrib_dir', + "--pg-distrib-dir", + dest="pg_distrib_dir", required=False, - default='/usr/local/', - help='Path where postgres binaries are installed. Default: /usr/local/', + default="/usr/local/", + help="Path where postgres binaries are installed. Default: /usr/local/", ) parser.add_argument( - '--psql-path', - dest='psql_path', + "--psql-path", + dest="psql_path", required=False, - default='/usr/local/bin/psql', - help='Path to the psql binary. Default: /usr/local/bin/psql', + default="/usr/local/bin/psql", + help="Path to the psql binary. Default: /usr/local/bin/psql", ) parser.add_argument( - '--only-import', - dest='only_import', + "--only-import", + dest="only_import", required=False, default=False, - action='store_true', - help='Skip export and tenant creation part', + action="store_true", + help="Skip export and tenant creation part", ) parser.add_argument( - '--work-dir', - dest='work_dir', + "--work-dir", + dest="work_dir", required=True, default=False, - help='directory where temporary tar files are stored', + help="directory where temporary tar files are stored", ) args = parser.parse_args() main(args) diff --git a/scripts/generate_perf_report_page.py b/scripts/generate_perf_report_page.py index 23fa4b76a3..b5b49bb600 100755 --- a/scripts/generate_perf_report_page.py +++ b/scripts/generate_perf_report_page.py @@ -1,31 +1,36 @@ #!/usr/bin/env python3 import argparse +import json from dataclasses import dataclass from pathlib import Path -import json from typing import Any, Dict, List, Optional, Tuple, cast + from jinja2 import Template # skip 'input' columns. They are included in the header and just blow the table -EXCLUDE_COLUMNS = frozenset({ - 'scale', - 'duration', - 'number_of_clients', - 'number_of_threads', - 'init_start_timestamp', - 'init_end_timestamp', - 'run_start_timestamp', - 'run_end_timestamp', -}) +EXCLUDE_COLUMNS = frozenset( + { + "scale", + "duration", + "number_of_clients", + "number_of_threads", + "init_start_timestamp", + "init_end_timestamp", + "run_start_timestamp", + "run_end_timestamp", + } +) -KEY_EXCLUDE_FIELDS = frozenset({ - 'init_start_timestamp', - 'init_end_timestamp', - 'run_start_timestamp', - 'run_end_timestamp', -}) -NEGATIVE_COLOR = 'negative' -POSITIVE_COLOR = 'positive' +KEY_EXCLUDE_FIELDS = frozenset( + { + "init_start_timestamp", + "init_end_timestamp", + "run_start_timestamp", + "run_end_timestamp", + } +) +NEGATIVE_COLOR = "negative" +POSITIVE_COLOR = "positive" EPS = 1e-6 @@ -55,75 +60,76 @@ def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], Li value_columns = [] common_columns = [] for item in values: - if item['name'] in KEY_EXCLUDE_FIELDS: + if item["name"] in KEY_EXCLUDE_FIELDS: continue - if item['report'] != 'test_param': - value_columns.append(cast(str, item['name'])) + if item["report"] != "test_param": + value_columns.append(cast(str, item["name"])) else: - common_columns.append((cast(str, item['name']), cast(str, item['value']))) + common_columns.append((cast(str, item["name"]), cast(str, item["value"]))) value_columns.sort() common_columns.sort(key=lambda x: x[0]) # sort by name return common_columns, value_columns def format_ratio(ratio: float, report: str) -> Tuple[str, str]: - color = '' - sign = '+' if ratio > 0 else '' + color = "" + sign = "+" if ratio > 0 else "" if abs(ratio) < 0.05: - return f' ({sign}{ratio:.2f})', color + return f" ({sign}{ratio:.2f})", color - if report not in {'test_param', 'higher_is_better', 'lower_is_better'}: - raise ValueError(f'Unknown report type: {report}') + if report not in {"test_param", "higher_is_better", "lower_is_better"}: + raise ValueError(f"Unknown report type: {report}") - if report == 'test_param': - return f'{ratio:.2f}', color + if report == "test_param": + return f"{ratio:.2f}", color if ratio > 0: - if report == 'higher_is_better': + if report == "higher_is_better": color = POSITIVE_COLOR - elif report == 'lower_is_better': + elif report == "lower_is_better": color = NEGATIVE_COLOR elif ratio < 0: - if report == 'higher_is_better': + if report == "higher_is_better": color = NEGATIVE_COLOR - elif report == 'lower_is_better': + elif report == "lower_is_better": color = POSITIVE_COLOR - return f' ({sign}{ratio:.2f})', color + return f" ({sign}{ratio:.2f})", color def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]: - for item in suit_run.values['data']: - if item['name'] == name: + for item in suit_run.values["data"]: + if item["name"] == name: return cast(Dict[str, Any], item) return None -def get_row_values(columns: List[str], run_result: SuitRun, - prev_result: Optional[SuitRun]) -> List[RowValue]: +def get_row_values( + columns: List[str], run_result: SuitRun, prev_result: Optional[SuitRun] +) -> List[RowValue]: row_values = [] for column in columns: current_value = extract_value(column, run_result) if current_value is None: # should never happen - raise ValueError(f'{column} not found in {run_result.values}') + raise ValueError(f"{column} not found in {run_result.values}") value = current_value["value"] if isinstance(value, float): - value = f'{value:.2f}' + value = f"{value:.2f}" if prev_result is None: - row_values.append(RowValue(value, '', '')) + row_values.append(RowValue(value, "", "")) continue prev_value = extract_value(column, prev_result) if prev_value is None: # this might happen when new metric is added and there is no value for it in previous run # let this be here, TODO add proper handling when this actually happens - raise ValueError(f'{column} not found in previous result') + raise ValueError(f"{column} not found in previous result") # adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero - ratio = (float(value) + EPS) / (float(prev_value['value']) + EPS) - 1 - ratio_display, color = format_ratio(ratio, current_value['report']) + ratio = (float(value) + EPS) / (float(prev_value["value"]) + EPS) - 1 + ratio_display, color = format_ratio(ratio, current_value["report"]) row_values.append(RowValue(value, color, ratio_display)) return row_values @@ -139,8 +145,10 @@ def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> Lis prev_run = None for run in runs: rows.append( - SuiteRunTableRow(revision=run.revision, - values=get_row_values(value_columns, run, prev_run))) + SuiteRunTableRow( + revision=run.revision, values=get_row_values(value_columns, run, prev_run) + ) + ) prev_run = run return rows @@ -152,27 +160,29 @@ def main(args: argparse.Namespace) -> None: # we have files in form: _.json # fill them in the hashmap so we have grouped items for the # same run configuration (scale, duration etc.) ordered by counter. - for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split('_')[0])): + for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split("_")[0])): run_data = json.loads(item.read_text()) - revision = run_data['revision'] + revision = run_data["revision"] - for suit_result in run_data['result']: - key = "{}{}".format(run_data['platform'], suit_result['suit']) + for suit_result in run_data["result"]: + key = "{}{}".format(run_data["platform"], suit_result["suit"]) # pack total duration as a synthetic value - total_duration = suit_result['total_duration'] - suit_result['data'].append({ - 'name': 'total_duration', - 'value': total_duration, - 'unit': 's', - 'report': 'lower_is_better', - }) - common_columns, value_columns = get_columns(suit_result['data']) + total_duration = suit_result["total_duration"] + suit_result["data"].append( + { + "name": "total_duration", + "value": total_duration, + "unit": "s", + "report": "lower_is_better", + } + ) + common_columns, value_columns = get_columns(suit_result["data"]) grouped_runs.setdefault( key, SuitRuns( - platform=run_data['platform'], - suit=suit_result['suit'], + platform=run_data["platform"], + suit=suit_result["suit"], common_columns=common_columns, value_columns=value_columns, runs=[], @@ -184,26 +194,26 @@ def main(args: argparse.Namespace) -> None: for result in grouped_runs.values(): suit = result.suit context[suit] = { - 'common_columns': result.common_columns, - 'value_columns': result.value_columns, - 'platform': result.platform, + "common_columns": result.common_columns, + "value_columns": result.value_columns, + "platform": result.platform, # reverse the order so newest results are on top of the table - 'rows': reversed(prepare_rows_from_runs(result.value_columns, result.runs)), + "rows": reversed(prepare_rows_from_runs(result.value_columns, result.runs)), } - template = Template((Path(__file__).parent / 'perf_report_template.html').read_text()) + template = Template((Path(__file__).parent / "perf_report_template.html").read_text()) Path(args.out).write_text(template.render(context=context)) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - '--input-dir', - dest='input_dir', + "--input-dir", + dest="input_dir", required=True, - help='Directory with jsons generated by the test suite', + help="Directory with jsons generated by the test suite", ) - parser.add_argument('--out', required=True, help='Output html file path') + parser.add_argument("--out", required=True, help="Output html file path") args = parser.parse_args() main(args) diff --git a/scripts/git-upload b/scripts/git-upload index a53987894a..d56c0f8e94 100755 --- a/scripts/git-upload +++ b/scripts/git-upload @@ -1,17 +1,16 @@ #!/usr/bin/env python3 -from contextlib import contextmanager -import shlex -from tempfile import TemporaryDirectory -from distutils.dir_util import copy_tree -from pathlib import Path - import argparse import os +import shlex import shutil import subprocess import sys import textwrap +from contextlib import contextmanager +from distutils.dir_util import copy_tree +from pathlib import Path +from tempfile import TemporaryDirectory from typing import Optional diff --git a/scripts/ingest_perf_test_result.py b/scripts/ingest_perf_test_result.py index 89463c986a..71f7ad3262 100644 --- a/scripts/ingest_perf_test_result.py +++ b/scripts/ingest_perf_test_result.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 import argparse -from contextlib import contextmanager import json import os +from contextlib import contextmanager +from datetime import datetime +from pathlib import Path + import psycopg2 import psycopg2.extras -from pathlib import Path -from datetime import datetime CREATE_TABLE = """ CREATE TABLE IF NOT EXISTS perf_test_results ( @@ -24,15 +25,15 @@ CREATE TABLE IF NOT EXISTS perf_test_results ( def err(msg): - print(f'error: {msg}') + print(f"error: {msg}") exit(1) @contextmanager def get_connection_cursor(): - connstr = os.getenv('DATABASE_URL') + connstr = os.getenv("DATABASE_URL") if not connstr: - err('DATABASE_URL environment variable is not set') + err("DATABASE_URL environment variable is not set") with psycopg2.connect(connstr) as conn: with conn.cursor() as cur: yield cur @@ -44,33 +45,35 @@ def create_table(cur): def ingest_perf_test_result(cursor, data_dile: Path, recorded_at_timestamp: int) -> int: run_data = json.loads(data_dile.read_text()) - revision = run_data['revision'] - platform = run_data['platform'] + revision = run_data["revision"] + platform = run_data["platform"] - run_result = run_data['result'] + run_result = run_data["result"] args_list = [] for suit_result in run_result: - suit = suit_result['suit'] - total_duration = suit_result['total_duration'] + suit = suit_result["suit"] + total_duration = suit_result["total_duration"] - suit_result['data'].append({ - 'name': 'total_duration', - 'value': total_duration, - 'unit': 's', - 'report': 'lower_is_better', - }) + suit_result["data"].append( + { + "name": "total_duration", + "value": total_duration, + "unit": "s", + "report": "lower_is_better", + } + ) - for metric in suit_result['data']: + for metric in suit_result["data"]: values = { - 'suit': suit, - 'revision': revision, - 'platform': platform, - 'metric_name': metric['name'], - 'metric_value': metric['value'], - 'metric_unit': metric['unit'], - 'metric_report_type': metric['report'], - 'recorded_at_timestamp': datetime.utcfromtimestamp(recorded_at_timestamp), + "suit": suit, + "revision": revision, + "platform": platform, + "metric_name": metric["name"], + "metric_value": metric["value"], + "metric_unit": metric["unit"], + "metric_report_type": metric["report"], + "recorded_at_timestamp": datetime.utcfromtimestamp(recorded_at_timestamp), } args_list.append(values) @@ -104,13 +107,16 @@ def ingest_perf_test_result(cursor, data_dile: Path, recorded_at_timestamp: int) def main(): - parser = argparse.ArgumentParser(description='Perf test result uploader. \ - Database connection string should be provided via DATABASE_URL environment variable', ) + parser = argparse.ArgumentParser( + description="Perf test result uploader. \ + Database connection string should be provided via DATABASE_URL environment variable", + ) parser.add_argument( - '--ingest', + "--ingest", type=Path, - help='Path to perf test result file, or directory with perf test result files') - parser.add_argument('--initdb', action='store_true', help='Initialuze database') + help="Path to perf test result file, or directory with perf test result files", + ) + parser.add_argument("--initdb", action="store_true", help="Initialuze database") args = parser.parse_args() with get_connection_cursor() as cur: @@ -118,19 +124,19 @@ def main(): create_table(cur) if not args.ingest.exists(): - err(f'ingest path {args.ingest} does not exist') + err(f"ingest path {args.ingest} does not exist") if args.ingest: if args.ingest.is_dir(): - for item in sorted(args.ingest.iterdir(), key=lambda x: int(x.name.split('_')[0])): - recorded_at_timestamp = int(item.name.split('_')[0]) + for item in sorted(args.ingest.iterdir(), key=lambda x: int(x.name.split("_")[0])): + recorded_at_timestamp = int(item.name.split("_")[0]) ingested = ingest_perf_test_result(cur, item, recorded_at_timestamp) - print(f'Ingested {ingested} metric values from {item}') + print(f"Ingested {ingested} metric values from {item}") else: - recorded_at_timestamp = int(args.ingest.name.split('_')[0]) + recorded_at_timestamp = int(args.ingest.name.split("_")[0]) ingested = ingest_perf_test_result(cur, args.ingest, recorded_at_timestamp) - print(f'Ingested {ingested} metric values from {args.ingest}') + print(f"Ingested {ingested} metric values from {args.ingest}") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/setup.cfg b/setup.cfg index 7f8c45c8c3..a067ee731d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,43 +1,8 @@ -# Just trying to gather linter settings in one file. -# I wonder if there's a way to de-duplicate them... - [flake8] -max-line-length = 100 - -[pycodestyle] -max-line-length = 100 - -[yapf] -based_on_style = pep8 -column_limit = 100 -split_all_top_level_comma_separated_values = true - -[mypy] -# mypy uses regex -exclude = ^vendor/ -# some tests don't typecheck when this flag is set -check_untyped_defs = false - -# Help mypy find imports when running against list of individual files. -# Without this line it would behave differently when executed on the entire project. -mypy_path = $MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner - -disallow_incomplete_defs = false -disallow_untyped_calls = false -disallow_untyped_decorators = false -disallow_untyped_defs = false -strict = true - -[mypy-asyncpg.*] -# There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577 -ignore_missing_imports = true - -[mypy-pg8000.*] -# Used only in testing clients -ignore_missing_imports = true - -[mypy-cached_property.*] -ignore_missing_imports = true - -[mypy-pytest.*] -ignore_missing_imports = true +# Move config to pyproject.toml as soon as flake8 supports it +# https://github.com/PyCQA/flake8/issues/234 +extend-ignore = + E203, # Whitespace before ':' -- conflicts with black + E266, # Too many leading '#' for block comment -- we use it for formatting sometimes + E501 # Line too long -- black sorts it out +extend-exclude = vendor/ diff --git a/test_runner/README.md b/test_runner/README.md index 4b54c45175..c7ec361d65 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -15,12 +15,22 @@ Prerequisites: ### Test Organization -The tests are divided into a few batches, such that each batch takes roughly -the same amount of time. The batches can be run in parallel, to minimize total -runtime. Currently, there are only two batches: +Regression tests are in the 'regress' directory. They can be run in +parallel to minimize total runtime. Most regression test sets up their +environment with its own pageservers and safekeepers (but see +`TEST_SHARED_FIXTURES`). -- test_batch_pg_regress: Runs PostgreSQL regression tests -- test_others: All other tests +'pg_clients' contains tests for connecting with various client +libraries. Each client test uses a Dockerfile that pulls an image that +contains the client, and connects to PostgreSQL with it. The client +tests can be run against an existing PostgreSQL or Neon installation. + +'performance' contains performance regression tests. Each test +exercises a particular scenario or workload, and outputs +measurements. They should be run serially, to avoid the tests +interfering with the performance of each other. Some performance tests +set up their own Neon environment, while others can be run against an +existing PostgreSQL or Neon environment. ### Running the tests diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py deleted file mode 100644 index cdb577f480..0000000000 --- a/test_runner/batch_others/test_clog_truncate.py +++ /dev/null @@ -1,72 +0,0 @@ -import time -import os - -from contextlib import closing - -from fixtures.neon_fixtures import NeonEnv -from fixtures.log_helper import log -from fixtures.utils import query_scalar - - -# -# Test compute node start after clog truncation -# -def test_clog_truncate(neon_simple_env: NeonEnv): - env = neon_simple_env - env.neon_cli.create_branch('test_clog_truncate', 'empty') - - # set aggressive autovacuum to make sure that truncation will happen - config = [ - 'autovacuum_max_workers=10', - 'autovacuum_vacuum_threshold=0', - 'autovacuum_vacuum_insert_threshold=0', - 'autovacuum_vacuum_cost_delay=0', - 'autovacuum_vacuum_cost_limit=10000', - 'autovacuum_naptime =1s', - 'autovacuum_freeze_max_age=100000' - ] - - pg = env.postgres.create_start('test_clog_truncate', config_lines=config) - log.info('postgres is running on test_clog_truncate branch') - - # Install extension containing function needed for test - pg.safe_psql('CREATE EXTENSION neon_test_utils') - - # Consume many xids to advance clog - with pg.cursor() as cur: - cur.execute('select test_consume_xids(1000*1000*10);') - log.info('xids consumed') - - # call a checkpoint to trigger TruncateSubtrans - cur.execute('CHECKPOINT;') - - # ensure WAL flush - cur.execute('select txid_current()') - log.info(cur.fetchone()) - - # wait for autovacuum to truncate the pg_xact - # XXX Is it worth to add a timeout here? - pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), '0000') - log.info(f"pg_xact_0000_path = {pg_xact_0000_path}") - - while os.path.isfile(pg_xact_0000_path): - log.info(f"file exists. wait for truncation. " "pg_xact_0000_path = {pg_xact_0000_path}") - time.sleep(5) - - # checkpoint to advance latest lsn - with pg.cursor() as cur: - cur.execute('CHECKPOINT;') - lsn_after_truncation = query_scalar(cur, 'select pg_current_wal_insert_lsn()') - - # create new branch after clog truncation and start a compute node on it - log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}') - env.neon_cli.create_branch('test_clog_truncate_new', - 'test_clog_truncate', - ancestor_start_lsn=lsn_after_truncation) - pg2 = env.postgres.create_start('test_clog_truncate_new') - log.info('postgres is running on test_clog_truncate_new branch') - - # check that new node doesn't contain truncated segment - pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), '0000') - log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}") - assert os.path.isfile(pg_xact_0000_path_new) is False diff --git a/test_runner/batch_others/test_crafted_wal_end.py b/test_runner/batch_others/test_crafted_wal_end.py deleted file mode 100644 index d1c46fc73a..0000000000 --- a/test_runner/batch_others/test_crafted_wal_end.py +++ /dev/null @@ -1,63 +0,0 @@ -from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft -from fixtures.log_helper import log -import pytest - -# Restart nodes with WAL end having specially crafted shape, like last record -# crossing segment boundary, to test decoding issues. - - -@pytest.mark.parametrize('wal_type', - [ - 'simple', - 'last_wal_record_xlog_switch', - 'last_wal_record_xlog_switch_ends_on_page_boundary', - 'last_wal_record_crossing_segment', - 'wal_record_crossing_segment_followed_by_small_one', - ]) -def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): - neon_env_builder.num_safekeepers = 1 - env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_crafted_wal_end') - - pg = env.postgres.create('test_crafted_wal_end') - wal_craft = WalCraft(env) - pg.config(wal_craft.postgres_config()) - pg.start() - res = pg.safe_psql_many(queries=[ - 'CREATE TABLE keys(key int primary key)', - 'INSERT INTO keys SELECT generate_series(1, 100)', - 'SELECT SUM(key) FROM keys' - ]) - assert res[-1][0] == (5050, ) - - wal_craft.in_existing(wal_type, pg.connstr()) - - log.info("Restarting all safekeepers and pageservers") - env.pageserver.stop() - env.safekeepers[0].stop() - env.safekeepers[0].start() - env.pageserver.start() - - log.info("Trying more queries") - res = pg.safe_psql_many(queries=[ - 'SELECT SUM(key) FROM keys', - 'INSERT INTO keys SELECT generate_series(101, 200)', - 'SELECT SUM(key) FROM keys', - ]) - assert res[0][0] == (5050, ) - assert res[-1][0] == (20100, ) - - log.info("Restarting all safekeepers and pageservers (again)") - env.pageserver.stop() - env.safekeepers[0].stop() - env.safekeepers[0].start() - env.pageserver.start() - - log.info("Trying more queries (again)") - res = pg.safe_psql_many(queries=[ - 'SELECT SUM(key) FROM keys', - 'INSERT INTO keys SELECT generate_series(201, 300)', - 'SELECT SUM(key) FROM keys', - ]) - assert res[0][0] == (20100, ) - assert res[-1][0] == (45150, ) diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/batch_others/test_createuser.py deleted file mode 100644 index d48db05395..0000000000 --- a/test_runner/batch_others/test_createuser.py +++ /dev/null @@ -1,28 +0,0 @@ -from fixtures.neon_fixtures import NeonEnv -from fixtures.log_helper import log -from fixtures.utils import query_scalar - - -# -# Test CREATE USER to check shared catalog restore -# -def test_createuser(neon_simple_env: NeonEnv): - env = neon_simple_env - env.neon_cli.create_branch('test_createuser', 'empty') - pg = env.postgres.create_start('test_createuser') - log.info("postgres is running on 'test_createuser' branch") - - with pg.cursor() as cur: - # Cause a 'relmapper' change in the original branch - cur.execute('CREATE USER testuser with password %s', ('testpwd', )) - - cur.execute('CHECKPOINT') - - lsn = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') - - # Create a branch - env.neon_cli.create_branch('test_createuser2', 'test_createuser', ancestor_start_lsn=lsn) - pg2 = env.postgres.create_start('test_createuser2') - - # Test that you can connect to new branch as a new user - assert pg2.safe_psql('select current_user', user='testuser') == [('testuser', )] diff --git a/test_runner/batch_others/test_fsm_truncate.py b/test_runner/batch_others/test_fsm_truncate.py deleted file mode 100644 index 0f85942598..0000000000 --- a/test_runner/batch_others/test_fsm_truncate.py +++ /dev/null @@ -1,11 +0,0 @@ -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient -import pytest - - -def test_fsm_truncate(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_fsm_truncate") - pg = env.postgres.create_start('test_fsm_truncate') - pg.safe_psql( - 'CREATE TABLE t1(key int); CREATE TABLE t2(key int); TRUNCATE TABLE t1; TRUNCATE TABLE t2;') diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py deleted file mode 100644 index 710b220ae8..0000000000 --- a/test_runner/batch_others/test_pageserver_api.py +++ /dev/null @@ -1,166 +0,0 @@ -from typing import Optional -from uuid import uuid4, UUID -import pytest -import pathlib -import os -import subprocess -from fixtures.utils import lsn_from_hex -from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - DEFAULT_BRANCH_NAME, - NeonEnv, - NeonEnvBuilder, - NeonPageserverHttpClient, - NeonPageserverApiException, - wait_until, - neon_binpath, - pg_distrib_dir, -) - - -# test that we cannot override node id after init -def test_pageserver_init_node_id(neon_simple_env: NeonEnv): - repo_dir = neon_simple_env.repo_dir - pageserver_config = repo_dir / 'pageserver.toml' - pageserver_bin = pathlib.Path(neon_binpath) / 'pageserver' - run_pageserver = lambda args: subprocess.run([str(pageserver_bin), '-D', str(repo_dir), *args], - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - - # remove initial config - pageserver_config.unlink() - - bad_init = run_pageserver(['--init', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) - assert bad_init.returncode == 1, 'pageserver should not be able to init new config without the node id' - assert "missing id" in bad_init.stderr - assert not pageserver_config.exists(), 'config file should not be created after init error' - - completed_init = run_pageserver( - ['--init', '-c', 'id = 12345', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) - assert completed_init.returncode == 0, 'pageserver should be able to create a new config with the node id given' - assert pageserver_config.exists(), 'config file should be created successfully' - - bad_reinit = run_pageserver( - ['--init', '-c', 'id = 12345', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) - assert bad_reinit.returncode == 1, 'pageserver should not be able to init new config without the node id' - assert "already exists, cannot init it" in bad_reinit.stderr - - bad_update = run_pageserver(['--update-config', '-c', 'id = 3']) - assert bad_update.returncode == 1, 'pageserver should not allow updating node id' - assert "has node id already, it cannot be overridden" in bad_update.stderr - - -def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): - client.check_status() - - # check initial tenant is there - assert initial_tenant.hex in {t['id'] for t in client.tenant_list()} - - # create new tenant and check it is also there - tenant_id = uuid4() - client.tenant_create(tenant_id) - assert tenant_id.hex in {t['id'] for t in client.tenant_list()} - - timelines = client.timeline_list(tenant_id) - assert len(timelines) == 0, "initial tenant should not have any timelines" - - # create timeline - timeline_id = uuid4() - client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) - - timelines = client.timeline_list(tenant_id) - assert len(timelines) > 0 - - # check it is there - assert timeline_id.hex in {b['timeline_id'] for b in client.timeline_list(tenant_id)} - for timeline in timelines: - timeline_id_str = str(timeline['timeline_id']) - timeline_details = client.timeline_detail(tenant_id=tenant_id, - timeline_id=UUID(timeline_id_str), - include_non_incremental_logical_size=True) - - assert timeline_details['tenant_id'] == tenant_id.hex - assert timeline_details['timeline_id'] == timeline_id_str - - local_timeline_details = timeline_details.get('local') - assert local_timeline_details is not None - assert local_timeline_details['timeline_state'] == 'Loaded' - - -def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): - env = neon_simple_env - with env.pageserver.http_client() as client: - tenant_id, timeline_id = env.neon_cli.create_tenant() - - timeline_details = client.timeline_detail(tenant_id=tenant_id, - timeline_id=timeline_id, - include_non_incremental_logical_size=True) - - assert timeline_details.get('wal_source_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running' - assert timeline_details.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running' - assert timeline_details.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running' - - -def expect_updated_msg_lsn(client: NeonPageserverHttpClient, - tenant_id: UUID, - timeline_id: UUID, - prev_msg_lsn: Optional[int]) -> int: - timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) - - # a successful `timeline_details` response must contain the below fields - local_timeline_details = timeline_details['local'] - assert "wal_source_connstr" in local_timeline_details.keys() - assert "last_received_msg_lsn" in local_timeline_details.keys() - assert "last_received_msg_ts" in local_timeline_details.keys() - - assert local_timeline_details["last_received_msg_lsn"] is not None, "the last received message's LSN is empty" - - last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"]) - assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \ - f"the last received message's LSN {last_msg_lsn} hasn't been updated \ - compared to the previous message's LSN {prev_msg_lsn}" - - return last_msg_lsn - - -# Test the WAL-receiver related fields in the response to `timeline_details` API call -# -# These fields used to be returned by a separate API call, but they're part of -# `timeline_details` now. -def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): - env = neon_simple_env - with env.pageserver.http_client() as client: - tenant_id, timeline_id = env.neon_cli.create_tenant() - pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) - - # Wait to make sure that we get a latest WAL receiver data. - # We need to wait here because it's possible that we don't have access to - # the latest WAL yet, when the `timeline_detail` API is first called. - # See: https://github.com/neondatabase/neon/issues/1768. - lsn = wait_until(number_of_iterations=5, - interval=1, - func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None)) - - # Make a DB modification then expect getting a new WAL receiver's data. - pg.safe_psql("CREATE TABLE t(key int primary key, value text)") - wait_until(number_of_iterations=5, - interval=1, - func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn)) - - -def test_pageserver_http_api_client(neon_simple_env: NeonEnv): - env = neon_simple_env - with env.pageserver.http_client() as client: - check_client(client, env.initial_tenant) - - -def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder): - neon_env_builder.auth_enabled = True - env = neon_env_builder.init_start() - - management_token = env.auth_keys.generate_management_token() - - with env.pageserver.http_client(auth_token=management_token) as client: - check_client(client, env.initial_tenant) diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py deleted file mode 100644 index 2d9957fc38..0000000000 --- a/test_runner/batch_others/test_proxy.py +++ /dev/null @@ -1,35 +0,0 @@ -import pytest -import psycopg2 - - -def test_proxy_select_1(static_proxy): - static_proxy.safe_psql('select 1', options='project=generic-project-name') - - -def test_password_hack(static_proxy): - user = 'borat' - password = 'password' - static_proxy.safe_psql(f"create role {user} with login password '{password}'", - options='project=irrelevant') - - # Note the format of `magic`! - magic = f"project=irrelevant;{password}" - static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) - - # Must also check that invalid magic won't be accepted. - with pytest.raises(psycopg2.errors.OperationalError): - magic = "broken" - static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic) - - -# Pass extra options to the server. -# -# Currently, proxy eats the extra connection options, so this fails. -# See https://github.com/neondatabase/neon/issues/1287 -@pytest.mark.xfail -def test_proxy_options(static_proxy): - with static_proxy.connect(options='-cproxytest.option=value') as conn: - with conn.cursor() as cur: - cur.execute('SHOW proxytest.option') - value = cur.fetchall()[0][0] - assert value == 'value' diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py deleted file mode 100644 index 809e942415..0000000000 --- a/test_runner/batch_others/test_wal_restore.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -from pathlib import Path - -from fixtures.neon_fixtures import (NeonEnvBuilder, - VanillaPostgres, - PortDistributor, - PgBin, - base_dir, - pg_distrib_dir) - - -def test_wal_restore(neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - test_output_dir: Path, - port_distributor: PortDistributor): - env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_wal_restore") - pg = env.postgres.create_start('test_wal_restore') - pg.safe_psql("create table t as select generate_series(1,300000)") - tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] - env.neon_cli.pageserver_stop() - port = port_distributor.get_port() - data_dir = test_output_dir / 'pgsql.restored' - with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored: - pg_bin.run_capture([ - os.path.join(base_dir, 'libs/utils/scripts/restore_from_wal.sh'), - os.path.join(pg_distrib_dir, 'bin'), - str(test_output_dir / 'repo' / 'safekeepers' / 'sk1' / str(tenant_id) / '*'), - str(data_dir), - str(port) - ]) - restored.start() - assert restored.safe_psql('select count(*) from t', user='cloud_admin') == [(300000, )] diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py deleted file mode 100644 index 0124459440..0000000000 --- a/test_runner/batch_pg_regress/test_isolation.py +++ /dev/null @@ -1,49 +0,0 @@ -import os -from pathlib import Path -import pytest -from fixtures.neon_fixtures import NeonEnv, base_dir, pg_distrib_dir - - -# The isolation tests run for a long time, especially in debug mode, -# so use a larger-than-default timeout. -@pytest.mark.timeout(1800) -def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): - env = neon_simple_env - - env.neon_cli.create_branch("test_isolation", "empty") - # Connect to postgres and create a database called "regression". - # isolation tests use prepared transactions, so enable them - pg = env.postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100']) - pg.safe_psql('CREATE DATABASE isolation_regression') - - # Create some local directories for pg_isolation_regress to run in. - runpath = test_output_dir / 'regress' - (runpath / 'testtablespace').mkdir(parents=True) - - # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, 'build/src/test/isolation') - src_path = os.path.join(base_dir, 'vendor/postgres/src/test/isolation') - bindir = os.path.join(pg_distrib_dir, 'bin') - schedule = os.path.join(src_path, 'isolation_schedule') - pg_isolation_regress = os.path.join(build_path, 'pg_isolation_regress') - - pg_isolation_regress_command = [ - pg_isolation_regress, - '--use-existing', - '--bindir={}'.format(bindir), - '--dlpath={}'.format(build_path), - '--inputdir={}'.format(src_path), - '--schedule={}'.format(schedule), - ] - - env_vars = { - 'PGPORT': str(pg.default_options['port']), - 'PGUSER': pg.default_options['user'], - 'PGHOST': pg.default_options['host'], - } - - # Run the command. - # We don't capture the output. It's not too chatty, and it always - # logs the exact same data to `regression.out` anyway. - with capsys.disabled(): - pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) diff --git a/test_runner/batch_pg_regress/test_neon_regress.py b/test_runner/batch_pg_regress/test_neon_regress.py deleted file mode 100644 index 66ea67d9f1..0000000000 --- a/test_runner/batch_pg_regress/test_neon_regress.py +++ /dev/null @@ -1,58 +0,0 @@ -import os -from pathlib import Path - -from fixtures.neon_fixtures import (NeonEnv, - check_restored_datadir_content, - base_dir, - pg_distrib_dir) -from fixtures.log_helper import log - - -def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): - env = neon_simple_env - - env.neon_cli.create_branch("test_neon_regress", "empty") - # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start('test_neon_regress') - pg.safe_psql('CREATE DATABASE regression') - - # Create some local directories for pg_regress to run in. - runpath = test_output_dir / 'regress' - (runpath / 'testtablespace').mkdir(parents=True) - - # Compute all the file locations that pg_regress will need. - # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress') - src_path = os.path.join(base_dir, 'test_runner/neon_regress') - bindir = os.path.join(pg_distrib_dir, 'bin') - schedule = os.path.join(src_path, 'parallel_schedule') - pg_regress = os.path.join(build_path, 'pg_regress') - - pg_regress_command = [ - pg_regress, - '--use-existing', - '--bindir={}'.format(bindir), - '--dlpath={}'.format(build_path), - '--schedule={}'.format(schedule), - '--inputdir={}'.format(src_path), - ] - - log.info(pg_regress_command) - env_vars = { - 'PGPORT': str(pg.default_options['port']), - 'PGUSER': pg.default_options['user'], - 'PGHOST': pg.default_options['host'], - } - - # Run the command. - # We don't capture the output. It's not too chatty, and it always - # logs the exact same data to `regression.out` anyway. - with capsys.disabled(): - pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - - # checkpoint one more time to ensure that the lsn we get is the latest one - pg.safe_psql('CHECKPOINT') - lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0] - - # Check that we restore the content of the datadir correctly - check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py deleted file mode 100644 index 28066d7a32..0000000000 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -import pathlib -import pytest -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content, base_dir, pg_distrib_dir - - -# The pg_regress tests run for a long time, especially in debug mode, -# so use a larger-than-default timeout. -@pytest.mark.timeout(1800) -def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: pathlib.Path, pg_bin, capsys): - env = neon_simple_env - - env.neon_cli.create_branch("test_pg_regress", "empty") - # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start('test_pg_regress') - pg.safe_psql('CREATE DATABASE regression') - - # Create some local directories for pg_regress to run in. - runpath = test_output_dir / 'regress' - (runpath / 'testtablespace').mkdir(parents=True) - - # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress') - src_path = os.path.join(base_dir, 'vendor/postgres/src/test/regress') - bindir = os.path.join(pg_distrib_dir, 'bin') - schedule = os.path.join(src_path, 'parallel_schedule') - pg_regress = os.path.join(build_path, 'pg_regress') - - pg_regress_command = [ - pg_regress, - '--bindir=""', - '--use-existing', - '--bindir={}'.format(bindir), - '--dlpath={}'.format(build_path), - '--schedule={}'.format(schedule), - '--inputdir={}'.format(src_path), - ] - - env_vars = { - 'PGPORT': str(pg.default_options['port']), - 'PGUSER': pg.default_options['user'], - 'PGHOST': pg.default_options['host'], - } - - # Run the command. - # We don't capture the output. It's not too chatty, and it always - # logs the exact same data to `regression.out` anyway. - with capsys.disabled(): - pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - - # checkpoint one more time to ensure that the lsn we get is the latest one - pg.safe_psql('CHECKPOINT') - - # Check that we restore the content of the datadir correctly - check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 51545d0217..8b7f6a2eea 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,5 +1,7 @@ -pytest_plugins = ("fixtures.neon_fixtures", - "fixtures.benchmark_fixture", - "fixtures.pg_stats", - "fixtures.compare_fixtures", - "fixtures.slow") +pytest_plugins = ( + "fixtures.neon_fixtures", + "fixtures.benchmark_fixture", + "fixtures.pg_stats", + "fixtures.compare_fixtures", + "fixtures.slow", +) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index cca4f7ce17..655ffed90d 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -10,12 +10,14 @@ import warnings from contextlib import contextmanager from datetime import datetime from pathlib import Path + # Type-related stuff from typing import Iterator, Optional import pytest from _pytest.config import Config from _pytest.terminal import TerminalReporter + """ This file contains fixtures for micro-benchmarks. @@ -58,6 +60,7 @@ class PgBenchRunResult: run_duration: float run_start_timestamp: int run_end_timestamp: int + scale: int # TODO progress @@ -112,8 +115,10 @@ class PgBenchRunResult: # pgbench v14: # initial connection time = 3.858 ms # tps = 309.281539 (without initial connection time) - if (line.startswith("tps = ") and ("(excluding connections establishing)" in line - or "(without initial connection time)")): + if line.startswith("tps = ") and ( + "(excluding connections establishing)" in line + or "(without initial connection time)" + ): tps = float(line.split()[2]) return cls( @@ -126,6 +131,7 @@ class PgBenchRunResult: run_duration=run_duration, run_start_timestamp=run_start_timestamp, run_end_timestamp=run_end_timestamp, + scale=scale, ) @@ -154,17 +160,21 @@ class PgBenchInitResult: last_line = stderr.splitlines()[-1] - regex = re.compile(r"done in (\d+\.\d+) s " - r"\(" - r"(?:drop tables (\d+\.\d+) s)?(?:, )?" - r"(?:create tables (\d+\.\d+) s)?(?:, )?" - r"(?:client-side generate (\d+\.\d+) s)?(?:, )?" - r"(?:vacuum (\d+\.\d+) s)?(?:, )?" - r"(?:primary keys (\d+\.\d+) s)?(?:, )?" - r"\)\.") + regex = re.compile( + r"done in (\d+\.\d+) s " + r"\(" + r"(?:drop tables (\d+\.\d+) s)?(?:, )?" + r"(?:create tables (\d+\.\d+) s)?(?:, )?" + r"(?:client-side generate (\d+\.\d+) s)?(?:, )?" + r"(?:vacuum (\d+\.\d+) s)?(?:, )?" + r"(?:primary keys (\d+\.\d+) s)?(?:, )?" + r"\)\." + ) if (m := regex.match(last_line)) is not None: - total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [float(v) for v in m.groups() if v is not None] + total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [ + float(v) for v in m.groups() if v is not None + ] else: raise RuntimeError(f"can't parse pgbench initialize results from `{last_line}`") @@ -185,11 +195,11 @@ class PgBenchInitResult: class MetricReport(str, enum.Enum): # str is a hack to make it json serializable # this means that this is a constant test parameter # like number of transactions, or number of clients - TEST_PARAM = 'test_param' + TEST_PARAM = "test_param" # reporter can use it to mark test runs with higher values as improvements - HIGHER_IS_BETTER = 'higher_is_better' + HIGHER_IS_BETTER = "higher_is_better" # the same but for lower values - LOWER_IS_BETTER = 'lower_is_better' + LOWER_IS_BETTER = "lower_is_better" class NeonBenchmarker: @@ -197,6 +207,7 @@ class NeonBenchmarker: An object for recording benchmark results. This is created for each test function by the zenbenchmark fixture """ + def __init__(self, property_recorder): # property recorder here is a pytest fixture provided by junitxml module # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property @@ -244,43 +255,63 @@ class NeonBenchmarker: ) def record_pg_bench_result(self, prefix: str, pg_bench_result: PgBenchRunResult): - self.record(f"{prefix}.number_of_clients", - pg_bench_result.number_of_clients, - '', - MetricReport.TEST_PARAM) - self.record(f"{prefix}.number_of_threads", - pg_bench_result.number_of_threads, - '', - MetricReport.TEST_PARAM) + self.record( + f"{prefix}.number_of_clients", + pg_bench_result.number_of_clients, + "", + MetricReport.TEST_PARAM, + ) + self.record( + f"{prefix}.number_of_threads", + pg_bench_result.number_of_threads, + "", + MetricReport.TEST_PARAM, + ) self.record( f"{prefix}.number_of_transactions_actually_processed", pg_bench_result.number_of_transactions_actually_processed, - '', + "", # that's because this is predefined by test matrix and doesn't change across runs report=MetricReport.TEST_PARAM, ) - self.record(f"{prefix}.latency_average", - pg_bench_result.latency_average, - unit="ms", - report=MetricReport.LOWER_IS_BETTER) + self.record( + f"{prefix}.latency_average", + pg_bench_result.latency_average, + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) if pg_bench_result.latency_stddev is not None: - self.record(f"{prefix}.latency_stddev", - pg_bench_result.latency_stddev, - unit="ms", - report=MetricReport.LOWER_IS_BETTER) - self.record(f"{prefix}.tps", pg_bench_result.tps, '', report=MetricReport.HIGHER_IS_BETTER) - self.record(f"{prefix}.run_duration", - pg_bench_result.run_duration, - unit="s", - report=MetricReport.LOWER_IS_BETTER) - self.record(f"{prefix}.run_start_timestamp", - pg_bench_result.run_start_timestamp, - '', - MetricReport.TEST_PARAM) - self.record(f"{prefix}.run_end_timestamp", - pg_bench_result.run_end_timestamp, - '', - MetricReport.TEST_PARAM) + self.record( + f"{prefix}.latency_stddev", + pg_bench_result.latency_stddev, + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + self.record(f"{prefix}.tps", pg_bench_result.tps, "", report=MetricReport.HIGHER_IS_BETTER) + self.record( + f"{prefix}.run_duration", + pg_bench_result.run_duration, + unit="s", + report=MetricReport.LOWER_IS_BETTER, + ) + self.record( + f"{prefix}.run_start_timestamp", + pg_bench_result.run_start_timestamp, + "", + MetricReport.TEST_PARAM, + ) + self.record( + f"{prefix}.run_end_timestamp", + pg_bench_result.run_end_timestamp, + "", + MetricReport.TEST_PARAM, + ) + self.record( + f"{prefix}.scale", + pg_bench_result.scale, + "", + MetricReport.TEST_PARAM, + ) def record_pg_bench_init_result(self, prefix: str, result: PgBenchInitResult): test_params = [ @@ -288,10 +319,9 @@ class NeonBenchmarker: "end_timestamp", ] for test_param in test_params: - self.record(f"{prefix}.{test_param}", - getattr(result, test_param), - '', - MetricReport.TEST_PARAM) + self.record( + f"{prefix}.{test_param}", getattr(result, test_param), "", MetricReport.TEST_PARAM + ) metrics = [ "duration", @@ -303,10 +333,9 @@ class NeonBenchmarker: ] for metric in metrics: if (value := getattr(result, metric)) is not None: - self.record(f"{prefix}.{metric}", - value, - unit="s", - report=MetricReport.LOWER_IS_BETTER) + self.record( + f"{prefix}.{metric}", value, unit="s", report=MetricReport.LOWER_IS_BETTER + ) def get_io_writes(self, pageserver) -> int: """ @@ -319,7 +348,7 @@ class NeonBenchmarker: """ Fetch the "maxrss" metric from the pageserver """ - metric_name = r'libmetrics_maxrss_kb' + metric_name = r"libmetrics_maxrss_kb" return self.get_int_counter_value(pageserver, metric_name) def get_int_counter_value(self, pageserver, metric_name) -> int: @@ -332,7 +361,7 @@ class NeonBenchmarker: # all prometheus metrics are floats. So to be pedantic, read it as a float # and round to integer. all_metrics = pageserver.http_client().get_metrics() - matches = re.search(fr'^{metric_name} (\S+)$', all_metrics, re.MULTILINE) + matches = re.search(rf"^{metric_name} (\S+)$", all_metrics, re.MULTILINE) assert matches return int(round(float(matches.group(1)))) @@ -358,10 +387,12 @@ class NeonBenchmarker: yield after = self.get_io_writes(pageserver) - self.record(metric_name, - round((after - before) / (1024 * 1024)), - "MB", - report=MetricReport.LOWER_IS_BETTER) + self.record( + metric_name, + round((after - before) / (1024 * 1024)), + "MB", + report=MetricReport.LOWER_IS_BETTER, + ) @pytest.fixture(scope="function") @@ -410,8 +441,9 @@ def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, result_entry = [] for _, recorded_property in test_report.user_properties: - terminalreporter.write("{}.{}: ".format(test_report.head_line, - recorded_property["name"])) + terminalreporter.write( + "{}.{}: ".format(test_report.head_line, recorded_property["name"]) + ) unit = recorded_property["unit"] value = recorded_property["value"] if unit == "MB": @@ -426,11 +458,13 @@ def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, result_entry.append(recorded_property) - result.append({ - "suit": test_report.nodeid, - "total_duration": test_report.duration, - "data": result_entry, - }) + result.append( + { + "suit": test_report.nodeid, + "total_duration": test_report.duration, + "data": result_entry, + } + ) out_dir = config.getoption("out_dir") if out_dir is None: @@ -442,6 +476,5 @@ def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, return get_out_path(Path(out_dir), revision=revision).write_text( - json.dumps({ - "revision": revision, "platform": platform, "result": result - }, indent=4)) + json.dumps({"revision": revision, "platform": platform, "result": result}, indent=4) + ) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index e6c3a79697..6bca5be335 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -1,14 +1,14 @@ -import pytest -from contextlib import contextmanager from abc import ABC, abstractmethod -from fixtures.pg_stats import PgStatTable - -from fixtures.neon_fixtures import PgBin, PgProtocol, VanillaPostgres, RemotePostgres, NeonEnv -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from contextlib import contextmanager # Type-related stuff from typing import Dict, List +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.neon_fixtures import NeonEnv, PgBin, PgProtocol, RemotePostgres, VanillaPostgres +from fixtures.pg_stats import PgStatTable + class PgCompare(ABC): """Common interface of all postgres implementations, useful for benchmarks. @@ -16,6 +16,7 @@ class PgCompare(ABC): This class is a helper class for the neon_with_baseline fixture. See its documentation for more details. """ + @property @abstractmethod def pg(self) -> PgProtocol: @@ -61,7 +62,7 @@ class PgCompare(ABC): data = self._retrieve_pg_stats(pg_stats) for k in set(init_data) & set(data): - self.zenbenchmark.record(k, data[k] - init_data[k], '', MetricReport.HIGHER_IS_BETTER) + self.zenbenchmark.record(k, data[k] - init_data[k], "", MetricReport.HIGHER_IS_BETTER) def _retrieve_pg_stats(self, pg_stats: List[PgStatTable]) -> Dict[str, int]: results: Dict[str, int] = {} @@ -81,17 +82,16 @@ class PgCompare(ABC): class NeonCompare(PgCompare): """PgCompare interface for the neon stack.""" - def __init__(self, - zenbenchmark: NeonBenchmarker, - neon_simple_env: NeonEnv, - pg_bin: PgBin, - branch_name): + + def __init__( + self, zenbenchmark: NeonBenchmarker, neon_simple_env: NeonEnv, pg_bin: PgBin, branch_name + ): self.env = neon_simple_env self._zenbenchmark = zenbenchmark self._pg_bin = pg_bin # We only use one branch and one timeline - self.env.neon_cli.create_branch(branch_name, 'empty') + self.env.neon_cli.create_branch(branch_name, "empty") self._pg = self.env.postgres.create_start(branch_name) self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0] @@ -118,32 +118,33 @@ class NeonCompare(PgCompare): self.pscur.execute(f"compact {self.env.initial_tenant.hex} {self.timeline}") def report_peak_memory_use(self) -> None: - self.zenbenchmark.record("peak_mem", - self.zenbenchmark.get_peak_mem(self.env.pageserver) / 1024, - 'MB', - report=MetricReport.LOWER_IS_BETTER) + self.zenbenchmark.record( + "peak_mem", + self.zenbenchmark.get_peak_mem(self.env.pageserver) / 1024, + "MB", + report=MetricReport.LOWER_IS_BETTER, + ) def report_size(self) -> None: - timeline_size = self.zenbenchmark.get_timeline_size(self.env.repo_dir, - self.env.initial_tenant, - self.timeline) - self.zenbenchmark.record('size', - timeline_size / (1024 * 1024), - 'MB', - report=MetricReport.LOWER_IS_BETTER) + timeline_size = self.zenbenchmark.get_timeline_size( + self.env.repo_dir, self.env.initial_tenant, self.timeline + ) + self.zenbenchmark.record( + "size", timeline_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER + ) total_files = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_created_persistent_files_total") + self.env.pageserver, "pageserver_created_persistent_files_total" + ) total_bytes = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_written_persistent_bytes_total") - self.zenbenchmark.record("data_uploaded", - total_bytes / (1024 * 1024), - "MB", - report=MetricReport.LOWER_IS_BETTER) - self.zenbenchmark.record("num_files_uploaded", - total_files, - "", - report=MetricReport.LOWER_IS_BETTER) + self.env.pageserver, "pageserver_written_persistent_bytes_total" + ) + self.zenbenchmark.record( + "data_uploaded", total_bytes / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER + ) + self.zenbenchmark.record( + "num_files_uploaded", total_files, "", report=MetricReport.LOWER_IS_BETTER + ) def record_pageserver_writes(self, out_name): return self.zenbenchmark.record_pageserver_writes(self.env.pageserver, out_name) @@ -154,13 +155,16 @@ class NeonCompare(PgCompare): class VanillaCompare(PgCompare): """PgCompare interface for vanilla postgres.""" + def __init__(self, zenbenchmark, vanilla_pg: VanillaPostgres): self._pg = vanilla_pg self._zenbenchmark = zenbenchmark - vanilla_pg.configure([ - 'shared_buffers=1MB', - 'synchronous_commit=off', - ]) + vanilla_pg.configure( + [ + "shared_buffers=1MB", + "synchronous_commit=off", + ] + ) vanilla_pg.start() # Long-lived cursor, useful for flushing @@ -186,16 +190,14 @@ class VanillaCompare(PgCompare): pass # TODO find something def report_size(self) -> None: - data_size = self.pg.get_subdir_size('base') - self.zenbenchmark.record('data_size', - data_size / (1024 * 1024), - 'MB', - report=MetricReport.LOWER_IS_BETTER) - wal_size = self.pg.get_subdir_size('pg_wal') - self.zenbenchmark.record('wal_size', - wal_size / (1024 * 1024), - 'MB', - report=MetricReport.LOWER_IS_BETTER) + data_size = self.pg.get_subdir_size("base") + self.zenbenchmark.record( + "data_size", data_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER + ) + wal_size = self.pg.get_subdir_size("pg_wal") + self.zenbenchmark.record( + "wal_size", wal_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER + ) @contextmanager def record_pageserver_writes(self, out_name): @@ -207,6 +209,7 @@ class VanillaCompare(PgCompare): class RemoteCompare(PgCompare): """PgCompare interface for a remote postgres instance.""" + def __init__(self, zenbenchmark, remote_pg: RemotePostgres): self._pg = remote_pg self._zenbenchmark = zenbenchmark @@ -247,18 +250,18 @@ class RemoteCompare(PgCompare): return self.zenbenchmark.record_duration(out_name) -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def neon_compare(request, zenbenchmark, pg_bin, neon_simple_env) -> NeonCompare: branch_name = request.node.name return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name) -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def vanilla_compare(zenbenchmark, vanilla_pg) -> VanillaCompare: return VanillaCompare(zenbenchmark, vanilla_pg) -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def remote_compare(zenbenchmark, remote_pg) -> RemoteCompare: return RemoteCompare(zenbenchmark, remote_pg) diff --git a/test_runner/fixtures/log_helper.py b/test_runner/fixtures/log_helper.py index 7c2d83d4e3..17f2402391 100644 --- a/test_runner/fixtures/log_helper.py +++ b/test_runner/fixtures/log_helper.py @@ -1,5 +1,6 @@ import logging import logging.config + """ This file configures logging to use in python tests. Logs are automatically captured and shown in their @@ -22,20 +23,16 @@ https://docs.pytest.org/en/6.2.x/logging.html LOGGING = { "version": 1, "loggers": { - "root": { - "level": "INFO" - }, - "root.safekeeper_async": { - "level": "INFO" # a lot of logs on DEBUG level - } - } + "root": {"level": "INFO"}, + "root.safekeeper_async": {"level": "INFO"}, # a lot of logs on DEBUG level + }, } -def getLogger(name='root') -> logging.Logger: +def getLogger(name="root") -> logging.Logger: """Method to get logger for tests. - Should be used to get correctly initialized logger. """ + Should be used to get correctly initialized logger.""" return logging.getLogger(name) diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 6fc62c6ea9..b51c7250e0 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -1,10 +1,8 @@ -from dataclasses import dataclass +from collections import defaultdict +from typing import Dict, List + from prometheus_client.parser import text_string_to_metric_families from prometheus_client.samples import Sample -from typing import Dict, List -from collections import defaultdict - -from fixtures.log_helper import log class Metrics: diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index fe0a3193c1..bbc35736bc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1,45 +1,49 @@ from __future__ import annotations -from dataclasses import field -from contextlib import contextmanager -from enum import Flag, auto -import enum -import textwrap -from cached_property import cached_property import abc -import asyncpg -import os -import boto3 -import pathlib -import uuid -import warnings -import jwt +import asyncio +import enum +import filecmp import json -import psycopg2 -import pytest +import os import re import shutil import socket import subprocess -import time -import filecmp import tempfile - -from contextlib import closing +import textwrap +import time +import uuid +from contextlib import closing, contextmanager +from dataclasses import dataclass, field +from enum import Flag, auto from pathlib import Path -from dataclasses import dataclass +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, TypeVar, Union, cast + +import asyncpg +import backoff # type: ignore +import boto3 +import jwt +import psycopg2 +import pytest +import requests +from cached_property import cached_property +from fixtures.log_helper import log # Type-related stuff from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import make_dsn, parse_dsn -from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple from typing_extensions import Literal -import requests -import backoff # type: ignore +from .utils import ( + allure_attach_from_dir, + etcd_path, + get_self_dir, + lsn_from_hex, + lsn_to_hex, + subprocess_capture, +) -from .utils import (etcd_path, get_self_dir, subprocess_capture, lsn_from_hex, lsn_to_hex) -from fixtures.log_helper import log """ This file contains pytest fixtures. A fixture is a test resource that can be summoned by placing its name in the test's arguments. @@ -58,25 +62,16 @@ put directly-importable functions into utils.py or another separate file. """ Env = Dict[str, str] -Fn = TypeVar('Fn', bound=Callable[..., Any]) +Fn = TypeVar("Fn", bound=Callable[..., Any]) -DEFAULT_OUTPUT_DIR = 'test_output' -DEFAULT_POSTGRES_DIR = 'tmp_install' -DEFAULT_BRANCH_NAME = 'main' +DEFAULT_OUTPUT_DIR = "test_output" +DEFAULT_POSTGRES_DIR = "tmp_install" +DEFAULT_BRANCH_NAME = "main" BASE_PORT = 15000 WORKER_PORT_NUM = 1000 -def pytest_addoption(parser): - parser.addoption( - "--skip-interfering-proc-check", - dest="skip_interfering_proc_check", - action="store_true", - help="skip check for interfering processes", - ) - - # These are set in pytest_configure() base_dir = "" neon_binpath = "" @@ -84,63 +79,46 @@ pg_distrib_dir = "" top_output_dir = "" -def check_interferring_processes(config): - if config.getoption("skip_interfering_proc_check"): - warnings.warn("interfering process check is skipped") - return - - # does not use -c as it is not supported on macOS - cmd = ['pgrep', 'pageserver|postgres|safekeeper'] - result = subprocess.run(cmd, stdout=subprocess.DEVNULL) - if result.returncode == 0: - # returncode of 0 means it found something. - # This is bad; we don't want any of those processes polluting the - # result of the test. - # NOTE this shows as an internal pytest error, there might be a better way - raise Exception( - 'Found interfering processes running. Stop all Neon pageservers, nodes, safekeepers, as well as stand-alone Postgres.' - ) - - def pytest_configure(config): """ Ensure that no unwanted daemons are running before we start testing. Check that we do not overflow available ports range. """ - check_interferring_processes(config) - numprocesses = config.getoption('numprocesses') - if numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768: # do not use ephemeral ports - raise Exception('Too many workers configured. Cannot distribute ports for services.') + numprocesses = config.getoption("numprocesses") + if ( + numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768 + ): # do not use ephemeral ports + raise Exception("Too many workers configured. Cannot distribute ports for services.") # find the base directory (currently this is the git root) global base_dir - base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..')) - log.info(f'base_dir is {base_dir}') + base_dir = os.path.normpath(os.path.join(get_self_dir(), "../..")) + log.info(f"base_dir is {base_dir}") # Compute the top-level directory for all tests. global top_output_dir - env_test_output = os.environ.get('TEST_OUTPUT') + env_test_output = os.environ.get("TEST_OUTPUT") if env_test_output is not None: top_output_dir = env_test_output else: top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR) - pathlib.Path(top_output_dir).mkdir(exist_ok=True) + Path(top_output_dir).mkdir(exist_ok=True) # Find the postgres installation. global pg_distrib_dir - env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR') + env_postgres_bin = os.environ.get("POSTGRES_DISTRIB_DIR") if env_postgres_bin: pg_distrib_dir = env_postgres_bin else: pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR)) - log.info(f'pg_distrib_dir is {pg_distrib_dir}') + log.info(f"pg_distrib_dir is {pg_distrib_dir}") if os.getenv("REMOTE_ENV"): # When testing against a remote server, we only need the client binary. - if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/psql')): + if not os.path.exists(os.path.join(pg_distrib_dir, "bin/psql")): raise Exception('psql not found at "{}"'.format(pg_distrib_dir)) else: - if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/postgres')): + if not os.path.exists(os.path.join(pg_distrib_dir, "bin/postgres")): raise Exception('postgres not found at "{}"'.format(pg_distrib_dir)) if os.getenv("REMOTE_ENV"): @@ -149,25 +127,26 @@ def pytest_configure(config): return # Find the neon binaries. global neon_binpath - env_neon_bin = os.environ.get('NEON_BIN') + env_neon_bin = os.environ.get("NEON_BIN") if env_neon_bin: neon_binpath = env_neon_bin else: - neon_binpath = os.path.join(base_dir, 'target/debug') - log.info(f'neon_binpath is {neon_binpath}') - if not os.path.exists(os.path.join(neon_binpath, 'pageserver')): + neon_binpath = os.path.join(base_dir, "target/debug") + log.info(f"neon_binpath is {neon_binpath}") + if not os.path.exists(os.path.join(neon_binpath, "pageserver")): raise Exception('neon binaries not found at "{}"'.format(neon_binpath)) def profiling_supported(): - """Return True if the pageserver was compiled with the 'profiling' feature - """ - bin_pageserver = os.path.join(str(neon_binpath), 'pageserver') - res = subprocess.run([bin_pageserver, '--version'], - check=True, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + """Return True if the pageserver was compiled with the 'profiling' feature""" + bin_pageserver = os.path.join(str(neon_binpath), "pageserver") + res = subprocess.run( + [bin_pageserver, "--version"], + check=True, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) return "profiling:true" in res.stdout @@ -179,21 +158,21 @@ def shareable_scope(fixture_name, config) -> Literal["session", "function"]: def myfixture(...) ... """ - return 'function' if os.environ.get('TEST_SHARED_FIXTURES') is None else 'session' + return "function" if os.environ.get("TEST_SHARED_FIXTURES") is None else "session" -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def worker_seq_no(worker_id: str): # worker_id is a pytest-xdist fixture # it can be master or gw # parse it to always get a number - if worker_id == 'master': + if worker_id == "master": return 0 - assert worker_id.startswith('gw') + assert worker_id.startswith("gw") return int(worker_id[2:]) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def worker_base_port(worker_seq_no: int): # so we divide ports in ranges of 100 ports # so workers have disjoint set of ports for services @@ -245,32 +224,34 @@ class PortDistributor: return port else: raise RuntimeError( - 'port range configured for test is exhausted, consider enlarging the range') + "port range configured for test is exhausted, consider enlarging the range" + ) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def port_distributor(worker_base_port): return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def default_broker(request: Any, port_distributor: PortDistributor): client_port = port_distributor.get_port() # multiple pytest sessions could get launched in parallel, get them different datadirs etcd_datadir = os.path.join(get_test_output_dir(request), f"etcd_datadir_{client_port}") - pathlib.Path(etcd_datadir).mkdir(exist_ok=True, parents=True) + Path(etcd_datadir).mkdir(exist_ok=True, parents=True) broker = Etcd(datadir=etcd_datadir, port=client_port, peer_port=port_distributor.get_port()) yield broker broker.stop() + allure_attach_from_dir(Path(etcd_datadir)) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def run_id(): yield uuid.uuid4() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def mock_s3_server(port_distributor: PortDistributor): mock_s3_server = MockS3Server(port_distributor.get_port()) yield mock_s3_server @@ -278,7 +259,8 @@ def mock_s3_server(port_distributor: PortDistributor): class PgProtocol: - """ Reusable connection logic """ + """Reusable connection logic""" + def __init__(self, **kwargs): self.default_options = kwargs @@ -290,18 +272,18 @@ class PgProtocol: def conn_options(self, **kwargs): result = self.default_options.copy() - if 'dsn' in kwargs: - result.update(parse_dsn(kwargs['dsn'])) + if "dsn" in kwargs: + result.update(parse_dsn(kwargs["dsn"])) result.update(kwargs) # Individual statement timeout in seconds. 2 minutes should be # enough for our tests, but if you need a longer, you can # change it by calling "SET statement_timeout" after # connecting. - options = result.get('options', '') + options = result.get("options", "") if "statement_timeout" not in options: - options = f'-cstatement_timeout=120s {options}' - result['options'] = options + options = f"-cstatement_timeout=120s {options}" + result["options"] = options return result # autocommit=True here by default because that's what we need most of the time @@ -337,19 +319,19 @@ class PgProtocol: # The psycopg2 option 'dbname' is called 'database' is asyncpg conn_options = self.conn_options(**kwargs) - if 'dbname' in conn_options: - conn_options['database'] = conn_options.pop('dbname') + if "dbname" in conn_options: + conn_options["database"] = conn_options.pop("dbname") # Convert options='-c=' to server_settings - if 'options' in conn_options: - options = conn_options.pop('options') - for match in re.finditer(r'-c(\w*)=(\w*)', options): + if "options" in conn_options: + options = conn_options.pop("options") + for match in re.finditer(r"-c(\w*)=(\w*)", options): key = match.group(1) val = match.group(2) - if 'server_options' in conn_options: - conn_options['server_settings'].update({key: val}) + if "server_options" in conn_options: + conn_options["server_settings"].update({key: val}) else: - conn_options['server_settings'] = {key: val} + conn_options["server_settings"] = {key: val} return await asyncpg.connect(**conn_options) def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]: @@ -395,11 +377,9 @@ class AuthKeys: return token def generate_tenant_token(self, tenant_id): - token = jwt.encode({ - "scope": "tenant", "tenant_id": tenant_id - }, - self.priv, - algorithm="RS256") + token = jwt.encode( + {"scope": "tenant", "tenant_id": tenant_id}, self.priv, algorithm="RS256" + ) if isinstance(token, bytes): token = token.decode() @@ -414,6 +394,7 @@ class MockS3Server: Also provides a set of methods to derive the connection properties from and the method to kill the underlying server. """ + def __init__( self, port: int, @@ -423,7 +404,7 @@ class MockS3Server: # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux # if a process is started from the shell process. - self.subprocess = subprocess.Popen(['poetry', 'run', 'moto_server', 's3', f'-p{port}']) + self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"]) error = None try: return_code = self.subprocess.poll() @@ -440,13 +421,13 @@ class MockS3Server: return f"http://127.0.0.1:{self.port}" def region(self) -> str: - return 'us-east-1' + return "us-east-1" def access_key(self) -> str: - return 'test' + return "test" def secret_key(self) -> str: - return 'test' + return "test" def kill(self): self.subprocess.kill() @@ -485,8 +466,8 @@ class S3Storage: def access_env_vars(self) -> Dict[str, str]: return { - 'AWS_ACCESS_KEY_ID': self.access_key, - 'AWS_SECRET_ACCESS_KEY': self.secret_key, + "AWS_ACCESS_KEY_ID": self.access_key, + "AWS_SECRET_ACCESS_KEY": self.secret_key, } @@ -526,6 +507,7 @@ class NeonEnvBuilder: created in the right directory, based on the test name, and it's properly cleaned up after the test has finished. """ + def __init__( self, repo_dir: Path, @@ -590,7 +572,7 @@ class NeonEnvBuilder: elif remote_storage_kind == RemoteStorageKind.REAL_S3: self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable) else: - raise RuntimeError(f'Unknown storage type: {remote_storage_kind}') + raise RuntimeError(f"Unknown storage type: {remote_storage_kind}") def enable_local_fs_remote_storage(self, force_enable=True): """ @@ -598,7 +580,7 @@ class NeonEnvBuilder: Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. """ assert force_enable or self.remote_storage is None, "remote storage is enabled already" - self.remote_storage = LocalFsStorage(Path(self.repo_dir / 'local_fs_remote_storage')) + self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage")) def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable=True): """ @@ -611,7 +593,7 @@ class NeonEnvBuilder: mock_region = self.mock_s3_server.region() self.remote_storage_client = boto3.client( - 's3', + "s3", endpoint_url=mock_endpoint, region_name=mock_region, aws_access_key_id=self.mock_s3_server.access_key(), @@ -650,20 +632,22 @@ class NeonEnvBuilder: self.keep_remote_storage_contents = False # construct a prefix inside bucket for the particular test case and test run - self.remote_storage_prefix = f'{self.run_id}/{test_name}' + self.remote_storage_prefix = f"{self.run_id}/{test_name}" self.remote_storage_client = boto3.client( - 's3', + "s3", region_name=region, aws_access_key_id=access_key, aws_secret_access_key=secret_key, aws_session_token=session_token, ) - self.remote_storage = S3Storage(bucket_name=bucket_name, - bucket_region=region, - access_key=access_key, - secret_key=secret_key, - prefix_in_bucket=self.remote_storage_prefix) + self.remote_storage = S3Storage( + bucket_name=bucket_name, + bucket_region=region, + access_key=access_key, + secret_key=secret_key, + prefix_in_bucket=self.remote_storage_prefix, + ) def cleanup_remote_storage(self): # here wee check for true remote storage, no the local one @@ -676,26 +660,28 @@ class NeonEnvBuilder: log.info("keep_remote_storage_contents skipping remote storage cleanup") return - log.info("removing data from test s3 bucket %s by prefix %s", - self.remote_storage.bucket_name, - self.remote_storage_prefix) - paginator = self.remote_storage_client.get_paginator('list_objects_v2') + log.info( + "removing data from test s3 bucket %s by prefix %s", + self.remote_storage.bucket_name, + self.remote_storage_prefix, + ) + paginator = self.remote_storage_client.get_paginator("list_objects_v2") pages = paginator.paginate( Bucket=self.remote_storage.bucket_name, Prefix=self.remote_storage_prefix, ) - objects_to_delete = {'Objects': []} + objects_to_delete = {"Objects": []} cnt = 0 - for item in pages.search('Contents'): + for item in pages.search("Contents"): # weirdly when nothing is found it returns [None] if item is None: break - objects_to_delete['Objects'].append({'Key': item['Key']}) + objects_to_delete["Objects"].append({"Key": item["Key"]}) # flush once aws limit reached - if len(objects_to_delete['Objects']) >= 1000: + if len(objects_to_delete["Objects"]) >= 1000: self.remote_storage_client.delete_objects( Bucket=self.remote_storage.bucket_name, Delete=objects_to_delete, @@ -704,9 +690,10 @@ class NeonEnvBuilder: cnt += 1 # flush rest - if len(objects_to_delete['Objects']): - self.remote_storage_client.delete_objects(Bucket=self.remote_storage.bucket_name, - Delete=objects_to_delete) + if len(objects_to_delete["Objects"]): + self.remote_storage_client.delete_objects( + Bucket=self.remote_storage.bucket_name, Delete=objects_to_delete + ) log.info("deleted %s objects from remote storage", cnt) @@ -716,7 +703,7 @@ class NeonEnvBuilder: def __exit__(self, exc_type, exc_value, traceback): # Stop all the nodes. if self.env: - log.info('Cleaning up all storage and compute nodes') + log.info("Cleaning up all storage and compute nodes") self.env.postgres.stop_all() for sk in self.env.safekeepers: sk.stop(immediate=True) @@ -757,6 +744,7 @@ class NeonEnv: create_tenant() - initializes a new tenant in the page server, returns the tenant id """ + def __init__(self, config: NeonEnvBuilder): self.repo_dir = config.repo_dir self.rust_log_override = config.rust_log_override @@ -774,15 +762,19 @@ class NeonEnv: self.initial_tenant = uuid.uuid4() # Create a config file corresponding to the options - toml = textwrap.dedent(f""" + toml = textwrap.dedent( + f""" default_tenant_id = '{self.initial_tenant.hex}' - """) + """ + ) - toml += textwrap.dedent(f""" + toml += textwrap.dedent( + f""" [etcd_broker] broker_endpoints = ['{self.broker.client_url()}'] etcd_binary_path = '{self.broker.binary_path}' - """) + """ + ) # Create config for pageserver pageserver_port = PageserverPort( @@ -791,18 +783,20 @@ class NeonEnv: ) pageserver_auth_type = "ZenithJWT" if config.auth_enabled else "Trust" - toml += textwrap.dedent(f""" + toml += textwrap.dedent( + f""" [pageserver] id=1 listen_pg_addr = 'localhost:{pageserver_port.pg}' listen_http_addr = 'localhost:{pageserver_port.http}' auth_type = '{pageserver_auth_type}' - """) + """ + ) # Create a corresponding NeonPageserver object - self.pageserver = NeonPageserver(self, - port=pageserver_port, - config_override=config.pageserver_config_override) + self.pageserver = NeonPageserver( + self, port=pageserver_port, config_override=config.pageserver_config_override + ) # Create config and a Safekeeper object for each safekeeper for i in range(1, config.num_safekeepers + 1): @@ -811,21 +805,29 @@ class NeonEnv: http=self.port_distributor.get_port(), ) id = config.safekeepers_id_start + i # assign ids sequentially - toml += textwrap.dedent(f""" + toml += textwrap.dedent( + f""" [[safekeepers]] id = {id} pg_port = {port.pg} http_port = {port.http} - sync = {'true' if config.safekeepers_enable_fsync else 'false'}""") + sync = {'true' if config.safekeepers_enable_fsync else 'false'}""" + ) if config.auth_enabled: - toml += textwrap.dedent(f""" + toml += textwrap.dedent( + """ auth_enabled = true - """) - if bool(self.remote_storage_users - & RemoteStorageUsers.SAFEKEEPER) and self.remote_storage is not None: - toml += textwrap.dedent(f""" + """ + ) + if ( + bool(self.remote_storage_users & RemoteStorageUsers.SAFEKEEPER) + and self.remote_storage is not None + ): + toml += textwrap.dedent( + f""" remote_storage = "{remote_storage_to_toml_inline_table(self.remote_storage)}" - """) + """ + ) safekeeper = Safekeeper(env=self, id=id, port=port) self.safekeepers.append(safekeeper) @@ -841,8 +843,8 @@ class NeonEnv: safekeeper.start() def get_safekeeper_connstrs(self) -> str: - """ Get list of safekeeper endpoints suitable for safekeepers GUC """ - return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) + """Get list of safekeeper endpoints suitable for safekeepers GUC""" + return ",".join([f"localhost:{wa.port.pg}" for wa in self.safekeepers]) def timeline_dir(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" @@ -850,8 +852,8 @@ class NeonEnv: @cached_property def auth_keys(self) -> AuthKeys: - pub = (Path(self.repo_dir) / 'auth_public_key.pem').read_bytes() - priv = (Path(self.repo_dir) / 'auth_private_key.pem').read_bytes() + pub = (Path(self.repo_dir) / "auth_public_key.pem").read_bytes() + priv = (Path(self.repo_dir) / "auth_private_key.pem").read_bytes() return AuthKeys(pub=pub, priv=priv) @@ -864,11 +866,11 @@ def _shared_simple_env( run_id: uuid.UUID, ) -> Iterator[NeonEnv]: """ - # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES - is set, this is shared by all tests using `neon_simple_env`. + # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES + is set, this is shared by all tests using `neon_simple_env`. """ - if os.environ.get('TEST_SHARED_FIXTURES') is None: + if os.environ.get("TEST_SHARED_FIXTURES") is None: # Create the environment in the per-test output directory repo_dir = os.path.join(get_test_output_dir(request), "repo") else: @@ -877,21 +879,21 @@ def _shared_simple_env( shutil.rmtree(repo_dir, ignore_errors=True) with NeonEnvBuilder( - repo_dir=Path(repo_dir), - port_distributor=port_distributor, - broker=default_broker, - mock_s3_server=mock_s3_server, - run_id=run_id, + repo_dir=Path(repo_dir), + port_distributor=port_distributor, + broker=default_broker, + mock_s3_server=mock_s3_server, + run_id=run_id, ) as builder: env = builder.init_start() # For convenience in tests, create a branch from the freshly-initialized cluster. - env.neon_cli.create_branch('empty', ancestor_branch_name=DEFAULT_BRANCH_NAME) + env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) yield env -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: """ Simple Neon environment, with no authentication and no safekeepers. @@ -906,7 +908,7 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: _shared_simple_env.postgres.stop_all() -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def neon_env_builder( test_output_dir, port_distributor: PortDistributor, @@ -932,11 +934,11 @@ def neon_env_builder( # Return the builder to the caller with NeonEnvBuilder( - repo_dir=Path(repo_dir), - port_distributor=port_distributor, - mock_s3_server=mock_s3_server, - broker=default_broker, - run_id=run_id, + repo_dir=Path(repo_dir), + port_distributor=port_distributor, + mock_s3_server=mock_s3_server, + broker=default_broker, + run_id=run_id, ) as builder: yield builder @@ -952,16 +954,16 @@ class NeonPageserverHttpClient(requests.Session): self.auth_token = auth_token if auth_token is not None: - self.headers['Authorization'] = f'Bearer {auth_token}' + self.headers["Authorization"] = f"Bearer {auth_token}" def verbose_error(self, res: requests.Response): try: res.raise_for_status() except requests.RequestException as e: try: - msg = res.json()['msg'] - except: - msg = '' + msg = res.json()["msg"] + except: # noqa: E722 + msg = "" raise NeonPageserverApiException(msg) from e def check_status(self): @@ -978,12 +980,12 @@ class NeonPageserverHttpClient(requests.Session): res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ - 'new_tenant_id': new_tenant_id.hex if new_tenant_id else None, + "new_tenant_id": new_tenant_id.hex if new_tenant_id else None, }, ) self.verbose_error(res) if res.status_code == 409: - raise Exception(f'could not create tenant: already exists for id {new_tenant_id}') + raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") new_tenant_id = res.json() assert isinstance(new_tenant_id, str) return uuid.UUID(new_tenant_id) @@ -1017,41 +1019,39 @@ class NeonPageserverHttpClient(requests.Session): ancestor_timeline_id: Optional[uuid.UUID] = None, ancestor_start_lsn: Optional[str] = None, ) -> Dict[Any, Any]: - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline", - json={ - 'new_timeline_id': - new_timeline_id.hex if new_timeline_id else None, - 'ancestor_start_lsn': - ancestor_start_lsn, - 'ancestor_timeline_id': - ancestor_timeline_id.hex if ancestor_timeline_id else None, - }) + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline", + json={ + "new_timeline_id": new_timeline_id.hex if new_timeline_id else None, + "ancestor_start_lsn": ancestor_start_lsn, + "ancestor_timeline_id": ancestor_timeline_id.hex if ancestor_timeline_id else None, + }, + ) self.verbose_error(res) if res.status_code == 409: - raise Exception(f'could not create timeline: already exists for id {new_timeline_id}') + raise Exception(f"could not create timeline: already exists for id {new_timeline_id}") res_json = res.json() assert isinstance(res_json, dict) return res_json - def timeline_detail(self, - tenant_id: uuid.UUID, - timeline_id: uuid.UUID, - include_non_incremental_logical_size: bool = False, - include_non_incremental_physical_size: bool = False) -> Dict[Any, Any]: - - include_non_incremental_logical_size_str = "0" + def timeline_detail( + self, + tenant_id: uuid.UUID, + timeline_id: uuid.UUID, + include_non_incremental_logical_size: bool = False, + include_non_incremental_physical_size: bool = False, + ) -> Dict[Any, Any]: + params = {} if include_non_incremental_logical_size: - include_non_incremental_logical_size_str = "1" - - include_non_incremental_physical_size_str = "0" + params["include-non-incremental-logical-size"] = "yes" if include_non_incremental_physical_size: - include_non_incremental_physical_size_str = "1" + params["include-non-incremental-physical-size"] = "yes" res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}" + - "?include-non-incremental-logical-size={include_non_incremental_logical_size_str}" + - "&include-non-incremental-physical-size={include_non_incremental_physical_size_str}") + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}", + params=params, + ) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) @@ -1059,7 +1059,8 @@ class NeonPageserverHttpClient(requests.Session): def timeline_delete(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}") + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}" + ) self.verbose_error(res) res_json = res.json() assert res_json is None @@ -1077,12 +1078,15 @@ class PageserverPort: http: int -CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", - re.MULTILINE) -CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", - re.MULTILINE) -TIMELINE_DATA_EXTRACTOR = re.compile(r"\s(?P[^\s]+)\s\[(?P[^\]]+)\]", - re.MULTILINE) +CREATE_TIMELINE_ID_EXTRACTOR = re.compile( + r"^Created timeline '(?P[^']+)'", re.MULTILINE +) +CREATE_TIMELINE_ID_EXTRACTOR = re.compile( + r"^Created timeline '(?P[^']+)'", re.MULTILINE +) +TIMELINE_DATA_EXTRACTOR = re.compile( + r"\s(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE +) class AbstractNeonCli(abc.ABC): @@ -1091,15 +1095,18 @@ class AbstractNeonCli(abc.ABC): Supports a way to run arbitrary command directly via CLI. Do not use directly, use specific subclasses instead. """ + def __init__(self, env: NeonEnv): self.env = env COMMAND: str = cast(str, None) # To be overwritten by the derived class. - def raw_cli(self, - arguments: List[str], - extra_env_vars: Optional[Dict[str, str]] = None, - check_return_code=True) -> 'subprocess.CompletedProcess[str]': + def raw_cli( + self, + arguments: List[str], + extra_env_vars: Optional[Dict[str, str]] = None, + check_return_code=True, + ) -> "subprocess.CompletedProcess[str]": """ Run the command with the specified arguments. @@ -1120,30 +1127,32 @@ class AbstractNeonCli(abc.ABC): bin_neon = os.path.join(str(neon_binpath), self.COMMAND) args = [bin_neon] + arguments - log.info('Running command "{}"'.format(' '.join(args))) + log.info('Running command "{}"'.format(" ".join(args))) log.info(f'Running in "{self.env.repo_dir}"') env_vars = os.environ.copy() - env_vars['NEON_REPO_DIR'] = str(self.env.repo_dir) - env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir) + env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) + env_vars["POSTGRES_DISTRIB_DIR"] = str(pg_distrib_dir) if self.env.rust_log_override is not None: - env_vars['RUST_LOG'] = self.env.rust_log_override + env_vars["RUST_LOG"] = self.env.rust_log_override for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): env_vars[extra_env_key] = extra_env_value # Pass coverage settings - var = 'LLVM_PROFILE_FILE' + var = "LLVM_PROFILE_FILE" val = os.environ.get(var) if val: env_vars[var] = val # Intercept CalledProcessError and print more info - res = subprocess.run(args, - env=env_vars, - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + res = subprocess.run( + args, + env=env_vars, + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) if not res.returncode: log.info(f"Run success: {res.stdout}") elif check_return_code: @@ -1154,10 +1163,9 @@ class AbstractNeonCli(abc.ABC): stderr: {res.stderr} """ log.info(msg) - raise Exception(msg) from subprocess.CalledProcessError(res.returncode, - res.args, - res.stdout, - res.stderr) + raise Exception(msg) from subprocess.CalledProcessError( + res.returncode, res.args, res.stdout, res.stderr + ) return res @@ -1167,12 +1175,14 @@ class NeonCli(AbstractNeonCli): Supports main commands via typed methods and a way to run arbitrary command directly via CLI. """ - COMMAND = 'neon_local' + COMMAND = "neon_local" - def create_tenant(self, - tenant_id: Optional[uuid.UUID] = None, - timeline_id: Optional[uuid.UUID] = None, - conf: Optional[Dict[str, str]] = None) -> Tuple[uuid.UUID, uuid.UUID]: + def create_tenant( + self, + tenant_id: Optional[uuid.UUID] = None, + timeline_id: Optional[uuid.UUID] = None, + conf: Optional[Dict[str, str]] = None, + ) -> Tuple[uuid.UUID, uuid.UUID]: """ Creates a new tenant, returns its id and its initial timeline's id. """ @@ -1181,13 +1191,14 @@ class NeonCli(AbstractNeonCli): if timeline_id is None: timeline_id = uuid.uuid4() if conf is None: - res = self.raw_cli([ - 'tenant', 'create', '--tenant-id', tenant_id.hex, '--timeline-id', timeline_id.hex - ]) + res = self.raw_cli( + ["tenant", "create", "--tenant-id", tenant_id.hex, "--timeline-id", timeline_id.hex] + ) else: - res = self.raw_cli([ - 'tenant', 'create', '--tenant-id', tenant_id.hex, '--timeline-id', timeline_id.hex - ] + sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), [])) + res = self.raw_cli( + ["tenant", "create", "--tenant-id", tenant_id.hex, "--timeline-id", timeline_id.hex] + + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) + ) res.check_returncode() return tenant_id, timeline_id @@ -1196,27 +1207,28 @@ class NeonCli(AbstractNeonCli): Update tenant config. """ if conf is None: - res = self.raw_cli(['tenant', 'config', '--tenant-id', tenant_id.hex]) + res = self.raw_cli(["tenant", "config", "--tenant-id", tenant_id.hex]) else: res = self.raw_cli( - ['tenant', 'config', '--tenant-id', tenant_id.hex] + - sum(list(map(lambda kv: (['-c', kv[0] + ':' + kv[1]]), conf.items())), [])) + ["tenant", "config", "--tenant-id", tenant_id.hex] + + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) + ) res.check_returncode() - def list_tenants(self) -> 'subprocess.CompletedProcess[str]': - res = self.raw_cli(['tenant', 'list']) + def list_tenants(self) -> "subprocess.CompletedProcess[str]": + res = self.raw_cli(["tenant", "list"]) res.check_returncode() return res - def create_timeline(self, - new_branch_name: str, - tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + def create_timeline( + self, new_branch_name: str, tenant_id: Optional[uuid.UUID] = None + ) -> uuid.UUID: cmd = [ - 'timeline', - 'create', - '--branch-name', + "timeline", + "create", + "--branch-name", new_branch_name, - '--tenant-id', + "--tenant-id", (tenant_id or self.env.initial_tenant).hex, ] @@ -1227,17 +1239,17 @@ class NeonCli(AbstractNeonCli): created_timeline_id = None if matches is not None: - created_timeline_id = matches.group('timeline_id') + created_timeline_id = matches.group("timeline_id") return uuid.UUID(created_timeline_id) def create_root_branch(self, branch_name: str, tenant_id: Optional[uuid.UUID] = None): cmd = [ - 'timeline', - 'create', - '--branch-name', + "timeline", + "create", + "--branch-name", branch_name, - '--tenant-id', + "--tenant-id", (tenant_id or self.env.initial_tenant).hex, ] @@ -1248,30 +1260,32 @@ class NeonCli(AbstractNeonCli): created_timeline_id = None if matches is not None: - created_timeline_id = matches.group('timeline_id') + created_timeline_id = matches.group("timeline_id") if created_timeline_id is None: - raise Exception('could not find timeline id after `neon timeline create` invocation') + raise Exception("could not find timeline id after `neon timeline create` invocation") else: return uuid.UUID(created_timeline_id) - def create_branch(self, - new_branch_name: str = DEFAULT_BRANCH_NAME, - ancestor_branch_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - ancestor_start_lsn: Optional[str] = None) -> uuid.UUID: + def create_branch( + self, + new_branch_name: str = DEFAULT_BRANCH_NAME, + ancestor_branch_name: Optional[str] = None, + tenant_id: Optional[uuid.UUID] = None, + ancestor_start_lsn: Optional[str] = None, + ) -> uuid.UUID: cmd = [ - 'timeline', - 'branch', - '--branch-name', + "timeline", + "branch", + "--branch-name", new_branch_name, - '--tenant-id', + "--tenant-id", (tenant_id or self.env.initial_tenant).hex, ] if ancestor_branch_name is not None: - cmd.extend(['--ancestor-branch-name', ancestor_branch_name]) + cmd.extend(["--ancestor-branch-name", ancestor_branch_name]) if ancestor_start_lsn is not None: - cmd.extend(['--ancestor-start-lsn', ancestor_start_lsn]) + cmd.extend(["--ancestor-start-lsn", ancestor_start_lsn]) res = self.raw_cli(cmd) res.check_returncode() @@ -1280,10 +1294,10 @@ class NeonCli(AbstractNeonCli): created_timeline_id = None if matches is not None: - created_timeline_id = matches.group('timeline_id') + created_timeline_id = matches.group("timeline_id") if created_timeline_id is None: - raise Exception('could not find timeline id after `neon timeline create` invocation') + raise Exception("could not find timeline id after `neon timeline create` invocation") else: return uuid.UUID(created_timeline_id) @@ -1295,52 +1309,60 @@ class NeonCli(AbstractNeonCli): # (L) main [b49f7954224a0ad25cc0013ea107b54b] # (L) ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] res = self.raw_cli( - ['timeline', 'list', '--tenant-id', (tenant_id or self.env.initial_tenant).hex]) + ["timeline", "list", "--tenant-id", (tenant_id or self.env.initial_tenant).hex] + ) timelines_cli = sorted( - map(lambda branch_and_id: (branch_and_id[0], branch_and_id[1]), - TIMELINE_DATA_EXTRACTOR.findall(res.stdout))) + map( + lambda branch_and_id: (branch_and_id[0], branch_and_id[1]), + TIMELINE_DATA_EXTRACTOR.findall(res.stdout), + ) + ) return timelines_cli - def init(self, - config_toml: str, - initial_timeline_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': - with tempfile.NamedTemporaryFile(mode='w+') as tmp: + def init( + self, config_toml: str, initial_timeline_id: Optional[uuid.UUID] = None + ) -> "subprocess.CompletedProcess[str]": + with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) tmp.flush() - cmd = ['init', f'--config={tmp.name}'] + cmd = ["init", f"--config={tmp.name}"] if initial_timeline_id: - cmd.extend(['--timeline-id', initial_timeline_id.hex]) + cmd.extend(["--timeline-id", initial_timeline_id.hex]) append_pageserver_param_overrides( params_to_update=cmd, remote_storage=self.env.remote_storage, remote_storage_users=self.env.remote_storage_users, - pageserver_config_override=self.env.pageserver.config_override) + pageserver_config_override=self.env.pageserver.config_override, + ) res = self.raw_cli(cmd) res.check_returncode() return res def pageserver_enabled_features(self) -> Any: - bin_pageserver = os.path.join(str(neon_binpath), 'pageserver') - args = [bin_pageserver, '--enabled-features'] - log.info('Running command "{}"'.format(' '.join(args))) + bin_pageserver = os.path.join(str(neon_binpath), "pageserver") + args = [bin_pageserver, "--enabled-features"] + log.info('Running command "{}"'.format(" ".join(args))) - res = subprocess.run(args, - check=True, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + res = subprocess.run( + args, + check=True, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) log.info(f"pageserver_enabled_features success: {res.stdout}") return json.loads(res.stdout) - def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]': - start_args = ['pageserver', 'start', *overrides] + def pageserver_start(self, overrides=()) -> "subprocess.CompletedProcess[str]": + start_args = ["pageserver", "start", *overrides] append_pageserver_param_overrides( params_to_update=start_args, remote_storage=self.env.remote_storage, remote_storage_users=self.env.remote_storage_users, - pageserver_config_override=self.env.pageserver.config_override) + pageserver_config_override=self.env.pageserver.config_override, + ) s3_env_vars = None if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): @@ -1348,29 +1370,29 @@ class NeonCli(AbstractNeonCli): return self.raw_cli(start_args, extra_env_vars=s3_env_vars) - def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]': - cmd = ['pageserver', 'stop'] + def pageserver_stop(self, immediate=False) -> "subprocess.CompletedProcess[str]": + cmd = ["pageserver", "stop"] if immediate: - cmd.extend(['-m', 'immediate']) + cmd.extend(["-m", "immediate"]) log.info(f"Stopping pageserver with {cmd}") return self.raw_cli(cmd) - def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]': + def safekeeper_start(self, id: int) -> "subprocess.CompletedProcess[str]": s3_env_vars = None if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): s3_env_vars = self.env.remote_storage.access_env_vars() - return self.raw_cli(['safekeeper', 'start', str(id)], extra_env_vars=s3_env_vars) + return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars) - def safekeeper_stop(self, - id: Optional[int] = None, - immediate=False) -> 'subprocess.CompletedProcess[str]': - args = ['safekeeper', 'stop'] + def safekeeper_stop( + self, id: Optional[int] = None, immediate=False + ) -> "subprocess.CompletedProcess[str]": + args = ["safekeeper", "stop"] if id is not None: args.append(str(id)) if immediate: - args.extend(['-m', 'immediate']) + args.extend(["-m", "immediate"]) return self.raw_cli(args) def pg_create( @@ -1380,19 +1402,19 @@ class NeonCli(AbstractNeonCli): tenant_id: Optional[uuid.UUID] = None, lsn: Optional[str] = None, port: Optional[int] = None, - ) -> 'subprocess.CompletedProcess[str]': + ) -> "subprocess.CompletedProcess[str]": args = [ - 'pg', - 'create', - '--tenant-id', + "pg", + "create", + "--tenant-id", (tenant_id or self.env.initial_tenant).hex, - '--branch-name', + "--branch-name", branch_name, ] if lsn is not None: - args.extend(['--lsn', lsn]) + args.extend(["--lsn", lsn]) if port is not None: - args.extend(['--port', str(port)]) + args.extend(["--port", str(port)]) if node_name is not None: args.append(node_name) @@ -1406,17 +1428,17 @@ class NeonCli(AbstractNeonCli): tenant_id: Optional[uuid.UUID] = None, lsn: Optional[str] = None, port: Optional[int] = None, - ) -> 'subprocess.CompletedProcess[str]': + ) -> "subprocess.CompletedProcess[str]": args = [ - 'pg', - 'start', - '--tenant-id', + "pg", + "start", + "--tenant-id", (tenant_id or self.env.initial_tenant).hex, ] if lsn is not None: - args.append(f'--lsn={lsn}') + args.append(f"--lsn={lsn}") if port is not None: - args.append(f'--port={port}') + args.append(f"--port={port}") if node_name is not None: args.append(node_name) @@ -1430,15 +1452,15 @@ class NeonCli(AbstractNeonCli): tenant_id: Optional[uuid.UUID] = None, destroy=False, check_return_code=True, - ) -> 'subprocess.CompletedProcess[str]': + ) -> "subprocess.CompletedProcess[str]": args = [ - 'pg', - 'stop', - '--tenant-id', + "pg", + "stop", + "--tenant-id", (tenant_id or self.env.initial_tenant).hex, ] if destroy: - args.append('--destroy') + args.append("--destroy") if node_name is not None: args.append(node_name) @@ -1451,12 +1473,12 @@ class WalCraft(AbstractNeonCli): Supports main commands via typed methods and a way to run arbitrary command directly via CLI. """ - COMMAND = 'wal_craft' + COMMAND = "wal_craft" def postgres_config(self) -> List[str]: res = self.raw_cli(["print-postgres-config"]) res.check_returncode() - return res.stdout.split('\n') + return res.stdout.split("\n") def in_existing(self, type: str, connection: str) -> None: res = self.raw_cli(["in-existing", type, connection]) @@ -1469,26 +1491,27 @@ class NeonPageserver(PgProtocol): Initializes the repository via `neon init`. """ + def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional[str] = None): - super().__init__(host='localhost', port=port.pg, user='cloud_admin') + super().__init__(host="localhost", port=port.pg, user="cloud_admin") self.env = env self.running = False self.service_port = port self.config_override = config_override - def start(self, overrides=()) -> 'NeonPageserver': + def start(self, overrides=()) -> "NeonPageserver": """ Start the page server. `overrides` allows to add some config to this pageserver start. Returns self. """ - assert self.running == False + assert self.running is False self.env.neon_cli.pageserver_start(overrides=overrides) self.running = True return self - def stop(self, immediate=False) -> 'NeonPageserver': + def stop(self, immediate=False) -> "NeonPageserver": """ Stop the page server. Returns self. @@ -1521,31 +1544,33 @@ def append_pageserver_param_overrides( remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) params_to_update.append( - f'--pageserver-config-override=remote_storage={remote_storage_toml_table}') + f"--pageserver-config-override=remote_storage={remote_storage_toml_table}" + ) - env_overrides = os.getenv('ZENITH_PAGESERVER_OVERRIDES') + env_overrides = os.getenv("ZENITH_PAGESERVER_OVERRIDES") if env_overrides is not None: params_to_update += [ - f'--pageserver-config-override={o.strip()}' for o in env_overrides.split(';') + f"--pageserver-config-override={o.strip()}" for o in env_overrides.split(";") ] if pageserver_config_override is not None: params_to_update += [ - f'--pageserver-config-override={o.strip()}' - for o in pageserver_config_override.split(';') + f"--pageserver-config-override={o.strip()}" + for o in pageserver_config_override.split(";") ] class PgBin: - """ A helper class for executing postgres binaries """ + """A helper class for executing postgres binaries""" + def __init__(self, log_dir: Path): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin') + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "bin") self.env = os.environ.copy() - self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib') + self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), "lib") def _fixpath(self, command: List[str]): - if '/' not in command[0]: + if "/" not in command[0]: command[0] = os.path.join(self.pg_bin_path, command[0]) def _build_env(self, env_add: Optional[Env]) -> Env: @@ -1570,15 +1595,17 @@ class PgBin: """ self._fixpath(command) - log.info('Running command "{}"'.format(' '.join(command))) + log.info('Running command "{}"'.format(" ".join(command))) env = self._build_env(env) subprocess.run(command, env=env, cwd=cwd, check=True) - def run_capture(self, - command: List[str], - env: Optional[Env] = None, - cwd: Optional[str] = None, - **kwargs: Any) -> str: + def run_capture( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[str] = None, + **kwargs: Any, + ) -> str: """ Run one of the postgres binaries, with stderr and stdout redirected to a file. @@ -1587,35 +1614,32 @@ class PgBin: """ self._fixpath(command) - log.info('Running command "{}"'.format(' '.join(command))) + log.info('Running command "{}"'.format(" ".join(command))) env = self._build_env(env) - return subprocess_capture(str(self.log_dir), - command, - env=env, - cwd=cwd, - check=True, - **kwargs) + return subprocess_capture( + str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs + ) -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def pg_bin(test_output_dir: Path) -> PgBin: return PgBin(test_output_dir) class VanillaPostgres(PgProtocol): def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): - super().__init__(host='localhost', port=port, dbname='postgres') + super().__init__(host="localhost", port=port, dbname="postgres") self.pgdatadir = pgdatadir self.pg_bin = pg_bin self.running = False if init: - self.pg_bin.run_capture(['initdb', '-D', str(pgdatadir)]) + self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)]) self.configure([f"port = {port}\n"]) def configure(self, options: List[str]): """Append lines into postgresql.conf file.""" assert not self.running - with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file: + with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: conf_file.write("\n".join(options)) def start(self, log_path: Optional[str] = None): @@ -1626,12 +1650,13 @@ class VanillaPostgres(PgProtocol): log_path = os.path.join(self.pgdatadir, "pg.log") self.pg_bin.run_capture( - ['pg_ctl', '-w', '-D', str(self.pgdatadir), '-l', log_path, 'start']) + ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"] + ) def stop(self): assert self.running self.running = False - self.pg_bin.run_capture(['pg_ctl', '-w', '-D', str(self.pgdatadir), 'stop']) + self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) def get_subdir_size(self, subdir) -> int: """Return size of pgdatadir subdirectory in bytes.""" @@ -1645,9 +1670,10 @@ class VanillaPostgres(PgProtocol): self.stop() -@pytest.fixture(scope='function') -def vanilla_pg(test_output_dir: Path, - port_distributor: PortDistributor) -> Iterator[VanillaPostgres]: +@pytest.fixture(scope="function") +def vanilla_pg( + test_output_dir: Path, port_distributor: PortDistributor +) -> Iterator[VanillaPostgres]: pgdatadir = test_output_dir / "pgdata-vanilla" pg_bin = PgBin(test_output_dir) port = port_distributor.get_port() @@ -1663,18 +1689,18 @@ class RemotePostgres(PgProtocol): self.running = True def configure(self, options: List[str]): - raise Exception('cannot change configuration of remote Posgres instance') + raise Exception("cannot change configuration of remote Posgres instance") def start(self): - raise Exception('cannot start a remote Postgres instance') + raise Exception("cannot start a remote Postgres instance") def stop(self): - raise Exception('cannot stop a remote Postgres instance') + raise Exception("cannot stop a remote Postgres instance") def get_subdir_size(self, subdir) -> int: # TODO: Could use the server's Generic File Access functions if superuser. # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE - raise Exception('cannot get size of a Postgres instance') + raise Exception("cannot get size of a Postgres instance") def __enter__(self): return self @@ -1684,7 +1710,7 @@ class RemotePostgres(PgProtocol): pass -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: pg_bin = PgBin(test_output_dir) @@ -1696,21 +1722,61 @@ def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: yield remote_pg +class PSQL: + """ + Helper class to make it easier to run psql in the proxy tests. + Copied and modified from PSQL from cloud/tests_e2e/common/psql.py + """ + + path: str + database_url: str + + def __init__( + self, + path: str = "psql", + host: str = "127.0.0.1", + port: int = 5432, + ): + assert shutil.which(path) + + self.path = path + self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name" + + async def run(self, query=None): + run_args = [self.path, "--no-psqlrc", "--quiet", "--tuples-only", self.database_url] + if query is not None: + run_args += ["--command", query] + + log.info(f"Run psql: {subprocess.list2cmdline(run_args)}") + return await asyncio.create_subprocess_exec( + *run_args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env={"LC_ALL": "C", **os.environ}, # one locale to rule them all + ) + + class NeonProxy(PgProtocol): - def __init__(self, proxy_port: int, http_port: int, auth_endpoint: str): + def __init__(self, proxy_port: int, http_port: int, auth_endpoint=None, mgmt_port=None): super().__init__(dsn=auth_endpoint, port=proxy_port) - self.host = '127.0.0.1' + self.host = "127.0.0.1" self.http_port = http_port self.proxy_port = proxy_port + self.mgmt_port = mgmt_port self.auth_endpoint = auth_endpoint self._popen: Optional[subprocess.Popen[bytes]] = None + self.link_auth_uri_prefix = "http://dummy-uri" def start(self) -> None: + """ + Starts a proxy with option '--auth-backend postgres' and a postgres instance already provided though '--auth-endpoint '." + """ assert self._popen is None + assert self.auth_endpoint is not None # Start proxy args = [ - os.path.join(str(neon_binpath), 'proxy'), + os.path.join(neon_binpath, "proxy"), *["--http", f"{self.host}:{self.http_port}"], *["--proxy", f"{self.host}:{self.proxy_port}"], *["--auth-backend", "postgres"], @@ -1719,6 +1785,25 @@ class NeonProxy(PgProtocol): self._popen = subprocess.Popen(args) self._wait_until_ready() + def start_with_link_auth(self) -> None: + """ + Starts a proxy with option '--auth-backend link' and a dummy authentication link '--uri dummy-auth-link'." + """ + assert self._popen is None + + # Start proxy + bin_proxy = os.path.join(str(neon_binpath), "proxy") + args = [bin_proxy] + args.extend(["--http", f"{self.host}:{self.http_port}"]) + args.extend(["--proxy", f"{self.host}:{self.proxy_port}"]) + args.extend(["--mgmt", f"{self.host}:{self.mgmt_port}"]) + args.extend(["--auth-backend", "link"]) + args.extend(["--uri", self.link_auth_uri_prefix]) + arg_str = " ".join(args) + log.info(f"starting proxy with command line ::: {arg_str}") + self._popen = subprocess.Popen(args, stdout=subprocess.PIPE) + self._wait_until_ready() + @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) def _wait_until_ready(self): requests.get(f"http://{self.host}:{self.http_port}/v1/status") @@ -1733,7 +1818,18 @@ class NeonProxy(PgProtocol): self._popen.kill() -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") +def link_proxy(port_distributor) -> Iterator[NeonProxy]: + """Neon proxy that routes through link auth.""" + http_port = port_distributor.get_port() + proxy_port = port_distributor.get_port() + mgmt_port = port_distributor.get_port() + with NeonProxy(proxy_port, http_port, mgmt_port=mgmt_port) as proxy: + proxy.start_with_link_auth() + yield proxy + + +@pytest.fixture(scope="function") def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: """Neon proxy that routes directly to vanilla postgres.""" @@ -1741,28 +1837,28 @@ def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: vanilla_pg.start() vanilla_pg.safe_psql("create user proxy with login superuser password 'password'") - port = vanilla_pg.default_options['port'] - host = vanilla_pg.default_options['host'] - dbname = vanilla_pg.default_options['dbname'] - auth_endpoint = f'postgres://proxy:password@{host}:{port}/{dbname}' + port = vanilla_pg.default_options["port"] + host = vanilla_pg.default_options["host"] + dbname = vanilla_pg.default_options["dbname"] + auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}" proxy_port = port_distributor.get_port() http_port = port_distributor.get_port() - with NeonProxy(proxy_port=proxy_port, http_port=http_port, - auth_endpoint=auth_endpoint) as proxy: + with NeonProxy( + proxy_port=proxy_port, http_port=http_port, auth_endpoint=auth_endpoint + ) as proxy: proxy.start() yield proxy class Postgres(PgProtocol): - """ An object representing a running postgres daemon. """ - def __init__(self, - env: NeonEnv, - tenant_id: uuid.UUID, - port: int, - check_stop_result: bool = True): - super().__init__(host='localhost', port=port, user='cloud_admin', dbname='postgres') + """An object representing a running postgres daemon.""" + + def __init__( + self, env: NeonEnv, tenant_id: uuid.UUID, port: int, check_stop_result: bool = True + ): + super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") self.env = env self.running = False self.node_name: Optional[str] = None # dubious, see asserts below @@ -1778,7 +1874,7 @@ class Postgres(PgProtocol): node_name: Optional[str] = None, lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, - ) -> 'Postgres': + ) -> "Postgres": """ Create the pg data directory. Returns self. @@ -1787,13 +1883,11 @@ class Postgres(PgProtocol): if not config_lines: config_lines = [] - self.node_name = node_name or f'{branch_name}_pg_node' - self.env.neon_cli.pg_create(branch_name, - node_name=self.node_name, - tenant_id=self.tenant_id, - lsn=lsn, - port=self.port) - path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name + self.node_name = node_name or f"{branch_name}_pg_node" + self.env.neon_cli.pg_create( + branch_name, node_name=self.node_name, tenant_id=self.tenant_id, lsn=lsn, port=self.port + ) + path = Path("pgdatadirs") / "tenants" / self.tenant_id.hex / self.node_name self.pgdata_dir = os.path.join(self.env.repo_dir, path) if config_lines is None: @@ -1801,12 +1895,12 @@ class Postgres(PgProtocol): # set small 'max_replication_write_lag' to enable backpressure # and make tests more stable. - config_lines = ['max_replication_write_lag=15MB'] + config_lines + config_lines = ["max_replication_write_lag=15MB"] + config_lines self.config(config_lines) return self - def start(self) -> 'Postgres': + def start(self) -> "Postgres": """ Start the Postgres instance. Returns self. @@ -1816,32 +1910,30 @@ class Postgres(PgProtocol): log.info(f"Starting postgres node {self.node_name}") - run_result = self.env.neon_cli.pg_start(self.node_name, - tenant_id=self.tenant_id, - port=self.port) + self.env.neon_cli.pg_start(self.node_name, tenant_id=self.tenant_id, port=self.port) self.running = True return self def pg_data_dir_path(self) -> str: - """ Path to data directory """ + """Path to data directory""" assert self.node_name - path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name + path = Path("pgdatadirs") / "tenants" / self.tenant_id.hex / self.node_name return os.path.join(self.env.repo_dir, path) def pg_xact_dir_path(self) -> str: - """ Path to pg_xact dir """ - return os.path.join(self.pg_data_dir_path(), 'pg_xact') + """Path to pg_xact dir""" + return os.path.join(self.pg_data_dir_path(), "pg_xact") def pg_twophase_dir_path(self) -> str: - """ Path to pg_twophase dir """ - return os.path.join(self.pg_data_dir_path(), 'pg_twophase') + """Path to pg_twophase dir""" + return os.path.join(self.pg_data_dir_path(), "pg_twophase") def config_file_path(self) -> str: - """ Path to postgresql.conf """ - return os.path.join(self.pg_data_dir_path(), 'postgresql.conf') + """Path to postgresql.conf""" + return os.path.join(self.pg_data_dir_path(), "postgresql.conf") - def adjust_for_safekeepers(self, safekeepers: str) -> 'Postgres': + def adjust_for_safekeepers(self, safekeepers: str) -> "Postgres": """ Adjust instance config for working with wal acceptors instead of pageserver (pre-configured by CLI) directly. @@ -1853,30 +1945,33 @@ class Postgres(PgProtocol): with open(self.config_file_path(), "w") as f: for cfg_line in cfg_lines: # walproposer uses different application_name - if ("synchronous_standby_names" in cfg_line or - # don't repeat safekeepers/wal_acceptors multiple times - "safekeepers" in cfg_line): + if ( + "synchronous_standby_names" in cfg_line + or + # don't repeat safekeepers/wal_acceptors multiple times + "neon.safekeepers" in cfg_line + ): continue f.write(cfg_line) f.write("synchronous_standby_names = 'walproposer'\n") - f.write("safekeepers = '{}'\n".format(safekeepers)) + f.write("neon.safekeepers = '{}'\n".format(safekeepers)) return self - def config(self, lines: List[str]) -> 'Postgres': + def config(self, lines: List[str]) -> "Postgres": """ Add lines to postgresql.conf. Lines should be an array of valid postgresql.conf rows. Returns self. """ - with open(self.config_file_path(), 'a') as conf: + with open(self.config_file_path(), "a") as conf: for line in lines: conf.write(line) - conf.write('\n') + conf.write("\n") return self - def stop(self) -> 'Postgres': + def stop(self) -> "Postgres": """ Stop the Postgres instance if it's running. Returns self. @@ -1884,24 +1979,23 @@ class Postgres(PgProtocol): if self.running: assert self.node_name is not None - self.env.neon_cli.pg_stop(self.node_name, - self.tenant_id, - check_return_code=self.check_stop_result) + self.env.neon_cli.pg_stop( + self.node_name, self.tenant_id, check_return_code=self.check_stop_result + ) self.running = False return self - def stop_and_destroy(self) -> 'Postgres': + def stop_and_destroy(self) -> "Postgres": """ Stop the Postgres instance, then destroy it. Returns self. """ assert self.node_name is not None - self.env.neon_cli.pg_stop(self.node_name, - self.tenant_id, - True, - check_return_code=self.check_stop_result) + self.env.neon_cli.pg_stop( + self.node_name, self.tenant_id, True, check_return_code=self.check_stop_result + ) self.node_name = None self.running = False @@ -1913,7 +2007,7 @@ class Postgres(PgProtocol): node_name: Optional[str] = None, lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, - ) -> 'Postgres': + ) -> "Postgres": """ Create a Postgres instance, apply config and then start it. @@ -1941,18 +2035,21 @@ class Postgres(PgProtocol): class PostgresFactory: - """ An object representing multiple running postgres daemons. """ + """An object representing multiple running postgres daemons.""" + def __init__(self, env: NeonEnv): self.env = env self.num_instances = 0 self.instances: List[Postgres] = [] - def create_start(self, - branch_name: str, - node_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - lsn: Optional[str] = None, - config_lines: Optional[List[str]] = None) -> Postgres: + def create_start( + self, + branch_name: str, + node_name: Optional[str] = None, + tenant_id: Optional[uuid.UUID] = None, + lsn: Optional[str] = None, + config_lines: Optional[List[str]] = None, + ) -> Postgres: pg = Postgres( self.env, @@ -1969,12 +2066,14 @@ class PostgresFactory: lsn=lsn, ) - def create(self, - branch_name: str, - node_name: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - lsn: Optional[str] = None, - config_lines: Optional[List[str]] = None) -> Postgres: + def create( + self, + branch_name: str, + node_name: Optional[str] = None, + tenant_id: Optional[uuid.UUID] = None, + lsn: Optional[str] = None, + config_lines: Optional[List[str]] = None, + ) -> Postgres: pg = Postgres( self.env, @@ -1992,7 +2091,7 @@ class PostgresFactory: config_lines=config_lines, ) - def stop_all(self) -> 'PostgresFactory': + def stop_all(self) -> "PostgresFactory": for pg in self.instances: pg.stop() @@ -2000,7 +2099,7 @@ class PostgresFactory: def read_pid(path: Path) -> int: - """ Read content of file into number """ + """Read content of file into number""" return int(path.read_text()) @@ -2012,14 +2111,15 @@ class SafekeeperPort: @dataclass class Safekeeper: - """ An object representing a running safekeeper daemon. """ + """An object representing a running safekeeper daemon.""" + env: NeonEnv port: SafekeeperPort id: int running: bool = False - def start(self) -> 'Safekeeper': - assert self.running == False + def start(self) -> "Safekeeper": + assert self.running is False self.env.neon_cli.safekeeper_start(self.id) self.running = True # wait for wal acceptor start by checking its status @@ -2032,22 +2132,22 @@ class Safekeeper: elapsed = time.time() - started_at if elapsed > 3: raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for wal acceptor start: {e}") + f"timed out waiting {elapsed:.0f}s for wal acceptor start: {e}" + ) time.sleep(0.5) else: break # success return self - def stop(self, immediate=False) -> 'Safekeeper': - log.info('Stopping safekeeper {}'.format(self.id)) + def stop(self, immediate=False) -> "Safekeeper": + log.info("Stopping safekeeper {}".format(self.id)) self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False return self - def append_logical_message(self, - tenant_id: uuid.UUID, - timeline_id: uuid.UUID, - request: Dict[str, Any]) -> Dict[str, Any]: + def append_logical_message( + self, tenant_id: uuid.UUID, timeline_id: uuid.UUID, request: Dict[str, Any] + ) -> Dict[str, Any]: """ Send JSON_CTRL query to append LogicalMessage to WAL and modify safekeeper state. It will construct LogicalMessage from provided @@ -2104,7 +2204,7 @@ class SafekeeperHttpClient(requests.Session): self.auth_token = auth_token if auth_token is not None: - self.headers['Authorization'] = f'Bearer {auth_token}' + self.headers["Authorization"] = f"Bearer {auth_token}" def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() @@ -2113,21 +2213,25 @@ class SafekeeperHttpClient(requests.Session): res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") res.raise_for_status() resj = res.json() - return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], - flush_lsn=resj['flush_lsn'], - timeline_start_lsn=resj['timeline_start_lsn'], - backup_lsn=resj['backup_lsn'], - remote_consistent_lsn=resj['remote_consistent_lsn']) + return SafekeeperTimelineStatus( + acceptor_epoch=resj["acceptor_state"]["epoch"], + flush_lsn=resj["flush_lsn"], + timeline_start_lsn=resj["timeline_start_lsn"], + backup_lsn=resj["backup_lsn"], + remote_consistent_lsn=resj["remote_consistent_lsn"], + ) def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body): res = self.post( f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", - json=body) + json=body, + ) res.raise_for_status() def timeline_delete_force(self, tenant_id: str, timeline_id: str) -> Dict[Any, Any]: res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" + ) res.raise_for_status() res_json = res.json() assert isinstance(res_json, dict) @@ -2150,21 +2254,24 @@ class SafekeeperHttpClient(requests.Session): metrics = SafekeeperMetrics() for match in re.finditer( - r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE): + r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', + all_metrics_text, + re.MULTILINE, + ): metrics.flush_lsn_inexact[(match.group(1), match.group(2))] = int(match.group(3)) for match in re.finditer( - r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE): + r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', + all_metrics_text, + re.MULTILINE, + ): metrics.commit_lsn_inexact[(match.group(1), match.group(2))] = int(match.group(3)) return metrics @dataclass class Etcd: - """ An object managing etcd instance """ + """An object managing etcd instance""" + datadir: str port: int peer_port: int @@ -2175,19 +2282,19 @@ class Etcd: self.binary_path = etcd_path() def client_url(self): - return f'http://127.0.0.1:{self.port}' + return f"http://127.0.0.1:{self.port}" def check_status(self): with requests.Session() as s: - s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1)) # do not retry + s.mount("http://", requests.adapters.HTTPAdapter(max_retries=1)) # do not retry s.get(f"{self.client_url()}/health").raise_for_status() def try_start(self): if self.handle is not None: - log.debug(f'etcd is already running on port {self.port}') + log.debug(f"etcd is already running on port {self.port}") return - pathlib.Path(self.datadir).mkdir(exist_ok=True) + Path(self.datadir).mkdir(exist_ok=True) if not self.binary_path.is_file(): raise RuntimeError(f"etcd broker binary '{self.binary_path}' is not a file") @@ -2204,7 +2311,7 @@ class Etcd: # Set --quota-backend-bytes to keep the etcd virtual memory # size smaller. Our test etcd clusters are very small. # See https://github.com/etcd-io/etcd/issues/7910 - f"--quota-backend-bytes=100000000" + "--quota-backend-bytes=100000000", ] self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file) @@ -2227,13 +2334,13 @@ class Etcd: self.handle.wait() -def get_test_output_dir(request: Any) -> pathlib.Path: - """ Compute the working directory for an individual test. """ +def get_test_output_dir(request: Any) -> Path: + """Compute the working directory for an individual test.""" test_name = request.node.name - test_dir = pathlib.Path(top_output_dir) / test_name.replace("/", "-") - log.info(f'get_test_output_dir is {test_dir}') + test_dir = Path(top_output_dir) / test_name.replace("/", "-") + log.info(f"get_test_output_dir is {test_dir}") # make mypy happy - assert isinstance(test_dir, pathlib.Path) + assert isinstance(test_dir, Path) return test_dir @@ -2246,32 +2353,43 @@ def get_test_output_dir(request: Any) -> pathlib.Path: # scope. So it uses the get_test_output_dir() function to get the path, and # this fixture ensures that the directory exists. That works because # 'autouse' fixtures are run before other fixtures. -@pytest.fixture(scope='function', autouse=True) -def test_output_dir(request: Any) -> pathlib.Path: - """ Create the working directory for an individual test. """ +@pytest.fixture(scope="function", autouse=True) +def test_output_dir(request: Any) -> Iterator[Path]: + """Create the working directory for an individual test.""" # one directory per test test_dir = get_test_output_dir(request) - log.info(f'test_output_dir is {test_dir}') + log.info(f"test_output_dir is {test_dir}") shutil.rmtree(test_dir, ignore_errors=True) test_dir.mkdir() - return test_dir + + yield test_dir + + allure_attach_from_dir(test_dir) -SKIP_DIRS = frozenset(('pg_wal', - 'pg_stat', - 'pg_stat_tmp', - 'pg_subtrans', - 'pg_logical', - 'pg_replslot/wal_proposer_slot')) +SKIP_DIRS = frozenset( + ( + "pg_wal", + "pg_stat", + "pg_stat_tmp", + "pg_subtrans", + "pg_logical", + "pg_replslot/wal_proposer_slot", + ) +) -SKIP_FILES = frozenset(('pg_internal.init', - 'pg.log', - 'zenith.signal', - 'postgresql.conf', - 'postmaster.opts', - 'postmaster.pid', - 'pg_control')) +SKIP_FILES = frozenset( + ( + "pg_internal.init", + "pg.log", + "zenith.signal", + "postgresql.conf", + "postmaster.opts", + "postmaster.pid", + "pg_control", + ) +) def should_skip_dir(dirname: str) -> bool: @@ -2283,16 +2401,16 @@ def should_skip_file(filename: str) -> bool: return True # check for temp table files according to https://www.postgresql.org/docs/current/storage-file-layout.html # i e "tBBB_FFF" - if not filename.startswith('t'): + if not filename.startswith("t"): return False - tmp_name = filename[1:].split('_') + tmp_name = filename[1:].split("_") if len(tmp_name) != 2: return False try: list(map(int, tmp_name)) - except: + except: # noqa: E722 return False return True @@ -2300,7 +2418,7 @@ def should_skip_file(filename: str) -> bool: # # Test helpers # -def list_files_to_compare(pgdata_dir: pathlib.Path): +def list_files_to_compare(pgdata_dir: Path): pgdata_files = [] for root, _file, filenames in os.walk(pgdata_dir): for filename in filenames: @@ -2329,7 +2447,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post restored_dir_path.mkdir(exist_ok=True) pg_bin = PgBin(test_output_dir) - psql_path = os.path.join(pg_bin.pg_bin_path, 'psql') + psql_path = os.path.join(pg_bin.pg_bin_path, "psql") cmd = rf""" {psql_path} \ @@ -2341,19 +2459,19 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} + psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} result = subprocess.run(cmd, env=psql_env, capture_output=True, text=True, shell=True) # Print captured stdout/stderr if basebackup cmd failed. if result.returncode != 0: - log.error('Basebackup shell command failed with:') + log.error("Basebackup shell command failed with:") log.error(result.stdout) log.error(result.stderr) assert result.returncode == 0 # list files we're going to compare assert pg.pgdata_dir - pgdata_files = list_files_to_compare(pathlib.Path(pg.pgdata_dir)) + pgdata_files = list_files_to_compare(Path(pg.pgdata_dir)) restored_files = list_files_to_compare(restored_dir_path) # check that file sets are equal @@ -2363,11 +2481,10 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post # filecmp returns (match, mismatch, error) lists # We've already filtered all mismatching files in list_files_to_compare(), # so here expect that the content is identical - (match, mismatch, error) = filecmp.cmpfiles(pg.pgdata_dir, - restored_dir_path, - pgdata_files, - shallow=False) - log.info(f'filecmp result mismatch and error lists:\n\t mismatch={mismatch}\n\t error={error}') + (match, mismatch, error) = filecmp.cmpfiles( + pg.pgdata_dir, restored_dir_path, pgdata_files, shallow=False + ) + log.info(f"filecmp result mismatch and error lists:\n\t mismatch={mismatch}\n\t error={error}") for f in mismatch: @@ -2375,11 +2492,11 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post f2 = os.path.join(restored_dir_path, f) stdout_filename = "{}.filediff".format(f2) - with open(stdout_filename, 'w') as stdout_f: + with open(stdout_filename, "w") as stdout_f: subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True) subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True) - cmd = 'diff {}.hex {}.hex'.format(f1, f2) + cmd = "diff {}.hex {}.hex".format(f1, f2) subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, error) == ([], []) @@ -2403,11 +2520,16 @@ def wait_until(number_of_iterations: int, interval: float, func): raise Exception("timed out while waiting for %s" % func) from last_exception -def assert_timeline_local(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID): - timeline_detail = pageserver_http_client.timeline_detail(tenant, timeline) - assert timeline_detail.get('local', {}).get("disk_consistent_lsn"), timeline_detail +def assert_timeline_local( + pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID +): + timeline_detail = pageserver_http_client.timeline_detail( + tenant, + timeline, + include_non_incremental_logical_size=True, + include_non_incremental_physical_size=True, + ) + assert timeline_detail.get("local", {}).get("disk_consistent_lsn"), timeline_detail return timeline_detail @@ -2416,68 +2538,100 @@ def assert_no_in_progress_downloads_for_tenant( tenant: uuid.UUID, ): tenant_status = pageserver_http_client.tenant_status(tenant) - assert tenant_status['has_in_progress_downloads'] is False, tenant_status + assert tenant_status["has_in_progress_downloads"] is False, tenant_status -def remote_consistent_lsn(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID) -> int: +def remote_consistent_lsn( + pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID +) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) - if detail['remote'] is None: + if detail["remote"] is None: # No remote information at all. This happens right after creating # a timeline, before any part of it has been uploaded to remote # storage yet. return 0 else: - lsn_str = detail['remote']['remote_consistent_lsn'] + lsn_str = detail["remote"]["remote_consistent_lsn"] assert isinstance(lsn_str, str) return lsn_from_hex(lsn_str) -def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int): +def wait_for_upload( + pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID, + lsn: int, +): """waits for local timeline upload up to specified lsn""" for i in range(20): current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) if current_lsn >= lsn: return - log.info("waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + log.info( + "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 + ) + ) time.sleep(1) - raise Exception("timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn))) + raise Exception( + "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn) + ) + ) -def last_record_lsn(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID) -> int: +def last_record_lsn( + pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID +) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) - lsn_str = detail['local']['last_record_lsn'] + lsn_str = detail["local"]["last_record_lsn"] assert isinstance(lsn_str, str) return lsn_from_hex(lsn_str) -def wait_for_last_record_lsn(pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int): +def wait_for_last_record_lsn( + pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID, + lsn: int, +): """waits for pageserver to catch up to a certain lsn""" for i in range(10): current_lsn = last_record_lsn(pageserver_http_client, tenant, timeline) if current_lsn >= lsn: return - log.info("waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1)) + log.info( + "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 + ) + ) time.sleep(1) - raise Exception("timed out while waiting for last_record_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn))) + raise Exception( + "timed out while waiting for last_record_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn) + ) + ) def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: uuid.UUID, timeline: uuid.UUID): """Wait for pageserver to catch up the latest flush LSN""" last_flush_lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) + + +def fork_at_current_lsn( + env: NeonEnv, + pg: Postgres, + new_branch_name: str, + ancestor_branch_name: str, + tenant_id: Optional[uuid.UUID] = None, +) -> uuid.UUID: + """ + Create new branch at the last LSN of an existing branch. + The "last LSN" is taken from the given Postgres instance. The pageserver will wait for all the + the WAL up to that LSN to arrive in the pageserver before creating the branch. + """ + current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0] + return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn) diff --git a/test_runner/fixtures/pg_stats.py b/test_runner/fixtures/pg_stats.py index e113d37248..b2e6886eb3 100644 --- a/test_runner/fixtures/pg_stats.py +++ b/test_runner/fixtures/pg_stats.py @@ -18,35 +18,43 @@ class PgStatTable: return f"SELECT {','.join(self.columns)} FROM {self.table} {self.additional_query}" -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def pg_stats_rw() -> List[PgStatTable]: return [ - PgStatTable("pg_stat_database", - ["tup_returned", "tup_fetched", "tup_inserted", "tup_updated", "tup_deleted"], - "WHERE datname='postgres'"), + PgStatTable( + "pg_stat_database", + ["tup_returned", "tup_fetched", "tup_inserted", "tup_updated", "tup_deleted"], + "WHERE datname='postgres'", + ), ] -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def pg_stats_ro() -> List[PgStatTable]: return [ - PgStatTable("pg_stat_database", ["tup_returned", "tup_fetched"], - "WHERE datname='postgres'"), + PgStatTable( + "pg_stat_database", ["tup_returned", "tup_fetched"], "WHERE datname='postgres'" + ), ] -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def pg_stats_wo() -> List[PgStatTable]: return [ - PgStatTable("pg_stat_database", ["tup_inserted", "tup_updated", "tup_deleted"], - "WHERE datname='postgres'"), + PgStatTable( + "pg_stat_database", + ["tup_inserted", "tup_updated", "tup_deleted"], + "WHERE datname='postgres'", + ), ] -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def pg_stats_wal() -> List[PgStatTable]: return [ - PgStatTable("pg_stat_wal", - ["wal_records", "wal_fpi", "wal_bytes", "wal_buffers_full", "wal_write"], - "") + PgStatTable( + "pg_stat_wal", + ["wal_records", "wal_fpi", "wal_bytes", "wal_buffers_full", "wal_write"], + "", + ) ] diff --git a/test_runner/fixtures/slow.py b/test_runner/fixtures/slow.py index c20b766a93..94199ae785 100644 --- a/test_runner/fixtures/slow.py +++ b/test_runner/fixtures/slow.py @@ -1,4 +1,5 @@ import pytest + """ This plugin allows tests to be marked as slow using pytest.mark.slow. By default slow tests are excluded. They need to be specifically requested with the --runslow flag in diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index a37d40014c..88bf6d634d 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,23 +1,24 @@ import contextlib import os -import pathlib +import re import shutil import subprocess +import tarfile from pathlib import Path - from typing import Any, List, Tuple -from psycopg2.extensions import cursor +import allure # type: ignore from fixtures.log_helper import log +from psycopg2.extensions import cursor def get_self_dir() -> str: - """ Get the path to the directory where this script lives. """ + """Get the path to the directory where this script lives.""" return os.path.dirname(os.path.abspath(__file__)) def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: - """ Run a process and capture its output + """Run a process and capture its output Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" where "cmd" is the name of the program and NNN is an incrementing @@ -27,14 +28,14 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: Returns basepath for files with captured output. """ assert type(cmd) is list - base = os.path.basename(cmd[0]) + '_{}'.format(global_counter()) + base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) basepath = os.path.join(capture_dir, base) - stdout_filename = basepath + '.stdout' - stderr_filename = basepath + '.stderr' + stdout_filename = basepath + ".stdout" + stderr_filename = basepath + ".stderr" try: - with open(stdout_filename, 'w') as stdout_f: - with open(stderr_filename, 'w') as stderr_f: + with open(stdout_filename, "w") as stdout_f: + with open(stderr_filename, "w") as stderr_f: log.info(f'Capturing stdout to "{base}.stdout" and stderr to "{base}.stderr"') subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) finally: @@ -50,7 +51,7 @@ _global_counter = 0 def global_counter() -> int: - """ A really dumb global counter. + """A really dumb global counter. This is useful for giving output files a unique number, so if we run the same command multiple times we can keep their output separate. @@ -61,13 +62,13 @@ def global_counter() -> int: def lsn_to_hex(num: int) -> str: - """ Convert lsn from int to standard hex notation. """ - return "{:X}/{:X}".format(num >> 32, num & 0xffffffff) + """Convert lsn from int to standard hex notation.""" + return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF) def lsn_from_hex(lsn_hex: str) -> int: - """ Convert lsn from hex notation to int. """ - l, r = lsn_hex.split('/') + """Convert lsn from hex notation to int.""" + l, r = lsn_hex.split("/") return (int(l, 16) << 32) + int(r, 16) @@ -75,14 +76,16 @@ def print_gc_result(row): log.info("GC duration {elapsed} ms".format_map(row)) log.info( " total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_pitr {layers_needed_by_pitr}" - " needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}" - .format_map(row)) + " needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}".format_map( + row + ) + ) def etcd_path() -> Path: path_output = shutil.which("etcd") if path_output is None: - raise RuntimeError('etcd not found in PATH') + raise RuntimeError("etcd not found in PATH") else: return Path(path_output) @@ -109,13 +112,13 @@ def get_dir_size(path: str) -> int: for name in files: try: totalbytes += os.path.getsize(os.path.join(root, name)) - except FileNotFoundError as e: + except FileNotFoundError: pass # file could be concurrently removed return totalbytes -def get_timeline_dir_size(path: pathlib.Path) -> int: +def get_timeline_dir_size(path: Path) -> int: """Get the timeline directory's total size, which only counts the layer files' size.""" sz = 0 for dir_entry in path.iterdir(): @@ -145,7 +148,12 @@ def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: parts = f_name.split("__") key_parts = parts[0].split("-") lsn_parts = parts[1].split("-") - return int(key_parts[0], 16), int(key_parts[1], 16), int(lsn_parts[0], 16), int(lsn_parts[1], 16) + return ( + int(key_parts[0], 16), + int(key_parts[1], 16), + int(lsn_parts[0], 16), + int(lsn_parts[1], 16), + ) def get_scale_for_db(size_mb: int) -> int: @@ -155,3 +163,36 @@ def get_scale_for_db(size_mb: int) -> int: """ return round(0.06689 * size_mb - 0.5) + + +ATTACHMENT_NAME_REGEX = re.compile( + r".+\.log|.+\.stderr|.+\.stdout|.+\.filediff|.+\.metrics|flamegraph\.svg|regression\.diffs" +) + + +def allure_attach_from_dir(dir: Path): + """Attach all non-empty files from `dir` that matches `ATTACHMENT_NAME_REGEX` to Allure report""" + + for attachment in Path(dir).glob("**/*"): + if ATTACHMENT_NAME_REGEX.fullmatch(attachment.name) and attachment.stat().st_size > 0: + source = str(attachment) + name = str(attachment.relative_to(dir)) + + # compress files larger than 1Mb, they're hardly readable in a browser + if attachment.stat().st_size > 1024 * 1024: + source = f"{attachment}.tar.gz" + with tarfile.open(source, "w:gz") as tar: + tar.add(attachment, arcname=attachment.name) + name = f"{name}.tar.gz" + + if source.endswith(".tar.gz"): + attachment_type = "application/gzip" + extension = "tar.gz" + elif source.endswith(".svg"): + attachment_type = "image/svg+xml" + extension = "svg" + else: + attachment_type = "text/plain" + extension = attachment.suffix.removeprefix(".") + + allure.attach.file(source, name, attachment_type, extension) diff --git a/test_runner/neon_regress/README.md b/test_runner/neon_regress/README.md deleted file mode 100644 index b23a55462e..0000000000 --- a/test_runner/neon_regress/README.md +++ /dev/null @@ -1,8 +0,0 @@ -To add a new SQL test - -- add sql script to run to neon_regress/sql/testname.sql -- add expected output to neon_regress/expected/testname.out -- add testname to parallel_schedule - -That's it. -For more complex tests see PostgreSQL regression tests. These works basically the same. diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 1d39b0830d..9cb346de47 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -1,28 +1,26 @@ import random -import time import statistics import threading +import time import timeit -import pytest from typing import List + +import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]): - neon_compare.zenbenchmark.record("branch_creation_duration_max", - max(durs), - 's', - MetricReport.LOWER_IS_BETTER) - neon_compare.zenbenchmark.record("branch_creation_duration_avg", - statistics.mean(durs), - 's', - MetricReport.LOWER_IS_BETTER) - neon_compare.zenbenchmark.record("branch_creation_duration_stdev", - statistics.stdev(durs), - 's', - MetricReport.LOWER_IS_BETTER) + neon_compare.zenbenchmark.record( + "branch_creation_duration_max", max(durs), "s", MetricReport.LOWER_IS_BETTER + ) + neon_compare.zenbenchmark.record( + "branch_creation_duration_avg", statistics.mean(durs), "s", MetricReport.LOWER_IS_BETTER + ) + neon_compare.zenbenchmark.record( + "branch_creation_duration_stdev", statistics.stdev(durs), "s", MetricReport.LOWER_IS_BETTER + ) @pytest.mark.parametrize("n_branches", [20]) @@ -37,15 +35,16 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) # Use aggressive GC and checkpoint settings, so GC and compaction happen more often during the test tenant, _ = env.neon_cli.create_tenant( - conf={ - 'gc_period': '5 s', - 'gc_horizon': f'{4 * 1024 ** 2}', - 'checkpoint_distance': f'{2 * 1024 ** 2}', - 'compaction_target_size': f'{1024 ** 2}', - 'compaction_threshold': '2', - # set PITR interval to be small, so we can do GC - 'pitr_interval': '5 s' - }) + conf={ + "gc_period": "5 s", + "gc_horizon": f"{4 * 1024 ** 2}", + "checkpoint_distance": f"{2 * 1024 ** 2}", + "compaction_target_size": f"{1024 ** 2}", + "compaction_threshold": "2", + # set PITR interval to be small, so we can do GC + "pitr_interval": "5 s", + } + ) def run_pgbench(branch: str): log.info(f"Start a pgbench workload on branch {branch}") @@ -53,15 +52,15 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) pg = env.postgres.create_start(branch, tenant_id=tenant) connstr = pg.connstr() - pg_bin.run_capture(['pgbench', '-i', connstr]) - pg_bin.run_capture(['pgbench', '-c10', '-T10', connstr]) + pg_bin.run_capture(["pgbench", "-i", connstr]) + pg_bin.run_capture(["pgbench", "-c10", "-T10", connstr]) pg.stop() - env.neon_cli.create_branch('b0', tenant_id=tenant) + env.neon_cli.create_branch("b0", tenant_id=tenant) threads: List[threading.Thread] = [] - threads.append(threading.Thread(target=run_pgbench, args=('b0', ), daemon=True)) + threads.append(threading.Thread(target=run_pgbench, args=("b0",), daemon=True)) threads[-1].start() branch_creation_durations = [] @@ -72,13 +71,13 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) p = random.randint(0, i) timer = timeit.default_timer() - env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(p), tenant_id=tenant) + env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p), tenant_id=tenant) dur = timeit.default_timer() - timer log.info(f"Creating branch b{i+1} took {dur}s") branch_creation_durations.append(dur) - threads.append(threading.Thread(target=run_pgbench, args=(f'b{i+1}', ), daemon=True)) + threads.append(threading.Thread(target=run_pgbench, args=(f"b{i+1}",), daemon=True)) threads[-1].start() for thread in threads: @@ -92,10 +91,10 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): env = neon_compare.env - env.neon_cli.create_branch('b0') + env.neon_cli.create_branch("b0") - pg = env.postgres.create_start('b0') - neon_compare.pg_bin.run_capture(['pgbench', '-i', '-s10', pg.connstr()]) + pg = env.postgres.create_start("b0") + neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", pg.connstr()]) branch_creation_durations = [] @@ -103,7 +102,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): # random a source branch p = random.randint(0, i) timer = timeit.default_timer() - env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(p)) + env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p)) dur = timeit.default_timer() - timer branch_creation_durations.append(dur) diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 6a5bad8757..d6e67aa361 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,8 +1,6 @@ from contextlib import closing -from fixtures.neon_fixtures import NeonEnv -from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare + +from fixtures.compare_fixtures import PgCompare # @@ -23,8 +21,8 @@ def test_bulk_insert(neon_with_baseline: PgCompare): cur.execute("create table huge (i int, j int);") # Run INSERT, recording the time and I/O it takes - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('insert'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("insert"): cur.execute("insert into huge values (generate_series(1, 5000000), 0);") env.flush() diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index fe3c3afe37..cef7ce0c6b 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -1,7 +1,7 @@ import timeit -from fixtures.benchmark_fixture import MetricReport -import pytest +import pytest +from fixtures.benchmark_fixture import MetricReport from fixtures.neon_fixtures import NeonEnvBuilder # Run bulk tenant creation test. @@ -12,7 +12,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder # 2. Average creation time per tenant -@pytest.mark.parametrize('tenants_count', [1, 5, 10]) +@pytest.mark.parametrize("tenants_count", [1, 5, 10]) def test_bulk_tenant_create( neon_env_builder: NeonEnvBuilder, tenants_count: int, @@ -27,22 +27,26 @@ def test_bulk_tenant_create( start = timeit.default_timer() tenant, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline(f'test_bulk_tenant_create_{tenants_count}_{i}', - tenant_id=tenant) + env.neon_cli.create_timeline( + f"test_bulk_tenant_create_{tenants_count}_{i}", tenant_id=tenant + ) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? - #if use_safekeepers == 'with_sa': + # if use_safekeepers == 'with_sa': # wa_factory.start_n_new(3) - pg_tenant = env.postgres.create_start(f'test_bulk_tenant_create_{tenants_count}_{i}', - tenant_id=tenant) + pg_tenant = env.postgres.create_start( + f"test_bulk_tenant_create_{tenants_count}_{i}", tenant_id=tenant + ) end = timeit.default_timer() time_slices.append(end - start) pg_tenant.stop() - zenbenchmark.record('tenant_creation_time', - sum(time_slices) / len(time_slices), - 's', - report=MetricReport.LOWER_IS_BETTER) + zenbenchmark.record( + "tenant_creation_time", + sum(time_slices) / len(time_slices), + "s", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/test_compare_pg_stats.py b/test_runner/performance/test_compare_pg_stats.py index b9bca90231..d39ea55fbb 100644 --- a/test_runner/performance/test_compare_pg_stats.py +++ b/test_runner/performance/test_compare_pg_stats.py @@ -6,7 +6,6 @@ from typing import List import pytest from fixtures.compare_fixtures import PgCompare from fixtures.pg_stats import PgStatTable - from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix @@ -18,85 +17,96 @@ def get_seeds_matrix(default: int = 100): @pytest.mark.parametrize("seed", get_seeds_matrix()) @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix(5)) -def test_compare_pg_stats_rw_with_pgbench_default(neon_with_baseline: PgCompare, - seed: int, - scale: int, - duration: int, - pg_stats_rw: List[PgStatTable]): +def test_compare_pg_stats_rw_with_pgbench_default( + neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_rw: List[PgStatTable], +): env = neon_with_baseline # initialize pgbench - env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) env.flush() with env.record_pg_stats(pg_stats_rw): env.pg_bin.run_capture( - ['pgbench', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) + ["pgbench", f"-T{duration}", f"--random-seed={seed}", env.pg.connstr()] + ) env.flush() @pytest.mark.parametrize("seed", get_seeds_matrix()) @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix(5)) -def test_compare_pg_stats_wo_with_pgbench_simple_update(neon_with_baseline: PgCompare, - seed: int, - scale: int, - duration: int, - pg_stats_wo: List[PgStatTable]): +def test_compare_pg_stats_wo_with_pgbench_simple_update( + neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_wo: List[PgStatTable], +): env = neon_with_baseline # initialize pgbench - env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) env.flush() with env.record_pg_stats(pg_stats_wo): env.pg_bin.run_capture( - ['pgbench', '-N', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) + ["pgbench", "-N", f"-T{duration}", f"--random-seed={seed}", env.pg.connstr()] + ) env.flush() @pytest.mark.parametrize("seed", get_seeds_matrix()) @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix(5)) -def test_compare_pg_stats_ro_with_pgbench_select_only(neon_with_baseline: PgCompare, - seed: int, - scale: int, - duration: int, - pg_stats_ro: List[PgStatTable]): +def test_compare_pg_stats_ro_with_pgbench_select_only( + neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_ro: List[PgStatTable], +): env = neon_with_baseline # initialize pgbench - env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) env.flush() with env.record_pg_stats(pg_stats_ro): env.pg_bin.run_capture( - ['pgbench', '-S', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) + ["pgbench", "-S", f"-T{duration}", f"--random-seed={seed}", env.pg.connstr()] + ) env.flush() @pytest.mark.parametrize("seed", get_seeds_matrix()) @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix(5)) -def test_compare_pg_stats_wal_with_pgbench_default(neon_with_baseline: PgCompare, - seed: int, - scale: int, - duration: int, - pg_stats_wal: List[PgStatTable]): +def test_compare_pg_stats_wal_with_pgbench_default( + neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_wal: List[PgStatTable], +): env = neon_with_baseline # initialize pgbench - env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) env.flush() with env.record_pg_stats(pg_stats_wal): env.pg_bin.run_capture( - ['pgbench', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) + ["pgbench", f"-T{duration}", f"--random-seed={seed}", env.pg.connstr()] + ) env.flush() @pytest.mark.parametrize("n_tables", [1, 10]) @pytest.mark.parametrize("duration", get_durations_matrix(10)) -def test_compare_pg_stats_wo_with_heavy_write(neon_with_baseline: PgCompare, - n_tables: int, - duration: int, - pg_stats_wo: List[PgStatTable]): +def test_compare_pg_stats_wo_with_heavy_write( + neon_with_baseline: PgCompare, n_tables: int, duration: int, pg_stats_wo: List[PgStatTable] +): env = neon_with_baseline with env.pg.connect().cursor() as cur: for i in range(n_tables): @@ -112,8 +122,7 @@ def test_compare_pg_stats_wo_with_heavy_write(neon_with_baseline: PgCompare, with env.record_pg_stats(pg_stats_wo): threads = [ - threading.Thread(target=start_single_table_workload, args=(i, )) - for i in range(n_tables) + threading.Thread(target=start_single_table_workload, args=(i,)) for i in range(n_tables) ] for thread in threads: diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index ad088684d5..01b2097112 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -1,10 +1,7 @@ from contextlib import closing -from fixtures.neon_fixtures import NeonEnv -from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare from io import BufferedReader, RawIOBase -from itertools import repeat + +from fixtures.compare_fixtures import PgCompare class CopyTestData(RawIOBase): @@ -27,9 +24,9 @@ class CopyTestData(RawIOBase): self.rownum += 1 # Number of bytes to read in this call - l = min(len(self.linebuf) - self.ptr, len(b)) + l = min(len(self.linebuf) - self.ptr, len(b)) # noqa: E741 - b[:l] = self.linebuf[self.ptr:(self.ptr + l)] + b[:l] = self.linebuf[self.ptr : (self.ptr + l)] self.ptr += l return l @@ -52,19 +49,19 @@ def test_copy(neon_with_baseline: PgCompare): # Load data with COPY, recording the time and I/O it takes. # # Since there's no data in the table previously, this extends it. - with env.record_pageserver_writes('copy_extend_pageserver_writes'): - with env.record_duration('copy_extend'): - cur.copy_from(copy_test_data(1000000), 'copytest') + with env.record_pageserver_writes("copy_extend_pageserver_writes"): + with env.record_duration("copy_extend"): + cur.copy_from(copy_test_data(1000000), "copytest") env.flush() # Delete most rows, and VACUUM to make the space available for reuse. - with env.record_pageserver_writes('delete_pageserver_writes'): - with env.record_duration('delete'): + with env.record_pageserver_writes("delete_pageserver_writes"): + with env.record_duration("delete"): cur.execute("delete from copytest where i % 100 <> 0;") env.flush() - with env.record_pageserver_writes('vacuum_pageserver_writes'): - with env.record_duration('vacuum'): + with env.record_pageserver_writes("vacuum_pageserver_writes"): + with env.record_duration("vacuum"): cur.execute("vacuum copytest") env.flush() @@ -72,9 +69,9 @@ def test_copy(neon_with_baseline: PgCompare): # by the VACUUM. # # This will also clear all the VM bits. - with env.record_pageserver_writes('copy_reuse_pageserver_writes'): - with env.record_duration('copy_reuse'): - cur.copy_from(copy_test_data(1000000), 'copytest') + with env.record_pageserver_writes("copy_reuse_pageserver_writes"): + with env.record_duration("copy_reuse"): + cur.copy_from(copy_test_data(1000000), "copytest") env.flush() env.report_peak_memory_use() diff --git a/test_runner/performance/test_dup_key.py b/test_runner/performance/test_dup_key.py index ee867a9845..81752ae740 100644 --- a/test_runner/performance/test_dup_key.py +++ b/test_runner/performance/test_dup_key.py @@ -1,5 +1,6 @@ -import pytest from contextlib import closing + +import pytest from fixtures.compare_fixtures import PgCompare from pytest_lazyfixture import lazy_fixture # type: ignore @@ -11,22 +12,24 @@ from pytest_lazyfixture import lazy_fixture # type: ignore pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), - ]) + ], +) def test_dup_key(env: PgCompare): # Update the same page many times, then measure read performance with closing(env.pg.connect()) as conn: with conn.cursor() as cur: - cur.execute('drop table if exists t, f;') + cur.execute("drop table if exists t, f;") cur.execute("SET synchronous_commit=off") cur.execute("SET statement_timeout=0") # Write many updates to the same row - with env.record_duration('write'): + with env.record_duration("write"): cur.execute("create table t (i integer, filler text);") - cur.execute('insert into t values (0);') - cur.execute(""" + cur.execute("insert into t values (0);") + cur.execute( + """ do $$ begin for ivar in 1..5000000 loop @@ -38,13 +41,14 @@ begin end loop; end; $$; -""") +""" + ) # Write 3-4 MB to evict t from compute cache - cur.execute('create table f (i integer);') - cur.execute(f'insert into f values (generate_series(1,100000));') + cur.execute("create table f (i integer);") + cur.execute("insert into f values (generate_series(1,100000));") # Read - with env.record_duration('read'): - cur.execute('select * from t;') + with env.record_duration("read"): + cur.execute("select * from t;") cur.fetchall() diff --git a/test_runner/performance/test_gist_build.py b/test_runner/performance/test_gist_build.py index 839eb3f57d..311030b99d 100644 --- a/test_runner/performance/test_gist_build.py +++ b/test_runner/performance/test_gist_build.py @@ -1,9 +1,6 @@ -import os from contextlib import closing -from fixtures.benchmark_fixture import MetricReport -from fixtures.neon_fixtures import NeonEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare -from fixtures.log_helper import log + +from fixtures.compare_fixtures import PgCompare # @@ -24,8 +21,8 @@ def test_gist_buffering_build(neon_with_baseline: PgCompare): ) # Build the index. - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('build'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("build"): cur.execute( "create index gist_pointidx2 on gist_point_tbl using gist(p) with (buffering = on)" ) diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py index d3da0310ce..aad6ee667a 100644 --- a/test_runner/performance/test_hot_page.py +++ b/test_runner/performance/test_hot_page.py @@ -1,5 +1,6 @@ -import pytest from contextlib import closing + +import pytest from fixtures.compare_fixtures import PgCompare from pytest_lazyfixture import lazy_fixture # type: ignore @@ -11,27 +12,28 @@ from pytest_lazyfixture import lazy_fixture # type: ignore pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), - ]) + ], +) def test_hot_page(env: PgCompare): # Update the same page many times, then measure read performance num_writes = 1000000 with closing(env.pg.connect()) as conn: with conn.cursor() as cur: - cur.execute('drop table if exists t, f;') + cur.execute("drop table if exists t, f;") # Write many updates to the same row - with env.record_duration('write'): - cur.execute('create table t (i integer);') - cur.execute('insert into t values (0);') + with env.record_duration("write"): + cur.execute("create table t (i integer);") + cur.execute("insert into t values (0);") for i in range(num_writes): - cur.execute(f'update t set i = {i};') + cur.execute(f"update t set i = {i};") # Write 3-4 MB to evict t from compute cache - cur.execute('create table f (i integer);') - cur.execute(f'insert into f values (generate_series(1,100000));') + cur.execute("create table f (i integer);") + cur.execute("insert into f values (generate_series(1,100000));") # Read - with env.record_duration('read'): - cur.execute('select * from t;') + with env.record_duration("read"): + cur.execute("select * from t;") cur.fetchall() diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py index 997c772f88..2f519e152c 100644 --- a/test_runner/performance/test_hot_table.py +++ b/test_runner/performance/test_hot_table.py @@ -1,5 +1,6 @@ -import pytest from contextlib import closing + +import pytest from fixtures.compare_fixtures import PgCompare from pytest_lazyfixture import lazy_fixture # type: ignore @@ -11,7 +12,8 @@ from pytest_lazyfixture import lazy_fixture # type: ignore pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), - ]) + ], +) def test_hot_table(env: PgCompare): # Update a small table many times, then measure read performance num_rows = 100000 # Slightly larger than shared buffers size TODO validate @@ -20,17 +22,17 @@ def test_hot_table(env: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: - cur.execute('drop table if exists t;') + cur.execute("drop table if exists t;") # Write many updates to a small table - with env.record_duration('write'): - cur.execute('create table t (i integer primary key);') - cur.execute(f'insert into t values (generate_series(1,{num_rows}));') + with env.record_duration("write"): + cur.execute("create table t (i integer primary key);") + cur.execute(f"insert into t values (generate_series(1,{num_rows}));") for i in range(num_writes): - cur.execute(f'update t set i = {i + num_rows} WHERE i = {i};') + cur.execute(f"update t set i = {i + num_rows} WHERE i = {i};") # Read the table - with env.record_duration('read'): + with env.record_duration("read"): for i in range(num_reads): - cur.execute('select * from t;') + cur.execute("select * from t;") cur.fetchall() diff --git a/test_runner/performance/test_parallel_copy_to.py b/test_runner/performance/test_parallel_copy_to.py index d4e74ce195..b4a25e0edc 100644 --- a/test_runner/performance/test_parallel_copy_to.py +++ b/test_runner/performance/test_parallel_copy_to.py @@ -1,10 +1,8 @@ -from io import BytesIO import asyncio -import asyncpg -from fixtures.neon_fixtures import NeonEnv, Postgres, PgProtocol -from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare +from io import BytesIO + +from fixtures.compare_fixtures import PgCompare +from fixtures.neon_fixtures import PgProtocol async def repeat_bytes(buf, repetitions: int): @@ -16,7 +14,8 @@ async def copy_test_data_to_table(pg: PgProtocol, worker_id: int, table_name: st buf = BytesIO() for i in range(1000): buf.write( - f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode()) + f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode() + ) buf.seek(0) copy_input = repeat_bytes(buf.read(), 5000) @@ -28,7 +27,7 @@ async def copy_test_data_to_table(pg: PgProtocol, worker_id: int, table_name: st async def parallel_load_different_tables(pg: PgProtocol, n_parallel: int): workers = [] for worker_id in range(n_parallel): - worker = copy_test_data_to_table(pg, worker_id, f'copytest_{worker_id}') + worker = copy_test_data_to_table(pg, worker_id, f"copytest_{worker_id}") workers.append(asyncio.create_task(worker)) # await all workers @@ -43,10 +42,10 @@ def test_parallel_copy_different_tables(neon_with_baseline: PgCompare, n_paralle cur = conn.cursor() for worker_id in range(n_parallel): - cur.execute(f'CREATE TABLE copytest_{worker_id} (i int, t text)') + cur.execute(f"CREATE TABLE copytest_{worker_id} (i int, t text)") - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('load'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("load"): asyncio.run(parallel_load_different_tables(env.pg, n_parallel)) env.flush() @@ -57,7 +56,7 @@ def test_parallel_copy_different_tables(neon_with_baseline: PgCompare, n_paralle async def parallel_load_same_table(pg: PgProtocol, n_parallel: int): workers = [] for worker_id in range(n_parallel): - worker = copy_test_data_to_table(pg, worker_id, f'copytest') + worker = copy_test_data_to_table(pg, worker_id, "copytest") workers.append(asyncio.create_task(worker)) # await all workers @@ -70,10 +69,10 @@ def test_parallel_copy_same_table(neon_with_baseline: PgCompare, n_parallel=5): conn = env.pg.connect() cur = conn.cursor() - cur.execute(f'CREATE TABLE copytest (i int, t text)') + cur.execute("CREATE TABLE copytest (i int, t text)") - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('load'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("load"): asyncio.run(parallel_load_same_table(env.pg, n_parallel)) env.flush() diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 89c510e76e..934642d095 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -30,7 +30,7 @@ def init_pgbench(env: PgCompare, cmdline): # duration is actually a metric and uses float instead of int for timestamp start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() - with env.record_pageserver_writes('init.pageserver_writes'): + with env.record_pageserver_writes("init.pageserver_writes"): out = env.pg_bin.run_capture(cmdline) env.flush() @@ -49,10 +49,12 @@ def init_pgbench(env: PgCompare, cmdline): def run_pgbench(env: PgCompare, prefix: str, cmdline): - with env.record_pageserver_writes(f'{prefix}.pageserver_writes'): + with env.record_pageserver_writes(f"{prefix}.pageserver_writes"): run_start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() - out = env.pg_bin.run_capture(cmdline, ) + out = env.pg_bin.run_capture( + cmdline, + ) run_duration = timeit.default_timer() - t0 run_end_timestamp = utc_now_timestamp() env.flush() @@ -78,40 +80,45 @@ def run_pgbench(env: PgCompare, prefix: str, cmdline): # # Currently, the # of connections is hardcoded at 4 def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: PgBenchLoadType): - env.zenbenchmark.record("scale", scale, '', MetricReport.TEST_PARAM) + env.zenbenchmark.record("scale", scale, "", MetricReport.TEST_PARAM) if workload_type == PgBenchLoadType.INIT: # Run initialize init_pgbench( - env, ['pgbench', f'-s{scale}', '-i', env.pg.connstr(options='-cstatement_timeout=1h')]) + env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options="-cstatement_timeout=1h")] + ) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: # Run simple-update workload - run_pgbench(env, - "simple-update", - [ - 'pgbench', - '-N', - '-c4', - f'-T{duration}', - '-P2', - '--progress-timestamp', - env.pg.connstr(), - ]) + run_pgbench( + env, + "simple-update", + [ + "pgbench", + "-N", + "-c4", + f"-T{duration}", + "-P2", + "--progress-timestamp", + env.pg.connstr(), + ], + ) if workload_type == PgBenchLoadType.SELECT_ONLY: # Run SELECT workload - run_pgbench(env, - "select-only", - [ - 'pgbench', - '-S', - '-c4', - f'-T{duration}', - '-P2', - '--progress-timestamp', - env.pg.connstr(), - ]) + run_pgbench( + env, + "select-only", + [ + "pgbench", + "-S", + "-c4", + f"-T{duration}", + "-P2", + "--progress-timestamp", + env.pg.connstr(), + ], + ) env.report_size() @@ -121,12 +128,12 @@ def get_durations_matrix(default: int = 45) -> List[int]: rv = [] for d in durations.split(","): d = d.strip().lower() - if d.endswith('h'): - duration = int(d.removesuffix('h')) * 60 * 60 - elif d.endswith('m'): - duration = int(d.removesuffix('m')) * 60 + if d.endswith("h"): + duration = int(d.removesuffix("h")) * 60 * 60 + elif d.endswith("m"): + duration = int(d.removesuffix("m")) * 60 else: - duration = int(d.removesuffix('s')) + duration = int(d.removesuffix("s")) rv.append(duration) return rv @@ -137,10 +144,10 @@ def get_scales_matrix(default: int = 10) -> List[int]: rv = [] for s in scales.split(","): s = s.strip().lower() - if s.endswith('mb'): - scale = get_scale_for_db(int(s.removesuffix('mb'))) - elif s.endswith('gb'): - scale = get_scale_for_db(int(s.removesuffix('gb')) * 1024) + if s.endswith("mb"): + scale = get_scale_for_db(int(s.removesuffix("mb"))) + elif s.endswith("gb"): + scale = get_scale_for_db(int(s.removesuffix("gb")) * 1024) else: scale = int(s) rv.append(scale) @@ -167,9 +174,9 @@ def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int): @pytest.mark.parametrize("duration", get_durations_matrix()) def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int): neon_env_builder.num_safekeepers = 1 - neon_env_builder.pageserver_config_override = ''' + neon_env_builder.pageserver_config_override = """ profiling="page_requests" -''' +""" if not profiling_supported(): pytest.skip("pageserver was built without 'profiling' feature") diff --git a/test_runner/performance/test_random_writes.py b/test_runner/performance/test_random_writes.py index 8931234c51..df766d52da 100644 --- a/test_runner/performance/test_random_writes.py +++ b/test_runner/performance/test_random_writes.py @@ -1,14 +1,8 @@ -import os -from contextlib import closing -from fixtures.benchmark_fixture import MetricReport -from fixtures.neon_fixtures import NeonEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare -from fixtures.log_helper import log - -import psycopg2.extras import random -import time +from contextlib import closing +from fixtures.benchmark_fixture import MetricReport +from fixtures.compare_fixtures import PgCompare from fixtures.utils import query_scalar @@ -43,13 +37,15 @@ def test_random_writes(neon_with_baseline: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: # Create the test table - with env.record_duration('init'): - cur.execute(""" + with env.record_duration("init"): + cur.execute( + """ CREATE TABLE Big( pk integer primary key, count integer default 0 ); - """) + """ + ) # Insert n_rows in batches to avoid query timeouts rows_inserted = 0 @@ -62,7 +58,7 @@ def test_random_writes(neon_with_baseline: PgCompare): # Get table size (can't be predicted because padding and alignment) table_size = query_scalar(cur, "SELECT pg_relation_size('Big')") - env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM) + env.zenbenchmark.record("table_size", table_size, "bytes", MetricReport.TEST_PARAM) # Decide how much to write, based on knowledge of pageserver implementation. # Avoiding segment collisions maximizes (neon_runtime / vanilla_runtime). @@ -72,13 +68,15 @@ def test_random_writes(neon_with_baseline: PgCompare): # The closer this is to 250 MB, the more realistic the test is. effective_checkpoint_distance = table_size * n_writes // n_rows - env.zenbenchmark.record("effective_checkpoint_distance", - effective_checkpoint_distance, - 'bytes', - MetricReport.TEST_PARAM) + env.zenbenchmark.record( + "effective_checkpoint_distance", + effective_checkpoint_distance, + "bytes", + MetricReport.TEST_PARAM, + ) # Update random keys - with env.record_duration('run'): + with env.record_duration("run"): for it in range(n_iterations): for i in range(n_writes): key = random.randint(1, n_rows) diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py index 8d7ad46c1a..c681c50ff5 100644 --- a/test_runner/performance/test_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -1,16 +1,15 @@ # Test sequential scan speed # from contextlib import closing -from dataclasses import dataclass -from fixtures.neon_fixtures import NeonEnv -from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker -from fixtures.compare_fixtures import PgCompare + import pytest +from fixtures.benchmark_fixture import MetricReport +from fixtures.compare_fixtures import PgCompare +from fixtures.log_helper import log @pytest.mark.parametrize( - 'rows,iters,workers', + "rows,iters,workers", [ # The test table is large enough (3-4 MB) that it doesn't fit in the compute node # cache, so the seqscans go to the page server. But small enough that it fits @@ -18,31 +17,34 @@ import pytest pytest.param(100000, 100, 0), # Also test with a larger table, with and without parallelism pytest.param(10000000, 1, 0), - pytest.param(10000000, 1, 4) - ]) + pytest.param(10000000, 1, 4), + ], +) def test_seqscans(neon_with_baseline: PgCompare, rows: int, iters: int, workers: int): env = neon_with_baseline with closing(env.pg.connect()) as conn: with conn.cursor() as cur: - cur.execute('create table t (i integer);') - cur.execute(f'insert into t values (generate_series(1,{rows}));') + cur.execute("create table t (i integer);") + cur.execute(f"insert into t values (generate_series(1,{rows}));") # Verify that the table is larger than shared_buffers - cur.execute(''' + cur.execute( + """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_ize from pg_settings where name = 'shared_buffers' - ''') + """ + ) row = cur.fetchone() assert row is not None shared_buffers = row[0] table_size = row[1] log.info(f"shared_buffers is {shared_buffers}, table size {table_size}") assert int(shared_buffers) < int(table_size) - env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM) + env.zenbenchmark.record("table_size", table_size, "bytes", MetricReport.TEST_PARAM) cur.execute(f"set max_parallel_workers_per_gather = {workers}") - with env.record_duration('run'): + with env.record_duration("run"): for i in range(iters): - cur.execute('select count(*) from t;') + cur.execute("select count(*) from t;") diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index 1cfd128e9b..e91b180154 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -1,7 +1,8 @@ -import pytest from contextlib import closing -from fixtures.neon_fixtures import NeonEnvBuilder + +import pytest from fixtures.benchmark_fixture import NeonBenchmarker +from fixtures.neon_fixtures import NeonEnvBuilder # This test sometimes runs for longer than the global 5 minute timeout. @@ -11,15 +12,15 @@ def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker env = neon_env_builder.init_start() # Start - env.neon_cli.create_branch('test_startup') + env.neon_cli.create_branch("test_startup") with zenbenchmark.record_duration("startup_time"): - pg = env.postgres.create_start('test_startup') + pg = env.postgres.create_start("test_startup") pg.safe_psql("select 1;") # Restart pg.stop_and_destroy() with zenbenchmark.record_duration("restart_time"): - pg.create_start('test_startup') + pg.create_start("test_startup") pg.safe_psql("select 1;") # Fill up @@ -28,8 +29,8 @@ def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker with closing(pg.connect()) as conn: with conn.cursor() as cur: for i in range(num_tables): - cur.execute(f'create table t_{i} (i integer);') - cur.execute(f'insert into t_{i} values (generate_series(1,{num_rows}));') + cur.execute(f"create table t_{i} (i integer);") + cur.execute(f"insert into t_{i} values (generate_series(1,{num_rows}));") # Read with zenbenchmark.record_duration("read_time"): @@ -42,7 +43,7 @@ def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker # Restart pg.stop_and_destroy() with zenbenchmark.record_duration("restart_with_data"): - pg.create_start('test_startup') + pg.create_start("test_startup") pg.safe_psql("select 1;") # Read diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index bbb5ddecab..03d5ba208a 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -10,8 +10,7 @@ from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin from fixtures.utils import lsn_from_hex - -from performance.test_perf_pgbench import (get_durations_matrix, get_scales_matrix) +from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix @pytest.fixture(params=["vanilla", "neon_off", "neon_on"]) @@ -30,7 +29,9 @@ def pg_compare(request) -> PgCompare: return fixture else: - assert len(x) == 2, f"request param ({request.param}) should have a format of \ + assert ( + len(x) == 2 + ), f"request param ({request.param}) should have a format of \ `neon_{{safekeepers_enable_fsync}}`" # `NeonCompare` interface @@ -70,8 +71,7 @@ def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_it with env.record_duration("run_duration"): threads = [ - threading.Thread(target=start_single_table_workload, args=(i, )) - for i in range(n_tables) + threading.Thread(target=start_single_table_workload, args=(i,)) for i in range(n_tables) ] for thread in threads: @@ -95,12 +95,14 @@ def test_heavy_write_workload(pg_compare: PgCompare, n_tables: int, scale: int, ) cur.execute(f"INSERT INTO t{i} (key) VALUES (0)") - workload_thread = threading.Thread(target=start_heavy_write_workload, - args=(env, n_tables, scale, num_iters)) + workload_thread = threading.Thread( + target=start_heavy_write_workload, args=(env, n_tables, scale, num_iters) + ) workload_thread.start() - record_thread = threading.Thread(target=record_lsn_write_lag, - args=(env, lambda: workload_thread.is_alive())) + record_thread = threading.Thread( + target=record_lsn_write_lag, args=(env, lambda: workload_thread.is_alive()) + ) record_thread.start() record_read_latency(env, lambda: workload_thread.is_alive(), "SELECT * from t0 where key = 0") @@ -110,14 +112,16 @@ def test_heavy_write_workload(pg_compare: PgCompare, n_tables: int, scale: int, def start_pgbench_simple_update_workload(env: PgCompare, duration: int): with env.record_duration("run_duration"): - env.pg_bin.run_capture([ - 'pgbench', - '-j10', - '-c10', - '-N', - f'-T{duration}', - env.pg.connstr(options="-csynchronous_commit=off") - ]) + env.pg_bin.run_capture( + [ + "pgbench", + "-j10", + "-c10", + "-N", + f"-T{duration}", + env.pg.connstr(options="-csynchronous_commit=off"), + ] + ) env.flush() @@ -128,20 +132,22 @@ def test_pgbench_simple_update_workload(pg_compare: PgCompare, scale: int, durat env = pg_compare # initialize pgbench tables - env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) env.flush() - workload_thread = threading.Thread(target=start_pgbench_simple_update_workload, - args=(env, duration)) + workload_thread = threading.Thread( + target=start_pgbench_simple_update_workload, args=(env, duration) + ) workload_thread.start() - record_thread = threading.Thread(target=record_lsn_write_lag, - args=(env, lambda: workload_thread.is_alive())) + record_thread = threading.Thread( + target=record_lsn_write_lag, args=(env, lambda: workload_thread.is_alive()) + ) record_thread.start() - record_read_latency(env, - lambda: workload_thread.is_alive(), - "SELECT * from pgbench_accounts where aid = 1") + record_read_latency( + env, lambda: workload_thread.is_alive(), "SELECT * from pgbench_accounts where aid = 1" + ) workload_thread.join() record_thread.join() @@ -150,13 +156,15 @@ def start_pgbench_intensive_initialization(env: PgCompare, scale: int, done_even with env.record_duration("run_duration"): # Needs to increase the statement timeout (default: 120s) because the # initialization step can be slow with a large scale. - env.pg_bin.run_capture([ - 'pgbench', - f'-s{scale}', - '-i', - '-Idtg', - env.pg.connstr(options='-cstatement_timeout=600s') - ]) + env.pg_bin.run_capture( + [ + "pgbench", + f"-s{scale}", + "-i", + "-Idtg", + env.pg.connstr(options="-cstatement_timeout=600s"), + ] + ) done_event.set() @@ -170,12 +178,14 @@ def test_pgbench_intensive_init_workload(pg_compare: PgCompare, scale: int): workload_done_event = threading.Event() - workload_thread = threading.Thread(target=start_pgbench_intensive_initialization, - args=(env, scale, workload_done_event)) + workload_thread = threading.Thread( + target=start_pgbench_intensive_initialization, args=(env, scale, workload_done_event) + ) workload_thread.start() - record_thread = threading.Thread(target=record_lsn_write_lag, - args=(env, lambda: not workload_done_event.is_set())) + record_thread = threading.Thread( + target=record_lsn_write_lag, args=(env, lambda: not workload_done_event.is_set()) + ) record_thread.start() record_read_latency(env, lambda: not workload_done_event.is_set(), "SELECT count(*) from foo") @@ -195,13 +205,15 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte cur.execute("CREATE EXTENSION neon") while run_cond(): - cur.execute(''' + cur.execute( + """ select pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn), pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn)), pg_current_wal_flush_lsn(), received_lsn from backpressure_lsns(); - ''') + """ + ) res = cur.fetchone() lsn_write_lags.append(res[0]) @@ -220,24 +232,29 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte time.sleep(pool_interval) - env.zenbenchmark.record("lsn_write_lag_max", - float(max(lsn_write_lags) / (1024**2)), - "MB", - MetricReport.LOWER_IS_BETTER) - env.zenbenchmark.record("lsn_write_lag_avg", - float(statistics.mean(lsn_write_lags) / (1024**2)), - "MB", - MetricReport.LOWER_IS_BETTER) - env.zenbenchmark.record("lsn_write_lag_stdev", - float(statistics.stdev(lsn_write_lags) / (1024**2)), - "MB", - MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record( + "lsn_write_lag_max", + float(max(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER, + ) + env.zenbenchmark.record( + "lsn_write_lag_avg", + float(statistics.mean(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER, + ) + env.zenbenchmark.record( + "lsn_write_lag_stdev", + float(statistics.stdev(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER, + ) -def record_read_latency(env: PgCompare, - run_cond: Callable[[], bool], - read_query: str, - read_interval: float = 1.0): +def record_read_latency( + env: PgCompare, run_cond: Callable[[], bool], read_query: str, read_interval: float = 1.0 +): read_latencies = [] with env.pg.connect().cursor() as cur: @@ -256,15 +273,12 @@ def record_read_latency(env: PgCompare, time.sleep(read_interval) - env.zenbenchmark.record("read_latency_max", - max(read_latencies), - 's', - MetricReport.LOWER_IS_BETTER) - env.zenbenchmark.record("read_latency_avg", - statistics.mean(read_latencies), - 's', - MetricReport.LOWER_IS_BETTER) - env.zenbenchmark.record("read_latency_stdev", - statistics.stdev(read_latencies), - 's', - MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record( + "read_latency_max", max(read_latencies), "s", MetricReport.LOWER_IS_BETTER + ) + env.zenbenchmark.record( + "read_latency_avg", statistics.mean(read_latencies), "s", MetricReport.LOWER_IS_BETTER + ) + env.zenbenchmark.record( + "read_latency_stdev", statistics.stdev(read_latencies), "s", MetricReport.LOWER_IS_BETTER + ) diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py index 1d729fd78f..30c217e392 100644 --- a/test_runner/performance/test_write_amplification.py +++ b/test_runner/performance/test_write_amplification.py @@ -10,12 +10,9 @@ # in LSN order, writing the oldest layer first. That creates a new 10 MB image # layer to be created for each of those small updates. This is the Write # Amplification problem at its finest. -import os from contextlib import closing -from fixtures.benchmark_fixture import MetricReport -from fixtures.neon_fixtures import NeonEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare -from fixtures.log_helper import log + +from fixtures.compare_fixtures import PgCompare def test_write_amplification(neon_with_baseline: PgCompare): @@ -23,18 +20,20 @@ def test_write_amplification(neon_with_baseline: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('run'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("run"): # NOTE: Because each iteration updates every table already created, # the runtime and write amplification is O(n^2), where n is the # number of iterations. for i in range(25): - cur.execute(f''' + cur.execute( + f""" CREATE TABLE tbl{i} AS SELECT g as i, 'long string to consume some space' || g as t FROM generate_series(1, 100000) g - ''') + """ + ) cur.execute(f"create index on tbl{i} (i);") for j in range(1, i): cur.execute(f"delete from tbl{j} where i = {i}") diff --git a/test_runner/pg_clients/python/pg8000/pg8000_example.py b/test_runner/pg_clients/python/pg8000/pg8000_example.py index f463867f88..b1d77af5bb 100755 --- a/test_runner/pg_clients/python/pg8000/pg8000_example.py +++ b/test_runner/pg_clients/python/pg8000/pg8000_example.py @@ -1,7 +1,6 @@ #! /usr/bin/env python3 import os -import ssl import pg8000.dbapi diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py index a117616358..2dbab19e7a 100644 --- a/test_runner/pg_clients/test_pg_clients.py +++ b/test_runner/pg_clients/test_pg_clients.py @@ -1,6 +1,4 @@ -import os import shutil -import subprocess from pathlib import Path from tempfile import NamedTemporaryFile @@ -18,10 +16,12 @@ from fixtures.utils import subprocess_capture "python/asyncpg", pytest.param( "python/pg8000", # See https://github.com/neondatabase/neon/pull/2008#discussion_r912264281 - marks=pytest.mark.xfail(reason="Handles SSL in incompatible with Neon way")), + marks=pytest.mark.xfail(reason="Handles SSL in incompatible with Neon way"), + ), pytest.param( "swift/PostgresClientKit", # See https://github.com/neondatabase/neon/pull/2008#discussion_r911896592 - marks=pytest.mark.xfail(reason="Neither SNI nor parameters is supported")), + marks=pytest.mark.xfail(reason="Neither SNI nor parameters is supported"), + ), "typescript/postgresql-client", ], ) @@ -31,12 +31,14 @@ def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: st env_file = None with NamedTemporaryFile(mode="w", delete=False) as f: env_file = f.name - f.write(f""" + f.write( + f""" NEON_HOST={conn_options["host"]} NEON_DATABASE={conn_options["dbname"]} NEON_USER={conn_options["user"]} NEON_PASSWORD={conn_options["password"]} - """) + """ + ) image_tag = client.lower() docker_bin = shutil.which("docker") diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py similarity index 55% rename from test_runner/batch_others/test_ancestor_branch.py rename to test_runner/regress/test_ancestor_branch.py index c4d36da043..96612a8aef 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -13,83 +13,90 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): # Extend compaction_period and gc_period to disable background compaction and gc. tenant, _ = env.neon_cli.create_tenant( conf={ - 'gc_period': '10 m', - 'gc_horizon': '1048576', - 'checkpoint_distance': '4194304', - 'compaction_period': '10 m', - 'compaction_threshold': '2', - 'compaction_target_size': '4194304', - }) + "gc_period": "10 m", + "gc_horizon": "1048576", + "checkpoint_distance": "4194304", + "compaction_period": "10 m", + "compaction_threshold": "2", + "compaction_target_size": "4194304", + } + ) env.pageserver.safe_psql("failpoints flush-frozen-before-sync=sleep(10000)") - pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) + pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() branch0_timeline = query_scalar(branch0_cur, "SHOW neon.timeline_id") log.info(f"b0 timeline {branch0_timeline}") # Create table, and insert 100k rows. - branch0_lsn = query_scalar(branch0_cur, 'SELECT pg_current_wal_insert_lsn()') + branch0_lsn = query_scalar(branch0_cur, "SELECT pg_current_wal_insert_lsn()") log.info(f"b0 at lsn {branch0_lsn}") - branch0_cur.execute('CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)') - branch0_cur.execute(''' + branch0_cur.execute("CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)") + branch0_cur.execute( + """ INSERT INTO foo SELECT '00112233445566778899AABBCCDDEEFF' || ':branch0:' || g FROM generate_series(1, 100000) g - ''') - lsn_100 = query_scalar(branch0_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN after 100k rows: {lsn_100}') + """ + ) + lsn_100 = query_scalar(branch0_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 100k rows: {lsn_100}") # Create branch1. - env.neon_cli.create_branch('branch1', 'main', tenant_id=tenant, ancestor_start_lsn=lsn_100) - pg_branch1 = env.postgres.create_start('branch1', tenant_id=tenant) + env.neon_cli.create_branch("branch1", "main", tenant_id=tenant, ancestor_start_lsn=lsn_100) + pg_branch1 = env.postgres.create_start("branch1", tenant_id=tenant) log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() branch1_timeline = query_scalar(branch1_cur, "SHOW neon.timeline_id") log.info(f"b1 timeline {branch1_timeline}") - branch1_lsn = query_scalar(branch1_cur, 'SELECT pg_current_wal_insert_lsn()') + branch1_lsn = query_scalar(branch1_cur, "SELECT pg_current_wal_insert_lsn()") log.info(f"b1 at lsn {branch1_lsn}") # Insert 100k rows. - branch1_cur.execute(''' + branch1_cur.execute( + """ INSERT INTO foo SELECT '00112233445566778899AABBCCDDEEFF' || ':branch1:' || g FROM generate_series(1, 100000) g - ''') - lsn_200 = query_scalar(branch1_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN after 200k rows: {lsn_200}') + """ + ) + lsn_200 = query_scalar(branch1_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 200k rows: {lsn_200}") # Create branch2. - env.neon_cli.create_branch('branch2', 'branch1', tenant_id=tenant, ancestor_start_lsn=lsn_200) - pg_branch2 = env.postgres.create_start('branch2', tenant_id=tenant) + env.neon_cli.create_branch("branch2", "branch1", tenant_id=tenant, ancestor_start_lsn=lsn_200) + pg_branch2 = env.postgres.create_start("branch2", tenant_id=tenant) log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() branch2_timeline = query_scalar(branch2_cur, "SHOW neon.timeline_id") log.info(f"b2 timeline {branch2_timeline}") - branch2_lsn = query_scalar(branch2_cur, 'SELECT pg_current_wal_insert_lsn()') + branch2_lsn = query_scalar(branch2_cur, "SELECT pg_current_wal_insert_lsn()") log.info(f"b2 at lsn {branch2_lsn}") # Insert 100k rows. - branch2_cur.execute(''' + branch2_cur.execute( + """ INSERT INTO foo SELECT '00112233445566778899AABBCCDDEEFF' || ':branch2:' || g FROM generate_series(1, 100000) g - ''') - lsn_300 = query_scalar(branch2_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN after 300k rows: {lsn_300}') + """ + ) + lsn_300 = query_scalar(branch2_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 300k rows: {lsn_300}") # Run compaction on branch1. - compact = f'compact {tenant.hex} {branch1_timeline} {lsn_200}' + compact = f"compact {tenant.hex} {branch1_timeline} {lsn_200}" log.info(compact) env.pageserver.safe_psql(compact) - assert query_scalar(branch0_cur, 'SELECT count(*) FROM foo') == 100000 + assert query_scalar(branch0_cur, "SELECT count(*) FROM foo") == 100000 - assert query_scalar(branch1_cur, 'SELECT count(*) FROM foo') == 200000 + assert query_scalar(branch1_cur, "SELECT count(*) FROM foo") == 200000 - assert query_scalar(branch2_cur, 'SELECT count(*) FROM foo') == 300000 + assert query_scalar(branch2_cur, "SELECT count(*) FROM foo") == 300000 diff --git a/test_runner/batch_others/test_auth.py b/test_runner/regress/test_auth.py similarity index 61% rename from test_runner/batch_others/test_auth.py rename to test_runner/regress/test_auth.py index 0fd0a5d7e3..16d6ae45c3 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -1,7 +1,8 @@ from contextlib import closing from uuid import uuid4 -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException + import pytest +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): @@ -23,41 +24,46 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): ps.safe_psql("set FOO", password=tenant_token) ps.safe_psql("set FOO", password=management_token) - new_timeline_id = env.neon_cli.create_branch('test_pageserver_auth', - tenant_id=env.initial_tenant) + new_timeline_id = env.neon_cli.create_branch( + "test_pageserver_auth", tenant_id=env.initial_tenant + ) # tenant can create branches - tenant_http_client.timeline_create(tenant_id=env.initial_tenant, - ancestor_timeline_id=new_timeline_id) + tenant_http_client.timeline_create( + tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + ) # console can create branches for tenant - management_http_client.timeline_create(tenant_id=env.initial_tenant, - ancestor_timeline_id=new_timeline_id) + management_http_client.timeline_create( + tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + ) # fail to create branch using token with different tenant_id - with pytest.raises(NeonPageserverApiException, - match='Forbidden: Tenant id mismatch. Permission denied'): - invalid_tenant_http_client.timeline_create(tenant_id=env.initial_tenant, - ancestor_timeline_id=new_timeline_id) + with pytest.raises( + NeonPageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied" + ): + invalid_tenant_http_client.timeline_create( + tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + ) # create tenant using management token management_http_client.tenant_create() # fail to create tenant using tenant token with pytest.raises( - NeonPageserverApiException, - match='Forbidden: Attempt to access management api with tenant scope. Permission denied' + NeonPageserverApiException, + match="Forbidden: Attempt to access management api with tenant scope. Permission denied", ): tenant_http_client.tenant_create() -@pytest.mark.parametrize('with_safekeepers', [False, True]) +@pytest.mark.parametrize("with_safekeepers", [False, True]) def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): neon_env_builder.auth_enabled = True if with_safekeepers: neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - branch = f'test_compute_auth_to_pageserver{with_safekeepers}' + branch = f"test_compute_auth_to_pageserver{with_safekeepers}" env.neon_cli.create_branch(branch) pg = env.postgres.create_start(branch) @@ -65,7 +71,7 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder, with_safek with conn.cursor() as cur: # we rely upon autocommit after each statement # as waiting for acceptors happens there - cur.execute('CREATE TABLE t(key int primary key, value text)') + cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (5000050000, ) + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (5000050000,) diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/regress/test_backpressure.py similarity index 76% rename from test_runner/batch_others/test_backpressure.py rename to test_runner/regress/test_backpressure.py index 4ca03b102b..a81fa380a9 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/regress/test_backpressure.py @@ -1,13 +1,13 @@ +import threading +import time from contextlib import closing, contextmanager + import psycopg2.extras import pytest -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log -import time -from fixtures.neon_fixtures import Postgres -import threading +from fixtures.neon_fixtures import NeonEnvBuilder, Postgres -pytest_plugins = ("fixtures.neon_fixtures") +pytest_plugins = "fixtures.neon_fixtures" @contextmanager @@ -44,7 +44,8 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv with pg_cur(pg) as cur: while not stop_event.is_set(): try: - cur.execute(''' + cur.execute( + """ select pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn) as received_lsn_lag, pg_wal_lsn_diff(pg_current_wal_flush_lsn(),disk_consistent_lsn) as disk_consistent_lsn_lag, pg_wal_lsn_diff(pg_current_wal_flush_lsn(),remote_consistent_lsn) as remote_consistent_lsn_lag, @@ -52,16 +53,19 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_flush_lsn(),disk_consistent_lsn)), pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_flush_lsn(),remote_consistent_lsn)) from backpressure_lsns(); - ''') + """ + ) res = cur.fetchone() received_lsn_lag = res[0] disk_consistent_lsn_lag = res[1] remote_consistent_lsn_lag = res[2] - log.info(f"received_lsn_lag = {received_lsn_lag} ({res[3]}), " - f"disk_consistent_lsn_lag = {disk_consistent_lsn_lag} ({res[4]}), " - f"remote_consistent_lsn_lag = {remote_consistent_lsn_lag} ({res[5]})") + log.info( + f"received_lsn_lag = {received_lsn_lag} ({res[3]}), " + f"disk_consistent_lsn_lag = {disk_consistent_lsn_lag} ({res[4]}), " + f"remote_consistent_lsn_lag = {remote_consistent_lsn_lag} ({res[5]})" + ) # Since feedback from pageserver is not immediate, we should allow some lag overflow lag_overflow = 5 * 1024 * 1024 # 5MB @@ -71,7 +75,9 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv if max_replication_flush_lag_bytes > 0: assert disk_consistent_lsn_lag < max_replication_flush_lag_bytes + lag_overflow if max_replication_apply_lag_bytes > 0: - assert remote_consistent_lsn_lag < max_replication_apply_lag_bytes + lag_overflow + assert ( + remote_consistent_lsn_lag < max_replication_apply_lag_bytes + lag_overflow + ) time.sleep(polling_interval) @@ -79,7 +85,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv log.info(f"backpressure check query failed: {e}") stop_event.set() - log.info('check thread stopped') + log.info("check thread stopped") # This test illustrates how to tune backpressure to control the lag @@ -94,10 +100,11 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() # Create a branch for us - env.neon_cli.create_branch('test_backpressure') + env.neon_cli.create_branch("test_backpressure") - pg = env.postgres.create_start('test_backpressure', - config_lines=['max_replication_write_lag=30MB']) + pg = env.postgres.create_start( + "test_backpressure", config_lines=["max_replication_write_lag=30MB"] + ) log.info("postgres is running on 'test_backpressure' branch") # setup check thread @@ -131,23 +138,29 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): rows_inserted += 100000 except Exception as e: if check_thread.is_alive(): - log.info('stopping check thread') + log.info("stopping check thread") check_stop_event.set() check_thread.join() - assert False, f"Exception {e} while inserting rows, but WAL lag is within configured threshold. That means backpressure is not tuned properly" + assert ( + False + ), f"Exception {e} while inserting rows, but WAL lag is within configured threshold. That means backpressure is not tuned properly" else: - assert False, f"Exception {e} while inserting rows and WAL lag overflowed configured threshold. That means backpressure doesn't work." + assert ( + False + ), f"Exception {e} while inserting rows and WAL lag overflowed configured threshold. That means backpressure doesn't work." log.info(f"inserted {rows_inserted} rows") if check_thread.is_alive(): - log.info('stopping check thread') + log.info("stopping check thread") check_stop_event.set() check_thread.join() - log.info('check thread stopped') + log.info("check thread stopped") else: - assert False, "WAL lag overflowed configured threshold. That means backpressure doesn't work." + assert ( + False + ), "WAL lag overflowed configured threshold. That means backpressure doesn't work." -#TODO test_backpressure_disk_consistent_lsn_lag. Play with pageserver's checkpoint settings -#TODO test_backpressure_remote_consistent_lsn_lag +# TODO test_backpressure_disk_consistent_lsn_lag. Play with pageserver's checkpoint settings +# TODO test_backpressure_remote_consistent_lsn_lag diff --git a/test_runner/batch_others/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py similarity index 73% rename from test_runner/batch_others/test_basebackup_error.py rename to test_runner/regress/test_basebackup_error.py index 0909ed98a7..81a46ee2f0 100644 --- a/test_runner/batch_others/test_basebackup_error.py +++ b/test_runner/regress/test_basebackup_error.py @@ -1,5 +1,4 @@ import pytest - from fixtures.neon_fixtures import NeonEnv @@ -12,7 +11,7 @@ def test_basebackup_error(neon_simple_env: NeonEnv): env.neon_cli.create_branch("test_basebackup_error", "empty") # Introduce failpoint - env.pageserver.safe_psql(f"failpoints basebackup-before-control-file=return") + env.pageserver.safe_psql("failpoints basebackup-before-control-file=return") with pytest.raises(Exception, match="basebackup-before-control-file"): - pg = env.postgres.create_start('test_basebackup_error') + env.postgres.create_start("test_basebackup_error") diff --git a/test_runner/batch_others/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py similarity index 68% rename from test_runner/batch_others/test_branch_and_gc.py rename to test_runner/regress/test_branch_and_gc.py index 8e433f65ad..deb041b5d1 100644 --- a/test_runner/batch_others/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -1,6 +1,7 @@ import threading -import pytest import time + +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.utils import lsn_from_hex, query_scalar @@ -49,55 +50,52 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): tenant, _ = env.neon_cli.create_tenant( conf={ # disable background GC - 'gc_period': '10 m', - 'gc_horizon': f'{10 * 1024 ** 3}', - + "gc_period": "10 m", + "gc_horizon": f"{10 * 1024 ** 3}", # small checkpoint distance to create more delta layer files - 'checkpoint_distance': f'{1024 ** 2}', - + "checkpoint_distance": f"{1024 ** 2}", # set the target size to be large to allow the image layer to cover the whole key space - 'compaction_target_size': f'{1024 ** 3}', - + "compaction_target_size": f"{1024 ** 3}", # tweak the default settings to allow quickly create image layers and L1 layers - 'compaction_period': '1 s', - 'compaction_threshold': '2', - 'image_creation_threshold': '1', - + "compaction_period": "1 s", + "compaction_threshold": "2", + "image_creation_threshold": "1", # set PITR interval to be small, so we can do GC - 'pitr_interval': '1 s' - }) + "pitr_interval": "1 s", + } + ) - timeline_main = env.neon_cli.create_timeline(f'test_main', tenant_id=tenant) - pg_main = env.postgres.create_start('test_main', tenant_id=tenant) + timeline_main = env.neon_cli.create_timeline("test_main", tenant_id=tenant) + pg_main = env.postgres.create_start("test_main", tenant_id=tenant) main_cur = pg_main.connect().cursor() main_cur.execute( "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" ) - main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') - lsn1 = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN1: {lsn1}') + main_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") + lsn1 = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN1: {lsn1}") - main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') - lsn2 = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN2: {lsn2}') + main_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") + lsn2 = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN2: {lsn2}") # Set the GC horizon so that lsn1 is inside the horizon, which means # we can create a new branch starting from lsn1. env.pageserver.safe_psql( - f'do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}') + f"do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}" + ) - env.neon_cli.create_branch('test_branch', - 'test_main', - tenant_id=tenant, - ancestor_start_lsn=lsn1) - pg_branch = env.postgres.create_start('test_branch', tenant_id=tenant) + env.neon_cli.create_branch( + "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1 + ) + pg_branch = env.postgres.create_start("test_branch", tenant_id=tenant) branch_cur = pg_branch.connect().cursor() - branch_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') + branch_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") - assert query_scalar(branch_cur, 'SELECT count(*) FROM foo') == 200000 + assert query_scalar(branch_cur, "SELECT count(*) FROM foo") == 200000 # This test simulates a race condition happening when branch creation and GC are performed concurrently. @@ -120,38 +118,37 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): tenant, _ = env.neon_cli.create_tenant( conf={ # disable background GC - 'gc_period': '10 m', - 'gc_horizon': f'{10 * 1024 ** 3}', - + "gc_period": "10 m", + "gc_horizon": f"{10 * 1024 ** 3}", # small checkpoint distance to create more delta layer files - 'checkpoint_distance': f'{1024 ** 2}', - + "checkpoint_distance": f"{1024 ** 2}", # set the target size to be large to allow the image layer to cover the whole key space - 'compaction_target_size': f'{1024 ** 3}', - + "compaction_target_size": f"{1024 ** 3}", # tweak the default settings to allow quickly create image layers and L1 layers - 'compaction_period': '1 s', - 'compaction_threshold': '2', - 'image_creation_threshold': '1', - + "compaction_period": "1 s", + "compaction_threshold": "2", + "image_creation_threshold": "1", # set PITR interval to be small, so we can do GC - 'pitr_interval': '0 s' - }) + "pitr_interval": "0 s", + } + ) - b0 = env.neon_cli.create_branch('b0', tenant_id=tenant) - pg0 = env.postgres.create_start('b0', tenant_id=tenant) - res = pg0.safe_psql_many(queries=[ - "CREATE TABLE t(key serial primary key)", - "INSERT INTO t SELECT FROM generate_series(1, 100000)", - "SELECT pg_current_wal_insert_lsn()", - "INSERT INTO t SELECT FROM generate_series(1, 100000)", - ]) + b0 = env.neon_cli.create_branch("b0", tenant_id=tenant) + pg0 = env.postgres.create_start("b0", tenant_id=tenant) + res = pg0.safe_psql_many( + queries=[ + "CREATE TABLE t(key serial primary key)", + "INSERT INTO t SELECT FROM generate_series(1, 100000)", + "SELECT pg_current_wal_insert_lsn()", + "INSERT INTO t SELECT FROM generate_series(1, 100000)", + ] + ) lsn = res[2][0][0] # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the # branch creation task but the individual timeline GC iteration happens *after* # the branch creation task. - env.pageserver.safe_psql(f"failpoints before-timeline-gc=sleep(2000)") + env.pageserver.safe_psql("failpoints before-timeline-gc=sleep(2000)") def do_gc(): env.pageserver.safe_psql(f"do_gc {tenant.hex} {b0.hex} 0") @@ -166,6 +163,6 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC. with pytest.raises(Exception, match="invalid branch start lsn"): - env.neon_cli.create_branch('b1', 'b0', tenant_id=tenant, ancestor_start_lsn=lsn) + env.neon_cli.create_branch("b1", "b0", tenant_id=tenant, ancestor_start_lsn=lsn) thread.join() diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/regress/test_branch_behind.py similarity index 56% rename from test_runner/batch_others/test_branch_behind.py rename to test_runner/regress/test_branch_behind.py index 95f478dda8..51946380d2 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -1,8 +1,8 @@ import psycopg2.extras import pytest from fixtures.log_helper import log -from fixtures.utils import print_gc_result, query_scalar from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import print_gc_result, query_scalar # @@ -21,8 +21,8 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() # Branch at the point where only 100 rows were inserted - env.neon_cli.create_branch('test_branch_behind') - pgmain = env.postgres.create_start('test_branch_behind') + env.neon_cli.create_branch("test_branch_behind") + pgmain = env.postgres.create_start("test_branch_behind") log.info("postgres is running on 'test_branch_behind' branch") main_cur = pgmain.connect().cursor() @@ -30,80 +30,86 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): timeline = query_scalar(main_cur, "SHOW neon.timeline_id") # Create table, and insert the first 100 rows - main_cur.execute('CREATE TABLE foo (t text)') + main_cur.execute("CREATE TABLE foo (t text)") # keep some early lsn to test branch creation on out of date lsn - gced_lsn = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') + gced_lsn = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100) g - ''') - lsn_a = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN after 100 rows: {lsn_a}') + """ + ) + lsn_a = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 100 rows: {lsn_a}") # Insert some more rows. (This generates enough WAL to fill a few segments.) - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 200000) g - ''') - lsn_b = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info(f'LSN after 200100 rows: {lsn_b}') + """ + ) + lsn_b = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 200100 rows: {lsn_b}") # Branch at the point where only 100 rows were inserted - env.neon_cli.create_branch('test_branch_behind_hundred', - 'test_branch_behind', - ancestor_start_lsn=lsn_a) + env.neon_cli.create_branch( + "test_branch_behind_hundred", "test_branch_behind", ancestor_start_lsn=lsn_a + ) # Insert many more rows. This generates enough WAL to fill a few segments. - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 200000) g - ''') - lsn_c = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') + """ + ) + lsn_c = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") - log.info(f'LSN after 400100 rows: {lsn_c}') + log.info(f"LSN after 400100 rows: {lsn_c}") # Branch at the point where only 200100 rows were inserted - env.neon_cli.create_branch('test_branch_behind_more', - 'test_branch_behind', - ancestor_start_lsn=lsn_b) + env.neon_cli.create_branch( + "test_branch_behind_more", "test_branch_behind", ancestor_start_lsn=lsn_b + ) - pg_hundred = env.postgres.create_start('test_branch_behind_hundred') - pg_more = env.postgres.create_start('test_branch_behind_more') + pg_hundred = env.postgres.create_start("test_branch_behind_hundred") + pg_more = env.postgres.create_start("test_branch_behind_more") # On the 'hundred' branch, we should see only 100 rows hundred_cur = pg_hundred.connect().cursor() - assert query_scalar(hundred_cur, 'SELECT count(*) FROM foo') == 100 + assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100 # On the 'more' branch, we should see 100200 rows more_cur = pg_more.connect().cursor() - assert query_scalar(more_cur, 'SELECT count(*) FROM foo') == 200100 + assert query_scalar(more_cur, "SELECT count(*) FROM foo") == 200100 # All the rows are visible on the main branch - assert query_scalar(main_cur, 'SELECT count(*) FROM foo') == 400100 + assert query_scalar(main_cur, "SELECT count(*) FROM foo") == 400100 # Check bad lsn's for branching # branch at segment boundary - env.neon_cli.create_branch('test_branch_segment_boundary', - 'test_branch_behind', - ancestor_start_lsn="0/3000000") - pg = env.postgres.create_start('test_branch_segment_boundary') - assert pg.safe_psql('SELECT 1')[0][0] == 1 + env.neon_cli.create_branch( + "test_branch_segment_boundary", "test_branch_behind", ancestor_start_lsn="0/3000000" + ) + pg = env.postgres.create_start("test_branch_segment_boundary") + assert pg.safe_psql("SELECT 1")[0][0] == 1 # branch at pre-initdb lsn with pytest.raises(Exception, match="invalid branch start lsn"): - env.neon_cli.create_branch('test_branch_preinitdb', ancestor_start_lsn="0/42") + env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn="0/42") # branch at pre-ancestor lsn with pytest.raises(Exception, match="less than timeline ancestor lsn"): - env.neon_cli.create_branch('test_branch_preinitdb', - 'test_branch_behind', - ancestor_start_lsn="0/42") + env.neon_cli.create_branch( + "test_branch_preinitdb", "test_branch_behind", ancestor_start_lsn="0/42" + ) # check that we cannot create branch based on garbage collected data with env.pageserver.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: @@ -114,13 +120,13 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): with pytest.raises(Exception, match="invalid branch start lsn"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail - env.neon_cli.create_branch('test_branch_create_fail', - 'test_branch_behind', - ancestor_start_lsn=gced_lsn) + env.neon_cli.create_branch( + "test_branch_create_fail", "test_branch_behind", ancestor_start_lsn=gced_lsn + ) # check that after gc everything is still there - assert query_scalar(hundred_cur, 'SELECT count(*) FROM foo') == 100 + assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100 - assert query_scalar(more_cur, 'SELECT count(*) FROM foo') == 200100 + assert query_scalar(more_cur, "SELECT count(*) FROM foo") == 200100 - assert query_scalar(main_cur, 'SELECT count(*) FROM foo') == 400100 + assert query_scalar(main_cur, "SELECT count(*) FROM foo") == 400100 diff --git a/test_runner/batch_others/test_branching.py b/test_runner/regress/test_branching.py similarity index 57% rename from test_runner/batch_others/test_branching.py rename to test_runner/regress/test_branching.py index c61bac7a58..0c1490294d 100644 --- a/test_runner/batch_others/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -1,10 +1,11 @@ -from typing import List -import threading -import pytest -from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres -import time import random +import threading +import time +from typing import List + +import pytest from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres from performance.test_perf_pgbench import get_scales_matrix @@ -20,38 +21,37 @@ from performance.test_perf_pgbench import get_scales_matrix @pytest.mark.parametrize("n_branches", [10]) @pytest.mark.parametrize("scale", get_scales_matrix(1)) @pytest.mark.parametrize("ty", ["cascade", "flat"]) -def test_branching_with_pgbench(neon_simple_env: NeonEnv, - pg_bin: PgBin, - n_branches: int, - scale: int, - ty: str): +def test_branching_with_pgbench( + neon_simple_env: NeonEnv, pg_bin: PgBin, n_branches: int, scale: int, ty: str +): env = neon_simple_env # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test tenant, _ = env.neon_cli.create_tenant( - conf={ - 'gc_period': '5 s', - 'gc_horizon': f'{1024 ** 2}', - 'checkpoint_distance': f'{1024 ** 2}', - 'compaction_target_size': f'{1024 ** 2}', - # set PITR interval to be small, so we can do GC - 'pitr_interval': '5 s' - }) + conf={ + "gc_period": "5 s", + "gc_horizon": f"{1024 ** 2}", + "checkpoint_distance": f"{1024 ** 2}", + "compaction_target_size": f"{1024 ** 2}", + # set PITR interval to be small, so we can do GC + "pitr_interval": "5 s", + } + ) def run_pgbench(pg: Postgres): connstr = pg.connstr() log.info(f"Start a pgbench workload on pg {connstr}") - pg_bin.run_capture(['pgbench', '-i', f'-s{scale}', connstr]) - pg_bin.run_capture(['pgbench', '-T15', connstr]) + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-T15", connstr]) - env.neon_cli.create_branch('b0', tenant_id=tenant) + env.neon_cli.create_branch("b0", tenant_id=tenant) pgs: List[Postgres] = [] - pgs.append(env.postgres.create_start('b0', tenant_id=tenant)) + pgs.append(env.postgres.create_start("b0", tenant_id=tenant)) threads: List[threading.Thread] = [] - threads.append(threading.Thread(target=run_pgbench, args=(pgs[0], ), daemon=True)) + threads.append(threading.Thread(target=run_pgbench, args=(pgs[0],), daemon=True)) threads[-1].start() thread_limit = 4 @@ -62,28 +62,29 @@ def test_branching_with_pgbench(neon_simple_env: NeonEnv, time.sleep(delay) log.info(f"Sleep {delay}s") - # If the number of concurrent threads exceeds a threshold, - # wait for all the threads to finish before spawning a new one. - # Because tests defined in `batch_others` are run concurrently in CI, - # we want to avoid the situation that one test exhausts resources for other tests. + # If the number of concurrent threads exceeds a threshold, wait for + # all the threads to finish before spawning a new one. Because the + # regression tests in this directory are run concurrently in CI, we + # want to avoid the situation that one test exhausts resources for + # other tests. if len(threads) >= thread_limit: for thread in threads: thread.join() threads = [] if ty == "cascade": - env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(i), tenant_id=tenant) + env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(i), tenant_id=tenant) else: - env.neon_cli.create_branch('b{}'.format(i + 1), 'b0', tenant_id=tenant) + env.neon_cli.create_branch("b{}".format(i + 1), "b0", tenant_id=tenant) - pgs.append(env.postgres.create_start('b{}'.format(i + 1), tenant_id=tenant)) + pgs.append(env.postgres.create_start("b{}".format(i + 1), tenant_id=tenant)) - threads.append(threading.Thread(target=run_pgbench, args=(pgs[-1], ), daemon=True)) + threads.append(threading.Thread(target=run_pgbench, args=(pgs[-1],), daemon=True)) threads[-1].start() for thread in threads: thread.join() for pg in pgs: - res = pg.safe_psql('SELECT count(*) from pgbench_accounts') - assert res[0] == (100000 * scale, ) + res = pg.safe_psql("SELECT count(*) from pgbench_accounts") + assert res[0] == (100000 * scale,) diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py similarity index 82% rename from test_runner/batch_others/test_broken_timeline.py rename to test_runner/regress/test_broken_timeline.py index b9e5f637ab..c4b23c24b8 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -1,12 +1,10 @@ -from typing import List, Tuple -from uuid import UUID -import pytest import concurrent.futures -from contextlib import closing -from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres -from fixtures.log_helper import log import os +from typing import List, Tuple +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.utils import query_scalar @@ -24,7 +22,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): tenant_id = tenant_id_uuid.hex timeline_id = timeline_id_uuid.hex - pg = env.postgres.create_start(f'main', tenant_id=tenant_id_uuid) + pg = env.postgres.create_start("main", tenant_id=tenant_id_uuid) with pg.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") @@ -42,7 +40,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # Corrupt metadata file on timeline 1 (tenant1, timeline1, pg1) = tenant_timelines[1] metadata_path = "{}/tenants/{}/timelines/{}/metadata".format(env.repo_dir, tenant1, timeline1) - print(f'overwriting metadata file at {metadata_path}') + print(f"overwriting metadata file at {metadata_path}") f = open(metadata_path, "w") f.write("overwritten with garbage!") f.close() @@ -52,17 +50,17 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): (tenant2, timeline2, pg2) = tenant_timelines[2] timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant2, timeline2) for filename in os.listdir(timeline_path): - if filename.startswith('00000'): + if filename.startswith("00000"): # Looks like a layer file. Remove it - os.remove(f'{timeline_path}/{filename}') + os.remove(f"{timeline_path}/{filename}") # Corrupt layer files file on timeline 3 (tenant3, timeline3, pg3) = tenant_timelines[3] timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant3, timeline3) for filename in os.listdir(timeline_path): - if filename.startswith('00000'): + if filename.startswith("00000"): # Looks like a layer file. Corrupt it - f = open(f'{timeline_path}/{filename}', "w") + f = open(f"{timeline_path}/{filename}", "w") f.write("overwritten with garbage!") f.close() @@ -77,7 +75,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): (tenant, timeline, pg) = tenant_timelines[n] with pytest.raises(Exception, match="Cannot load local timeline") as err: pg.start() - log.info(f'compute startup failed as expected: {err}') + log.info(f"compute startup failed as expected: {err}") def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): @@ -87,9 +85,10 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: futures = [ - executor.submit(env.neon_cli.create_timeline, - f"test-create-multiple-timelines-{i}", - tenant_id) for i in range(4) + executor.submit( + env.neon_cli.create_timeline, f"test-create-multiple-timelines-{i}", tenant_id + ) + for i in range(4) ] for future in futures: future.result() @@ -101,7 +100,7 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): tenant_id, _ = env.neon_cli.create_tenant() # Introduce failpoint when creating a new timeline - env.pageserver.safe_psql(f"failpoints before-checkpoint-new-timeline=return") + env.pageserver.safe_psql("failpoints before-checkpoint-new-timeline=return") with pytest.raises(Exception, match="before-checkpoint-new-timeline"): _ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id) diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py new file mode 100644 index 0000000000..f47e4a99bf --- /dev/null +++ b/test_runner/regress/test_clog_truncate.py @@ -0,0 +1,70 @@ +import os +import time + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import query_scalar + + +# +# Test compute node start after clog truncation +# +def test_clog_truncate(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_clog_truncate", "empty") + + # set aggressive autovacuum to make sure that truncation will happen + config = [ + "autovacuum_max_workers=10", + "autovacuum_vacuum_threshold=0", + "autovacuum_vacuum_insert_threshold=0", + "autovacuum_vacuum_cost_delay=0", + "autovacuum_vacuum_cost_limit=10000", + "autovacuum_naptime =1s", + "autovacuum_freeze_max_age=100000", + ] + + pg = env.postgres.create_start("test_clog_truncate", config_lines=config) + log.info("postgres is running on test_clog_truncate branch") + + # Install extension containing function needed for test + pg.safe_psql("CREATE EXTENSION neon_test_utils") + + # Consume many xids to advance clog + with pg.cursor() as cur: + cur.execute("select test_consume_xids(1000*1000*10);") + log.info("xids consumed") + + # call a checkpoint to trigger TruncateSubtrans + cur.execute("CHECKPOINT;") + + # ensure WAL flush + cur.execute("select txid_current()") + log.info(cur.fetchone()) + + # wait for autovacuum to truncate the pg_xact + # XXX Is it worth to add a timeout here? + pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), "0000") + log.info(f"pg_xact_0000_path = {pg_xact_0000_path}") + + while os.path.isfile(pg_xact_0000_path): + log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}") + time.sleep(5) + + # checkpoint to advance latest lsn + with pg.cursor() as cur: + cur.execute("CHECKPOINT;") + lsn_after_truncation = query_scalar(cur, "select pg_current_wal_insert_lsn()") + + # create new branch after clog truncation and start a compute node on it + log.info(f"create branch at lsn_after_truncation {lsn_after_truncation}") + env.neon_cli.create_branch( + "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation + ) + pg2 = env.postgres.create_start("test_clog_truncate_new") + log.info("postgres is running on test_clog_truncate_new branch") + + # check that new node doesn't contain truncated segment + pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), "0000") + log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}") + assert os.path.isfile(pg_xact_0000_path_new) is False diff --git a/test_runner/batch_others/test_close_fds.py b/test_runner/regress/test_close_fds.py similarity index 76% rename from test_runner/batch_others/test_close_fds.py rename to test_runner/regress/test_close_fds.py index 9521b1bb4a..c7ea37f9c8 100644 --- a/test_runner/batch_others/test_close_fds.py +++ b/test_runner/regress/test_close_fds.py @@ -1,18 +1,18 @@ -from contextlib import closing -import shutil -import time -import subprocess import os.path +import shutil +import subprocess +import time +from contextlib import closing from cached_property import threading -from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv def lsof_path() -> str: path_output = shutil.which("lsof") if path_output is None: - raise RuntimeError('lsof not found in PATH') + raise RuntimeError("lsof not found in PATH") else: return path_output @@ -36,16 +36,18 @@ def test_lsof_pageserver_pid(neon_simple_env: NeonEnv): path = os.path.join(env.repo_dir, "pageserver.pid") lsof = lsof_path() while workload_thread.is_alive(): - res = subprocess.run([lsof, path], - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + res = subprocess.run( + [lsof, path], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) # parse the `lsof` command's output to get only the list of commands - commands = [line.split(' ')[0] for line in res.stdout.strip().split('\n')[1:]] + commands = [line.split(" ")[0] for line in res.stdout.strip().split("\n")[1:]] if len(commands) > 0: log.info(f"lsof commands: {commands}") - assert commands == ['pageserve'] + assert commands == ["pageserve"] time.sleep(1.0) diff --git a/test_runner/batch_others/test_config.py b/test_runner/regress/test_config.py similarity index 70% rename from test_runner/batch_others/test_config.py rename to test_runner/regress/test_config.py index 51deeebeed..3477d96b89 100644 --- a/test_runner/batch_others/test_config.py +++ b/test_runner/regress/test_config.py @@ -1,7 +1,7 @@ from contextlib import closing -from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv # @@ -12,19 +12,21 @@ def test_config(neon_simple_env: NeonEnv): env.neon_cli.create_branch("test_config", "empty") # change config - pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1']) - log.info('postgres is running on test_config branch') + pg = env.postgres.create_start("test_config", config_lines=["log_min_messages=debug1"]) + log.info("postgres is running on test_config branch") with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute(''' + cur.execute( + """ SELECT setting FROM pg_settings WHERE source != 'default' AND source != 'override' AND name = 'log_min_messages' - ''') + """ + ) # check that config change was applied - assert cur.fetchone() == ('debug1', ) + assert cur.fetchone() == ("debug1",) diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py new file mode 100644 index 0000000000..32e5366945 --- /dev/null +++ b/test_runner/regress/test_crafted_wal_end.py @@ -0,0 +1,71 @@ +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft + +# Restart nodes with WAL end having specially crafted shape, like last record +# crossing segment boundary, to test decoding issues. + + +@pytest.mark.parametrize( + "wal_type", + [ + "simple", + "last_wal_record_xlog_switch", + "last_wal_record_xlog_switch_ends_on_page_boundary", + "last_wal_record_crossing_segment", + "wal_record_crossing_segment_followed_by_small_one", + ], +) +def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_crafted_wal_end") + + pg = env.postgres.create("test_crafted_wal_end") + wal_craft = WalCraft(env) + pg.config(wal_craft.postgres_config()) + pg.start() + res = pg.safe_psql_many( + queries=[ + "CREATE TABLE keys(key int primary key)", + "INSERT INTO keys SELECT generate_series(1, 100)", + "SELECT SUM(key) FROM keys", + ] + ) + assert res[-1][0] == (5050,) + + wal_craft.in_existing(wal_type, pg.connstr()) + + log.info("Restarting all safekeepers and pageservers") + env.pageserver.stop() + env.safekeepers[0].stop() + env.safekeepers[0].start() + env.pageserver.start() + + log.info("Trying more queries") + res = pg.safe_psql_many( + queries=[ + "SELECT SUM(key) FROM keys", + "INSERT INTO keys SELECT generate_series(101, 200)", + "SELECT SUM(key) FROM keys", + ] + ) + assert res[0][0] == (5050,) + assert res[-1][0] == (20100,) + + log.info("Restarting all safekeepers and pageservers (again)") + env.pageserver.stop() + env.safekeepers[0].stop() + env.safekeepers[0].start() + env.pageserver.start() + + log.info("Trying more queries (again)") + res = pg.safe_psql_many( + queries=[ + "SELECT SUM(key) FROM keys", + "INSERT INTO keys SELECT generate_series(201, 300)", + "SELECT SUM(key) FROM keys", + ] + ) + assert res[0][0] == (20100,) + assert res[-1][0] == (45150,) diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/regress/test_createdropdb.py similarity index 55% rename from test_runner/batch_others/test_createdropdb.py rename to test_runner/regress/test_createdropdb.py index 0fbf6e2a47..036e50e6e8 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/regress/test_createdropdb.py @@ -1,9 +1,8 @@ import os import pathlib -from contextlib import closing -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.utils import query_scalar @@ -12,35 +11,37 @@ from fixtures.utils import query_scalar # def test_createdb(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch('test_createdb', 'empty') + env.neon_cli.create_branch("test_createdb", "empty") - pg = env.postgres.create_start('test_createdb') + pg = env.postgres.create_start("test_createdb") log.info("postgres is running on 'test_createdb' branch") with pg.cursor() as cur: # Cause a 'relmapper' change in the original branch - cur.execute('VACUUM FULL pg_class') + cur.execute("VACUUM FULL pg_class") - cur.execute('CREATE DATABASE foodb') + cur.execute("CREATE DATABASE foodb") - lsn = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') + lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create a branch - env.neon_cli.create_branch('test_createdb2', 'test_createdb', ancestor_start_lsn=lsn) - pg2 = env.postgres.create_start('test_createdb2') + env.neon_cli.create_branch("test_createdb2", "test_createdb", ancestor_start_lsn=lsn) + pg2 = env.postgres.create_start("test_createdb2") # Test that you can connect to the new database on both branches for db in (pg, pg2): - with db.cursor(dbname='foodb') as cur: + with db.cursor(dbname="foodb") as cur: # Check database size in both branches - cur.execute(""" + cur.execute( + """ select pg_size_pretty(pg_database_size('foodb')), pg_size_pretty( sum(pg_relation_size(oid, 'main')) +sum(pg_relation_size(oid, 'vm')) +sum(pg_relation_size(oid, 'fsm')) ) FROM pg_class where relisshared is false - """) + """ + ) res = cur.fetchone() assert res is not None # check that dbsize equals sum of all relation sizes, excluding shared ones @@ -53,51 +54,51 @@ def test_createdb(neon_simple_env: NeonEnv): # def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch('test_dropdb', 'empty') - pg = env.postgres.create_start('test_dropdb') + env.neon_cli.create_branch("test_dropdb", "empty") + pg = env.postgres.create_start("test_dropdb") log.info("postgres is running on 'test_dropdb' branch") with pg.cursor() as cur: - cur.execute('CREATE DATABASE foodb') + cur.execute("CREATE DATABASE foodb") - lsn_before_drop = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') + lsn_before_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") dboid = query_scalar(cur, "SELECT oid FROM pg_database WHERE datname='foodb';") with pg.cursor() as cur: - cur.execute('DROP DATABASE foodb') + cur.execute("DROP DATABASE foodb") - cur.execute('CHECKPOINT') + cur.execute("CHECKPOINT") - lsn_after_drop = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') + lsn_after_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create two branches before and after database drop. - env.neon_cli.create_branch('test_before_dropdb', - 'test_dropdb', - ancestor_start_lsn=lsn_before_drop) - pg_before = env.postgres.create_start('test_before_dropdb') + env.neon_cli.create_branch( + "test_before_dropdb", "test_dropdb", ancestor_start_lsn=lsn_before_drop + ) + pg_before = env.postgres.create_start("test_before_dropdb") - env.neon_cli.create_branch('test_after_dropdb', - 'test_dropdb', - ancestor_start_lsn=lsn_after_drop) - pg_after = env.postgres.create_start('test_after_dropdb') + env.neon_cli.create_branch( + "test_after_dropdb", "test_dropdb", ancestor_start_lsn=lsn_after_drop + ) + pg_after = env.postgres.create_start("test_after_dropdb") # Test that database exists on the branch before drop - pg_before.connect(dbname='foodb').close() + pg_before.connect(dbname="foodb").close() # Test that database subdir exists on the branch before drop assert pg_before.pgdata_dir - dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid) + dbpath = pathlib.Path(pg_before.pgdata_dir) / "base" / str(dboid) log.info(dbpath) - assert os.path.isdir(dbpath) == True + assert os.path.isdir(dbpath) is True # Test that database subdir doesn't exist on the branch after drop assert pg_after.pgdata_dir - dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid) + dbpath = pathlib.Path(pg_after.pgdata_dir) / "base" / str(dboid) log.info(dbpath) - assert os.path.isdir(dbpath) == False + assert os.path.isdir(dbpath) is False # Check that we restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py new file mode 100644 index 0000000000..c5f8246f5b --- /dev/null +++ b/test_runner/regress/test_createuser.py @@ -0,0 +1,28 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import query_scalar + + +# +# Test CREATE USER to check shared catalog restore +# +def test_createuser(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_createuser", "empty") + pg = env.postgres.create_start("test_createuser") + log.info("postgres is running on 'test_createuser' branch") + + with pg.cursor() as cur: + # Cause a 'relmapper' change in the original branch + cur.execute("CREATE USER testuser with password %s", ("testpwd",)) + + cur.execute("CHECKPOINT") + + lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") + + # Create a branch + env.neon_cli.create_branch("test_createuser2", "test_createuser", ancestor_start_lsn=lsn) + pg2 = env.postgres.create_start("test_createuser2") + + # Test that you can connect to new branch as a new user + assert pg2.safe_psql("select current_user", user="testuser") == [("testuser",)] diff --git a/test_runner/regress/test_fsm_truncate.py b/test_runner/regress/test_fsm_truncate.py new file mode 100644 index 0000000000..4551ff97e0 --- /dev/null +++ b/test_runner/regress/test_fsm_truncate.py @@ -0,0 +1,10 @@ +from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_fsm_truncate(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_fsm_truncate") + pg = env.postgres.create_start("test_fsm_truncate") + pg.safe_psql( + "CREATE TABLE t1(key int); CREATE TABLE t2(key int); TRUNCATE TABLE t1; TRUNCATE TABLE t2;" + ) diff --git a/test_runner/batch_others/test_fullbackup.py b/test_runner/regress/test_fullbackup.py similarity index 61% rename from test_runner/batch_others/test_fullbackup.py rename to test_runner/regress/test_fullbackup.py index bce085c157..8155f52060 100644 --- a/test_runner/batch_others/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -1,22 +1,28 @@ -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres -from fixtures.neon_fixtures import pg_distrib_dir import os + +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + PortDistributor, + VanillaPostgres, + pg_distrib_dir, +) from fixtures.utils import query_scalar, subprocess_capture num_rows = 1000 # Ensure that regular postgres can start from fullbackup -def test_fullbackup(neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - port_distributor: PortDistributor): +def test_fullbackup( + neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor +): neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_fullbackup') - pgmain = env.postgres.create_start('test_fullbackup') + env.neon_cli.create_branch("test_fullbackup") + pgmain = env.postgres.create_start("test_fullbackup") log.info("postgres is running on 'test_fullbackup' branch") with pgmain.cursor() as cur: @@ -24,16 +30,18 @@ def test_fullbackup(neon_env_builder: NeonEnvBuilder, # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") - cur.execute(f'''CREATE TABLE tbl AS SELECT 'long string to consume some space' || g - from generate_series(1,{num_rows}) g''') + cur.execute( + f"""CREATE TABLE tbl AS SELECT 'long string to consume some space' || g + from generate_series(1,{num_rows}) g""" + ) cur.execute("CHECKPOINT") - lsn = query_scalar(cur, 'SELECT pg_current_wal_insert_lsn()') + lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") log.info(f"start_backup_lsn = {lsn}") # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} + psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} # Get and unpack fullbackup from pageserver restored_dir_path = env.repo_dir / "restored_datadir" @@ -42,13 +50,14 @@ def test_fullbackup(neon_env_builder: NeonEnvBuilder, cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] result_basepath = pg_bin.run_capture(cmd, env=psql_env) tar_output_file = result_basepath + ".stdout" - subprocess_capture(str(env.repo_dir), - ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)]) + subprocess_capture( + str(env.repo_dir), ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)] + ) # HACK # fullbackup returns neon specific pg_control and first WAL segment # use resetwal to overwrite it - pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, 'pg_resetwal') + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") cmd = [pg_resetwal_path, "-D", str(restored_dir_path)] pg_bin.run_capture(cmd, env=psql_env) @@ -56,9 +65,11 @@ def test_fullbackup(neon_env_builder: NeonEnvBuilder, port = port_distributor.get_port() with VanillaPostgres(restored_dir_path, pg_bin, port, init=False) as vanilla_pg: # TODO make port an optional argument - vanilla_pg.configure([ - f"port={port}", - ]) + vanilla_pg.configure( + [ + f"port={port}", + ] + ) vanilla_pg.start() - num_rows_found = vanilla_pg.safe_psql('select count(*) from tbl;', user="cloud_admin")[0][0] + num_rows_found = vanilla_pg.safe_psql("select count(*) from tbl;", user="cloud_admin")[0][0] assert num_rows == num_rows_found diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py similarity index 83% rename from test_runner/batch_others/test_gc_aggressive.py rename to test_runner/regress/test_gc_aggressive.py index d7f6308182..90824f882a 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -1,8 +1,8 @@ import asyncio import random -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.utils import query_scalar # Test configuration @@ -24,7 +24,7 @@ async def update_table(pg: Postgres): while updates_performed < updates_to_perform: updates_performed += 1 id = random.randrange(1, num_rows) - row = await pg_conn.fetchrow(f'UPDATE foo SET counter = counter + 1 WHERE id = {id}') + await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}") # Perform aggressive GC with 0 horizon @@ -57,24 +57,26 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() env.neon_cli.create_branch("test_gc_aggressive", "main") - pg = env.postgres.create_start('test_gc_aggressive') - log.info('postgres is running on test_gc_aggressive branch') + pg = env.postgres.create_start("test_gc_aggressive") + log.info("postgres is running on test_gc_aggressive branch") with pg.cursor() as cur: timeline = query_scalar(cur, "SHOW neon.timeline_id") # Create table, and insert the first 100 rows - cur.execute('CREATE TABLE foo (id int, counter int, t text)') - cur.execute(f''' + cur.execute("CREATE TABLE foo (id int, counter int, t text)") + cur.execute( + f""" INSERT INTO foo SELECT g, 0, 'long string to consume some space' || g FROM generate_series(1, {num_rows}) g - ''') - cur.execute('CREATE INDEX ON foo(id)') + """ + ) + cur.execute("CREATE INDEX ON foo(id)") asyncio.run(update_and_gc(env, pg, timeline)) - cur.execute('SELECT COUNT(*), SUM(counter) FROM foo') + cur.execute("SELECT COUNT(*), SUM(counter) FROM foo") r = cur.fetchone() assert r is not None assert r == (num_rows, updates_to_perform) diff --git a/test_runner/batch_others/test_import.py b/test_runner/regress/test_import.py similarity index 74% rename from test_runner/batch_others/test_import.py rename to test_runner/regress/test_import.py index 039945e5e4..a2671727f7 100644 --- a/test_runner/batch_others/test_import.py +++ b/test_runner/regress/test_import.py @@ -1,17 +1,24 @@ -import re -import pytest -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, Postgres, wait_for_upload, wait_for_last_record_lsn -from fixtures.utils import lsn_from_hex -from uuid import UUID, uuid4 -import os -import tarfile -import shutil -from pathlib import Path import json -from fixtures.utils import subprocess_capture -from fixtures.log_helper import log +import os +import re +import shutil +import tarfile from contextlib import closing -from fixtures.neon_fixtures import pg_distrib_dir +from pathlib import Path +from uuid import UUID, uuid4 + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + Postgres, + pg_distrib_dir, + wait_for_last_record_lsn, + wait_for_upload, +) +from fixtures.utils import lsn_from_hex, subprocess_capture @pytest.mark.timeout(600) @@ -19,9 +26,11 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build # Put data in vanilla pg vanilla_pg.start() vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") - vanilla_pg.safe_psql('''create table t as select 'long string to consume some space' || g - from generate_series(1,300000) g''') - assert vanilla_pg.safe_psql('select count(*) from t') == [(300000, )] + vanilla_pg.safe_psql( + """create table t as select 'long string to consume some space' || g + from generate_series(1,300000) g""" + ) + assert vanilla_pg.safe_psql("select count(*) from t") == [(300000,)] # Take basebackup basebackup_dir = os.path.join(test_output_dir, "basebackup") @@ -29,15 +38,17 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build wal_tar = os.path.join(basebackup_dir, "pg_wal.tar") os.mkdir(basebackup_dir) vanilla_pg.safe_psql("CHECKPOINT") - pg_bin.run([ - "pg_basebackup", - "-F", - "tar", - "-d", - vanilla_pg.connstr(), - "-D", - basebackup_dir, - ]) + pg_bin.run( + [ + "pg_basebackup", + "-F", + "tar", + "-d", + vanilla_pg.connstr(), + "-D", + basebackup_dir, + ] + ) # Make corrupt base tar with missing pg_control unpacked_base = os.path.join(basebackup_dir, "unpacked-base") @@ -45,9 +56,11 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build os.mkdir(unpacked_base, 0o750) subprocess_capture(str(test_output_dir), ["tar", "-xf", base_tar, "-C", unpacked_base]) os.remove(os.path.join(unpacked_base, "global/pg_control")) - subprocess_capture(str(test_output_dir), - ["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base), - cwd=unpacked_base) + subprocess_capture( + str(test_output_dir), + ["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base), + cwd=unpacked_base, + ) # Get start_lsn and end_lsn with open(os.path.join(basebackup_dir, "backup_manifest")) as f: @@ -65,24 +78,26 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build env.pageserver.http_client().tenant_create(tenant) def import_tar(base, wal): - env.neon_cli.raw_cli([ - "timeline", - "import", - "--tenant-id", - tenant.hex, - "--timeline-id", - timeline.hex, - "--node-name", - node_name, - "--base-lsn", - start_lsn, - "--base-tarfile", - base, - "--end-lsn", - end_lsn, - "--wal-tarfile", - wal, - ]) + env.neon_cli.raw_cli( + [ + "timeline", + "import", + "--tenant-id", + tenant.hex, + "--timeline-id", + timeline.hex, + "--node-name", + node_name, + "--base-lsn", + start_lsn, + "--base-tarfile", + base, + "--end-lsn", + end_lsn, + "--wal-tarfile", + wal, + ] + ) # Importing corrupt backup fails with pytest.raises(Exception): @@ -102,7 +117,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build # Check it worked pg = env.postgres.create_start(node_name, tenant_id=tenant) - assert pg.safe_psql('select count(*) from t') == [(300000, )] + assert pg.safe_psql("select count(*) from t") == [(300000,)] @pytest.mark.timeout(600) @@ -111,8 +126,8 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() - timeline = env.neon_cli.create_branch('test_import_from_pageserver_small') - pg = env.postgres.create_start('test_import_from_pageserver_small') + timeline = env.neon_cli.create_branch("test_import_from_pageserver_small") + pg = env.postgres.create_start("test_import_from_pageserver_small") num_rows = 3000 lsn = _generate_data(num_rows, pg) @@ -129,8 +144,8 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() - timeline = env.neon_cli.create_branch('test_import_from_pageserver_multisegment') - pg = env.postgres.create_start('test_import_from_pageserver_multisegment') + timeline = env.neon_cli.create_branch("test_import_from_pageserver_multisegment") + pg = env.postgres.create_start("test_import_from_pageserver_multisegment") # For `test_import_from_pageserver_multisegment`, we want to make sure that the data # is large enough to create multi-segment files. Typically, a segment file's size is @@ -139,8 +154,9 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne num_rows = 30000000 lsn = _generate_data(num_rows, pg) - logical_size = env.pageserver.http_client().timeline_detail( - env.initial_tenant, timeline)['local']['current_logical_size'] + logical_size = env.pageserver.http_client().timeline_detail(env.initial_tenant, timeline)[ + "local" + ]["current_logical_size"] log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB") assert logical_size > 1024**3 # = 1GB @@ -148,7 +164,7 @@ def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: Ne # Check if the backup data contains multiple segment files cnt_seg_files = 0 - segfile_re = re.compile('[0-9]+\\.[0-9]+') + segfile_re = re.compile("[0-9]+\\.[0-9]+") with tarfile.open(tar_output_file, "r") as tar_f: for f in tar_f.getnames(): if segfile_re.search(f) is not None: @@ -166,11 +182,13 @@ def _generate_data(num_rows: int, pg: Postgres) -> str: with conn.cursor() as cur: # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") - cur.execute(f'''CREATE TABLE tbl AS SELECT 'long string to consume some space' || g - from generate_series(1,{num_rows}) g''') + cur.execute( + f"""CREATE TABLE tbl AS SELECT 'long string to consume some space' || g + from generate_series(1,{num_rows}) g""" + ) cur.execute("CHECKPOINT") - cur.execute('SELECT pg_current_wal_insert_lsn()') + cur.execute("SELECT pg_current_wal_insert_lsn()") res = cur.fetchone() assert res is not None and isinstance(res[0], str) return res[0] @@ -189,7 +207,7 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} + psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} # Get a fullbackup from pageserver query = f"fullbackup { env.initial_tenant.hex} {timeline.hex} {lsn}" @@ -201,11 +219,11 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel env.postgres.stop_all() env.pageserver.stop() - dir_to_clear = Path(env.repo_dir) / 'tenants' + dir_to_clear = Path(env.repo_dir) / "tenants" shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) - #start the pageserver again + # start the pageserver again env.pageserver.start() # Import using another tenantid, because we use the same pageserver. @@ -216,20 +234,22 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel node_name = "import_from_pageserver" client = env.pageserver.http_client() client.tenant_create(tenant) - env.neon_cli.raw_cli([ - "timeline", - "import", - "--tenant-id", - tenant.hex, - "--timeline-id", - timeline.hex, - "--node-name", - node_name, - "--base-lsn", - lsn, - "--base-tarfile", - os.path.join(tar_output_file), - ]) + env.neon_cli.raw_cli( + [ + "timeline", + "import", + "--tenant-id", + tenant.hex, + "--timeline-id", + timeline.hex, + "--node-name", + node_name, + "--base-lsn", + lsn, + "--base-tarfile", + os.path.join(tar_output_file), + ] + ) # Wait for data to land in s3 wait_for_last_record_lsn(client, tenant, timeline, lsn_from_hex(lsn)) @@ -237,7 +257,7 @@ def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timel # Check it worked pg = env.postgres.create_start(node_name, tenant_id=tenant) - assert pg.safe_psql('select count(*) from tbl') == [(expected_num_rows, )] + assert pg.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] # Take another fullbackup query = f"fullbackup { tenant.hex} {timeline.hex} {lsn}" diff --git a/test_runner/batch_others/test_large_schema.py b/test_runner/regress/test_large_schema.py similarity index 88% rename from test_runner/batch_others/test_large_schema.py rename to test_runner/regress/test_large_schema.py index 18ae0614a9..f14265f6fd 100644 --- a/test_runner/batch_others/test_large_schema.py +++ b/test_runner/regress/test_large_schema.py @@ -1,7 +1,8 @@ -import time import os -from fixtures.neon_fixtures import NeonEnvBuilder +import time + from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder # This test creates large number of tables which cause large catalog. @@ -14,7 +15,7 @@ from fixtures.log_helper import log def test_large_schema(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - pg = env.postgres.create_start('main') + pg = env.postgres.create_start("main") conn = pg.connect() cur = conn.cursor() @@ -22,7 +23,7 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): tables = 2 # 10 is too much for debug build partitions = 1000 for i in range(1, tables + 1): - print(f'iteration {i} / {tables}') + print(f"iteration {i} / {tables}") # Restart compute. Restart is actually not strictly needed. # It is done mostly because this test originally tries to model the problem reported by Ketteq. @@ -52,10 +53,10 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): # It's normal that it takes some time for the pageserver to # restart, and for the connection to fail until it does. It # should eventually recover, so retry until it succeeds. - print(f'failed: {error}') + print(f"failed: {error}") if retries < max_retries: retries += 1 - print(f'retry {retries} / {max_retries}') + print(f"retry {retries} / {max_retries}") time.sleep(retry_sleep) continue else: @@ -67,7 +68,7 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): for i in range(1, tables + 1): cur.execute(f"SELECT count(*) FROM t_{i}") - assert cur.fetchone() == (partitions, ) + assert cur.fetchone() == (partitions,) cur.execute("set enable_sort=off") cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid") @@ -77,6 +78,6 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant_id, timeline_id) for filename in os.listdir(timeline_path): - if filename.startswith('00000'): - log.info(f'layer {filename} size is {os.path.getsize(timeline_path + filename)}') + if filename.startswith("00000"): + log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}") assert os.path.getsize(timeline_path + filename) < 512_000_000 diff --git a/test_runner/batch_others/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py similarity index 72% rename from test_runner/batch_others/test_lsn_mapping.py rename to test_runner/regress/test_lsn_mapping.py index d8b207135e..0c1d3648f2 100644 --- a/test_runner/batch_others/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -1,13 +1,7 @@ -from contextlib import closing -from datetime import timedelta, timezone, tzinfo -import math -from uuid import UUID -import psycopg2.extras -import psycopg2.errors -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres -from fixtures.log_helper import log -import time +from datetime import timedelta +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import query_scalar @@ -18,7 +12,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() - new_timeline_id = env.neon_cli.create_branch('test_lsn_mapping') + new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping") pgmain = env.postgres.create_start("test_lsn_mapping") log.info("postgres is running on 'test_lsn_mapping' branch") @@ -35,7 +29,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): for i in range(1000): cur.execute(f"INSERT INTO foo VALUES({i})") # Get the timestamp at UTC - after_timestamp = query_scalar(cur, 'SELECT clock_timestamp()').replace(tzinfo=None) + after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None) tbl.append([i, after_timestamp]) # Execute one more transaction with synchronous_commit enabled, to flush @@ -47,17 +41,17 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): probe_timestamp = tbl[-1][1] + timedelta(hours=1) result = query_scalar( ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" + f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'", ) - assert result == 'future' + assert result == "future" # timestamp too the far history probe_timestamp = tbl[0][1] - timedelta(hours=10) result = query_scalar( ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" + f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'", ) - assert result == 'past' + assert result == "past" # Probe a bunch of timestamps in the valid range for i in range(1, len(tbl), 100): @@ -66,14 +60,14 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Call get_lsn_by_timestamp to get the LSN lsn = query_scalar( ps_cur, - f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'" + f"get_lsn_by_timestamp {env.initial_tenant.hex} {new_timeline_id.hex} '{probe_timestamp.isoformat()}Z'", ) # Launch a new read-only node at that LSN, and check that only the rows # that were supposed to be committed at that point in time are visible. - pg_here = env.postgres.create_start(branch_name='test_lsn_mapping', - node_name='test_lsn_mapping_read', - lsn=lsn) + pg_here = env.postgres.create_start( + branch_name="test_lsn_mapping", node_name="test_lsn_mapping_read", lsn=lsn + ) assert pg_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i pg_here.stop_and_destroy() diff --git a/test_runner/batch_others/test_multixact.py b/test_runner/regress/test_multixact.py similarity index 75% rename from test_runner/batch_others/test_multixact.py rename to test_runner/regress/test_multixact.py index dd00066092..635beb16b7 100644 --- a/test_runner/batch_others/test_multixact.py +++ b/test_runner/regress/test_multixact.py @@ -1,5 +1,5 @@ -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.utils import query_scalar @@ -11,18 +11,21 @@ from fixtures.utils import query_scalar # def test_multixact(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env - env.neon_cli.create_branch('test_multixact', 'empty') - pg = env.postgres.create_start('test_multixact') + env.neon_cli.create_branch("test_multixact", "empty") + pg = env.postgres.create_start("test_multixact") log.info("postgres is running on 'test_multixact' branch") cur = pg.connect().cursor() - cur.execute(''' + cur.execute( + """ CREATE TABLE t1(i int primary key); INSERT INTO t1 select * from generate_series(1, 100); - ''') + """ + ) - next_multixact_id_old = query_scalar(cur, - 'SELECT next_multixact_id FROM pg_control_checkpoint()') + next_multixact_id_old = query_scalar( + cur, "SELECT next_multixact_id FROM pg_control_checkpoint()" + ) # Lock entries using parallel connections in a round-robin fashion. nclients = 20 @@ -40,17 +43,18 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): for i in range(5000): conn = connections[i % nclients] conn.commit() - conn.cursor().execute('select * from t1 for key share') + conn.cursor().execute("select * from t1 for key share") # We have multixacts now. We can close the connections. for c in connections: c.close() # force wal flush - cur.execute('checkpoint') + cur.execute("checkpoint") cur.execute( - 'SELECT next_multixact_id, pg_current_wal_insert_lsn() FROM pg_control_checkpoint()') + "SELECT next_multixact_id, pg_current_wal_insert_lsn() FROM pg_control_checkpoint()" + ) res = cur.fetchone() assert res is not None next_multixact_id = res[0] @@ -60,12 +64,13 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - env.neon_cli.create_branch('test_multixact_new', 'test_multixact', ancestor_start_lsn=lsn) - pg_new = env.postgres.create_start('test_multixact_new') + env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn) + pg_new = env.postgres.create_start("test_multixact_new") log.info("postgres is running on 'test_multixact_new' branch") next_multixact_id_new = pg_new.safe_psql( - 'SELECT next_multixact_id FROM pg_control_checkpoint()')[0][0] + "SELECT next_multixact_id FROM pg_control_checkpoint()" + )[0][0] # Check that we restored pg_controlfile correctly assert next_multixact_id_new == next_multixact_id diff --git a/test_runner/batch_others/test_neon_cli.py b/test_runner/regress/test_neon_cli.py similarity index 82% rename from test_runner/batch_others/test_neon_cli.py rename to test_runner/regress/test_neon_cli.py index 728bc7b894..1acfa72127 100644 --- a/test_runner/batch_others/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -1,21 +1,29 @@ import uuid -import requests - -from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient from typing import cast +import requests +from fixtures.neon_fixtures import ( + DEFAULT_BRANCH_NAME, + NeonEnv, + NeonEnvBuilder, + NeonPageserverHttpClient, +) -def helper_compare_timeline_list(pageserver_http_client: NeonPageserverHttpClient, - env: NeonEnv, - initial_tenant: uuid.UUID): + +def helper_compare_timeline_list( + pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv, initial_tenant: uuid.UUID +): """ Compare timelines list returned by CLI and directly via API. Filters out timelines created by other tests. """ timelines_api = sorted( - map(lambda t: cast(str, t['timeline_id']), - pageserver_http_client.timeline_list(initial_tenant))) + map( + lambda t: cast(str, t["timeline_id"]), + pageserver_http_client.timeline_list(initial_tenant), + ) + ) timelines_cli = env.neon_cli.list_timelines() assert timelines_cli == env.neon_cli.list_timelines(initial_tenant) @@ -32,12 +40,13 @@ def test_cli_timeline_list(neon_simple_env: NeonEnv): helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a branch for us - main_timeline_id = env.neon_cli.create_branch('test_cli_branch_list_main') + main_timeline_id = env.neon_cli.create_branch("test_cli_branch_list_main") helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a nested branch - nested_timeline_id = env.neon_cli.create_branch('test_cli_branch_list_nested', - 'test_cli_branch_list_main') + nested_timeline_id = env.neon_cli.create_branch( + "test_cli_branch_list_nested", "test_cli_branch_list_main" + ) helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Check that all new branches are visible via CLI @@ -49,7 +58,7 @@ def test_cli_timeline_list(neon_simple_env: NeonEnv): def helper_compare_tenant_list(pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv): tenants = pageserver_http_client.tenant_list() - tenants_api = sorted(map(lambda t: cast(str, t['id']), tenants)) + tenants_api = sorted(map(lambda t: cast(str, t["id"]), tenants)) res = env.neon_cli.list_tenants() tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) @@ -97,7 +106,7 @@ def test_cli_ipv4_listeners(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() # Connect to sk port on v4 loopback - res = requests.get(f'http://127.0.0.1:{env.safekeepers[0].port.http}/v1/status') + res = requests.get(f"http://127.0.0.1:{env.safekeepers[0].port.http}/v1/status") assert res.ok # FIXME Test setup is using localhost:xx in ps config. diff --git a/test_runner/batch_others/test_next_xid.py b/test_runner/regress/test_next_xid.py similarity index 82% rename from test_runner/batch_others/test_next_xid.py rename to test_runner/regress/test_next_xid.py index f8d11a9381..698ea0e1d3 100644 --- a/test_runner/batch_others/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -8,15 +8,15 @@ from fixtures.neon_fixtures import NeonEnvBuilder def test_next_xid(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - pg = env.postgres.create_start('main') + pg = env.postgres.create_start("main") conn = pg.connect() cur = conn.cursor() - cur.execute('CREATE TABLE t(x integer)') + cur.execute("CREATE TABLE t(x integer)") iterations = 32 for i in range(1, iterations + 1): - print(f'iteration {i} / {iterations}') + print(f"iteration {i} / {iterations}") # Kill and restart the pageserver. pg.stop() @@ -38,10 +38,10 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder): # It's normal that it takes some time for the pageserver to # restart, and for the connection to fail until it does. It # should eventually recover, so retry until it succeeds. - print(f'failed: {error}') + print(f"failed: {error}") if retries < max_retries: retries += 1 - print(f'retry {retries} / {max_retries}') + print(f"retry {retries} / {max_retries}") time.sleep(retry_sleep) continue else: @@ -51,4 +51,4 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder): conn = pg.connect() cur = conn.cursor() cur.execute("SELECT count(*) FROM t") - assert cur.fetchone() == (iterations, ) + assert cur.fetchone() == (iterations,) diff --git a/test_runner/batch_others/test_normal_work.py b/test_runner/regress/test_normal_work.py similarity index 69% rename from test_runner/batch_others/test_normal_work.py rename to test_runner/regress/test_normal_work.py index 5b25691517..002d697288 100644 --- a/test_runner/batch_others/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -1,33 +1,35 @@ +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient -import pytest def check_tenant(env: NeonEnv, pageserver_http: NeonPageserverHttpClient): tenant_id, timeline_id = env.neon_cli.create_tenant() - pg = env.postgres.create_start('main', tenant_id=tenant_id) + pg = env.postgres.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement - res_1 = pg.safe_psql_many(queries=[ - 'CREATE TABLE t(key int primary key, value text)', - 'INSERT INTO t SELECT generate_series(1,100000), \'payload\'', - 'SELECT sum(key) FROM t', - ]) + res_1 = pg.safe_psql_many( + queries=[ + "CREATE TABLE t(key int primary key, value text)", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + "SELECT sum(key) FROM t", + ] + ) - assert res_1[-1][0] == (5000050000, ) + assert res_1[-1][0] == (5000050000,) # TODO check detach on live instance log.info("stopping compute") pg.stop() log.info("compute stopped") pg.start() - res_2 = pg.safe_psql('SELECT sum(key) FROM t') - assert res_2[0] == (5000050000, ) + res_2 = pg.safe_psql("SELECT sum(key) FROM t") + assert res_2[0] == (5000050000,) pg.stop() pageserver_http.tenant_detach(tenant_id) -@pytest.mark.parametrize('num_timelines,num_safekeepers', [(3, 1)]) +@pytest.mark.parametrize("num_timelines,num_safekeepers", [(3, 1)]) def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int): """ Basic test: diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py similarity index 85% rename from test_runner/batch_others/test_old_request_lsn.py rename to test_runner/regress/test_old_request_lsn.py index 78a936af19..257913ef3f 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,7 +1,7 @@ -from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.log_helper import log -from fixtures.utils import print_gc_result, query_scalar import psycopg2.extras +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import print_gc_result, query_scalar # @@ -19,8 +19,8 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() env.neon_cli.create_branch("test_old_request_lsn", "main") - pg = env.postgres.create_start('test_old_request_lsn') - log.info('postgres is running on test_old_request_lsn branch') + pg = env.postgres.create_start("test_old_request_lsn") + log.info("postgres is running on test_old_request_lsn branch") pg_conn = pg.connect() cur = pg_conn.cursor() @@ -33,25 +33,29 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers. - cur.execute('CREATE TABLE foo (id int4 PRIMARY KEY, val int, t text)') - cur.execute(''' + cur.execute("CREATE TABLE foo (id int4 PRIMARY KEY, val int, t text)") + cur.execute( + """ INSERT INTO foo SELECT g, 1, 'long string to consume some space' || g FROM generate_series(1, 100000) g - ''') + """ + ) # Verify that the table is larger than shared_buffers, so that the SELECT below # will cause GetPage requests. - cur.execute(''' + cur.execute( + """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize from pg_settings where name = 'shared_buffers' - ''') + """ + ) row = cur.fetchone() assert row is not None - log.info(f'shared_buffers is {row[0]}, table size {row[1]}') + log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) - cur.execute('VACUUM foo') + cur.execute("VACUUM foo") # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. @@ -61,7 +65,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): print_gc_result(row) for j in range(100): - cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;') + cur.execute("UPDATE foo SET val = val + 1 WHERE id = 1;") # All (or at least most of) the updates should've been on the same page, so # that we haven't had to evict any dirty pages for a long time. Now run diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py new file mode 100644 index 0000000000..869f53ac0a --- /dev/null +++ b/test_runner/regress/test_pageserver_api.py @@ -0,0 +1,192 @@ +import pathlib +import subprocess +from typing import Optional +from uuid import UUID, uuid4 + +from fixtures.neon_fixtures import ( + DEFAULT_BRANCH_NAME, + NeonEnv, + NeonEnvBuilder, + NeonPageserverHttpClient, + neon_binpath, + pg_distrib_dir, + wait_until, +) +from fixtures.utils import lsn_from_hex + + +# test that we cannot override node id after init +def test_pageserver_init_node_id(neon_simple_env: NeonEnv): + repo_dir = neon_simple_env.repo_dir + pageserver_config = repo_dir / "pageserver.toml" + pageserver_bin = pathlib.Path(neon_binpath) / "pageserver" + + def run_pageserver(args): + return subprocess.run( + [str(pageserver_bin), "-D", str(repo_dir), *args], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + # remove initial config + pageserver_config.unlink() + + bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']) + assert ( + bad_init.returncode == 1 + ), "pageserver should not be able to init new config without the node id" + assert "missing id" in bad_init.stderr + assert not pageserver_config.exists(), "config file should not be created after init error" + + completed_init = run_pageserver( + ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] + ) + assert ( + completed_init.returncode == 0 + ), "pageserver should be able to create a new config with the node id given" + assert pageserver_config.exists(), "config file should be created successfully" + + bad_reinit = run_pageserver( + ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] + ) + assert ( + bad_reinit.returncode == 1 + ), "pageserver should not be able to init new config without the node id" + assert "already exists, cannot init it" in bad_reinit.stderr + + bad_update = run_pageserver(["--update-config", "-c", "id = 3"]) + assert bad_update.returncode == 1, "pageserver should not allow updating node id" + assert "has node id already, it cannot be overridden" in bad_update.stderr + + +def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): + client.check_status() + + # check initial tenant is there + assert initial_tenant.hex in {t["id"] for t in client.tenant_list()} + + # create new tenant and check it is also there + tenant_id = uuid4() + client.tenant_create(tenant_id) + assert tenant_id.hex in {t["id"] for t in client.tenant_list()} + + timelines = client.timeline_list(tenant_id) + assert len(timelines) == 0, "initial tenant should not have any timelines" + + # create timeline + timeline_id = uuid4() + client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) + + timelines = client.timeline_list(tenant_id) + assert len(timelines) > 0 + + # check it is there + assert timeline_id.hex in {b["timeline_id"] for b in client.timeline_list(tenant_id)} + for timeline in timelines: + timeline_id_str = str(timeline["timeline_id"]) + timeline_details = client.timeline_detail( + tenant_id=tenant_id, + timeline_id=UUID(timeline_id_str), + include_non_incremental_logical_size=True, + ) + + assert timeline_details["tenant_id"] == tenant_id.hex + assert timeline_details["timeline_id"] == timeline_id_str + + local_timeline_details = timeline_details.get("local") + assert local_timeline_details is not None + assert local_timeline_details["timeline_state"] == "Loaded" + + +def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): + env = neon_simple_env + with env.pageserver.http_client() as client: + tenant_id, timeline_id = env.neon_cli.create_tenant() + + timeline_details = client.timeline_detail( + tenant_id=tenant_id, timeline_id=timeline_id, include_non_incremental_logical_size=True + ) + + assert ( + timeline_details.get("wal_source_connstr") is None + ), "Should not be able to connect to WAL streaming without PG compute node running" + assert ( + timeline_details.get("last_received_msg_lsn") is None + ), "Should not be able to connect to WAL streaming without PG compute node running" + assert ( + timeline_details.get("last_received_msg_ts") is None + ), "Should not be able to connect to WAL streaming without PG compute node running" + + +def expect_updated_msg_lsn( + client: NeonPageserverHttpClient, + tenant_id: UUID, + timeline_id: UUID, + prev_msg_lsn: Optional[int], +) -> int: + timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) + + # a successful `timeline_details` response must contain the below fields + local_timeline_details = timeline_details["local"] + assert "wal_source_connstr" in local_timeline_details.keys() + assert "last_received_msg_lsn" in local_timeline_details.keys() + assert "last_received_msg_ts" in local_timeline_details.keys() + + assert ( + local_timeline_details["last_received_msg_lsn"] is not None + ), "the last received message's LSN is empty" + + last_msg_lsn = lsn_from_hex(local_timeline_details["last_received_msg_lsn"]) + assert ( + prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn + ), f"the last received message's LSN {last_msg_lsn} hasn't been updated \ + compared to the previous message's LSN {prev_msg_lsn}" + + return last_msg_lsn + + +# Test the WAL-receiver related fields in the response to `timeline_details` API call +# +# These fields used to be returned by a separate API call, but they're part of +# `timeline_details` now. +def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): + env = neon_simple_env + with env.pageserver.http_client() as client: + tenant_id, timeline_id = env.neon_cli.create_tenant() + pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) + + # Wait to make sure that we get a latest WAL receiver data. + # We need to wait here because it's possible that we don't have access to + # the latest WAL yet, when the `timeline_detail` API is first called. + # See: https://github.com/neondatabase/neon/issues/1768. + lsn = wait_until( + number_of_iterations=5, + interval=1, + func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None), + ) + + # Make a DB modification then expect getting a new WAL receiver's data. + pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn), + ) + + +def test_pageserver_http_api_client(neon_simple_env: NeonEnv): + env = neon_simple_env + with env.pageserver.http_client() as client: + check_client(client, env.initial_tenant) + + +def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + + management_token = env.auth_keys.generate_management_token() + + with env.pageserver.http_client(auth_token=management_token) as client: + check_client(client, env.initial_tenant) diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/regress/test_pageserver_catchup.py similarity index 78% rename from test_runner/batch_others/test_pageserver_catchup.py rename to test_runner/regress/test_pageserver_catchup.py index dd24351e17..cba3203591 100644 --- a/test_runner/batch_others/test_pageserver_catchup.py +++ b/test_runner/regress/test_pageserver_catchup.py @@ -9,24 +9,27 @@ def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder) neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_pageserver_catchup_while_compute_down') + env.neon_cli.create_branch("test_pageserver_catchup_while_compute_down") # Make shared_buffers large to ensure we won't query pageserver while it is down. - pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down', - config_lines=['shared_buffers=512MB']) + pg = env.postgres.create_start( + "test_pageserver_catchup_while_compute_down", config_lines=["shared_buffers=512MB"] + ) pg_conn = pg.connect() cur = pg_conn.cursor() # Create table, and insert some rows. - cur.execute('CREATE TABLE foo (t text)') - cur.execute(''' + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10000) g - ''') + """ + ) cur.execute("SELECT count(*) FROM foo") - assert cur.fetchone() == (10000, ) + assert cur.fetchone() == (10000,) # Stop and restart pageserver. This is a more or less graceful shutdown, although # the page server doesn't currently have a shutdown routine so there's no difference @@ -35,11 +38,13 @@ def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder) # insert some more rows # since pageserver is shut down, these will be only on safekeepers - cur.execute(''' + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10000) g - ''') + """ + ) # stop safekeepers gracefully env.safekeepers[0].stop() @@ -54,11 +59,11 @@ def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder) env.safekeepers[2].start() # restart compute node - pg.stop_and_destroy().create_start('test_pageserver_catchup_while_compute_down') + pg.stop_and_destroy().create_start("test_pageserver_catchup_while_compute_down") # Ensure that basebackup went correct and pageserver returned all data pg_conn = pg.connect() cur = pg_conn.cursor() cur.execute("SELECT count(*) FROM foo") - assert cur.fetchone() == (20000, ) + assert cur.fetchone() == (20000,) diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py similarity index 87% rename from test_runner/batch_others/test_pageserver_restart.py rename to test_runner/regress/test_pageserver_restart.py index c656469cb7..e2bd8be9b7 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -1,5 +1,5 @@ -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder # Test restarting page server, while safekeeper and compute node keep @@ -7,8 +7,8 @@ from fixtures.log_helper import log def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_pageserver_restart') - pg = env.postgres.create_start('test_pageserver_restart') + env.neon_cli.create_branch("test_pageserver_restart") + pg = env.postgres.create_start("test_pageserver_restart") pg_conn = pg.connect() cur = pg_conn.cursor() @@ -17,18 +17,22 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point # of this test. - cur.execute('CREATE TABLE foo (t text)') - cur.execute(''' + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g - ''') + """ + ) # Verify that the table is larger than shared_buffers - cur.execute(''' + cur.execute( + """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize from pg_settings where name = 'shared_buffers' - ''') + """ + ) row = cur.fetchone() assert row is not None log.info(f"shared_buffers is {row[0]}, table size {row[1]}") @@ -49,7 +53,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): cur = pg_conn.cursor() cur.execute("SELECT count(*) FROM foo") - assert cur.fetchone() == (100000, ) + assert cur.fetchone() == (100000,) # Stop the page server by force, and restart it env.pageserver.stop() diff --git a/test_runner/batch_others/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py similarity index 87% rename from test_runner/batch_others/test_parallel_copy.py rename to test_runner/regress/test_parallel_copy.py index 55947fe427..59f19026cc 100644 --- a/test_runner/batch_others/test_parallel_copy.py +++ b/test_runner/regress/test_parallel_copy.py @@ -1,7 +1,8 @@ -from io import BytesIO import asyncio -from fixtures.neon_fixtures import NeonEnv, Postgres +from io import BytesIO + from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, Postgres async def repeat_bytes(buf, repetitions: int): @@ -13,7 +14,8 @@ async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str) buf = BytesIO() for i in range(1000): buf.write( - f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode()) + f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode() + ) buf.seek(0) copy_input = repeat_bytes(buf.read(), 5000) @@ -30,7 +32,7 @@ async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str) async def parallel_load_same_table(pg: Postgres, n_parallel: int): workers = [] for worker_id in range(n_parallel): - worker = copy_test_data_to_table(pg, worker_id, f'copytest') + worker = copy_test_data_to_table(pg, worker_id, "copytest") workers.append(asyncio.create_task(worker)) # await all workers @@ -41,13 +43,13 @@ async def parallel_load_same_table(pg: Postgres, n_parallel: int): def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): env = neon_simple_env env.neon_cli.create_branch("test_parallel_copy", "empty") - pg = env.postgres.create_start('test_parallel_copy') + pg = env.postgres.create_start("test_parallel_copy") log.info("postgres is running on 'test_parallel_copy' branch") # Create test table conn = pg.connect() cur = conn.cursor() - cur.execute(f'CREATE TABLE copytest (i int, t text)') + cur.execute("CREATE TABLE copytest (i int, t text)") # Run COPY TO to load the table with parallel connections. asyncio.run(parallel_load_same_table(pg, n_parallel)) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py new file mode 100644 index 0000000000..119528b8f9 --- /dev/null +++ b/test_runner/regress/test_pg_regress.py @@ -0,0 +1,159 @@ +# +# This file runs pg_regress-based tests. +# +import os +from pathlib import Path + +import pytest +from fixtures.neon_fixtures import NeonEnv, base_dir, check_restored_datadir_content, pg_distrib_dir + + +# Run the main PostgreSQL regression tests, in src/test/regress. +# +# This runs for a long time, especially in debug mode, so use a larger-than-default +# timeout. +@pytest.mark.timeout(1800) +def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): + env = neon_simple_env + + env.neon_cli.create_branch("test_pg_regress", "empty") + # Connect to postgres and create a database called "regression". + pg = env.postgres.create_start("test_pg_regress") + pg.safe_psql("CREATE DATABASE regression") + + # Create some local directories for pg_regress to run in. + runpath = test_output_dir / "regress" + (runpath / "testtablespace").mkdir(parents=True) + + # Compute all the file locations that pg_regress will need. + build_path = os.path.join(pg_distrib_dir, "build/src/test/regress") + src_path = os.path.join(base_dir, "vendor/postgres/src/test/regress") + bindir = os.path.join(pg_distrib_dir, "bin") + schedule = os.path.join(src_path, "parallel_schedule") + pg_regress = os.path.join(build_path, "pg_regress") + + pg_regress_command = [ + pg_regress, + '--bindir=""', + "--use-existing", + "--bindir={}".format(bindir), + "--dlpath={}".format(build_path), + "--schedule={}".format(schedule), + "--inputdir={}".format(src_path), + ] + + env_vars = { + "PGPORT": str(pg.default_options["port"]), + "PGUSER": pg.default_options["user"], + "PGHOST": pg.default_options["host"], + } + + # Run the command. + # We don't capture the output. It's not too chatty, and it always + # logs the exact same data to `regression.out` anyway. + with capsys.disabled(): + pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) + + # checkpoint one more time to ensure that the lsn we get is the latest one + pg.safe_psql("CHECKPOINT") + + # Check that we restore the content of the datadir correctly + check_restored_datadir_content(test_output_dir, env, pg) + + +# Run the PostgreSQL "isolation" tests, in src/test/isolation. +# +# This runs for a long time, especially in debug mode, so use a larger-than-default +# timeout. +@pytest.mark.timeout(1800) +def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): + env = neon_simple_env + + env.neon_cli.create_branch("test_isolation", "empty") + # Connect to postgres and create a database called "regression". + # isolation tests use prepared transactions, so enable them + pg = env.postgres.create_start("test_isolation", config_lines=["max_prepared_transactions=100"]) + pg.safe_psql("CREATE DATABASE isolation_regression") + + # Create some local directories for pg_isolation_regress to run in. + runpath = test_output_dir / "regress" + (runpath / "testtablespace").mkdir(parents=True) + + # Compute all the file locations that pg_isolation_regress will need. + build_path = os.path.join(pg_distrib_dir, "build/src/test/isolation") + src_path = os.path.join(base_dir, "vendor/postgres/src/test/isolation") + bindir = os.path.join(pg_distrib_dir, "bin") + schedule = os.path.join(src_path, "isolation_schedule") + pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") + + pg_isolation_regress_command = [ + pg_isolation_regress, + "--use-existing", + "--bindir={}".format(bindir), + "--dlpath={}".format(build_path), + "--inputdir={}".format(src_path), + "--schedule={}".format(schedule), + ] + + env_vars = { + "PGPORT": str(pg.default_options["port"]), + "PGUSER": pg.default_options["user"], + "PGHOST": pg.default_options["host"], + } + + # Run the command. + # We don't capture the output. It's not too chatty, and it always + # logs the exact same data to `regression.out` anyway. + with capsys.disabled(): + pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) + + +# Run extra Neon-specific pg_regress-based tests. The tests and their +# schedule file are in the sql_regress/ directory. +def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): + env = neon_simple_env + + env.neon_cli.create_branch("test_sql_regress", "empty") + # Connect to postgres and create a database called "regression". + pg = env.postgres.create_start("test_sql_regress") + pg.safe_psql("CREATE DATABASE regression") + + # Create some local directories for pg_regress to run in. + runpath = test_output_dir / "regress" + (runpath / "testtablespace").mkdir(parents=True) + + # Compute all the file locations that pg_regress will need. + # This test runs neon specific tests + build_path = os.path.join(pg_distrib_dir, "build/src/test/regress") + src_path = os.path.join(base_dir, "test_runner/sql_regress") + bindir = os.path.join(pg_distrib_dir, "bin") + schedule = os.path.join(src_path, "parallel_schedule") + pg_regress = os.path.join(build_path, "pg_regress") + + pg_regress_command = [ + pg_regress, + "--use-existing", + "--bindir={}".format(bindir), + "--dlpath={}".format(build_path), + "--schedule={}".format(schedule), + "--inputdir={}".format(src_path), + ] + + env_vars = { + "PGPORT": str(pg.default_options["port"]), + "PGUSER": pg.default_options["user"], + "PGHOST": pg.default_options["host"], + } + + # Run the command. + # We don't capture the output. It's not too chatty, and it always + # logs the exact same data to `regression.out` anyway. + with capsys.disabled(): + pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) + + # checkpoint one more time to ensure that the lsn we get is the latest one + pg.safe_psql("CHECKPOINT") + pg.safe_psql("select pg_current_wal_insert_lsn()")[0][0] + + # Check that we restore the content of the datadir correctly + check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py similarity index 68% rename from test_runner/batch_others/test_pitr_gc.py rename to test_runner/regress/test_pitr_gc.py index d63fc4b584..1fc18ebbc4 100644 --- a/test_runner/batch_others/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -2,8 +2,8 @@ from contextlib import closing import psycopg2.extras from fixtures.log_helper import log -from fixtures.utils import print_gc_result, query_scalar from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import print_gc_result, query_scalar # @@ -14,10 +14,12 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 # Set pitr interval such that we need to keep the data - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" + neon_env_builder.pageserver_config_override = ( + "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" + ) env = neon_env_builder.init_start() - pgmain = env.postgres.create_start('main') + pgmain = env.postgres.create_start("main") log.info("postgres is running on 'main' branch") main_pg_conn = pgmain.connect() @@ -25,30 +27,32 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): timeline = query_scalar(main_cur, "SHOW neon.timeline_id") # Create table - main_cur.execute('CREATE TABLE foo (t text)') + main_cur.execute("CREATE TABLE foo (t text)") for i in range(10000): - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space'; - ''') + """ + ) if i == 99: # keep some early lsn to test branch creation after GC - main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()') + main_cur.execute("SELECT pg_current_wal_insert_lsn(), txid_current()") res = main_cur.fetchone() assert res is not None lsn_a = res[0] xid_a = res[1] - log.info(f'LSN after 100 rows: {lsn_a} xid {xid_a}') + log.info(f"LSN after 100 rows: {lsn_a} xid {xid_a}") - main_cur.execute('SELECT pg_current_wal_insert_lsn(), txid_current()') + main_cur.execute("SELECT pg_current_wal_insert_lsn(), txid_current()") res = main_cur.fetchone() assert res is not None debug_lsn = res[0] debug_xid = res[1] - log.info(f'LSN after 10000 rows: {debug_lsn} xid {debug_xid}') + log.info(f"LSN after 10000 rows: {debug_lsn} xid {debug_xid}") # run GC with closing(env.pageserver.connect()) as psconn: @@ -61,16 +65,16 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # Branch at the point where only 100 rows were inserted # It must have been preserved by PITR setting - env.neon_cli.create_branch('test_pitr_gc_hundred', 'main', ancestor_start_lsn=lsn_a) + env.neon_cli.create_branch("test_pitr_gc_hundred", "main", ancestor_start_lsn=lsn_a) - pg_hundred = env.postgres.create_start('test_pitr_gc_hundred') + pg_hundred = env.postgres.create_start("test_pitr_gc_hundred") # On the 'hundred' branch, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() hundred_cur = hundred_pg_conn.cursor() - hundred_cur.execute('SELECT count(*) FROM foo') - assert hundred_cur.fetchone() == (100, ) + hundred_cur.execute("SELECT count(*) FROM foo") + assert hundred_cur.fetchone() == (100,) # All the rows are visible on the main branch - main_cur.execute('SELECT count(*) FROM foo') - assert main_cur.fetchone() == (10000, ) + main_cur.execute("SELECT count(*) FROM foo") + assert main_cur.fetchone() == (10000,) diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py new file mode 100644 index 0000000000..bd02841dc0 --- /dev/null +++ b/test_runner/regress/test_proxy.py @@ -0,0 +1,142 @@ +import json +import subprocess +from urllib.parse import urlparse + +import psycopg2 +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres + + +def test_proxy_select_1(static_proxy): + static_proxy.safe_psql("select 1", options="project=generic-project-name") + + +def test_password_hack(static_proxy): + user = "borat" + password = "password" + static_proxy.safe_psql( + f"create role {user} with login password '{password}'", options="project=irrelevant" + ) + + # Note the format of `magic`! + magic = f"project=irrelevant;{password}" + static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic) + + # Must also check that invalid magic won't be accepted. + with pytest.raises(psycopg2.errors.OperationalError): + magic = "broken" + static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic) + + +def get_session_id_from_uri_line(uri_prefix, uri_line): + assert uri_prefix in uri_line + + url_parts = urlparse(uri_line) + psql_session_id = url_parts.path[1:] + assert psql_session_id.isalnum(), "session_id should only contain alphanumeric chars." + link_auth_uri_prefix = uri_line[: -len(url_parts.path)] + # invariant: the prefix must match the uri_prefix. + assert ( + link_auth_uri_prefix == uri_prefix + ), f"Line='{uri_line}' should contain a http auth link of form '{uri_prefix}/'." + # invariant: the entire link_auth_uri should be on its own line, module spaces. + assert " ".join(uri_line.split(" ")) == f"{uri_prefix}/{psql_session_id}" + + return psql_session_id + + +def create_and_send_db_info(local_vanilla_pg, psql_session_id, mgmt_port): + pg_user = "proxy" + pg_password = "password" + + local_vanilla_pg.start() + query = f"create user {pg_user} with login superuser password '{pg_password}'" + local_vanilla_pg.safe_psql(query) + + port = local_vanilla_pg.default_options["port"] + host = local_vanilla_pg.default_options["host"] + dbname = local_vanilla_pg.default_options["dbname"] + + db_info_dict = { + "session_id": psql_session_id, + "result": { + "Success": { + "host": host, + "port": port, + "dbname": dbname, + "user": pg_user, + "password": pg_password, + } + }, + } + db_info_str = json.dumps(db_info_dict) + cmd_args = [ + "psql", + "-h", + "127.0.0.1", # localhost + "-p", + f"{mgmt_port}", + "-c", + db_info_str, + ] + + log.info(f"Sending to proxy the user and db info: {' '.join(cmd_args)}") + p = subprocess.Popen(cmd_args, stdout=subprocess.PIPE) + out, err = p.communicate() + assert "ok" in str(out) + + +async def get_uri_line_from_process_welcome_notice(link_auth_uri_prefix, proc): + """ + Returns the line from the welcome notice from proc containing link_auth_uri_prefix. + :param link_auth_uri_prefix: the uri prefix used to indicate the line of interest + :param proc: the process to read the welcome message from. + :return: a line containing the full link authentication uri. + """ + max_num_lines_of_welcome_message = 15 + for attempt in range(max_num_lines_of_welcome_message): + raw_line = await proc.stderr.readline() + line = raw_line.decode("utf-8").strip() + if link_auth_uri_prefix in line: + return line + assert False, f"did not find line containing '{link_auth_uri_prefix}'" + + +@pytest.mark.asyncio +async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy): + """ + Test copied and modified from: test_project_psql_link_auth test from cloud/tests_e2e/tests/test_project.py + Step 1. establish connection to the proxy + Step 2. retrieve session_id: + Step 2.1: read welcome message + Step 2.2: parse session_id + Step 3. create a vanilla_pg and send user and db info via command line (using Popen) a psql query via mgmt port to proxy. + Step 4. assert that select 1 has been executed correctly. + """ + + psql = PSQL( + host=link_proxy.host, + port=link_proxy.proxy_port, + ) + proc = await psql.run("select 42") + + uri_prefix = link_proxy.link_auth_uri_prefix + line_str = await get_uri_line_from_process_welcome_notice(uri_prefix, proc) + + psql_session_id = get_session_id_from_uri_line(uri_prefix, line_str) + log.info(f"Parsed psql_session_id='{psql_session_id}' from Neon welcome message.") + + create_and_send_db_info(vanilla_pg, psql_session_id, link_proxy.mgmt_port) + + out = (await proc.stdout.read()).decode("utf-8").strip() + assert out == "42" + + +# Pass extra options to the server. +def test_proxy_options(static_proxy): + with static_proxy.connect(options="project=irrelevant -cproxytest.option=value") as conn: + with conn.cursor() as cur: + cur.execute("SHOW proxytest.option") + value = cur.fetchall()[0][0] + assert value == "value" diff --git a/test_runner/batch_others/test_read_validation.py b/test_runner/regress/test_read_validation.py similarity index 84% rename from test_runner/batch_others/test_read_validation.py rename to test_runner/regress/test_read_validation.py index 4be7af4c10..beaae0351b 100644 --- a/test_runner/batch_others/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -1,14 +1,11 @@ from contextlib import closing -from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log - -from psycopg2.errors import UndefinedTable -from psycopg2.errors import IoError - +from fixtures.neon_fixtures import NeonEnv from fixtures.utils import query_scalar +from psycopg2.errors import IoError, UndefinedTable -pytest_plugins = ("fixtures.neon_fixtures") +pytest_plugins = "fixtures.neon_fixtures" extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"] @@ -47,13 +44,15 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Test table is populated, validating buffer cache") cache_entries = query_scalar( - c, - "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + ) assert cache_entries > 0, "No buffers cached for the test relation" c.execute( - "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}" - .format(relfilenode)) + "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}".format( + relfilenode + ) + ) reln = c.fetchone() assert reln is not None @@ -62,21 +61,23 @@ def test_read_validation(neon_simple_env: NeonEnv): c.execute("select clear_buffer_cache()") cache_entries = query_scalar( - c, - "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + ) assert cache_entries == 0, "Failed to clear buffer cache" log.info("Cache is clear, reading stale page version") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))" - .format(first[0])) + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))".format( + first[0] + ) + ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn" cache_entries = query_scalar( - c, - "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + ) assert cache_entries == 0, "relation buffers detected after invalidation" log.info("Cache is clear, reading latest page version without cache") @@ -88,8 +89,8 @@ def test_read_validation(neon_simple_env: NeonEnv): assert second == direct_latest, "Failed fetch page at latest lsn" cache_entries = query_scalar( - c, - "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode)) + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + ) assert cache_entries == 0, "relation buffers detected after invalidation" log.info( @@ -97,8 +98,10 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))" - .format(reln[0], reln[1], reln[2], first[0])) + "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( + reln[0], reln[1], reln[2], first[0] + ) + ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -107,20 +110,24 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))" - .format(reln[0], reln[1], reln[2])) + "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))".format( + reln[0], reln[1], reln[2] + ) + ) direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" - c.execute('drop table foo;') + c.execute("drop table foo;") log.info( "Relation dropped, attempting reading stale page version without cache using relation identifiers" ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))" - .format(reln[0], reln[1], reln[2], first[0])) + "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( + reln[0], reln[1], reln[2], first[0] + ) + ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/regress/test_readonly_node.py similarity index 50% rename from test_runner/batch_others/test_readonly_node.py rename to test_runner/regress/test_readonly_node.py index 82fc6329cf..0bd78c62a3 100644 --- a/test_runner/batch_others/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -12,81 +12,87 @@ from fixtures.utils import query_scalar # def test_readonly_node(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch('test_readonly_node', 'empty') - pgmain = env.postgres.create_start('test_readonly_node') + env.neon_cli.create_branch("test_readonly_node", "empty") + pgmain = env.postgres.create_start("test_readonly_node") log.info("postgres is running on 'test_readonly_node' branch") main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() # Create table, and insert the first 100 rows - main_cur.execute('CREATE TABLE foo (t text)') + main_cur.execute("CREATE TABLE foo (t text)") - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100) g - ''') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_a = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info('LSN after 100 rows: ' + lsn_a) + """ + ) + main_cur.execute("SELECT pg_current_wal_insert_lsn()") + lsn_a = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info("LSN after 100 rows: " + lsn_a) # Insert some more rows. (This generates enough WAL to fill a few segments.) - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 200000) g - ''') - lsn_b = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info('LSN after 200100 rows: ' + lsn_b) + """ + ) + lsn_b = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info("LSN after 200100 rows: " + lsn_b) # Insert many more rows. This generates enough WAL to fill a few segments. - main_cur.execute(''' + main_cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 200000) g - ''') + """ + ) - lsn_c = query_scalar(main_cur, 'SELECT pg_current_wal_insert_lsn()') - log.info('LSN after 400100 rows: ' + lsn_c) + lsn_c = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info("LSN after 400100 rows: " + lsn_c) # Create first read-only node at the point where only 100 rows were inserted - pg_hundred = env.postgres.create_start(branch_name='test_readonly_node', - node_name='test_readonly_node_hundred', - lsn=lsn_a) + pg_hundred = env.postgres.create_start( + branch_name="test_readonly_node", node_name="test_readonly_node_hundred", lsn=lsn_a + ) # And another at the point where 200100 rows were inserted - pg_more = env.postgres.create_start(branch_name='test_readonly_node', - node_name='test_readonly_node_more', - lsn=lsn_b) + pg_more = env.postgres.create_start( + branch_name="test_readonly_node", node_name="test_readonly_node_more", lsn=lsn_b + ) # On the 'hundred' node, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() hundred_cur = hundred_pg_conn.cursor() - hundred_cur.execute('SELECT count(*) FROM foo') - assert hundred_cur.fetchone() == (100, ) + hundred_cur.execute("SELECT count(*) FROM foo") + assert hundred_cur.fetchone() == (100,) # On the 'more' node, we should see 100200 rows more_pg_conn = pg_more.connect() more_cur = more_pg_conn.cursor() - more_cur.execute('SELECT count(*) FROM foo') - assert more_cur.fetchone() == (200100, ) + more_cur.execute("SELECT count(*) FROM foo") + assert more_cur.fetchone() == (200100,) # All the rows are visible on the main branch - main_cur.execute('SELECT count(*) FROM foo') - assert main_cur.fetchone() == (400100, ) + main_cur.execute("SELECT count(*) FROM foo") + assert main_cur.fetchone() == (400100,) # Check creating a node at segment boundary - pg = env.postgres.create_start(branch_name='test_readonly_node', - node_name='test_branch_segment_boundary', - lsn='0/3000000') + pg = env.postgres.create_start( + branch_name="test_readonly_node", node_name="test_branch_segment_boundary", lsn="0/3000000" + ) cur = pg.connect().cursor() - cur.execute('SELECT 1') - assert cur.fetchone() == (1, ) + cur.execute("SELECT 1") + assert cur.fetchone() == (1,) # Create node at pre-initdb lsn with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail - env.postgres.create_start(branch_name='test_readonly_node', - node_name='test_readonly_node_preinitdb', - lsn='0/42') + env.postgres.create_start( + branch_name="test_readonly_node", node_name="test_readonly_node_preinitdb", lsn="0/42" + ) diff --git a/test_runner/batch_others/test_recovery.py b/test_runner/regress/test_recovery.py similarity index 87% rename from test_runner/batch_others/test_recovery.py rename to test_runner/regress/test_recovery.py index 5ba783b802..6aa8b4e9be 100644 --- a/test_runner/batch_others/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -1,11 +1,9 @@ -import os import time -import psycopg2.extras -import json -from ast import Assert from contextlib import closing -from fixtures.neon_fixtures import NeonEnvBuilder + +import psycopg2.extras from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder # @@ -21,17 +19,17 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): # Check if failpoints enables. Otherwise the test doesn't make sense f = env.neon_cli.pageserver_enabled_features() - assert "failpoints" in f["features"], "Build pageserver with --features=failpoints option to run this test" + assert ( + "failpoints" in f["features"] + ), "Build pageserver with --features=failpoints option to run this test" neon_env_builder.start() # Create a branch for us env.neon_cli.create_branch("test_pageserver_recovery", "main") - pg = env.postgres.create_start('test_pageserver_recovery') + pg = env.postgres.create_start("test_pageserver_recovery") log.info("postgres is running on 'test_pageserver_recovery' branch") - connstr = pg.connstr() - with closing(pg.connect()) as conn: with conn.cursor() as cur: with closing(env.pageserver.connect()) as psconn: @@ -62,4 +60,4 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute("select count(*) from foo") - assert cur.fetchone() == (100000, ) + assert cur.fetchone() == (100000,) diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/regress/test_remote_storage.py similarity index 70% rename from test_runner/batch_others/test_remote_storage.py rename to test_runner/regress/test_remote_storage.py index ca46010dca..0015c75670 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -1,14 +1,24 @@ # It's possible to run any regular test with the local fs remote storage via # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... -import shutil, os -from pathlib import Path +import os +import shutil import time +from pathlib import Path from uuid import UUID -from fixtures.neon_fixtures import NeonEnvBuilder, RemoteStorageKind, assert_timeline_local, available_remote_storages, wait_until, wait_for_last_record_lsn, wait_for_upload -from fixtures.log_helper import log -from fixtures.utils import lsn_from_hex, query_scalar + import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + RemoteStorageKind, + assert_timeline_local, + available_remote_storages, + wait_for_last_record_lsn, + wait_for_upload, + wait_until, +) +from fixtures.utils import lsn_from_hex, query_scalar # @@ -28,26 +38,26 @@ import pytest # * queries the specific data, ensuring that it matches the one stored before # # The tests are done for all types of remote storage pageserver supports. -@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages()) +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_remote_storage_backup_and_restore( neon_env_builder: NeonEnvBuilder, - remote_storatge_kind: RemoteStorageKind, + remote_storage_kind: RemoteStorageKind, ): # Use this test to check more realistic SK ids: some etcd key parsing bugs were related, # and this test needs SK to write data to pageserver, so it will be visible neon_env_builder.safekeepers_id_start = 12 neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storatge_kind, - test_name='test_remote_storage_backup_and_restore', + remote_storage_kind=remote_storage_kind, + test_name="test_remote_storage_backup_and_restore", ) data_id = 1 - data_secret = 'very secret secret' + data_secret = "very secret secret" ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() - pg = env.postgres.create_start('main') + pg = env.postgres.create_start("main") client = env.pageserver.http_client() @@ -58,10 +68,12 @@ def test_remote_storage_backup_and_restore( for checkpoint_number in checkpoint_numbers: with pg.cursor() as cur: - cur.execute(f''' + cur.execute( + f""" CREATE TABLE t{checkpoint_number}(id int primary key, secret text); INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); - ''') + """ + ) current_lsn = lsn_from_hex(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) # wait until pageserver receives that data @@ -70,16 +82,16 @@ def test_remote_storage_backup_and_restore( # run checkpoint manually to be sure that data landed in remote storage env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") - log.info(f'waiting for checkpoint {checkpoint_number} upload') + log.info(f"waiting for checkpoint {checkpoint_number} upload") # wait until pageserver successfully uploaded a checkpoint to remote storage wait_for_upload(client, UUID(tenant_id), UUID(timeline_id), current_lsn) - log.info(f'upload of checkpoint {checkpoint_number} is done') + log.info(f"upload of checkpoint {checkpoint_number} is done") ##### Stop the first pageserver instance, erase all its data env.postgres.stop_all() env.pageserver.stop() - dir_to_clear = Path(env.repo_dir) / 'tenants' + dir_to_clear = Path(env.repo_dir) / "tenants" shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) @@ -87,7 +99,7 @@ def test_remote_storage_backup_and_restore( env.pageserver.start() # Introduce failpoint in download - env.pageserver.safe_psql(f"failpoints remote-storage-download-pre-rename=return") + env.pageserver.safe_psql("failpoints remote-storage-download-pre-rename=return") client.tenant_attach(UUID(tenant_id)) @@ -100,8 +112,8 @@ def test_remote_storage_backup_and_restore( detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) log.info("Timeline detail with active failpoint: %s", detail) - assert detail['local'] is None - assert detail['remote']['awaits_download'] + assert detail["local"] is None + assert detail["remote"]["awaits_download"] # trigger temporary download files removal env.pageserver.stop() @@ -110,19 +122,24 @@ def test_remote_storage_backup_and_restore( client.tenant_attach(UUID(tenant_id)) log.info("waiting for timeline redownload") - wait_until(number_of_iterations=20, - interval=1, - func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id))) + wait_until( + number_of_iterations=20, + interval=1, + func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id)), + ) detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) - assert detail['local'] is not None + assert detail["local"] is not None log.info("Timeline detail after attach completed: %s", detail) - assert lsn_from_hex(detail['local']['last_record_lsn']) >= current_lsn, 'current db Lsn should should not be less than the one stored on remote storage' - assert not detail['remote']['awaits_download'] + assert ( + lsn_from_hex(detail["local"]["last_record_lsn"]) >= current_lsn + ), "current db Lsn should should not be less than the one stored on remote storage" + assert not detail["remote"]["awaits_download"] - pg = env.postgres.create_start('main') + pg = env.postgres.create_start("main") with pg.cursor() as cur: for checkpoint_number in checkpoint_numbers: - assert query_scalar(cur, - f'SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};' - ) == f'{data_secret}|{checkpoint_number}' + assert ( + query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};") + == f"{data_secret}|{checkpoint_number}" + ) diff --git a/test_runner/batch_others/test_setup.py b/test_runner/regress/test_setup.py similarity index 100% rename from test_runner/batch_others/test_setup.py rename to test_runner/regress/test_setup.py diff --git a/test_runner/batch_others/test_subxacts.py b/test_runner/regress/test_subxacts.py similarity index 74% rename from test_runner/batch_others/test_subxacts.py rename to test_runner/regress/test_subxacts.py index d06877825e..42234bf535 100644 --- a/test_runner/batch_others/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -1,5 +1,5 @@ -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content # Test subtransactions @@ -11,28 +11,30 @@ from fixtures.log_helper import log def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env env.neon_cli.create_branch("test_subxacts", "empty") - pg = env.postgres.create_start('test_subxacts') + pg = env.postgres.create_start("test_subxacts") log.info("postgres is running on 'test_subxacts' branch") pg_conn = pg.connect() cur = pg_conn.cursor() - cur.execute(''' + cur.execute( + """ CREATE TABLE t1(i int, j int); - ''') + """ + ) - cur.execute('select pg_switch_wal();') + cur.execute("select pg_switch_wal();") # Issue 100 transactions, with 1000 subtransactions in each. for i in range(100): - cur.execute('begin') + cur.execute("begin") for j in range(1000): - cur.execute(f'savepoint sp{j}') - cur.execute(f'insert into t1 values ({i}, {j})') - cur.execute('commit') + cur.execute(f"savepoint sp{j}") + cur.execute(f"insert into t1 values ({i}, {j})") + cur.execute("commit") # force wal flush - cur.execute('checkpoint') + cur.execute("checkpoint") # Check that we can restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/batch_others/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py similarity index 75% rename from test_runner/batch_others/test_tenant_conf.py rename to test_runner/regress/test_tenant_conf.py index d25aad742e..d496edd6dc 100644 --- a/test_runner/batch_others/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -1,28 +1,28 @@ from contextlib import closing -import pytest import psycopg2.extras - -from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder def test_tenant_config(neon_env_builder: NeonEnvBuilder): # set some non-default global config - neon_env_builder.pageserver_config_override = ''' + neon_env_builder.pageserver_config_override = """ page_cache_size=444; wait_lsn_timeout='111 s'; -tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' +tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" env = neon_env_builder.init_start() """Test per tenant configuration""" - tenant, _ = env.neon_cli.create_tenant(conf={ - 'checkpoint_distance': '20000', - 'gc_period': '30sec', - }) + tenant, _ = env.neon_cli.create_tenant( + conf={ + "checkpoint_distance": "20000", + "gc_period": "30sec", + } + ) - env.neon_cli.create_timeline(f'test_tenant_conf', tenant_id=tenant) - pg = env.postgres.create_start( + env.neon_cli.create_timeline("test_tenant_conf", tenant_id=tenant) + env.postgres.create_start( "test_tenant_conf", "main", tenant, @@ -36,7 +36,8 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' pscur.execute(f"show {env.initial_tenant.hex}") res = pscur.fetchone() assert all( - i in res.items() for i in { + i in res.items() + for i in { "checkpoint_distance": 10000, "compaction_target_size": 1048576, "compaction_period": 1, @@ -44,8 +45,9 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' "gc_horizon": 67108864, "gc_period": 100, "image_creation_threshold": 3, - "pitr_interval": 2592000 - }.items()) + "pitr_interval": 2592000, + }.items() + ) # check the configuration of the new tenant with closing(env.pageserver.connect()) as psconn: @@ -54,7 +56,8 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' res = pscur.fetchone() log.info(f"res: {res}") assert all( - i in res.items() for i in { + i in res.items() + for i in { "checkpoint_distance": 20000, "compaction_target_size": 1048576, "compaction_period": 1, @@ -62,15 +65,18 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' "gc_horizon": 67108864, "gc_period": 30, "image_creation_threshold": 3, - "pitr_interval": 2592000 - }.items()) + "pitr_interval": 2592000, + }.items() + ) # update the config and ensure that it has changed - env.neon_cli.config_tenant(tenant_id=tenant, - conf={ - 'checkpoint_distance': '15000', - 'gc_period': '80sec', - }) + env.neon_cli.config_tenant( + tenant_id=tenant, + conf={ + "checkpoint_distance": "15000", + "gc_period": "80sec", + }, + ) with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: @@ -78,7 +84,8 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' res = pscur.fetchone() log.info(f"after config res: {res}") assert all( - i in res.items() for i in { + i in res.items() + for i in { "checkpoint_distance": 15000, "compaction_target_size": 1048576, "compaction_period": 1, @@ -86,8 +93,9 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' "gc_horizon": 67108864, "gc_period": 80, "image_creation_threshold": 3, - "pitr_interval": 2592000 - }.items()) + "pitr_interval": 2592000, + }.items() + ) # restart the pageserver and ensure that the config is still correct env.pageserver.stop() @@ -99,7 +107,8 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' res = pscur.fetchone() log.info(f"after restart res: {res}") assert all( - i in res.items() for i in { + i in res.items() + for i in { "checkpoint_distance": 15000, "compaction_target_size": 1048576, "compaction_period": 1, @@ -107,5 +116,6 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' "gc_horizon": 67108864, "gc_period": 80, "image_creation_threshold": 3, - "pitr_interval": 2592000 - }.items()) + "pitr_interval": 2592000, + }.items() + ) diff --git a/test_runner/batch_others/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py similarity index 66% rename from test_runner/batch_others/test_tenant_detach.py rename to test_runner/regress/test_tenant_detach.py index afc4f89bbf..f1b30429bf 100644 --- a/test_runner/batch_others/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,9 +1,9 @@ +import uuid from threading import Thread from uuid import uuid4 -import uuid + import psycopg2 import pytest - from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException @@ -11,7 +11,7 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiExc def do_gc_target(env: NeonEnv, tenant_id: uuid.UUID, timeline_id: uuid.UUID): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: - env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0') + env.pageserver.safe_psql(f"do_gc {tenant_id.hex} {timeline_id.hex} 0") except Exception as e: log.error("do_gc failed: %s", e) @@ -22,8 +22,10 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # first check for non existing tenant tenant_id = uuid4() - with pytest.raises(expected_exception=NeonPageserverApiException, - match=f'Tenant not found for id {tenant_id.hex}'): + with pytest.raises( + expected_exception=NeonPageserverApiException, + match=f"Tenant not found for id {tenant_id.hex}", + ): pageserver_http.tenant_detach(tenant_id) # create new nenant @@ -32,17 +34,20 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # assert tenant exists on disk assert (env.repo_dir / "tenants" / tenant_id.hex).exists() - pg = env.postgres.create_start('main', tenant_id=tenant_id) + pg = env.postgres.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement - pg.safe_psql_many(queries=[ - 'CREATE TABLE t(key int primary key, value text)', - 'INSERT INTO t SELECT generate_series(1,100000), \'payload\'', - ]) + pg.safe_psql_many( + queries=[ + "CREATE TABLE t(key int primary key, value text)", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + ] + ) # gc should not try to even start - with pytest.raises(expected_exception=psycopg2.DatabaseError, - match='gc target timeline does not exist'): - env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {uuid4().hex} 0') + with pytest.raises( + expected_exception=psycopg2.DatabaseError, match="gc target timeline does not exist" + ): + env.pageserver.safe_psql(f"do_gc {tenant_id.hex} {uuid4().hex} 0") # try to concurrently run gc and detach gc_thread = Thread(target=lambda: do_gc_target(env, tenant_id, timeline_id)) @@ -67,6 +72,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # check that nothing is left on disk for deleted tenant assert not (env.repo_dir / "tenants" / tenant_id.hex).exists() - with pytest.raises(expected_exception=psycopg2.DatabaseError, - match=f'Tenant {tenant_id.hex} not found'): - env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0') + with pytest.raises( + expected_exception=psycopg2.DatabaseError, match=f"Tenant {tenant_id.hex} not found" + ): + env.pageserver.safe_psql(f"do_gc {tenant_id.hex} {timeline_id.hex} 0") diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py similarity index 76% rename from test_runner/batch_others/test_tenant_relocation.py rename to test_runner/regress/test_tenant_relocation.py index eb65e2e3b5..4d949e0c13 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -14,7 +14,6 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient, - PageserverPort, PortDistributor, Postgres, assert_no_in_progress_downloads_for_tenant, @@ -34,12 +33,14 @@ def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @contextmanager -def new_pageserver_helper(new_pageserver_dir: pathlib.Path, - pageserver_bin: pathlib.Path, - remote_storage_mock_path: pathlib.Path, - pg_port: int, - http_port: int, - broker: Optional[Etcd]): +def new_pageserver_helper( + new_pageserver_dir: pathlib.Path, + pageserver_bin: pathlib.Path, + remote_storage_mock_path: pathlib.Path, + pg_port: int, + http_port: int, + broker: Optional[Etcd], +): """ cannot use NeonPageserver yet because it depends on neon cli which currently lacks support for multiple pageservers @@ -47,18 +48,20 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, # actually run new pageserver cmd = [ str(pageserver_bin), - '--workdir', + "--workdir", str(new_pageserver_dir), - '--daemonize', - '--update-config', + "--daemonize", + "--update-config", f"-c listen_pg_addr='localhost:{pg_port}'", f"-c listen_http_addr='localhost:{http_port}'", f"-c pg_distrib_dir='{pg_distrib_dir}'", - f"-c id=2", + "-c id=2", f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", ] if broker is not None: - cmd.append(f"-c broker_endpoints=['{broker.client_url()}']", ) + cmd.append( + f"-c broker_endpoints=['{broker.client_url()}']", + ) log.info("starting new pageserver %s", cmd) out = subprocess.check_output(cmd, text=True) @@ -67,7 +70,7 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, yield finally: log.info("stopping new pageserver") - pid = int((new_pageserver_dir / 'pageserver.pid').read_text()) + pid = int((new_pageserver_dir / "pageserver.pid").read_text()) os.kill(pid, signal.SIGQUIT) @@ -88,7 +91,7 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve with pg_cur(pg) as cur: cur.execute("INSERT INTO load VALUES ('some payload')") inserted_ctr += 1 - except: + except: # noqa: E722 if not failed: log.info("load failed") failed = True @@ -105,7 +108,7 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve log.info("successfully recovered %s", inserted_ctr) failed = False load_ok_event.set() - log.info('load thread stopped') + log.info("load thread stopped") def populate_branch( @@ -123,8 +126,10 @@ def populate_branch( cur.execute("SELECT pg_current_wal_flush_lsn()") log.info("pg_current_wal_flush_lsn() %s", lsn_from_hex(cur.fetchone()[0])) - log.info("timeline detail %s", - ps_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)) + log.info( + "timeline detail %s", + ps_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id), + ) # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -133,7 +138,7 @@ def populate_branch( cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'some payload'") if expected_sum is not None: cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (expected_sum, ) + assert cur.fetchone() == (expected_sum,) cur.execute("SELECT pg_current_wal_flush_lsn()") current_lsn = lsn_from_hex(cur.fetchone()[0]) @@ -166,34 +171,41 @@ def check_timeline_attached( # when load is active these checks can break because lsns are not static # so lets check with some margin - assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), - lsn_from_hex(old_timeline_detail['local']['disk_consistent_lsn']), - 0.03) + assert_abs_margin_ratio( + lsn_from_hex(new_timeline_detail["local"]["disk_consistent_lsn"]), + lsn_from_hex(old_timeline_detail["local"]["disk_consistent_lsn"]), + 0.03, + ) - assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), - old_current_lsn, - 0.03) + assert_abs_margin_ratio( + lsn_from_hex(new_timeline_detail["local"]["disk_consistent_lsn"]), old_current_lsn, 0.03 + ) -def switch_pg_to_new_pageserver(env: NeonEnv, - pg: Postgres, - new_pageserver_port: int, - tenant_id: UUID, - timeline_id: UUID) -> pathlib.Path: +def switch_pg_to_new_pageserver( + env: NeonEnv, pg: Postgres, new_pageserver_port: int, tenant_id: UUID, timeline_id: UUID +) -> pathlib.Path: pg.stop() pg_config_file_path = pathlib.Path(pg.config_file_path()) - pg_config_file_path.open('a').write( - f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_port}'") + pg_config_file_path.open("a").write( + f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_port}'" + ) pg.start() - timeline_to_detach_local_path = env.repo_dir / 'tenants' / tenant_id.hex / 'timelines' / timeline_id.hex + timeline_to_detach_local_path = ( + env.repo_dir / "tenants" / tenant_id.hex / "timelines" / timeline_id.hex + ) files_before_detach = os.listdir(timeline_to_detach_local_path) - assert 'metadata' in files_before_detach, f'Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ - but got: {files_before_detach}' - assert len(files_before_detach) >= 2, f'Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ - but got {files_before_detach}' + assert ( + "metadata" in files_before_detach + ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ + but got: {files_before_detach}" + assert ( + len(files_before_detach) >= 2 + ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ + but got {files_before_detach}" return timeline_to_detach_local_path @@ -202,39 +214,44 @@ def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path with pg_cur(pg) as cur: # check that data is still there cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (sum_before_migration, ) + assert cur.fetchone() == (sum_before_migration,) # check that we can write new data cur.execute("INSERT INTO t SELECT generate_series(1001,2000), 'some payload'") cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (sum_before_migration + 1500500, ) + assert cur.fetchone() == (sum_before_migration + 1500500,) - assert not os.path.exists(old_local_path), f'After detach, local timeline dir {old_local_path} should be removed' + assert not os.path.exists( + old_local_path + ), f"After detach, local timeline dir {old_local_path} should be removed" @pytest.mark.parametrize( - 'method', + "method", [ # A minor migration involves no storage breaking changes. # It is done by attaching the tenant to a new pageserver. - 'minor', + "minor", # A major migration involves exporting a postgres datadir # basebackup and importing it into the new pageserver. # This kind of migration can tolerate breaking changes # to storage format - 'major', - ]) -@pytest.mark.parametrize('with_load', ['with_load', 'without_load']) -def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, - port_distributor: PortDistributor, - test_output_dir, - method: str, - with_load: str): + "major", + ], +) +@pytest.mark.parametrize("with_load", ["with_load", "without_load"]) +def test_tenant_relocation( + neon_env_builder: NeonEnvBuilder, + port_distributor: PortDistributor, + test_output_dir, + method: str, + with_load: str, +): neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() # create folder for remote storage mock - remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' + remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage" # we use two branches to check that they are both relocated # first branch is used for load, compute for second one is used to @@ -242,12 +259,15 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, pageserver_http = env.pageserver.http_client() - tenant_id, initial_timeline_id = env.neon_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) + tenant_id, initial_timeline_id = env.neon_cli.create_tenant( + UUID("74ee8b079a0e437eb0afea7d26a07209") + ) log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id) - pg_main = env.postgres.create_start(branch_name='test_tenant_relocation_main', - tenant_id=tenant_id) + pg_main = env.postgres.create_start( + branch_name="test_tenant_relocation_main", tenant_id=tenant_id + ) timeline_id_main, current_lsn_main = populate_branch( pg_main, @@ -263,8 +283,9 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, ancestor_start_lsn=lsn_to_hex(current_lsn_main), tenant_id=tenant_id, ) - pg_second = env.postgres.create_start(branch_name='test_tenant_relocation_second', - tenant_id=tenant_id) + pg_second = env.postgres.create_start( + branch_name="test_tenant_relocation_second", tenant_id=tenant_id + ) timeline_id_second, current_lsn_second = populate_branch( pg_second, @@ -281,7 +302,7 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_second, current_lsn_second) timeline_detail_second = assert_timeline_local(pageserver_http, tenant_id, timeline_id_second) - if with_load == 'with_load': + if with_load == "with_load": # create load table with pg_cur(pg_main) as cur: cur.execute("CREATE TABLE load(value text)") @@ -317,22 +338,24 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, log.info("inititalizing new pageserver") # bootstrap second pageserver - new_pageserver_dir = env.repo_dir / 'new_pageserver' + new_pageserver_dir = env.repo_dir / "new_pageserver" new_pageserver_dir.mkdir() new_pageserver_pg_port = port_distributor.get_port() new_pageserver_http_port = port_distributor.get_port() log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port) - pageserver_bin = pathlib.Path(neon_binpath) / 'pageserver' + pageserver_bin = pathlib.Path(neon_binpath) / "pageserver" new_pageserver_http = NeonPageserverHttpClient(port=new_pageserver_http_port, auth_token=None) - with new_pageserver_helper(new_pageserver_dir, - pageserver_bin, - remote_storage_mock_path, - new_pageserver_pg_port, - new_pageserver_http_port, - neon_env_builder.broker): + with new_pageserver_helper( + new_pageserver_dir, + pageserver_bin, + remote_storage_mock_path, + new_pageserver_pg_port, + new_pageserver_http_port, + neon_env_builder.broker, + ): # Migrate either by attaching from s3 or import/export basebackup if method == "major": @@ -367,13 +390,16 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, # check that it shows that download is in progress tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id) - assert tenant_status.get('has_in_progress_downloads'), tenant_status + assert tenant_status.get("has_in_progress_downloads"), tenant_status # wait until tenant is downloaded - wait_until(number_of_iterations=10, - interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant( - new_pageserver_http, tenant_id)) + wait_until( + number_of_iterations=10, + interval=1, + func=lambda: assert_no_in_progress_downloads_for_tenant( + new_pageserver_http, tenant_id + ), + ) check_timeline_attached( new_pageserver_http, @@ -392,10 +418,10 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, ) # rewrite neon cli config to use new pageserver for basebackup to start new compute - cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() + cli_config_lines = (env.repo_dir / "config").read_text().splitlines() cli_config_lines[-2] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'" cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'" - (env.repo_dir / 'config').write_text('\n'.join(cli_config_lines)) + (env.repo_dir / "config").write_text("\n".join(cli_config_lines)) old_local_path_main = switch_pg_to_new_pageserver( env, @@ -423,7 +449,8 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, # ensure that we can successfully read all relations on the new pageserver with pg_cur(pg_second) as cur: - cur.execute(''' + cur.execute( + """ DO $$ DECLARE r RECORD; @@ -435,18 +462,19 @@ def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname; END LOOP; END$$; - ''') + """ + ) - if with_load == 'with_load': + if with_load == "with_load": assert load_ok_event.wait(3) - log.info('stopping load thread') + log.info("stopping load thread") load_stop_event.set() load_thread.join(timeout=10) - log.info('load thread stopped') + log.info("load thread stopped") # bring old pageserver back for clean shutdown via neon cli # new pageserver will be shut down by the context manager - cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() + cli_config_lines = (env.repo_dir / "config").read_text().splitlines() cli_config_lines[-2] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'" cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'" - (env.repo_dir / 'config').write_text('\n'.join(cli_config_lines)) + (env.repo_dir / "config").write_text("\n".join(cli_config_lines)) diff --git a/test_runner/batch_others/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py similarity index 94% rename from test_runner/batch_others/test_tenant_tasks.py rename to test_runner/regress/test_tenant_tasks.py index fae2a2199d..8617bc8ea9 100644 --- a/test_runner/batch_others/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,9 +1,9 @@ -from fixtures.neon_fixtures import NeonEnvBuilder, wait_until from uuid import UUID -import time + +from fixtures.neon_fixtures import NeonEnvBuilder, wait_until -def get_only_element(l): +def get_only_element(l): # noqa: E741 assert len(l) == 1 return l[0] @@ -45,9 +45,9 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() - timeline = env.neon_cli.create_timeline(name, tenant_id=tenant) + env.neon_cli.create_timeline(name, tenant_id=tenant) pg = env.postgres.create_start(name, tenant_id=tenant) - assert (get_state(tenant) == "Active") + assert get_state(tenant) == "Active" # Stop compute pg.stop() diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/regress/test_tenants.py similarity index 72% rename from test_runner/batch_others/test_tenants.py rename to test_runner/regress/test_tenants.py index 8d73d8185c..0e0cd44471 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,15 +1,15 @@ +import os from contextlib import closing from datetime import datetime -import os -import pytest -from fixtures.neon_fixtures import NeonEnvBuilder +import pytest from fixtures.log_helper import log from fixtures.metrics import parse_metrics +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.utils import lsn_to_hex -@pytest.mark.parametrize('with_safekeepers', [False, True]) +@pytest.mark.parametrize("with_safekeepers", [False, True]) def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): if with_safekeepers: neon_env_builder.num_safekeepers = 3 @@ -19,17 +19,19 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: tenant_1, _ = env.neon_cli.create_tenant() tenant_2, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', - tenant_id=tenant_1) - env.neon_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', - tenant_id=tenant_2) + env.neon_cli.create_timeline( + f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_1 + ) + env.neon_cli.create_timeline( + f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_2 + ) pg_tenant1 = env.postgres.create_start( - f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', + f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_1, ) pg_tenant2 = env.postgres.create_start( - f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', + f"test_tenants_normal_work_with_safekeepers{with_safekeepers}", tenant_id=tenant_2, ) @@ -41,7 +43,7 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (5000050000, ) + assert cur.fetchone() == (5000050000,) def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): @@ -51,11 +53,11 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): tenant_1, _ = env.neon_cli.create_tenant() tenant_2, _ = env.neon_cli.create_tenant() - timeline_1 = env.neon_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_1) - timeline_2 = env.neon_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_2) + timeline_1 = env.neon_cli.create_timeline("test_metrics_normal_work", tenant_id=tenant_1) + timeline_2 = env.neon_cli.create_timeline("test_metrics_normal_work", tenant_id=tenant_2) - pg_tenant1 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_1) - pg_tenant2 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_2) + pg_tenant1 = env.postgres.create_start("test_metrics_normal_work", tenant_id=tenant_1) + pg_tenant2 = env.postgres.create_start("test_metrics_normal_work", tenant_id=tenant_2) for pg in [pg_tenant1, pg_tenant2]: with closing(pg.connect()) as conn: @@ -63,29 +65,28 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (5000050000, ) + assert cur.fetchone() == (5000050000,) collected_metrics = { "pageserver": env.pageserver.http_client().get_metrics(), } for sk in env.safekeepers: - collected_metrics[f'safekeeper{sk.id}'] = sk.http_client().get_metrics_str() + collected_metrics[f"safekeeper{sk.id}"] = sk.http_client().get_metrics_str() for name in collected_metrics: - basepath = os.path.join(neon_env_builder.repo_dir, f'{name}.metrics') + basepath = os.path.join(neon_env_builder.repo_dir, f"{name}.metrics") - with open(basepath, 'w') as stdout_f: + with open(basepath, "w") as stdout_f: print(collected_metrics[name], file=stdout_f, flush=True) all_metrics = [parse_metrics(m, name) for name, m in collected_metrics.items()] ps_metrics = all_metrics[0] sk_metrics = all_metrics[1:] - ttids = [{ - 'tenant_id': tenant_1.hex, 'timeline_id': timeline_1.hex - }, { - 'tenant_id': tenant_2.hex, 'timeline_id': timeline_2.hex - }] + ttids = [ + {"tenant_id": tenant_1.hex, "timeline_id": timeline_1.hex}, + {"tenant_id": tenant_2.hex, "timeline_id": timeline_2.hex}, + ] # Test metrics per timeline for tt in ttids: @@ -105,7 +106,8 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): log.info(f"Checking common metrics for {metrics.name}") log.info( - f"process_cpu_seconds_total: {metrics.query_one('process_cpu_seconds_total').value}") + f"process_cpu_seconds_total: {metrics.query_one('process_cpu_seconds_total').value}" + ) log.info(f"process_threads: {int(metrics.query_one('process_threads').value)}") log.info( f"process_resident_memory_bytes (MB): {metrics.query_one('process_resident_memory_bytes').value / 1024 / 1024}" diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py similarity index 75% rename from test_runner/batch_others/test_tenants_with_remote_storage.py rename to test_runner/regress/test_tenants_with_remote_storage.py index 636616a45b..083150e12a 100644 --- a/test_runner/batch_others/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -7,28 +7,35 @@ # import asyncio -from contextlib import closing from typing import List, Tuple from uuid import UUID import pytest - -from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres, RemoteStorageKind, available_remote_storages, wait_for_last_record_lsn, wait_for_upload +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + Postgres, + RemoteStorageKind, + available_remote_storages, + wait_for_last_record_lsn, + wait_for_upload, +) from fixtures.utils import lsn_from_hex async def tenant_workload(env: NeonEnv, pg: Postgres): - pageserver_conn = await env.pageserver.connect_async() + await env.pageserver.connect_async() pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show neon.tenant_id") - timeline_id = await pg_conn.fetchval("show neon.timeline_id") + await pg_conn.fetchval("show neon.tenant_id") + await pg_conn.fetchval("show neon.timeline_id") await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") for i in range(1, 100): await pg_conn.execute( - f"INSERT INTO t SELECT {i}*1000 + g, 'payload' from generate_series(1,1000) g") + f"INSERT INTO t SELECT {i}*1000 + g, 'payload' from generate_series(1,1000) g" + ) # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -46,11 +53,11 @@ async def all_tenants_workload(env: NeonEnv, tenants_pgs): await asyncio.gather(*workers) -@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages()) -def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind): +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storatge_kind, - test_name='test_tenants_many', + remote_storage_kind=remote_storage_kind, + test_name="test_tenants_many", ) env = neon_env_builder.init_start() @@ -61,12 +68,13 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Re # Use a tiny checkpoint distance, to create a lot of layers quickly tenant, _ = env.neon_cli.create_tenant( conf={ - 'checkpoint_distance': '5000000', - }) - env.neon_cli.create_timeline(f'test_tenants_many', tenant_id=tenant) + "checkpoint_distance": "5000000", + } + ) + env.neon_cli.create_timeline("test_tenants_many", tenant_id=tenant) pg = env.postgres.create_start( - f'test_tenants_many', + "test_tenants_many", tenant_id=tenant, ) tenants_pgs.append((tenant, pg)) @@ -77,7 +85,8 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Re pageserver_http = env.pageserver.http_client() for tenant, pg in tenants_pgs: res = pg.safe_psql_many( - ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"]) + ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"] + ) tenant_id = res[0][0][0] timeline_id = res[1][0][0] current_lsn = lsn_from_hex(res[2][0][0]) diff --git a/test_runner/batch_others/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py similarity index 56% rename from test_runner/batch_others/test_timeline_delete.py rename to test_runner/regress/test_timeline_delete.py index 594475faf4..7a55ffb769 100644 --- a/test_runner/batch_others/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -1,6 +1,6 @@ from uuid import uuid4 -import pytest +import pytest from fixtures.neon_fixtures import NeonEnv, NeonPageserverApiException, wait_until @@ -17,44 +17,57 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # for non existing tenant: invalid_tenant_id = uuid4() - with pytest.raises(NeonPageserverApiException, - match=f"Tenant {invalid_tenant_id.hex} not found in local tenant state"): + with pytest.raises( + NeonPageserverApiException, + match=f"Tenant {invalid_tenant_id.hex} not found in local tenant state", + ): ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) # construct pair of branches to validate that pageserver prohibits # deletion of ancestor timelines when they have child branches parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty") - leaf_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_branch1", - "test_ancestor_branch_delete_parent") + leaf_timeline_id = env.neon_cli.create_branch( + "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent" + ) ps_http = env.pageserver.http_client() - with pytest.raises(NeonPageserverApiException, - match="Cannot detach timeline which has child timelines"): + with pytest.raises( + NeonPageserverApiException, match="Cannot detach timeline which has child timelines" + ): - timeline_path = env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / parent_timeline_id.hex + timeline_path = ( + env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / parent_timeline_id.hex + ) assert timeline_path.exists() ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) assert not timeline_path.exists() - timeline_path = env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / leaf_timeline_id.hex + timeline_path = ( + env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / leaf_timeline_id.hex + ) assert timeline_path.exists() # retry deletes when compaction or gc is running in pageserver - wait_until(number_of_iterations=3, - interval=0.2, - func=lambda: ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)) + wait_until( + number_of_iterations=3, + interval=0.2, + func=lambda: ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id), + ) assert not timeline_path.exists() # check 404 - with pytest.raises(NeonPageserverApiException, - match="is not found neither locally nor remotely"): + with pytest.raises( + NeonPageserverApiException, match="is not found neither locally nor remotely" + ): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) # FIXME leaves tenant without timelines, should we prevent deletion of root timeline? - wait_until(number_of_iterations=3, - interval=0.2, - func=lambda: ps_http.timeline_delete(env.initial_tenant, parent_timeline_id)) + wait_until( + number_of_iterations=3, + interval=0.2, + func=lambda: ps_http.timeline_delete(env.initial_tenant, parent_timeline_id), + ) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/regress/test_timeline_size.py similarity index 63% rename from test_runner/batch_others/test_timeline_size.py rename to test_runner/regress/test_timeline_size.py index 6e1168e38f..f6b665ec8c 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1,24 +1,33 @@ -from contextlib import closing +import math import random -from uuid import UUID import re -import psycopg2.extras -import psycopg2.errors -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local, wait_for_last_flush_lsn -from fixtures.log_helper import log import time +from contextlib import closing +from uuid import UUID +import psycopg2.errors +import psycopg2.extras +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + Postgres, + assert_timeline_local, + wait_for_last_flush_lsn, +) from fixtures.utils import get_timeline_dir_size def test_timeline_size(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch('test_timeline_size', 'empty') + new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") client = env.pageserver.http_client() timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - assert timeline_details['local']['current_logical_size'] == timeline_details['local'][ - 'current_logical_size_non_incremental'] + assert ( + timeline_details["local"]["current_logical_size"] + == timeline_details["local"]["current_logical_size_non_incremental"] + ) pgmain = env.postgres.create_start("test_timeline_size") log.info("postgres is running on 'test_timeline_size' branch") @@ -28,32 +37,40 @@ def test_timeline_size(neon_simple_env: NeonEnv): cur.execute("SHOW neon.timeline_id") cur.execute("CREATE TABLE foo (t text)") - cur.execute(""" + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10) g - """) + """ + ) res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res['local'] - assert local_details["current_logical_size"] == local_details[ - "current_logical_size_non_incremental"] + local_details = res["local"] + assert ( + local_details["current_logical_size"] + == local_details["current_logical_size_non_incremental"] + ) cur.execute("TRUNCATE foo") res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res['local'] - assert local_details["current_logical_size"] == local_details[ - "current_logical_size_non_incremental"] + local_details = res["local"] + assert ( + local_details["current_logical_size"] + == local_details["current_logical_size_non_incremental"] + ) def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch('test_timeline_size', 'empty') + new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") client = env.pageserver.http_client() timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - assert timeline_details['local']['current_logical_size'] == timeline_details['local'][ - 'current_logical_size_non_incremental'] + assert ( + timeline_details["local"]["current_logical_size"] + == timeline_details["local"]["current_logical_size_non_incremental"] + ) pgmain = env.postgres.create_start("test_timeline_size") log.info("postgres is running on 'test_timeline_size' branch") @@ -63,32 +80,40 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): cur.execute("SHOW neon.timeline_id") res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res['local'] - assert local_details["current_logical_size"] == local_details[ - "current_logical_size_non_incremental"] + local_details = res["local"] + assert ( + local_details["current_logical_size"] + == local_details["current_logical_size_non_incremental"] + ) - cur.execute('CREATE DATABASE foodb') - with closing(pgmain.connect(dbname='foodb')) as conn: + cur.execute("CREATE DATABASE foodb") + with closing(pgmain.connect(dbname="foodb")) as conn: with conn.cursor() as cur2: cur2.execute("CREATE TABLE foo (t text)") - cur2.execute(""" + cur2.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10) g - """) + """ + ) res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res['local'] - assert local_details["current_logical_size"] == local_details[ - "current_logical_size_non_incremental"] + local_details = res["local"] + assert ( + local_details["current_logical_size"] + == local_details["current_logical_size_non_incremental"] + ) - cur.execute('DROP DATABASE foodb') + cur.execute("DROP DATABASE foodb") res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - local_details = res['local'] - assert local_details["current_logical_size"] == local_details[ - "current_logical_size_non_incremental"] + local_details = res["local"] + assert ( + local_details["current_logical_size"] + == local_details["current_logical_size_non_incremental"] + ) # wait until received_lsn_lag is 0 @@ -100,14 +125,17 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 elapsed = time.time() - started_at if elapsed > timeout: raise RuntimeError( - f"timed out waiting for pageserver to reach pg_current_wal_flush_lsn()") + "timed out waiting for pageserver to reach pg_current_wal_flush_lsn()" + ) - res = pgmain.safe_psql(''' + res = pgmain.safe_psql( + """ SELECT pg_size_pretty(pg_cluster_size()), pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag FROM backpressure_lsns(); - ''')[0] + """ + )[0] log.info(f"pg_cluster_size = {res[0]}, received_lsn_lag = {res[1]}") received_lsn_lag = res[1] @@ -116,17 +144,19 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - new_timeline_id = env.neon_cli.create_branch('test_timeline_size_quota') + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota") client = env.pageserver.http_client() res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) - assert res['local']["current_logical_size"] == res['local'][ - "current_logical_size_non_incremental"] + assert ( + res["local"]["current_logical_size"] == res["local"]["current_logical_size_non_incremental"] + ) pgmain = env.postgres.create_start( "test_timeline_size_quota", # Set small limit for the test - config_lines=['neon.max_cluster_size=30MB']) + config_lines=["neon.max_cluster_size=30MB"], + ) log.info("postgres is running on 'test_timeline_size_quota' branch") with closing(pgmain.connect()) as conn: @@ -139,19 +169,23 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): # Insert many rows. This query must fail because of space limit try: - cur.execute(''' + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g - ''') + """ + ) wait_for_pageserver_catchup(pgmain) - cur.execute(''' + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 500000) g - ''') + """ + ) # If we get here, the timeline size limit failed log.error("Query unexpectedly succeeded") @@ -161,17 +195,19 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): log.info(f"Query expectedly failed with: {err}") # drop table to free space - cur.execute('DROP TABLE foo') + cur.execute("DROP TABLE foo") wait_for_pageserver_catchup(pgmain) # create it again and insert some rows. This query must succeed cur.execute("CREATE TABLE foo (t text)") - cur.execute(''' + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10000) g - ''') + """ + ) wait_for_pageserver_catchup(pgmain) @@ -182,15 +218,17 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): def test_timeline_physical_size_init(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_init') + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init") pg = env.postgres.create_start("test_timeline_physical_size_init") - pg.safe_psql_many([ - "CREATE TABLE foo (t text)", - """INSERT INTO foo + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 1000) g""", - ]) + ] + ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) @@ -203,15 +241,17 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv): def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_checkpoint') + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint") pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint") - pg.safe_psql_many([ - "CREATE TABLE foo (t text)", - """INSERT INTO foo + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 1000) g""", - ]) + ] + ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") @@ -222,19 +262,23 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): # Disable background compaction as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m'}" + neon_env_builder.pageserver_config_override = ( + "tenant_config={checkpoint_distance=100000, compaction_period='10m'}" + ) env = neon_env_builder.init_start() - new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_compaction') + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") - pg.safe_psql_many([ - "CREATE TABLE foo (t text)", - """INSERT INTO foo + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g""", - ]) + ] + ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") @@ -246,29 +290,32 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = \ - "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" + neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" env = neon_env_builder.init_start() - new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_post_gc') + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") pg = env.postgres.create_start("test_timeline_physical_size_post_gc") - pg.safe_psql_many([ - "CREATE TABLE foo (t text)", - """INSERT INTO foo + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g""", - ]) + ] + ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") - pg.safe_psql(""" + pg.safe_psql( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g - """) + """ + ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") @@ -278,18 +325,22 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): assert_physical_size(env, env.initial_tenant, new_timeline_id) -def test_timeline_physical_size_metric(neon_simple_env: NeonEnv): +# The timeline logical and physical sizes are also exposed as prometheus metrics. +# Test the metrics. +def test_timeline_size_metrics(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_metric') - pg = env.postgres.create_start("test_timeline_physical_size_metric") + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_metrics") + pg = env.postgres.create_start("test_timeline_size_metrics") - pg.safe_psql_many([ - "CREATE TABLE foo (t text)", - """INSERT INTO foo + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g""", - ]) + ] + ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}") @@ -299,14 +350,36 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv): matches = re.search( f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', metrics, - re.MULTILINE) + re.MULTILINE, + ) assert matches - - # assert that the metric matches the actual physical size on disk tl_physical_size_metric = int(matches.group(1)) + + # assert that the physical size metric matches the actual physical size on disk timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id) assert tl_physical_size_metric == get_timeline_dir_size(timeline_path) + # Check that the logical size metric is sane, and matches + matches = re.search( + f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', + metrics, + re.MULTILINE, + ) + assert matches + tl_logical_size_metric = int(matches.group(1)) + + # An empty database is around 8 MB. There at least 3 databases, 'postgres', + # 'template0', 'template1'. So the total size should be about 32 MB. This isn't + # very accurate and can change with different PostgreSQL versions, so allow a + # couple of MB of slack. + assert math.isclose(tl_logical_size_metric, 32 * 1024 * 1024, abs_tol=2 * 1024 * 1024) + + # The sum of the sizes of all databases, as seen by pg_database_size(), should also + # be close. Again allow some slack, the logical size metric includes some things like + # the SLRUs that are not included in pg_database_size(). + dbsize_sum = pg.safe_psql("select sum(pg_database_size(oid)) from pg_database")[0][0] + assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024) + def test_tenant_physical_size(neon_simple_env: NeonEnv): random.seed(100) @@ -317,8 +390,8 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): tenant, timeline = env.neon_cli.create_tenant() def get_timeline_physical_size(timeline: UUID): - res = client.timeline_detail(tenant, timeline) - return res['local']['current_physical_size_non_incremental'] + res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) + return res["local"]["current_physical_size_non_incremental"] timeline_total_size = get_timeline_physical_size(timeline) for i in range(10): @@ -327,10 +400,12 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): timeline = env.neon_cli.create_branch(f"test_tenant_physical_size_{i}", tenant_id=tenant) pg = env.postgres.create_start(f"test_tenant_physical_size_{i}", tenant_id=tenant) - pg.safe_psql_many([ - "CREATE TABLE foo (t text)", - f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g", - ]) + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g", + ] + ) wait_for_last_flush_lsn(env, pg, tenant, timeline) env.pageserver.safe_psql(f"checkpoint {tenant.hex} {timeline.hex}") @@ -339,7 +414,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): pg.stop() - tenant_physical_size = int(client.tenant_status(tenant_id=tenant)['current_physical_size']) + tenant_physical_size = int(client.tenant_status(tenant_id=tenant)["current_physical_size"]) assert tenant_physical_size == timeline_total_size @@ -349,6 +424,8 @@ def assert_physical_size(env: NeonEnv, tenant_id: UUID, timeline_id: UUID): client = env.pageserver.http_client() res = assert_timeline_local(client, tenant_id, timeline_id) timeline_path = env.timeline_dir(tenant_id, timeline_id) - assert res["local"]["current_physical_size"] == res["local"][ - "current_physical_size_non_incremental"] + assert ( + res["local"]["current_physical_size"] + == res["local"]["current_physical_size_non_incremental"] + ) assert res["local"]["current_physical_size"] == get_timeline_dir_size(timeline_path) diff --git a/test_runner/batch_others/test_twophase.py b/test_runner/regress/test_twophase.py similarity index 74% rename from test_runner/batch_others/test_twophase.py rename to test_runner/regress/test_twophase.py index 04e3d0b7bc..f3b0f9ca06 100644 --- a/test_runner/batch_others/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -1,7 +1,7 @@ import os -from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn # @@ -10,37 +10,37 @@ from fixtures.log_helper import log def test_twophase(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_twophase", "empty") - pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5']) + pg = env.postgres.create_start("test_twophase", config_lines=["max_prepared_transactions=5"]) log.info("postgres is running on 'test_twophase' branch") conn = pg.connect() cur = conn.cursor() - cur.execute('CREATE TABLE foo (t text)') + cur.execute("CREATE TABLE foo (t text)") # Prepare a transaction that will insert a row - cur.execute('BEGIN') + cur.execute("BEGIN") cur.execute("INSERT INTO foo VALUES ('one')") cur.execute("PREPARE TRANSACTION 'insert_one'") # Prepare another transaction that will insert a row - cur.execute('BEGIN') + cur.execute("BEGIN") cur.execute("INSERT INTO foo VALUES ('two')") cur.execute("PREPARE TRANSACTION 'insert_two'") # Prepare a transaction that will insert a row - cur.execute('BEGIN') + cur.execute("BEGIN") cur.execute("INSERT INTO foo VALUES ('three')") cur.execute("PREPARE TRANSACTION 'insert_three'") # Prepare another transaction that will insert a row - cur.execute('BEGIN') + cur.execute("BEGIN") cur.execute("INSERT INTO foo VALUES ('four')") cur.execute("PREPARE TRANSACTION 'insert_four'") # On checkpoint state data copied to files in # pg_twophase directory and fsynced - cur.execute('CHECKPOINT') + cur.execute("CHECKPOINT") twophase_files = os.listdir(pg.pg_twophase_dir_path()) log.info(twophase_files) @@ -48,19 +48,19 @@ def test_twophase(neon_simple_env: NeonEnv): cur.execute("COMMIT PREPARED 'insert_three'") cur.execute("ROLLBACK PREPARED 'insert_four'") - cur.execute('CHECKPOINT') + cur.execute("CHECKPOINT") twophase_files = os.listdir(pg.pg_twophase_dir_path()) log.info(twophase_files) assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - env.neon_cli.create_branch("test_twophase_prepared", "test_twophase") + fork_at_current_lsn(env, pg, "test_twophase_prepared", "test_twophase") # Start compute on the new branch pg2 = env.postgres.create_start( - 'test_twophase_prepared', - config_lines=['max_prepared_transactions=5'], + "test_twophase_prepared", + config_lines=["max_prepared_transactions=5"], ) # Check that we restored only needed twophase files @@ -76,9 +76,9 @@ def test_twophase(neon_simple_env: NeonEnv): cur2.execute("COMMIT PREPARED 'insert_one'") cur2.execute("ROLLBACK PREPARED 'insert_two'") - cur2.execute('SELECT * FROM foo') - assert cur2.fetchall() == [('one', ), ('three', )] + cur2.execute("SELECT * FROM foo") + assert cur2.fetchall() == [("one",), ("three",)] # Only one committed insert is visible on the original branch - cur.execute('SELECT * FROM foo') - assert cur.fetchall() == [('three', )] + cur.execute("SELECT * FROM foo") + assert cur.fetchall() == [("three",)] diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/regress/test_vm_bits.py similarity index 54% rename from test_runner/batch_others/test_vm_bits.py rename to test_runner/regress/test_vm_bits.py index 29b55f5b8c..16a870471b 100644 --- a/test_runner/batch_others/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -1,5 +1,5 @@ -from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn # @@ -10,48 +10,50 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_vm_bit_clear", "empty") - pg = env.postgres.create_start('test_vm_bit_clear') + pg = env.postgres.create_start("test_vm_bit_clear") log.info("postgres is running on 'test_vm_bit_clear' branch") pg_conn = pg.connect() cur = pg_conn.cursor() # Install extension containing function needed for test - cur.execute('CREATE EXTENSION neon_test_utils') + cur.execute("CREATE EXTENSION neon_test_utils") # Create a test table and freeze it to set the VM bit. - cur.execute('CREATE TABLE vmtest_delete (id integer PRIMARY KEY)') - cur.execute('INSERT INTO vmtest_delete VALUES (1)') - cur.execute('VACUUM FREEZE vmtest_delete') + cur.execute("CREATE TABLE vmtest_delete (id integer PRIMARY KEY)") + cur.execute("INSERT INTO vmtest_delete VALUES (1)") + cur.execute("VACUUM FREEZE vmtest_delete") - cur.execute('CREATE TABLE vmtest_update (id integer PRIMARY KEY)') - cur.execute('INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g') - cur.execute('VACUUM FREEZE vmtest_update') + cur.execute("CREATE TABLE vmtest_update (id integer PRIMARY KEY)") + cur.execute("INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g") + cur.execute("VACUUM FREEZE vmtest_update") # DELETE and UPDATE the rows. - cur.execute('DELETE FROM vmtest_delete WHERE id = 1') - cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1') + cur.execute("DELETE FROM vmtest_delete WHERE id = 1") + cur.execute("UPDATE vmtest_update SET id = 5000 WHERE id = 1") # Branch at this point, to test that later - env.neon_cli.create_branch("test_vm_bit_clear_new", "test_vm_bit_clear") + fork_at_current_lsn(env, pg, "test_vm_bit_clear_new", "test_vm_bit_clear") # Clear the buffer cache, to force the VM page to be re-fetched from # the page server - cur.execute('SELECT clear_buffer_cache()') + cur.execute("SELECT clear_buffer_cache()") # Check that an index-only scan doesn't see the deleted row. If the # clearing of the VM bit was not replayed correctly, this would incorrectly # return deleted row. - cur.execute(''' + cur.execute( + """ set enable_seqscan=off; set enable_indexscan=on; set enable_bitmapscan=off; - ''') + """ + ) - cur.execute('SELECT * FROM vmtest_delete WHERE id = 1') - assert (cur.fetchall() == []) - cur.execute('SELECT * FROM vmtest_update WHERE id = 1') - assert (cur.fetchall() == []) + cur.execute("SELECT * FROM vmtest_delete WHERE id = 1") + assert cur.fetchall() == [] + cur.execute("SELECT * FROM vmtest_update WHERE id = 1") + assert cur.fetchall() == [] cur.close() @@ -61,19 +63,21 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # a dirty VM page is evicted. If the VM bit was not correctly cleared by the # earlier WAL record, the full-page image hides the problem. Starting a new # server at the right point-in-time avoids that full-page image. - pg_new = env.postgres.create_start('test_vm_bit_clear_new') + pg_new = env.postgres.create_start("test_vm_bit_clear_new") log.info("postgres is running on 'test_vm_bit_clear_new' branch") pg_new_conn = pg_new.connect() cur_new = pg_new_conn.cursor() - cur_new.execute(''' + cur_new.execute( + """ set enable_seqscan=off; set enable_indexscan=on; set enable_bitmapscan=off; - ''') + """ + ) - cur_new.execute('SELECT * FROM vmtest_delete WHERE id = 1') - assert (cur_new.fetchall() == []) - cur_new.execute('SELECT * FROM vmtest_update WHERE id = 1') - assert (cur_new.fetchall() == []) + cur_new.execute("SELECT * FROM vmtest_delete WHERE id = 1") + assert cur_new.fetchall() == [] + cur_new.execute("SELECT * FROM vmtest_update WHERE id = 1") + assert cur_new.fetchall() == [] diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py similarity index 76% rename from test_runner/batch_others/test_wal_acceptor.py rename to test_runner/regress/test_wal_acceptor.py index b6f914858e..28daeb18ed 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1,42 +1,59 @@ -import pathlib -import pytest -import random -import time import os +import pathlib +import random import shutil import signal import subprocess import sys import threading +import time import uuid - from contextlib import closing from dataclasses import dataclass, field from pathlib import Path -from fixtures.neon_fixtures import NeonPageserver, PgBin, Etcd, Postgres, RemoteStorageKind, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, available_remote_storages, neon_binpath, PgProtocol, wait_for_last_record_lsn, wait_for_upload -from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex, query_scalar -from fixtures.log_helper import log -from typing import List, Optional, Any +from typing import Any, List, Optional from uuid import uuid4 +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Etcd, + NeonEnv, + NeonEnvBuilder, + NeonPageserver, + PgBin, + PgProtocol, + PortDistributor, + Postgres, + RemoteStorageKind, + RemoteStorageUsers, + Safekeeper, + SafekeeperPort, + available_remote_storages, + neon_binpath, + wait_for_last_record_lsn, + wait_for_upload, +) +from fixtures.utils import get_dir_size, lsn_from_hex, lsn_to_hex, query_scalar -def wait_lsn_force_checkpoint(tenant_id: str, - timeline_id: str, - pg: Postgres, - ps: NeonPageserver, - pageserver_conn_options={}): - lsn = lsn_from_hex(pg.safe_psql('SELECT pg_current_wal_flush_lsn()')[0][0]) + +def wait_lsn_force_checkpoint( + tenant_id: str, timeline_id: str, pg: Postgres, ps: NeonPageserver, pageserver_conn_options={} +): + lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) log.info(f"pg_current_wal_flush_lsn is {lsn_to_hex(lsn)}, waiting for it on pageserver") auth_token = None - if 'password' in pageserver_conn_options: - auth_token = pageserver_conn_options['password'] + if "password" in pageserver_conn_options: + auth_token = pageserver_conn_options["password"] # wait for the pageserver to catch up - wait_for_last_record_lsn(ps.http_client(auth_token=auth_token), - uuid.UUID(hex=tenant_id), - uuid.UUID(hex=timeline_id), - lsn) + wait_for_last_record_lsn( + ps.http_client(auth_token=auth_token), + uuid.UUID(hex=tenant_id), + uuid.UUID(hex=timeline_id), + lsn, + ) # force checkpoint to advance remote_consistent_lsn with closing(ps.connect(**pageserver_conn_options)) as psconn: @@ -44,10 +61,12 @@ def wait_lsn_force_checkpoint(tenant_id: str, pscur.execute(f"checkpoint {tenant_id} {timeline_id}") # ensure that remote_consistent_lsn is advanced - wait_for_upload(ps.http_client(auth_token=auth_token), - uuid.UUID(hex=tenant_id), - uuid.UUID(hex=timeline_id), - lsn) + wait_for_upload( + ps.http_client(auth_token=auth_token), + uuid.UUID(hex=tenant_id), + uuid.UUID(hex=timeline_id), + lsn, + ) @dataclass @@ -89,7 +108,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): with env.pageserver.http_client() as pageserver_http: timeline_details = [ pageserver_http.timeline_detail( - tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name]) + tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name] + ) for branch_name in branch_names ] # All changes visible to pageserver (last_record_lsn) should be @@ -105,14 +125,14 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): for timeline_detail in timeline_details: timeline_id: str = timeline_detail["timeline_id"] - local_timeline_detail = timeline_detail.get('local') + local_timeline_detail = timeline_detail.get("local") if local_timeline_detail is None: log.debug(f"Timeline {timeline_id} is not present locally, skipping") continue m = TimelineMetrics( timeline_id=timeline_id, - last_record_lsn=lsn_from_hex(local_timeline_detail['last_record_lsn']), + last_record_lsn=lsn_from_hex(local_timeline_detail["last_record_lsn"]), ) for sk_m in sk_metrics: m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)]) @@ -120,14 +140,20 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): # Invariant. May be < when transaction is in progress. - assert commit_lsn <= flush_lsn, f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + assert ( + commit_lsn <= flush_lsn + ), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" # We only call collect_metrics() after a transaction is confirmed by # the compute node, which only happens after a consensus of safekeepers # has confirmed the transaction. We assume majority consensus here. - assert (2 * sum(m.last_record_lsn <= lsn - for lsn in m.flush_lsns) > neon_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" - assert (2 * sum(m.last_record_lsn <= lsn - for lsn in m.commit_lsns) > neon_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + assert ( + 2 * sum(m.last_record_lsn <= lsn for lsn in m.flush_lsns) + > neon_env_builder.num_safekeepers + ), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + assert ( + 2 * sum(m.last_record_lsn <= lsn for lsn in m.commit_lsns) + > neon_env_builder.num_safekeepers + ), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" timeline_metrics.append(m) log.info(f"{message}: {timeline_metrics}") return timeline_metrics @@ -154,9 +180,11 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): while not self.should_stop.is_set(): collect_metrics("during INSERT INTO") time.sleep(1) - except: - log.error("MetricsChecker's thread failed, the test will be failed on .stop() call", - exc_info=True) + except: # noqa: E722 + log.error( + "MetricsChecker's thread failed, the test will be failed on .stop() call", + exc_info=True, + ) # We want to preserve traceback as well as the exception exc_type, exc_value, exc_tb = sys.exc_info() assert exc_type @@ -183,7 +211,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): # Check data for 2/3 timelines for pg in pgs[:-1]: res = pg.safe_psql("SELECT sum(key) FROM t") - assert res[0] == (5000050000, ) + assert res[0] == (5000050000,) final_m = collect_metrics("after SELECT") # Assume that LSNs (a) behave similarly in all timelines; and (b) INSERT INTO alters LSN significantly. @@ -208,8 +236,8 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = n_acceptors env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_safekeepers_restarts') - pg = env.postgres.create_start('test_safekeepers_restarts') + env.neon_cli.create_branch("test_safekeepers_restarts") + pg = env.postgres.create_start("test_safekeepers_restarts") # we rely upon autocommit after each statement # as waiting for acceptors happens there @@ -217,9 +245,9 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): cur = pg_conn.cursor() failed_node = None - cur.execute('CREATE TABLE t(key int primary key, value text)') + cur.execute("CREATE TABLE t(key int primary key, value text)") for i in range(n_inserts): - cur.execute("INSERT INTO t values (%s, 'payload');", (i + 1, )) + cur.execute("INSERT INTO t values (%s, 'payload');", (i + 1,)) if random.random() <= fault_probability: if failed_node is None: @@ -228,7 +256,7 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): else: failed_node.start() failed_node = None - assert query_scalar(cur, 'SELECT sum(key) FROM t') == 500500 + assert query_scalar(cur, "SELECT sum(key) FROM t") == 500500 # Test that safekeepers push their info to the broker and learn peer status from it @@ -238,7 +266,7 @@ def test_broker(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_broker", "main") - pg = env.postgres.create_start('test_broker') + pg = env.postgres.create_start("test_broker") pg.safe_psql("CREATE TABLE t(key int primary key, value text)") # learn neon timeline from compute @@ -260,9 +288,10 @@ def test_broker(neon_env_builder: NeonEnvBuilder): while True: stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] if all( - lsn_from_hex(s_after.remote_consistent_lsn) > lsn_from_hex( - s_before.remote_consistent_lsn) for s_after, - s_before in zip(stat_after, stat_before)): + lsn_from_hex(s_after.remote_consistent_lsn) + > lsn_from_hex(s_before.remote_consistent_lsn) + for s_after, s_before in zip(stat_after, stat_before) + ): break elapsed = time.time() - started_at if elapsed > 20: @@ -273,7 +302,7 @@ def test_broker(neon_env_builder: NeonEnvBuilder): # Test that old WAL consumed by peers and pageserver is removed from safekeepers. -@pytest.mark.parametrize('auth_enabled', [False, True]) +@pytest.mark.parametrize("auth_enabled", [False, True]) def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.num_safekeepers = 2 # to advance remote_consistent_lsn @@ -281,16 +310,18 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_safekeepers_wal_removal') - pg = env.postgres.create_start('test_safekeepers_wal_removal') + env.neon_cli.create_branch("test_safekeepers_wal_removal") + pg = env.postgres.create_start("test_safekeepers_wal_removal") # Note: it is important to insert at least two segments, as currently # control file is synced roughly once in segment range and WAL is not # removed until all horizons are persisted. - pg.safe_psql_many([ - 'CREATE TABLE t(key int primary key, value text)', - "INSERT INTO t SELECT generate_series(1,200000), 'payload'", - ]) + pg.safe_psql_many( + [ + "CREATE TABLE t(key int primary key, value text)", + "INSERT INTO t SELECT generate_series(1,200000), 'payload'", + ] + ) tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] @@ -298,12 +329,12 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # force checkpoint to advance remote_consistent_lsn pageserver_conn_options = {} if auth_enabled: - pageserver_conn_options['password'] = env.auth_keys.generate_tenant_token(tenant_id) + pageserver_conn_options["password"] = env.auth_keys.generate_tenant_token(tenant_id) wait_lsn_force_checkpoint(tenant_id, timeline_id, pg, env.pageserver, pageserver_conn_options) # We will wait for first segment removal. Make sure they exist for starter. first_segments = [ - os.path.join(sk.data_dir(), tenant_id, timeline_id, '000000010000000000000001') + os.path.join(sk.data_dir(), tenant_id, timeline_id, "000000010000000000000001") for sk in env.safekeepers ] assert all(os.path.exists(p) for p in first_segments) @@ -312,25 +343,33 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): http_cli = env.safekeepers[0].http_client() else: http_cli = env.safekeepers[0].http_client( - auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + auth_token=env.auth_keys.generate_tenant_token(tenant_id) + ) http_cli_other = env.safekeepers[0].http_client( - auth_token=env.auth_keys.generate_tenant_token(uuid4().hex)) + auth_token=env.auth_keys.generate_tenant_token(uuid4().hex) + ) http_cli_noauth = env.safekeepers[0].http_client() # Pretend WAL is offloaded to s3. if auth_enabled: - old_backup_lsn = http_cli.timeline_status(tenant_id=tenant_id, - timeline_id=timeline_id).backup_lsn - assert 'FFFFFFFF/FEFFFFFF' != old_backup_lsn + old_backup_lsn = http_cli.timeline_status( + tenant_id=tenant_id, timeline_id=timeline_id + ).backup_lsn + assert "FFFFFFFF/FEFFFFFF" != old_backup_lsn for cli in [http_cli_other, http_cli_noauth]: - with pytest.raises(cli.HTTPError, match='Forbidden|Unauthorized'): - cli.record_safekeeper_info(tenant_id, - timeline_id, {'backup_lsn': 'FFFFFFFF/FEFFFFFF'}) - assert old_backup_lsn == http_cli.timeline_status(tenant_id=tenant_id, - timeline_id=timeline_id).backup_lsn - http_cli.record_safekeeper_info(tenant_id, timeline_id, {'backup_lsn': 'FFFFFFFF/FEFFFFFF'}) - assert 'FFFFFFFF/FEFFFFFF' == http_cli.timeline_status(tenant_id=tenant_id, - timeline_id=timeline_id).backup_lsn + with pytest.raises(cli.HTTPError, match="Forbidden|Unauthorized"): + cli.record_safekeeper_info( + tenant_id, timeline_id, {"backup_lsn": "FFFFFFFF/FEFFFFFF"} + ) + assert ( + old_backup_lsn + == http_cli.timeline_status(tenant_id=tenant_id, timeline_id=timeline_id).backup_lsn + ) + http_cli.record_safekeeper_info(tenant_id, timeline_id, {"backup_lsn": "FFFFFFFF/FEFFFFFF"}) + assert ( + "FFFFFFFF/FEFFFFFF" + == http_cli.timeline_status(tenant_id=tenant_id, timeline_id=timeline_id).backup_lsn + ) # wait till first segment is removed on all safekeepers started_at = time.time() @@ -355,7 +394,8 @@ def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end): elapsed = time.time() - started_at if elapsed > 30: raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for segment ending at {seg_end} get offloaded") + f"timed out waiting {elapsed:.0f}s for segment ending at {seg_end} get offloaded" + ) time.sleep(0.5) @@ -364,8 +404,9 @@ def wait_wal_trim(tenant_id, timeline_id, sk, target_size): http_cli = sk.http_client() while True: tli_status = http_cli.timeline_status(tenant_id, timeline_id) - sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), tenant_id, - timeline_id)) / 1024 / 1024 + sk_wal_size = ( + get_dir_size(os.path.join(sk.data_dir(), tenant_id, timeline_id)) / 1024 / 1024 + ) log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size:.2f}MB status={tli_status}") if sk_wal_size <= target_size: @@ -379,21 +420,21 @@ def wait_wal_trim(tenant_id, timeline_id, sk, target_size): time.sleep(0.5) -@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages()) -def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind): +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storatge_kind, - test_name='test_safekeepers_wal_backup', + remote_storage_kind=remote_storage_kind, + test_name="test_safekeepers_wal_backup", ) neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_safekeepers_wal_backup') - pg = env.postgres.create_start('test_safekeepers_wal_backup') + env.neon_cli.create_branch("test_safekeepers_wal_backup") + pg = env.postgres.create_start("test_safekeepers_wal_backup") # learn neon timeline from compute tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] @@ -401,11 +442,11 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Remo pg_conn = pg.connect() cur = pg_conn.cursor() - cur.execute('create table t(key int, value text)') + cur.execute("create table t(key int, value text)") # Shut down subsequently each of safekeepers and fill a segment while sk is # down; ensure segment gets offloaded by others. - offloaded_seg_end = ['0/2000000', '0/3000000', '0/4000000'] + offloaded_seg_end = ["0/2000000", "0/3000000", "0/4000000"] for victim, seg_end in zip(env.safekeepers, offloaded_seg_end): victim.stop() # roughly fills one segment @@ -419,36 +460,36 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: Remo # put one of safekeepers down again env.safekeepers[0].stop() # restart postgres - pg.stop_and_destroy().create_start('test_safekeepers_wal_backup') + pg.stop_and_destroy().create_start("test_safekeepers_wal_backup") # and ensure offloading still works with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute("insert into t select generate_series(1,250000), 'payload'") - wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], '0/5000000') + wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], "0/5000000") -@pytest.mark.parametrize('remote_storatge_kind', available_remote_storages()) -def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: RemoteStorageKind): +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storatge_kind, - test_name='test_s3_wal_replay', + remote_storage_kind=remote_storage_kind, + test_name="test_s3_wal_replay", ) neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_s3_wal_replay') + env.neon_cli.create_branch("test_s3_wal_replay") env.pageserver.stop() - pageserver_tenants_dir = os.path.join(env.repo_dir, 'tenants') - pageserver_fresh_copy = os.path.join(env.repo_dir, 'tenants_fresh') + pageserver_tenants_dir = os.path.join(env.repo_dir, "tenants") + pageserver_fresh_copy = os.path.join(env.repo_dir, "tenants_fresh") log.info(f"Creating a copy of pageserver in a fresh state at {pageserver_fresh_copy}") shutil.copytree(pageserver_tenants_dir, pageserver_fresh_copy) env.pageserver.start() - pg = env.postgres.create_start('test_s3_wal_replay') + pg = env.postgres.create_start("test_s3_wal_replay") # learn neon timeline from compute tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] @@ -462,7 +503,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: R cur.execute("insert into t values (1, 'payload')") expected_sum += 1 - offloaded_seg_end = ['0/3000000'] + offloaded_seg_end = ["0/3000000"] for seg_end in offloaded_seg_end: # roughly fills two segments cur.execute("insert into t select generate_series(1,500000), 'payload'") @@ -476,28 +517,30 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: R # advance remote_consistent_lsn to trigger WAL trimming # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push etcd updates env.safekeepers[0].http_client().record_safekeeper_info( - tenant_id, timeline_id, {'remote_consistent_lsn': offloaded_seg_end[-1]}) + tenant_id, timeline_id, {"remote_consistent_lsn": offloaded_seg_end[-1]} + ) for sk in env.safekeepers: # require WAL to be trimmed, so no more than one segment is left on disk wait_wal_trim(tenant_id, timeline_id, sk, 16 * 1.5) - last_lsn = query_scalar(cur, 'SELECT pg_current_wal_flush_lsn()') + last_lsn = query_scalar(cur, "SELECT pg_current_wal_flush_lsn()") pageserver_lsn = env.pageserver.http_client().timeline_detail( - uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"] + uuid.UUID(tenant_id), uuid.UUID((timeline_id)) + )["local"]["last_record_lsn"] lag = lsn_from_hex(last_lsn) - lsn_from_hex(pageserver_lsn) log.info( - f'Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb' + f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb" ) # replace pageserver with a fresh copy pg.stop_and_destroy() env.pageserver.stop() - log.info(f'Removing current pageserver state at {pageserver_tenants_dir}') + log.info(f"Removing current pageserver state at {pageserver_tenants_dir}") shutil.rmtree(pageserver_tenants_dir) - log.info(f'Copying fresh pageserver state from {pageserver_fresh_copy}') + log.info(f"Copying fresh pageserver state from {pageserver_fresh_copy}") shutil.move(pageserver_fresh_copy, pageserver_tenants_dir) # start pageserver and wait for replay @@ -509,39 +552,43 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storatge_kind: R while True: elapsed = time.time() - started_at if elapsed > wait_lsn_timeout: - raise RuntimeError(f'Timed out waiting for WAL redo') + raise RuntimeError("Timed out waiting for WAL redo") pageserver_lsn = env.pageserver.http_client().timeline_detail( - uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"] + uuid.UUID(tenant_id), uuid.UUID((timeline_id)) + )["local"]["last_record_lsn"] lag = lsn_from_hex(last_lsn) - lsn_from_hex(pageserver_lsn) if time.time() > last_debug_print + 10 or lag <= 0: last_debug_print = time.time() - log.info(f'Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb') + log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb") if lag <= 0: break time.sleep(1) - log.info(f'WAL redo took {elapsed} s') + log.info(f"WAL redo took {elapsed} s") # verify data - pg.create_start('test_s3_wal_replay') + pg.create_start("test_s3_wal_replay") assert pg.safe_psql("select sum(key) from t")[0][0] == expected_sum class ProposerPostgres(PgProtocol): """Object for running postgres without NeonEnv""" - def __init__(self, - pgdata_dir: str, - pg_bin, - timeline_id: uuid.UUID, - tenant_id: uuid.UUID, - listen_addr: str, - port: int): - super().__init__(host=listen_addr, port=port, user='cloud_admin', dbname='postgres') + + def __init__( + self, + pgdata_dir: str, + pg_bin, + timeline_id: uuid.UUID, + tenant_id: uuid.UUID, + listen_addr: str, + port: int, + ): + super().__init__(host=listen_addr, port=port, user="cloud_admin", dbname="postgres") self.pgdata_dir: str = pgdata_dir self.pg_bin: PgBin = pg_bin @@ -551,15 +598,15 @@ class ProposerPostgres(PgProtocol): self.port: int = port def pg_data_dir_path(self) -> str: - """ Path to data directory """ + """Path to data directory""" return self.pgdata_dir def config_file_path(self) -> str: - """ Path to postgresql.conf """ - return os.path.join(self.pgdata_dir, 'postgresql.conf') + """Path to postgresql.conf""" + return os.path.join(self.pgdata_dir, "postgresql.conf") def create_dir_config(self, safekeepers: str): - """ Create dir and config for running --sync-safekeepers """ + """Create dir and config for running --sync-safekeepers""" pathlib.Path(self.pg_data_dir_path()).mkdir(exist_ok=True) with open(self.config_file_path(), "w") as f: @@ -568,8 +615,8 @@ class ProposerPostgres(PgProtocol): "shared_preload_libraries = 'neon'\n", f"neon.timeline_id = '{self.timeline_id.hex}'\n", f"neon.tenant_id = '{self.tenant_id.hex}'\n", - f"neon.pageserver_connstring = ''\n", - f"safekeepers = '{safekeepers}'\n", + "neon.pageserver_connstring = ''\n", + f"neon.safekeepers = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", f"port = '{self.port}'\n", ] @@ -588,36 +635,36 @@ class ProposerPostgres(PgProtocol): } basepath = self.pg_bin.run_capture(command, env) - stdout_filename = basepath + '.stdout' + stdout_filename = basepath + ".stdout" - with open(stdout_filename, 'r') as stdout_f: + with open(stdout_filename, "r") as stdout_f: stdout = stdout_f.read() return stdout.strip("\n ") def initdb(self): - """ Run initdb """ + """Run initdb""" args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path()] self.pg_bin.run(args) def start(self): - """ Start postgres with pg_ctl """ + """Start postgres with pg_ctl""" log_path = os.path.join(self.pg_data_dir_path(), "pg.log") args = ["pg_ctl", "-D", self.pg_data_dir_path(), "-l", log_path, "-w", "start"] self.pg_bin.run(args) def stop(self): - """ Stop postgres with pg_ctl """ + """Stop postgres with pg_ctl""" args = ["pg_ctl", "-D", self.pg_data_dir_path(), "-m", "immediate", "-w", "stop"] self.pg_bin.run(args) # insert wal in all safekeepers and run sync on proposer -def test_sync_safekeepers(neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - port_distributor: PortDistributor): +def test_sync_safekeepers( + neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor +): # We don't really need the full environment for this test, just the # safekeepers would be enough. @@ -629,12 +676,9 @@ def test_sync_safekeepers(neon_env_builder: NeonEnvBuilder, # write config for proposer pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") - pg = ProposerPostgres(pgdata_dir, - pg_bin, - timeline_id, - tenant_id, - '127.0.0.1', - port_distributor.get_port()) + pg = ProposerPostgres( + pgdata_dir, pg_bin, timeline_id, tenant_id, "127.0.0.1", port_distributor.get_port() + ) pg.create_dir_config(env.get_safekeeper_connstrs()) # valid lsn, which is not in the segment start, nor in zero segment @@ -669,13 +713,13 @@ def test_sync_safekeepers(neon_env_builder: NeonEnvBuilder, assert all(lsn_after_sync == lsn for lsn in lsn_after_append) -@pytest.mark.parametrize('auth_enabled', [False, True]) +@pytest.mark.parametrize("auth_enabled", [False, True]) def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_timeline_status') - pg = env.postgres.create_start('test_timeline_status') + env.neon_cli.create_branch("test_timeline_status") + pg = env.postgres.create_start("test_timeline_status") wa = env.safekeepers[0] @@ -690,7 +734,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) wa_http_cli.check_status() wa_http_cli_bad = wa.http_client( - auth_token=env.auth_keys.generate_tenant_token(uuid4().hex)) + auth_token=env.auth_keys.generate_tenant_token(uuid4().hex) + ) wa_http_cli_bad.check_status() wa_http_cli_noauth = wa.http_client() wa_http_cli_noauth.check_status() @@ -702,7 +747,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): if auth_enabled: for cli in [wa_http_cli_bad, wa_http_cli_noauth]: - with pytest.raises(cli.HTTPError, match='Forbidden|Unauthorized'): + with pytest.raises(cli.HTTPError, match="Forbidden|Unauthorized"): cli.timeline_status(tenant_id, timeline_id) pg.safe_psql("create table t(i int)") @@ -720,19 +765,23 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): class SafekeeperEnv: - def __init__(self, - repo_dir: Path, - port_distributor: PortDistributor, - pg_bin: PgBin, - num_safekeepers: int = 1): + def __init__( + self, + repo_dir: Path, + port_distributor: PortDistributor, + pg_bin: PgBin, + num_safekeepers: int = 1, + ): self.repo_dir = repo_dir self.port_distributor = port_distributor - self.broker = Etcd(datadir=os.path.join(self.repo_dir, "etcd"), - port=self.port_distributor.get_port(), - peer_port=self.port_distributor.get_port()) + self.broker = Etcd( + datadir=os.path.join(self.repo_dir, "etcd"), + port=self.port_distributor.get_port(), + peer_port=self.port_distributor.get_port(), + ) self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers - self.bin_safekeeper = os.path.join(str(neon_binpath), 'safekeeper') + self.bin_safekeeper = os.path.join(str(neon_binpath), "safekeeper") self.safekeepers: Optional[List[subprocess.CompletedProcess[Any]]] = None self.postgres: Optional[ProposerPostgres] = None self.tenant_id: Optional[uuid.UUID] = None @@ -778,23 +827,25 @@ class SafekeeperEnv: str(i), "--broker-endpoints", self.broker.client_url(), - "--daemonize" + "--daemonize", ] log.info(f'Running command "{" ".join(args)}"') return subprocess.run(args, check=True) def get_safekeeper_connstrs(self): - return ','.join([sk_proc.args[2] for sk_proc in self.safekeepers]) + return ",".join([sk_proc.args[2] for sk_proc in self.safekeepers]) def create_postgres(self): pgdata_dir = os.path.join(self.repo_dir, "proposer_pgdata") - pg = ProposerPostgres(pgdata_dir, - self.pg_bin, - self.timeline_id, - self.tenant_id, - "127.0.0.1", - self.port_distributor.get_port()) + pg = ProposerPostgres( + pgdata_dir, + self.pg_bin, + self.timeline_id, + self.tenant_id, + "127.0.0.1", + self.port_distributor.get_port(), + ) pg.initdb() pg.create_dir_config(self.get_safekeeper_connstrs()) return pg @@ -811,7 +862,7 @@ class SafekeeperEnv: return self def __exit__(self, exc_type, exc_value, traceback): - log.info('Cleaning up all safekeeper and compute nodes') + log.info("Cleaning up all safekeeper and compute nodes") # Stop all the nodes if self.postgres is not None: @@ -821,9 +872,9 @@ class SafekeeperEnv: self.kill_safekeeper(sk_proc.args[6]) -def test_safekeeper_without_pageserver(test_output_dir: str, - port_distributor: PortDistributor, - pg_bin: PgBin): +def test_safekeeper_without_pageserver( + test_output_dir: str, port_distributor: PortDistributor, pg_bin: PgBin +): # Create the environment in the test-specific output dir repo_dir = Path(os.path.join(test_output_dir, "repo")) @@ -845,19 +896,19 @@ def test_safekeeper_without_pageserver(test_output_dir: str, def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str: - return ','.join([f'localhost:{sk.port.pg}' for sk in env.safekeepers if sk.id in sk_names]) + return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names]) def execute_payload(pg: Postgres): with closing(pg.connect()) as conn: with conn.cursor() as cur: # we rely upon autocommit after each statement # as waiting for acceptors happens there - cur.execute('CREATE TABLE IF NOT EXISTS t(key int, value text)') + cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") cur.execute("INSERT INTO t VALUES (0, 'something')") - sum_before = query_scalar(cur, 'SELECT SUM(key) FROM t') + sum_before = query_scalar(cur, "SELECT SUM(key) FROM t") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - sum_after = query_scalar(cur, 'SELECT SUM(key) FROM t') + sum_after = query_scalar(cur, "SELECT SUM(key) FROM t") assert sum_after == sum_before + 5000050000 def show_statuses(safekeepers: List[Safekeeper], tenant_id: str, timeline_id: str): @@ -871,12 +922,12 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 4 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_replace_safekeeper') + env.neon_cli.create_branch("test_replace_safekeeper") log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() active_safekeepers = [1, 2, 3] - pg = env.postgres.create('test_replace_safekeeper') + pg = env.postgres.create("test_replace_safekeeper") pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) pg.start() @@ -914,7 +965,7 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) log.info("Recreate postgres to replace failed sk1 with new sk4") - pg.stop_and_destroy().create('test_replace_safekeeper') + pg.stop_and_destroy().create("test_replace_safekeeper") active_safekeepers = [2, 3, 4] env.safekeepers[3].start() pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) @@ -934,16 +985,16 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): # of WAL segments. def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): # used to calculate delta in collect_stats - last_lsn = .0 + last_lsn = 0.0 # returns LSN and pg_wal size, all in MB def collect_stats(pg: Postgres, cur, enable_logs=True): nonlocal last_lsn assert pg.pgdata_dir is not None - log.info('executing INSERT to generate WAL') + log.info("executing INSERT to generate WAL") current_lsn = lsn_from_hex(query_scalar(cur, "select pg_current_wal_lsn()")) / 1024 / 1024 - pg_wal_size = get_dir_size(os.path.join(pg.pgdata_dir, 'pg_wal')) / 1024 / 1024 + pg_wal_size = get_dir_size(os.path.join(pg.pgdata_dir, "pg_wal")) / 1024 / 1024 if enable_logs: log.info(f"LSN delta: {current_lsn - last_lsn} MB, current WAL size: {pg_wal_size} MB") last_lsn = current_lsn @@ -956,15 +1007,16 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_wal_deleted_after_broadcast') + env.neon_cli.create_branch("test_wal_deleted_after_broadcast") # Adjust checkpoint config to prevent keeping old WAL segments pg = env.postgres.create_start( - 'test_wal_deleted_after_broadcast', - config_lines=['min_wal_size=32MB', 'max_wal_size=32MB', 'log_checkpoints=on']) + "test_wal_deleted_after_broadcast", + config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"], + ) pg_conn = pg.connect() cur = pg_conn.cursor() - cur.execute('CREATE TABLE t(key int, value text)') + cur.execute("CREATE TABLE t(key int, value text)") collect_stats(pg, cur) @@ -973,15 +1025,15 @@ def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): generate_wal(cur) collect_stats(pg, cur) - log.info('executing checkpoint') - cur.execute('CHECKPOINT') + log.info("executing checkpoint") + cur.execute("CHECKPOINT") wal_size_after_checkpoint = collect_stats(pg, cur)[1] # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) assert wal_size_after_checkpoint < 16 * 2.5 -@pytest.mark.parametrize('auth_enabled', [False, True]) +@pytest.mark.parametrize("auth_enabled", [False, True]) def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.num_safekeepers = 1 neon_env_builder.auth_enabled = auth_enabled @@ -989,25 +1041,25 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # Create two tenants: one will be deleted, other should be preserved. tenant_id = env.initial_tenant.hex - timeline_id_1 = env.neon_cli.create_branch('br1').hex # Active, delete explicitly - timeline_id_2 = env.neon_cli.create_branch('br2').hex # Inactive, delete explicitly - timeline_id_3 = env.neon_cli.create_branch('br3').hex # Active, delete with the tenant - timeline_id_4 = env.neon_cli.create_branch('br4').hex # Inactive, delete with the tenant + timeline_id_1 = env.neon_cli.create_branch("br1").hex # Active, delete explicitly + timeline_id_2 = env.neon_cli.create_branch("br2").hex # Inactive, delete explicitly + timeline_id_3 = env.neon_cli.create_branch("br3").hex # Active, delete with the tenant + timeline_id_4 = env.neon_cli.create_branch("br4").hex # Inactive, delete with the tenant tenant_id_other_uuid, timeline_id_other_uuid = env.neon_cli.create_tenant() tenant_id_other = tenant_id_other_uuid.hex timeline_id_other = timeline_id_other_uuid.hex # Populate branches - pg_1 = env.postgres.create_start('br1') - pg_2 = env.postgres.create_start('br2') - pg_3 = env.postgres.create_start('br3') - pg_4 = env.postgres.create_start('br4') - pg_other = env.postgres.create_start('main', tenant_id=uuid.UUID(hex=tenant_id_other)) + pg_1 = env.postgres.create_start("br1") + pg_2 = env.postgres.create_start("br2") + pg_3 = env.postgres.create_start("br3") + pg_4 = env.postgres.create_start("br4") + pg_other = env.postgres.create_start("main", tenant_id=uuid.UUID(hex=tenant_id_other)) for pg in [pg_1, pg_2, pg_3, pg_4, pg_other]: with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute('CREATE TABLE t(key int primary key)') + cur.execute("CREATE TABLE t(key int primary key)") sk = env.safekeepers[0] sk_data_dir = Path(sk.data_dir()) if not auth_enabled: @@ -1016,7 +1068,8 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): else: sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) sk_http_other = sk.http_client( - auth_token=env.auth_keys.generate_tenant_token(tenant_id_other)) + auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) + ) sk_http_noauth = sk.http_client() assert (sk_data_dir / tenant_id / timeline_id_1).is_dir() assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() @@ -1034,7 +1087,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): for pg in [pg_1, pg_3, pg_other]: with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute('INSERT INTO t (key) VALUES (1)') + cur.execute("INSERT INTO t (key) VALUES (1)") # Remove initial tenant's br1 (active) assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == { @@ -1049,7 +1102,8 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # Ensure repeated deletion succeeds assert sk_http.timeline_delete_force(tenant_id, timeline_id_1) == { - "dir_existed": False, "was_active": False + "dir_existed": False, + "was_active": False, } assert not (sk_data_dir / tenant_id / timeline_id_1).exists() assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() @@ -1060,9 +1114,9 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): if auth_enabled: # Ensure we cannot delete the other tenant for sk_h in [sk_http, sk_http_noauth]: - with pytest.raises(sk_h.HTTPError, match='Forbidden|Unauthorized'): + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): assert sk_h.timeline_delete_force(tenant_id_other, timeline_id_other) - with pytest.raises(sk_h.HTTPError, match='Forbidden|Unauthorized'): + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): assert sk_h.tenant_delete_force(tenant_id_other) assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() @@ -1078,7 +1132,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() # Remove non-existing branch, should succeed - assert sk_http.timeline_delete_force(tenant_id, '00' * 16) == { + assert sk_http.timeline_delete_force(tenant_id, "00" * 16) == { "dir_existed": False, "was_active": False, } @@ -1107,4 +1161,4 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): sk_http_other.timeline_status(tenant_id_other, timeline_id_other) with closing(pg_other.connect()) as conn: with conn.cursor() as cur: - cur.execute('INSERT INTO t (key) VALUES (123)') + cur.execute("INSERT INTO t (key) VALUES (123)") diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py similarity index 67% rename from test_runner/batch_others/test_wal_acceptor_async.py rename to test_runner/regress/test_wal_acceptor_async.py index e1d3ba0919..83285e0cbe 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -1,17 +1,16 @@ import asyncio -import uuid - -import asyncpg import random import time - -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper -from fixtures.log_helper import getLogger -from fixtures.utils import lsn_from_hex, lsn_to_hex -from typing import List, Optional +import uuid from dataclasses import dataclass +from typing import List, Optional -log = getLogger('root.safekeeper_async') +import asyncpg +from fixtures.log_helper import getLogger +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper +from fixtures.utils import lsn_from_hex, lsn_to_hex + +log = getLogger("root.safekeeper_async") class BankClient(object): @@ -21,21 +20,22 @@ class BankClient(object): self.init_amount = init_amount async def initdb(self): - await self.conn.execute('DROP TABLE IF EXISTS bank_accs') - await self.conn.execute('CREATE TABLE bank_accs(uid int primary key, amount int)') + await self.conn.execute("DROP TABLE IF EXISTS bank_accs") + await self.conn.execute("CREATE TABLE bank_accs(uid int primary key, amount int)") await self.conn.execute( - ''' + """ INSERT INTO bank_accs SELECT *, $1 FROM generate_series(0, $2) - ''', + """, self.init_amount, - self.n_accounts - 1) - await self.conn.execute('DROP TABLE IF EXISTS bank_log') - await self.conn.execute('CREATE TABLE bank_log(from_uid int, to_uid int, amount int)') + self.n_accounts - 1, + ) + await self.conn.execute("DROP TABLE IF EXISTS bank_log") + await self.conn.execute("CREATE TABLE bank_log(from_uid int, to_uid int, amount int)") async def check_invariant(self): - row = await self.conn.fetchrow('SELECT sum(amount) AS sum FROM bank_accs') - assert row['sum'] == self.n_accounts * self.init_amount + row = await self.conn.fetchrow("SELECT sum(amount) AS sum FROM bank_accs") + assert row["sum"] == self.n_accounts * self.init_amount async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount): @@ -45,17 +45,17 @@ async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount): async with conn.transaction(): await conn.execute( - 'UPDATE bank_accs SET amount = amount + ($1) WHERE uid = $2', + "UPDATE bank_accs SET amount = amount + ($1) WHERE uid = $2", amount, to_uid, ) await conn.execute( - 'UPDATE bank_accs SET amount = amount - ($1) WHERE uid = $2', + "UPDATE bank_accs SET amount = amount - ($1) WHERE uid = $2", amount, from_uid, ) await conn.execute( - 'INSERT INTO bank_log VALUES ($1, $2, $3)', + "INSERT INTO bank_log VALUES ($1, $2, $3)", from_uid, to_uid, amount, @@ -80,12 +80,12 @@ class WorkerStats(object): assert all(cnt > 0 for cnt in self.counters) progress = sum(self.counters) - log.info('All workers made {} transactions'.format(progress)) + log.info("All workers made {} transactions".format(progress)) async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer): pg_conn = await pg.connect_async() - log.debug('Started worker {}'.format(worker_id)) + log.debug("Started worker {}".format(worker_id)) while stats.running: from_uid = random.randint(0, n_accounts - 1) @@ -95,19 +95,21 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou await bank_transfer(pg_conn, from_uid, to_uid, amount) stats.inc_progress(worker_id) - log.debug('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid)) + log.debug("Executed transfer({}) {} => {}".format(amount, from_uid, to_uid)) - log.debug('Finished worker {}'.format(worker_id)) + log.debug("Finished worker {}".format(worker_id)) await pg_conn.close() -async def wait_for_lsn(safekeeper: Safekeeper, - tenant_id: str, - timeline_id: str, - wait_lsn: str, - polling_interval=1, - timeout=60): +async def wait_for_lsn( + safekeeper: Safekeeper, + tenant_id: str, + timeline_id: str, + wait_lsn: str, + polling_interval=1, + timeout=60, +): """ Poll flush_lsn from safekeeper until it's greater or equal than provided wait_lsn. To do that, timeline_status is fetched from @@ -119,7 +121,7 @@ async def wait_for_lsn(safekeeper: Safekeeper, flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn log.info( - f'Safekeeper at port {safekeeper.port.pg} has flush_lsn {flush_lsn}, waiting for lsn {wait_lsn}' + f"Safekeeper at port {safekeeper.port.pg} has flush_lsn {flush_lsn}, waiting for lsn {wait_lsn}" ) while lsn_from_hex(wait_lsn) > lsn_from_hex(flush_lsn): @@ -131,22 +133,24 @@ async def wait_for_lsn(safekeeper: Safekeeper, await asyncio.sleep(polling_interval) flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn - log.debug(f'safekeeper port={safekeeper.port.pg} flush_lsn={flush_lsn} wait_lsn={wait_lsn}') + log.debug(f"safekeeper port={safekeeper.port.pg} flush_lsn={flush_lsn} wait_lsn={wait_lsn}") # This test will run several iterations and check progress in each of them. # On each iteration 1 acceptor is stopped, and 2 others should allow # background workers execute transactions. In the end, state should remain # consistent. -async def run_restarts_under_load(env: NeonEnv, - pg: Postgres, - acceptors: List[Safekeeper], - n_workers=10, - n_accounts=100, - init_amount=100000, - max_transfer=100, - period_time=4, - iterations=10): +async def run_restarts_under_load( + env: NeonEnv, + pg: Postgres, + acceptors: List[Safekeeper], + n_workers=10, + n_accounts=100, + init_amount=100000, + max_transfer=100, + period_time=4, + iterations=10, +): # Set timeout for this test at 5 minutes. It should be enough for test to complete, # taking into account that this timeout is checked only at the beginning of every iteration. test_timeout_at = time.monotonic() + 5 * 60 @@ -166,20 +170,21 @@ async def run_restarts_under_load(env: NeonEnv, workers.append(asyncio.create_task(worker)) for it in range(iterations): - assert time.monotonic() < test_timeout_at, 'test timed out' + assert time.monotonic() < test_timeout_at, "test timed out" victim_idx = it % len(acceptors) victim = acceptors[victim_idx] victim.stop() - flush_lsn = await pg_conn.fetchval('SELECT pg_current_wal_flush_lsn()') + flush_lsn = await pg_conn.fetchval("SELECT pg_current_wal_flush_lsn()") flush_lsn = lsn_to_hex(flush_lsn) - log.info(f'Postgres flush_lsn {flush_lsn}') + log.info(f"Postgres flush_lsn {flush_lsn}") pageserver_lsn = env.pageserver.http_client().timeline_detail( - uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"] + uuid.UUID(tenant_id), uuid.UUID((timeline_id)) + )["local"]["last_record_lsn"] sk_ps_lag = lsn_from_hex(flush_lsn) - lsn_from_hex(pageserver_lsn) - log.info(f'Pageserver last_record_lsn={pageserver_lsn} lag={sk_ps_lag / 1024}kb') + log.info(f"Pageserver last_record_lsn={pageserver_lsn} lag={sk_ps_lag / 1024}kb") # Wait until alive safekeepers catch up with postgres for idx, safekeeper in enumerate(acceptors): @@ -193,7 +198,7 @@ async def run_restarts_under_load(env: NeonEnv, victim.start() - log.info('Iterations are finished, exiting coroutines...') + log.info("Iterations are finished, exiting coroutines...") stats.running = False # await all workers await asyncio.gather(*workers) @@ -207,10 +212,11 @@ def test_restarts_under_load(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_safekeepers_restarts_under_load') + env.neon_cli.create_branch("test_safekeepers_restarts_under_load") # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long - pg = env.postgres.create_start('test_safekeepers_restarts_under_load', - config_lines=['max_replication_write_lag=1MB']) + pg = env.postgres.create_start( + "test_safekeepers_restarts_under_load", config_lines=["max_replication_write_lag=1MB"] + ) asyncio.run(run_restarts_under_load(env, pg, env.safekeepers)) @@ -222,15 +228,17 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_restarts_frequent_checkpoints') + env.neon_cli.create_branch("test_restarts_frequent_checkpoints") # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long - pg = env.postgres.create_start('test_restarts_frequent_checkpoints', - config_lines=[ - 'max_replication_write_lag=1MB', - 'min_wal_size=32MB', - 'max_wal_size=32MB', - 'log_checkpoints=on' - ]) + pg = env.postgres.create_start( + "test_restarts_frequent_checkpoints", + config_lines=[ + "max_replication_write_lag=1MB", + "min_wal_size=32MB", + "max_wal_size=32MB", + "log_checkpoints=on", + ], + ) # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments # are not removed before broadcasted to all safekeepers, with the help of replication slot @@ -244,51 +252,51 @@ def postgres_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): port=env.port_distributor.get_port(), # In these tests compute has high probability of terminating on its own # before our stop() due to lost consensus leadership. - check_stop_result=False) + check_stop_result=False, + ) # embed current time in node name - node_name = pgdir_name or f'pg_node_{time.time()}' - return pg.create_start(branch_name=branch, - node_name=node_name, - config_lines=['log_statement=all']) + node_name = pgdir_name or f"pg_node_{time.time()}" + return pg.create_start( + branch_name=branch, node_name=node_name, config_lines=["log_statement=all"] + ) -async def exec_compute_query(env: NeonEnv, - branch: str, - query: str, - pgdir_name: Optional[str] = None): +async def exec_compute_query( + env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None +): with postgres_create_start(env, branch=branch, pgdir_name=pgdir_name) as pg: before_conn = time.time() conn = await pg.connect_async() res = await conn.fetch(query) await conn.close() after_conn = time.time() - log.info(f'{query} took {after_conn - before_conn}s') + log.info(f"{query} took {after_conn - before_conn}s") return res -async def run_compute_restarts(env: NeonEnv, - queries=16, - batch_insert=10000, - branch='test_compute_restarts'): +async def run_compute_restarts( + env: NeonEnv, queries=16, batch_insert=10000, branch="test_compute_restarts" +): cnt = 0 sum = 0 - await exec_compute_query(env, branch, 'CREATE TABLE t (i int)') + await exec_compute_query(env, branch, "CREATE TABLE t (i int)") for i in range(queries): if i % 4 == 0: await exec_compute_query( - env, branch, f'INSERT INTO t SELECT 1 FROM generate_series(1, {batch_insert})') + env, branch, f"INSERT INTO t SELECT 1 FROM generate_series(1, {batch_insert})" + ) sum += batch_insert cnt += batch_insert elif (i % 4 == 1) or (i % 4 == 3): # Note that select causes lots of FPI's and increases probability of safekeepers # standing at different LSNs after compute termination. - actual_sum = (await exec_compute_query(env, branch, 'SELECT SUM(i) FROM t'))[0][0] - assert actual_sum == sum, f'Expected sum={sum}, actual={actual_sum}' + actual_sum = (await exec_compute_query(env, branch, "SELECT SUM(i) FROM t"))[0][0] + assert actual_sum == sum, f"Expected sum={sum}, actual={actual_sum}" elif i % 4 == 2: - await exec_compute_query(env, branch, 'UPDATE t SET i = i + 1') + await exec_compute_query(env, branch, "UPDATE t SET i = i + 1") sum += cnt @@ -297,7 +305,7 @@ def test_compute_restarts(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_compute_restarts') + env.neon_cli.create_branch("test_compute_restarts") asyncio.run(run_compute_restarts(env)) @@ -315,7 +323,7 @@ class BackgroundCompute(object): async def run(self): if self.running: - raise Exception('BackgroundCompute is already running') + raise Exception("BackgroundCompute is already running") self.running = True i = 0 @@ -327,17 +335,17 @@ class BackgroundCompute(object): res = await exec_compute_query( self.env, self.branch, - f'INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key', - pgdir_name=f'bgcompute{self.index}_key{verify_key}', + f"INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key", + pgdir_name=f"bgcompute{self.index}_key{verify_key}", ) - log.info(f'result: {res}') + log.info(f"result: {res}") if len(res) != 1: - raise Exception('No result returned') + raise Exception("No result returned") if res[0][0] != verify_key: - raise Exception('Wrong result returned') + raise Exception("Wrong result returned") self.successful_queries.append(verify_key) except Exception as e: - log.info(f'BackgroundCompute {self.index} query failed: {e}') + log.info(f"BackgroundCompute {self.index} query failed: {e}") # With less sleep, there is a very big chance of not committing # anything or only 1 xact during test run. @@ -345,14 +353,12 @@ class BackgroundCompute(object): self.running = False -async def run_concurrent_computes(env: NeonEnv, - num_computes=10, - run_seconds=20, - branch='test_concurrent_computes'): +async def run_concurrent_computes( + env: NeonEnv, num_computes=10, run_seconds=20, branch="test_concurrent_computes" +): await exec_compute_query( - env, - branch, - 'CREATE TABLE query_log (t timestamp default now(), index int, verify_key int)') + env, branch, "CREATE TABLE query_log (t timestamp default now(), index int, verify_key int)" + ) computes = [BackgroundCompute(i, env, branch) for i in range(num_computes)] background_tasks = [asyncio.create_task(compute.run()) for compute in computes] @@ -367,13 +373,17 @@ async def run_concurrent_computes(env: NeonEnv, # work for some time with only one compute -- it should be able to make some xacts TIMEOUT_SECONDS = computes[0].MAX_QUERY_GAP_SECONDS + 3 initial_queries_by_0 = len(computes[0].successful_queries) - log.info(f'Waiting for another query by computes[0], ' - f'it already had {initial_queries_by_0}, timeout is {TIMEOUT_SECONDS}s') + log.info( + f"Waiting for another query by computes[0], " + f"it already had {initial_queries_by_0}, timeout is {TIMEOUT_SECONDS}s" + ) for _ in range(10 * TIMEOUT_SECONDS): current_queries_by_0 = len(computes[0].successful_queries) - initial_queries_by_0 if current_queries_by_0 >= 1: - log.info(f'Found {current_queries_by_0} successful queries ' - f'by computes[0], completing the test') + log.info( + f"Found {current_queries_by_0} successful queries " + f"by computes[0], completing the test" + ) break await asyncio.sleep(0.1) else: @@ -382,12 +392,14 @@ async def run_concurrent_computes(env: NeonEnv, await asyncio.gather(background_tasks[0]) - result = await exec_compute_query(env, branch, 'SELECT * FROM query_log') + result = await exec_compute_query(env, branch, "SELECT * FROM query_log") # we should have inserted something while single compute was running - log.info(f'Executed {len(result)} queries, {current_queries_by_0} of them ' - f'by computes[0] after we started stopping the others') + log.info( + f"Executed {len(result)} queries, {current_queries_by_0} of them " + f"by computes[0] after we started stopping the others" + ) for row in result: - log.info(f'{row[0]} {row[1]} {row[2]}') + log.info(f"{row[0]} {row[1]} {row[2]}") # ensure everything reported as committed wasn't lost for compute in computes: @@ -402,16 +414,15 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_concurrent_computes') + env.neon_cli.create_branch("test_concurrent_computes") asyncio.run(run_concurrent_computes(env)) # Stop safekeeper and check that query cannot be executed while safekeeper is down. # Query will insert a single row into a table. -async def check_unavailability(sk: Safekeeper, - conn: asyncpg.Connection, - key: int, - start_delay_sec: int = 2): +async def check_unavailability( + sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2 +): # shutdown one of two acceptors, that is, majority sk.stop() @@ -431,7 +442,7 @@ async def run_unavailability(env: NeonEnv, pg: Postgres): conn = await pg.connect_async() # check basic work with table - await conn.execute('CREATE TABLE t(key int primary key, value text)') + await conn.execute("CREATE TABLE t(key int primary key, value text)") await conn.execute("INSERT INTO t values (1, 'payload')") # stop safekeeper and check that query cannot be executed while safekeeper is down @@ -443,7 +454,7 @@ async def run_unavailability(env: NeonEnv, pg: Postgres): # check that we can execute queries after restart await conn.execute("INSERT INTO t values (4, 'payload')") - result_sum = await conn.fetchval('SELECT sum(key) FROM t') + result_sum = await conn.fetchval("SELECT sum(key) FROM t") assert result_sum == 10 @@ -452,8 +463,8 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 2 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_safekeepers_unavailability') - pg = env.postgres.create_start('test_safekeepers_unavailability') + env.neon_cli.create_branch("test_safekeepers_unavailability") + pg = env.postgres.create_start("test_safekeepers_unavailability") asyncio.run(run_unavailability(env, pg)) @@ -473,20 +484,20 @@ async def xmas_garland(safekeepers: List[Safekeeper], data: RaceConditionTest): if random.random() >= 0.5: victims.append(sk) log.info( - f'Iteration {data.iteration}: stopping {list(map(lambda sk: sk.id, victims))} safekeepers' + f"Iteration {data.iteration}: stopping {list(map(lambda sk: sk.id, victims))} safekeepers" ) for v in victims: v.stop() await asyncio.sleep(1) for v in victims: v.start() - log.info(f'Iteration {data.iteration} finished') + log.info(f"Iteration {data.iteration} finished") await asyncio.sleep(1) async def run_race_conditions(env: NeonEnv, pg: Postgres): conn = await pg.connect_async() - await conn.execute('CREATE TABLE t(key int primary key, value text)') + await conn.execute("CREATE TABLE t(key int primary key, value text)") data = RaceConditionTest(0, False) bg_xmas = asyncio.create_task(xmas_garland(env.safekeepers, data)) @@ -501,9 +512,9 @@ async def run_race_conditions(env: NeonEnv, pg: Postgres): expected_sum += i i += 1 - log.info(f'Executed {i-1} queries') + log.info(f"Executed {i-1} queries") - res = await conn.fetchval('SELECT sum(key) FROM t') + res = await conn.fetchval("SELECT sum(key) FROM t") assert res == expected_sum data.is_stopped = True @@ -516,8 +527,8 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_safekeepers_race_conditions') - pg = env.postgres.create_start('test_safekeepers_race_conditions') + env.neon_cli.create_branch("test_safekeepers_race_conditions") + pg = env.postgres.create_start("test_safekeepers_race_conditions") asyncio.run(run_race_conditions(env, pg)) @@ -527,13 +538,15 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder): async def run_wal_lagging(env: NeonEnv, pg: Postgres): def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str: # use ports 10, 11 and 12 to simulate unavailable safekeepers - return ','.join([ - f'localhost:{sk.port.pg if active else 10 + i}' - for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk)) - ]) + return ",".join( + [ + f"localhost:{sk.port.pg if active else 10 + i}" + for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk)) + ] + ) conn = await pg.connect_async() - await conn.execute('CREATE TABLE t(key int primary key, value text)') + await conn.execute("CREATE TABLE t(key int primary key, value text)") await conn.close() pg.stop() @@ -552,7 +565,7 @@ async def run_wal_lagging(env: NeonEnv, pg: Postgres): continue pg.adjust_for_safekeepers(safekeepers_guc(env, active_sk)) - log.info(f'Iteration {it}: {active_sk}') + log.info(f"Iteration {it}: {active_sk}") pg.start() conn = await pg.connect_async() @@ -569,9 +582,9 @@ async def run_wal_lagging(env: NeonEnv, pg: Postgres): pg.start() conn = await pg.connect_async() - log.info(f'Executed {i-1} queries') + log.info(f"Executed {i-1} queries") - res = await conn.fetchval('SELECT sum(key) FROM t') + res = await conn.fetchval("SELECT sum(key) FROM t") assert res == expected_sum @@ -581,7 +594,7 @@ def test_wal_lagging(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch('test_wal_lagging') - pg = env.postgres.create_start('test_wal_lagging') + env.neon_cli.create_branch("test_wal_lagging") + pg = env.postgres.create_start("test_wal_lagging") asyncio.run(run_wal_lagging(env, pg)) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py new file mode 100644 index 0000000000..0847b5a505 --- /dev/null +++ b/test_runner/regress/test_wal_restore.py @@ -0,0 +1,39 @@ +import os +from pathlib import Path + +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + PortDistributor, + VanillaPostgres, + base_dir, + pg_distrib_dir, +) + + +def test_wal_restore( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + test_output_dir: Path, + port_distributor: PortDistributor, +): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_wal_restore") + pg = env.postgres.create_start("test_wal_restore") + pg.safe_psql("create table t as select generate_series(1,300000)") + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + env.neon_cli.pageserver_stop() + port = port_distributor.get_port() + data_dir = test_output_dir / "pgsql.restored" + with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored: + pg_bin.run_capture( + [ + os.path.join(base_dir, "libs/utils/scripts/restore_from_wal.sh"), + os.path.join(pg_distrib_dir, "bin"), + str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"), + str(data_dir), + str(port), + ] + ) + restored.start() + assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] diff --git a/test_runner/neon_regress/.gitignore b/test_runner/sql_regress/.gitignore similarity index 100% rename from test_runner/neon_regress/.gitignore rename to test_runner/sql_regress/.gitignore diff --git a/test_runner/sql_regress/README.md b/test_runner/sql_regress/README.md new file mode 100644 index 0000000000..1ae8aaf61a --- /dev/null +++ b/test_runner/sql_regress/README.md @@ -0,0 +1,13 @@ +Simple tests that only need a PostgreSQL connection to run. +These are run by the regress/test_pg_regress.py test, which uses +the PostgreSQL pg_regress utility. + +To add a new SQL test: + +- add sql script to run to neon_regress/sql/testname.sql +- add expected output to neon_regress/expected/testname.out +- add testname to parallel_schedule + +That's it. +For more complex tests see PostgreSQL regression tests in src/test/regress. +These work basically the same. diff --git a/test_runner/neon_regress/expected/.gitignore b/test_runner/sql_regress/expected/.gitignore similarity index 100% rename from test_runner/neon_regress/expected/.gitignore rename to test_runner/sql_regress/expected/.gitignore diff --git a/test_runner/neon_regress/expected/neon-cid.out b/test_runner/sql_regress/expected/neon-cid.out similarity index 100% rename from test_runner/neon_regress/expected/neon-cid.out rename to test_runner/sql_regress/expected/neon-cid.out diff --git a/test_runner/neon_regress/expected/neon-clog.out b/test_runner/sql_regress/expected/neon-clog.out similarity index 100% rename from test_runner/neon_regress/expected/neon-clog.out rename to test_runner/sql_regress/expected/neon-clog.out diff --git a/test_runner/neon_regress/expected/neon-rel-truncate.out b/test_runner/sql_regress/expected/neon-rel-truncate.out similarity index 100% rename from test_runner/neon_regress/expected/neon-rel-truncate.out rename to test_runner/sql_regress/expected/neon-rel-truncate.out diff --git a/test_runner/neon_regress/expected/neon-vacuum-full.out b/test_runner/sql_regress/expected/neon-vacuum-full.out similarity index 100% rename from test_runner/neon_regress/expected/neon-vacuum-full.out rename to test_runner/sql_regress/expected/neon-vacuum-full.out diff --git a/test_runner/neon_regress/parallel_schedule b/test_runner/sql_regress/parallel_schedule similarity index 100% rename from test_runner/neon_regress/parallel_schedule rename to test_runner/sql_regress/parallel_schedule diff --git a/test_runner/neon_regress/sql/.gitignore b/test_runner/sql_regress/sql/.gitignore similarity index 100% rename from test_runner/neon_regress/sql/.gitignore rename to test_runner/sql_regress/sql/.gitignore diff --git a/test_runner/neon_regress/sql/neon-cid.sql b/test_runner/sql_regress/sql/neon-cid.sql similarity index 100% rename from test_runner/neon_regress/sql/neon-cid.sql rename to test_runner/sql_regress/sql/neon-cid.sql diff --git a/test_runner/neon_regress/sql/neon-clog.sql b/test_runner/sql_regress/sql/neon-clog.sql similarity index 100% rename from test_runner/neon_regress/sql/neon-clog.sql rename to test_runner/sql_regress/sql/neon-clog.sql diff --git a/test_runner/neon_regress/sql/neon-rel-truncate.sql b/test_runner/sql_regress/sql/neon-rel-truncate.sql similarity index 100% rename from test_runner/neon_regress/sql/neon-rel-truncate.sql rename to test_runner/sql_regress/sql/neon-rel-truncate.sql diff --git a/test_runner/neon_regress/sql/neon-vacuum-full.sql b/test_runner/sql_regress/sql/neon-vacuum-full.sql similarity index 100% rename from test_runner/neon_regress/sql/neon-vacuum-full.sql rename to test_runner/sql_regress/sql/neon-vacuum-full.sql diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index 3960546689..0281f4f48b 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -1,8 +1,9 @@ -import pytest import os -from fixtures.neon_fixtures import NeonEnv +import pytest from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv + """ Use this test to see what happens when tests fail. @@ -13,8 +14,9 @@ Set the environment variable RUN_BROKEN to see this test run (and fail, and hopefully not leave any server processes behind). """ -run_broken = pytest.mark.skipif(os.environ.get('RUN_BROKEN') is None, - reason="only used for testing the fixtures") +run_broken = pytest.mark.skipif( + os.environ.get("RUN_BROKEN") is None, reason="only used for testing the fixtures" +) @run_broken @@ -23,7 +25,7 @@ def test_broken(neon_simple_env: NeonEnv, pg_bin): env.neon_cli.create_branch("test_broken", "empty") env.postgres.create_start("test_broken") - log.info('postgres is running') + log.info("postgres is running") - log.info('THIS NEXT COMMAND WILL FAIL:') - pg_bin.run('pgbench -i_am_a_broken_test'.split()) + log.info("THIS NEXT COMMAND WILL FAIL:") + pg_bin.run("pgbench -i_am_a_broken_test".split()) diff --git a/vendor/postgres b/vendor/postgres index 49015ce98f..22d9ead36b 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 49015ce98f550d4fc08d3c1fe348faa71a15f51b +Subproject commit 22d9ead36beeab6b6a99c64f9b0b1576927ad91b diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 4dc7e4e157..bfe61b9ced 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -16,6 +16,7 @@ publish = false [dependencies] ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } +bstr = { version = "0.2", features = ["lazy_static", "regex-automata", "serde", "serde1", "serde1-nostd", "std", "unicode"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } @@ -25,7 +26,7 @@ futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink" futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] } futures-util = { version = "0.3", default-features = false, features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] } generic-array = { version = "0.14", default-features = false, features = ["more_lengths"] } -hashbrown = { version = "0.11", features = ["ahash", "inline-more", "raw"] } +hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } hex = { version = "0.4", features = ["alloc", "serde", "std"] } hyper = { version = "0.14", features = ["client", "full", "h2", "http1", "http2", "runtime", "server", "socket2", "stream", "tcp"] } indexmap = { version = "1", default-features = false, features = ["std"] } @@ -40,12 +41,13 @@ num-traits = { version = "0.2", features = ["i128", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } +regex-automata = { version = "0.1", features = ["regex-syntax", "std"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } -time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "quickcheck", "quickcheck-dep", "std", "time-macros"] } +time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] } -tokio-util = { version = "0.7", features = ["codec", "io"] } +tokio-util = { version = "0.7", features = ["codec", "io", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } tracing-core = { version = "0.1", features = ["lazy_static", "std", "valuable"] } @@ -55,7 +57,7 @@ anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } -hashbrown = { version = "0.11", features = ["ahash", "inline-more", "raw"] } +hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] }