diff --git a/.dockerignore b/.dockerignore index 960588b6f2..ae0ad8fd77 100644 --- a/.dockerignore +++ b/.dockerignore @@ -14,10 +14,12 @@ !pgxn/ !proxy/ !safekeeper/ +!s3_scrubber/ !storage_broker/ !trace/ !vendor/postgres-v14/ !vendor/postgres-v15/ +!vendor/postgres-v16/ !workspace_hack/ !neon_local/ !scripts/ninstall.sh diff --git a/.github/actionlint.yml b/.github/actionlint.yml new file mode 100644 index 0000000000..fddd2f980d --- /dev/null +++ b/.github/actionlint.yml @@ -0,0 +1,8 @@ +self-hosted-runner: + labels: + - gen3 + - large + - small + - us-east-2 +config-variables: + - SLACK_UPCOMING_RELEASE_CHANNEL_ID diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 013b446307..8dfa6c465f 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -70,6 +70,9 @@ runs: name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }} path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} prefix: latest + # The lack of compatibility snapshot (for example, for the new Postgres version) + # shouldn't fail the whole job. Only relevant test should fail. + skip-if-does-not-exist: true - name: Checkout if: inputs.needs_postgres_source == 'true' diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml new file mode 100644 index 0000000000..584828c1d0 --- /dev/null +++ b/.github/workflows/actionlint.yml @@ -0,0 +1,31 @@ +name: Lint GitHub Workflows + +on: + push: + branches: + - main + - release + paths: + - '.github/workflows/*.ya?ml' + pull_request: + paths: + - '.github/workflows/*.ya?ml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + actionlint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: reviewdog/action-actionlint@v1 + env: + # SC2046 - Quote this to prevent word splitting. - https://www.shellcheck.net/wiki/SC2046 + # SC2086 - Double quote to prevent globbing and word splitting. - https://www.shellcheck.net/wiki/SC2086 + SHELLCHECK_OPTS: --exclude=SC2046,SC2086 + with: + fail_on_error: true + filter_mode: nofilter + level: error diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index ac9e908c09..5b21011b83 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -2,7 +2,9 @@ name: Handle `approved-for-ci-run` label # This workflow helps to run CI pipeline for PRs made by external contributors (from forks). on: - pull_request: + pull_request_target: + branches: + - main types: # Default types that triggers a workflow ([1]): # - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request @@ -14,42 +16,103 @@ on: # Actual magic happens here: - labeled +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number }} + env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: "ci-run/pr-${{ github.event.pull_request.number }}" + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +defaults: + run: + shell: bash -euo pipefail {0} jobs: remove-label: # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR. # The PR should be reviewed and labelled manually again. - runs-on: [ ubuntu-latest ] + permissions: + pull-requests: write # For `gh pr edit` if: | contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) && contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run') + runs-on: ubuntu-latest + steps: - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" - create-branch: - # Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it. + create-or-update-pr-for-ci-run: + # Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it. - runs-on: [ ubuntu-latest ] + permissions: + pull-requests: write # for `gh pr edit` + # For `git push` and `gh pr create` we use CI_ACCESS_TOKEN if: | github.event.action == 'labeled' && contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run') + runs-on: ubuntu-latest + steps: - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" - uses: actions/checkout@v3 with: ref: main + token: ${{ secrets.CI_ACCESS_TOKEN }} - run: gh pr checkout "${PR_NUMBER}" - - run: git checkout -b "ci-run/pr-${PR_NUMBER}" + - run: git checkout -b "${BRANCH}" - - run: git push --force origin "ci-run/pr-${PR_NUMBER}" + - run: git push --force origin "${BRANCH}" + + - name: Create a Pull Request for CI run (if required) + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + cat << EOF > body.md + This Pull Request is created automatically to run the CI pipeline for #${PR_NUMBER} + + Please do not alter or merge/close it. + + Feel free to review/comment/discuss the original PR #${PR_NUMBER}. + EOF + + ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')" + if [ -z "${ALREADY_CREATED}" ]; then + gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \ + --body-file "body.md" \ + --head "${BRANCH}" \ + --base "main" \ + --draft + fi + + cleanup: + # Close PRs and delete branchs if the original PR is closed. + + permissions: + contents: write # for `--delete-branch` flag in `gh pr close` + pull-requests: write # for `gh pr close` + + if: | + github.event.action == 'closed' && + github.event.pull_request.head.repo.full_name != github.repository + + runs-on: ubuntu-latest + + steps: + - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch + run: | + CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --json 'closed' --jq '.[].closed')" + if [ "${CLOSED}" == "false" ]; then + gh pr --repo "${GITHUB_REPOSITORY}" close "${BRANCH}" --delete-branch + fi diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 172b904331..e2f15d96db 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -117,6 +117,7 @@ jobs: outputs: pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }} olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }} + tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }} steps: - name: Generate matrix for pgbench benchmark @@ -136,11 +137,11 @@ jobs: }' if [ "$(date +%A)" = "Saturday" ]; then - matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, + matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, { "platform": "rds-aurora", "db_size": "50gb"}]') fi - echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT + echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT - name: Generate matrix for OLAP benchmarks id: olap-compare-matrix @@ -152,11 +153,30 @@ jobs: }' if [ "$(date +%A)" = "Saturday" ]; then - matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" }, + matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" }, { "platform": "rds-aurora" }]') fi - echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT + echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT + + - name: Generate matrix for TPC-H benchmarks + id: tpch-compare-matrix + run: | + matrix='{ + "platform": [ + "neon-captest-reuse" + ], + "scale": [ + "10" + ] + }' + + if [ "$(date +%A)" = "Saturday" ]; then + matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" }, + { "platform": "rds-aurora", "scale": "10" }]') + fi + + echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT pgbench-compare: needs: [ generate-matrices ] @@ -233,7 +253,11 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - psql ${CONNSTR} -c "SELECT version();" + QUERY="SELECT version();" + if [[ "${PLATFORM}" = "neon"* ]]; then + QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + fi + psql ${CONNSTR} -c "${QUERY}" - name: Benchmark init uses: ./.github/actions/run-python-test-set @@ -358,7 +382,11 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - psql ${CONNSTR} -c "SELECT version();" + QUERY="SELECT version();" + if [[ "${PLATFORM}" = "neon"* ]]; then + QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + fi + psql ${CONNSTR} -c "${QUERY}" - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set @@ -372,6 +400,7 @@ jobs: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + TEST_OLAP_SCALE: 10 - name: Create Allure report if: ${{ !cancelled() }} @@ -398,7 +427,7 @@ jobs: strategy: fail-fast: false - matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} + matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -407,6 +436,7 @@ jobs: BUILD_TYPE: remote SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} PLATFORM: ${{ matrix.platform }} + TEST_OLAP_SCALE: ${{ matrix.scale }} runs-on: [ self-hosted, us-east-2, x64 ] container: @@ -428,18 +458,17 @@ jobs: ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Set up Connection String - id: set-up-connstr + - name: Get Connstring Secret Name run: | case "${PLATFORM}" in neon-captest-reuse) - CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }} + ENV_PLATFORM=CAPTEST_TPCH ;; rds-aurora) - CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR }} + ENV_PLATFORM=RDS_AURORA_TPCH ;; rds-postgres) - CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }} + ENV_PLATFORM=RDS_AURORA_TPCH ;; *) echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" @@ -447,9 +476,21 @@ jobs: ;; esac + CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR" + echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV + + - name: Set up Connection String + id: set-up-connstr + run: | + CONNSTR=${{ secrets[env.CONNSTR_SECRET_NAME] }} + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - psql ${CONNSTR} -c "SELECT version();" + QUERY="SELECT version();" + if [[ "${PLATFORM}" = "neon"* ]]; then + QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + fi + psql ${CONNSTR} -c "${QUERY}" - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set @@ -463,6 +504,7 @@ jobs: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + TEST_OLAP_SCALE: ${{ matrix.scale }} - name: Create Allure report if: ${{ !cancelled() }} @@ -534,7 +576,11 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - psql ${CONNSTR} -c "SELECT version();" + QUERY="SELECT version();" + if [[ "${PLATFORM}" = "neon"* ]]; then + QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + fi + psql ${CONNSTR} -c "${QUERY}" - name: Run user examples uses: ./.github/actions/run-python-test-set diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1ec2a65a89..7271a8d29f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -5,7 +5,6 @@ on: branches: - main - release - - ci-run/pr-* pull_request: defaults: @@ -24,7 +23,30 @@ env: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} jobs: + check-permissions: + runs-on: ubuntu-latest + + steps: + - name: Disallow PRs from forks + if: | + github.event_name == 'pull_request' && + github.event.pull_request.head.repo.full_name != github.repository + + run: | + if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then + MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork" + else + MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run" + fi + + echo >&2 "We don't run CI for PRs from forks" + echo >&2 "${MESSAGE}" + + exit 1 + + tag: + needs: [ check-permissions ] runs-on: [ self-hosted, gen3, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned outputs: @@ -53,6 +75,7 @@ jobs: id: build-tag check-codestyle-python: + needs: [ check-permissions ] runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned @@ -85,6 +108,7 @@ jobs: run: poetry run mypy . check-codestyle-rust: + needs: [ check-permissions ] runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned @@ -151,6 +175,7 @@ jobs: run: cargo deny check build-neon: + needs: [ check-permissions ] runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned @@ -187,7 +212,7 @@ jobs: # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603 FAILED=false - for postgres in postgres-v14 postgres-v15; do + for postgres in postgres-v14 postgres-v15 postgres-v16; do expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"') actual=$(git rev-parse "HEAD:vendor/${postgres}") if [ "${expected}" != "${actual}" ]; then @@ -209,6 +234,10 @@ jobs: id: pg_v15_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT + - name: Set pg 16 revision for caching + id: pg_v16_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + # Set some environment variables used by all the steps. # # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. @@ -229,10 +258,12 @@ jobs: cov_prefix="" CARGO_FLAGS="--locked --release" fi - echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV - echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV - echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV - echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV + { + echo "cov_prefix=${cov_prefix}" + echo "CARGO_FEATURES=${CARGO_FEATURES}" + echo "CARGO_FLAGS=${CARGO_FLAGS}" + echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" + } >> $GITHUB_ENV # Disabled for now # Don't include the ~/.cargo/registry/src directory. It contains just @@ -267,6 +298,13 @@ jobs: path: pg_install/v15 key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v16 build + id: cache_pg_16 + uses: actions/cache@v3 + with: + path: pg_install/v16 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' run: mold -run make postgres-v14 -j$(nproc) @@ -275,6 +313,10 @@ jobs: if: steps.cache_pg_15.outputs.cache-hit != 'true' run: mold -run make postgres-v15 -j$(nproc) + - name: Build postgres v16 + if: steps.cache_pg_16.outputs.cache-hit != 'true' + run: mold -run make postgres-v16 -j$(nproc) + - name: Build neon extensions run: mold -run make neon-pg-ext -j$(nproc) @@ -348,17 +390,17 @@ jobs: uses: ./.github/actions/save-coverage-data regress-tests: + needs: [ check-permissions, build-neon ] runs-on: [ self-hosted, gen3, large ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned # Default shared memory is 64mb options: --init --shm-size=512mb - needs: [ build-neon ] strategy: fail-fast: false matrix: build_type: [ debug, release ] - pg_version: [ v14, v15 ] + pg_version: [ v14, v15, v16 ] steps: - name: Checkout uses: actions/checkout@v3 @@ -386,12 +428,12 @@ jobs: uses: ./.github/actions/save-coverage-data benchmarks: + needs: [ check-permissions, build-neon ] runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned # Default shared memory is 64mb options: --init --shm-size=512mb - needs: [ build-neon ] if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: fail-fast: false @@ -418,12 +460,13 @@ jobs: # while coverage is currently collected for the debug ones create-test-report: + needs: [ check-permissions, regress-tests, coverage-report, benchmarks ] + if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} + runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - needs: [ regress-tests, benchmarks ] - if: ${{ !cancelled() }} steps: - uses: actions/checkout@v3 @@ -449,42 +492,40 @@ jobs: reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}", } + const coverage = { + coverageUrl: "${{ needs.coverage-report.outputs.coverage-html }}", + summaryJsonUrl: "${{ needs.coverage-report.outputs.coverage-json }}", + } + const script = require("./scripts/comment-test-report.js") await script({ github, context, fetch, report, + coverage, }) coverage-report: + needs: [ check-permissions, regress-tests ] + runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init - needs: [ regress-tests ] strategy: fail-fast: false matrix: build_type: [ debug ] + outputs: + coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }} + coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }} steps: - name: Checkout uses: actions/checkout@v3 with: submodules: true - fetch-depth: 1 - -# Disabled for now -# - name: Restore cargo deps cache -# id: cache_cargo -# uses: actions/cache@v3 -# with: -# path: | -# ~/.cargo/registry/ -# !~/.cargo/registry/src -# ~/.cargo/git/ -# target/ -# key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} + fetch-depth: 0 - name: Get Neon artifact uses: ./.github/actions/download @@ -527,13 +568,45 @@ jobs: REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT + - name: Build coverage report NEW + id: upload-coverage-report-new + env: + BUCKET: neon-github-public-dev + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + run: | + BASELINE="$(git merge-base HEAD origin/main)" + CURRENT="${COMMIT_SHA}" + + cp /tmp/coverage/report/lcov.info ./${CURRENT}.info + + GENHTML_ARGS="--ignore-errors path,unmapped,empty --synthesize-missing --demangle-cpp rustfilt --output-directory lcov-html ${CURRENT}.info" + + # Use differential coverage if the baseline coverage exists. + # It can be missing if the coverage repoer wasn't uploaded yet or tests has failed on BASELINE commit. + if aws s3 cp --only-show-errors s3://${BUCKET}/code-coverage/${BASELINE}/lcov.info ./${BASELINE}.info; then + git diff ${BASELINE} ${CURRENT} -- '*.rs' > baseline-current.diff + + GENHTML_ARGS="--baseline-file ${BASELINE}.info --diff-file baseline-current.diff ${GENHTML_ARGS}" + fi + + genhtml ${GENHTML_ARGS} + + aws s3 cp --only-show-errors --recursive ./lcov-html/ s3://${BUCKET}/code-coverage/${COMMIT_SHA}/lcov + + REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/index.html + echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT + + REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json + echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT + - uses: actions/github-script@v6 env: REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }} + REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: script: | - const { REPORT_URL, COMMIT_SHA } = process.env + const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env await github.rest.repos.createCommitStatus({ owner: context.repo.owner, @@ -544,12 +617,21 @@ jobs: context: 'Code coverage report', }) + await github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: `${COMMIT_SHA}`, + state: 'success', + target_url: `${REPORT_URL_NEW}`, + context: 'Code coverage report NEW', + }) + trigger-e2e-tests: + needs: [ check-permissions, promote-images, tag ] runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init - needs: [ promote-images, tag ] steps: - name: Set PR's status to pending and request a remote CI test run: | @@ -590,8 +672,8 @@ jobs: }" neon-image: + needs: [ check-permissions, tag ] runs-on: [ self-hosted, gen3, large ] - needs: [ tag ] container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: run: @@ -638,7 +720,7 @@ jobs: compute-tools-image: runs-on: [ self-hosted, gen3, large ] - needs: [ tag ] + needs: [ check-permissions, tag ] container: gcr.io/kaniko-project/executor:v1.9.2-debug defaults: run: @@ -683,17 +765,17 @@ jobs: run: rm -rf ~/.ecr compute-node-image: + needs: [ check-permissions, tag ] runs-on: [ self-hosted, gen3, large ] container: image: gcr.io/kaniko-project/executor:v1.9.2-debug # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution."" # Should be prevented by https://github.com/neondatabase/neon/issues/4281 options: --add-host=download.osgeo.org:140.211.15.30 - needs: [ tag ] strategy: fail-fast: false matrix: - version: [ v14, v15 ] + version: [ v14, v15, v16 ] defaults: run: shell: sh -eu {0} @@ -742,17 +824,17 @@ jobs: run: rm -rf ~/.ecr vm-compute-node-image: + needs: [ check-permissions, tag, compute-node-image ] runs-on: [ self-hosted, gen3, large ] - needs: [ tag, compute-node-image ] strategy: fail-fast: false matrix: - version: [ v14, v15 ] + version: [ v14, v15, v16 ] defaults: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.17.5 + VM_BUILDER_VERSION: v0.17.11 steps: - name: Checkout @@ -784,7 +866,7 @@ jobs: docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} test-images: - needs: [ tag, neon-image, compute-node-image, compute-tools-image ] + needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ] runs-on: [ self-hosted, gen3, small ] steps: @@ -827,8 +909,8 @@ jobs: docker compose -f ./docker-compose/docker-compose.yml down promote-images: + needs: [ check-permissions, tag, test-images, vm-compute-node-image ] runs-on: [ self-hosted, gen3, small ] - needs: [ tag, test-images, vm-compute-node-image ] container: golang:1.19-bullseye # Don't add if-condition here. # The job should always be run because we have dependant other jobs that shouldn't be skipped @@ -848,6 +930,7 @@ jobs: run: | crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 + crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16 - name: Add latest tag to images if: | @@ -860,6 +943,8 @@ jobs: crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest + crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest + crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest - name: Push images to production ECR if: | @@ -872,6 +957,8 @@ jobs: crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest - name: Configure Docker Hub login run: | @@ -883,6 +970,7 @@ jobs: run: | crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} + crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} - name: Push latest tags to Docker Hub if: | @@ -895,21 +983,19 @@ jobs: crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest - name: Cleanup ECR folder run: rm -rf ~/.ecr - build-private-extensions: - runs-on: [ self-hosted, gen3, small ] - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned - options: --init - needs: [ tag ] + trigger-custom-extensions-build-and-wait: + needs: [ check-permissions, tag ] + runs-on: ubuntu-latest steps: - name: Set PR's status to pending and request a remote CI test run: | - COMMIT_SHA=${{ github.event.pull_request.head.sha }} - COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + COMMIT_SHA=${{ github.event.pull_request.head.sha || github.sha }} REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions" curl -f -X POST \ @@ -939,11 +1025,50 @@ jobs: } }" + - name: Wait for extension build to finish + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + TIMEOUT=1800 # 30 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer + INTERVAL=15 # try each N seconds + + last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context + + for ((i=0; i <= TIMEOUT; i+=INTERVAL)); do + sleep $INTERVAL + + # Get statuses for the latest commit in the PR / branch + gh api \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }}" > statuses.json + + # Get the latest status for the "build-and-upload-extensions" context + last_status=$(jq --raw-output '[.[] | select(.context == "build-and-upload-extensions")] | sort_by(.created_at)[-1].state' statuses.json) + if [ "${last_status}" = "pending" ]; then + # Extension build is still in progress. + continue + elif [ "${last_status}" = "success" ]; then + # Extension build is successful. + exit 0 + else + # Status is neither "pending" nor "success", exit the loop and fail the job. + break + fi + done + + # Extension build failed, print `statuses.json` for debugging and fail the job. + jq '.' statuses.json + + echo >&2 "Status of extension build is '${last_status}' != 'success'" + exit 1 + deploy: + needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ] + if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' + runs-on: [ self-hosted, gen3, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - needs: [ promote-images, tag, regress-tests ] - if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' steps: - name: Fix git ownership run: | @@ -966,8 +1091,9 @@ jobs: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then - gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} + gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" @@ -981,20 +1107,35 @@ jobs: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 script: | - github.rest.git.createRef({ + await github.rest.git.createRef({ owner: context.repo.owner, repo: context.repo.repo, ref: "refs/tags/${{ needs.tag.outputs.build-tag }}", sha: context.sha, }) + - name: Create GitHub release + if: github.ref_name == 'release' + uses: actions/github-script@v6 + with: + # Retry script for 5XX server errors: https://github.com/actions/github-script#retries + retries: 5 + script: | + await github.rest.repos.createRelease({ + owner: context.repo.owner, + repo: context.repo.repo, + tag_name: "${{ needs.tag.outputs.build-tag }}", + generate_release_notes: true, + }) + promote-compatibility-data: + needs: [ check-permissions, promote-images, tag, regress-tests ] + if: github.ref_name == 'release' + runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned options: --init - needs: [ promote-images, tag, regress-tests ] - if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch' steps: - name: Promote compatibility snapshot for the release env: @@ -1002,7 +1143,7 @@ jobs: PREFIX: artifacts/latest run: | # Update compatibility snapshot for the release - for pg_version in v14 v15; do + for pg_version in v14 v15 v16; do for build_type in debug release; do OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index a21ddb0414..8a1e4571fd 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -4,7 +4,6 @@ on: push: branches: - main - - ci-run/pr-* pull_request: defaults: @@ -39,7 +38,7 @@ jobs: fetch-depth: 1 - name: Install macOS postgres dependencies - run: brew install flex bison openssl protobuf + run: brew install flex bison openssl protobuf icu4c pkg-config - name: Set pg 14 revision for caching id: pg_v14_rev @@ -49,6 +48,10 @@ jobs: id: pg_v15_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT + - name: Set pg 16 revision for caching + id: pg_v16_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + - name: Cache postgres v14 build id: cache_pg_14 uses: actions/cache@v3 @@ -63,6 +66,13 @@ jobs: path: pg_install/v15 key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v16 build + id: cache_pg_16 + uses: actions/cache@v3 + with: + path: pg_install/v16 + key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Set extra env for macOS run: | echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV @@ -86,6 +96,10 @@ jobs: if: steps.cache_pg_15.outputs.cache-hit != 'true' run: make postgres-v15 -j$(nproc) + - name: Build postgres v16 + if: steps.cache_pg_16.outputs.cache-hit != 'true' + run: make postgres-v16 -j$(nproc) + - name: Build neon extensions run: make neon-pg-ext -j$(nproc) diff --git a/.github/workflows/release-notify.yml b/.github/workflows/release-notify.yml new file mode 100644 index 0000000000..ba396dba74 --- /dev/null +++ b/.github/workflows/release-notify.yml @@ -0,0 +1,29 @@ +name: Notify Slack channel about upcoming release + +concurrency: + group: ${{ github.workflow }}-${{ github.event.number }} + cancel-in-progress: true + +on: + pull_request: + branches: + - release + types: + # Default types that triggers a workflow: + # - https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request + - opened + - synchronize + - reopened + # Additional types that we want to handle: + - closed + +jobs: + notify: + runs-on: [ ubuntu-latest ] + + steps: + - uses: neondatabase/dev-actions/release-pr-notify@main + with: + slack-token: ${{ secrets.SLACK_BOT_TOKEN }} + slack-channel-id: ${{ vars.SLACK_UPCOMING_RELEASE_CHANNEL_ID || 'C05QQ9J1BRC' }} # if not set, then `#test-release-notifications` + github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 595ee05514..36af98f96e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -2,16 +2,19 @@ name: Create Release Branch on: schedule: - - cron: '0 10 * * 2' + - cron: '0 7 * * 2' workflow_dispatch: jobs: create_release_branch: - runs-on: [ubuntu-latest] + runs-on: [ ubuntu-latest ] + + permissions: + contents: write # for `git push` steps: - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: ref: main @@ -26,9 +29,16 @@ jobs: run: git push origin releases/${{ steps.date.outputs.date }} - name: Create pull request into release - uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - head: releases/${{ steps.date.outputs.date }} - base: release - title: Release ${{ steps.date.outputs.date }} + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + cat << EOF > body.md + ## Release ${{ steps.date.outputs.date }} + + **Please merge this PR using 'Create a merge commit'!** + EOF + + gh pr create --title "Release ${{ steps.date.outputs.date }}" \ + --body-file "body.md" \ + --head "releases/${{ steps.date.outputs.date }}" \ + --base "release" diff --git a/.gitmodules b/.gitmodules index 081a404135..1d925674a1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,3 +6,7 @@ path = vendor/postgres-v15 url = https://github.com/neondatabase/postgres.git branch = REL_15_STABLE_neon +[submodule "vendor/postgres-v16"] + path = vendor/postgres-v16 + url = https://github.com/neondatabase/postgres.git + branch = REL_16_STABLE_neon diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c5b3ff7459..5de7842f1a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,3 +27,28 @@ your patch's fault. Help to fix the root cause if something else has broken the CI, before pushing. *Happy Hacking!* + +# How to run a CI pipeline on Pull Requests from external contributors +_An instruction for maintainers_ + +## TL;DR: +- Review the PR +- If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then: + - Press the "Approve and run" button in GitHub UI + - Add the `approved-for-ci-run` label to the PR + +Repeat all steps after any change to the PR. +- When the changes are ready to get merged — merge the original PR (not the internal one) + +## Longer version: + +GitHub Actions triggered by the `pull_request` event don't share repository secrets with the forks (for security reasons). +So, passing the CI pipeline on Pull Requests from external contributors is impossible. + +We're using the following approach to make it work: +- After the review, assign the `approved-for-ci-run` label to the PR if changes look safe +- A GitHub Action will create an internal branch and a new PR with the same changes (for example, for a PR `#1234`, it'll be a branch `ci-run/pr-1234`) +- Because the PR is created from the internal branch, it is able to access repository secrets (that's why it's crucial to make sure that the PR doesn't contain any malicious code that could expose our secrets or intentionally harm the CI) +- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review) + +For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml) diff --git a/Cargo.lock b/Cargo.lock index 867008808b..2055f001af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -227,6 +227,7 @@ checksum = "de3d533e0263bf453cc80af4c8bcc4d64e2aca293bd16f81633a36f1bf4a97cb" dependencies = [ "aws-credential-types", "aws-http", + "aws-sdk-sso", "aws-sdk-sts", "aws-smithy-async", "aws-smithy-client", @@ -237,12 +238,15 @@ dependencies = [ "aws-types", "bytes", "fastrand 2.0.0", + "hex", "http", "hyper", + "ring", "time", "tokio", "tower", "tracing", + "zeroize", ] [[package]] @@ -332,6 +336,30 @@ dependencies = [ "url", ] +[[package]] +name = "aws-sdk-sso" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f41bf2c28d32dbb9894a8fcfcb148265d034d3f4a170552a47553a09de890895" +dependencies = [ + "aws-credential-types", + "aws-http", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-client", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "http", + "regex", + "tokio-stream", + "tracing", +] + [[package]] name = "aws-sdk-sts" version = "0.29.0" @@ -608,7 +636,7 @@ dependencies = [ "sha1", "sync_wrapper", "tokio", - "tokio-tungstenite 0.20.0", + "tokio-tungstenite", "tower", "tower-layer", "tower-service", @@ -1029,6 +1057,8 @@ dependencies = [ "comfy-table", "compute_api", "git-version", + "hex", + "hyper", "nix 0.26.2", "once_cell", "pageserver_api", @@ -1044,6 +1074,7 @@ dependencies = [ "storage_broker", "tar", "thiserror", + "tokio", "toml", "tracing", "url", @@ -1777,6 +1808,16 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" +[[package]] +name = "histogram" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b" +dependencies = [ + "serde", + "thiserror", +] + [[package]] name = "hmac" version = "0.12.1" @@ -1900,15 +1941,15 @@ dependencies = [ [[package]] name = "hyper-tungstenite" -version = "0.9.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "880b8b1c98a5ec2a505c7c90db6d3f6f1f480af5655d9c5b55facc9382a5a5b5" +checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9" dependencies = [ "hyper", - "pin-project", + "pin-project-lite", "tokio", - "tokio-tungstenite 0.18.0", - "tungstenite 0.18.0", + "tokio-tungstenite", + "tungstenite", ] [[package]] @@ -2867,9 +2908,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.9" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" [[package]] name = "pin-utils" @@ -3685,6 +3726,43 @@ version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" +[[package]] +name = "s3_scrubber" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-stream", + "aws-config", + "aws-sdk-s3", + "aws-smithy-http", + "aws-types", + "bincode", + "bytes", + "chrono", + "clap", + "crc32c", + "either", + "futures-util", + "hex", + "histogram", + "itertools", + "pageserver", + "rand", + "reqwest", + "serde", + "serde_json", + "serde_with", + "thiserror", + "tokio", + "tokio-rustls", + "tokio-stream", + "tracing", + "tracing-appender", + "tracing-subscriber", + "utils", + "workspace_hack", +] + [[package]] name = "safekeeper" version = "0.1.0" @@ -4563,18 +4641,6 @@ dependencies = [ "xattr", ] -[[package]] -name = "tokio-tungstenite" -version = "0.18.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54319c93411147bced34cb5609a80e0a8e44c5999c93903a81cd866630ec0bfd" -dependencies = [ - "futures-util", - "log", - "tokio", - "tungstenite 0.18.0", -] - [[package]] name = "tokio-tungstenite" version = "0.20.0" @@ -4584,7 +4650,7 @@ dependencies = [ "futures-util", "log", "tokio", - "tungstenite 0.20.0", + "tungstenite", ] [[package]] @@ -4768,6 +4834,17 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-appender" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09d48f71a791638519505cefafe162606f706c25592e4bde4d97600c0195312e" +dependencies = [ + "crossbeam-channel", + "time", + "tracing-subscriber", +] + [[package]] name = "tracing-attributes" version = "0.1.24" @@ -4888,28 +4965,9 @@ checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" [[package]] name = "tungstenite" -version = "0.18.0" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30ee6ab729cd4cf0fd55218530c4522ed30b7b6081752839b68fcec8d0960788" -dependencies = [ - "base64 0.13.1", - "byteorder", - "bytes", - "http", - "httparse", - "log", - "rand", - "sha1", - "thiserror", - "url", - "utf-8", -] - -[[package]] -name = "tungstenite" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e862a1c4128df0112ab625f55cd5c934bcb4312ba80b39ae4b4835a3fd58e649" +checksum = "9e3dac10fd62eaf6617d3a904ae222845979aec67c615d1c842b4002c7666fb9" dependencies = [ "byteorder", "bytes", @@ -5506,6 +5564,10 @@ name = "workspace_hack" version = "0.1.0" dependencies = [ "anyhow", + "aws-config", + "aws-runtime", + "aws-sigv4", + "aws-smithy-http", "axum", "base64 0.21.1", "bytes", @@ -5514,7 +5576,6 @@ dependencies = [ "clap", "clap_builder", "crossbeam-utils", - "digest", "either", "fail", "futures", @@ -5523,6 +5584,7 @@ dependencies = [ "futures-executor", "futures-sink", "futures-util", + "hex", "hyper", "itertools", "libc", @@ -5546,6 +5608,7 @@ dependencies = [ "socket2 0.4.9", "syn 1.0.109", "syn 2.0.28", + "time", "tokio", "tokio-rustls", "tokio-util", @@ -5554,7 +5617,9 @@ dependencies = [ "tower", "tracing", "tracing-core", + "tungstenite", "url", + "uuid", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index d545be266f..4fe3069822 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,4 +1,5 @@ [workspace] +resolver = "2" members = [ "compute_tools", "control_plane", @@ -7,6 +8,7 @@ members = [ "proxy", "safekeeper", "storage_broker", + "s3_scrubber", "workspace_hack", "trace", "libs/compute_api", @@ -76,7 +78,7 @@ hostname = "0.3.1" humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" -hyper-tungstenite = "0.9" +hyper-tungstenite = "0.11" inotify = "0.10.2" itertools = "0.10" jsonwebtoken = "8" diff --git a/Dockerfile b/Dockerfile index 1c447b2db9..eb4c4bba25 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,6 +12,7 @@ WORKDIR /home/nonroot COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 +COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh @@ -39,6 +40,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server COPY --chown=nonroot . . # Show build caching stats to check if it was used in the end. @@ -65,6 +67,7 @@ RUN set -e \ && apt install -y \ libreadline-dev \ libseccomp-dev \ + libicu67 \ openssl \ ca-certificates \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ @@ -81,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ +COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 0416a98450..55eb9b7411 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -74,8 +74,8 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar ENV PATH "/usr/local/pgsql/bin:$PATH" -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \ - echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \ +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ + echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ ./autogen.sh && \ @@ -124,8 +124,21 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install -y ninja-build python3-dev libncurses5 binutils clang -RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \ - echo "1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 plv8.tar.gz" | sha256sum --check && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + export PLV8_VERSION=3.1.5 \ + export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \ + ;; \ + "v16") \ + export PLV8_VERSION=3.1.8 \ + export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \ + ;; \ + *) \ + echo "Export the valid PG_VERSION variable" && exit 1 \ + ;; \ + esac && \ + wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \ + echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \ mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -172,8 +185,8 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz cp -R /h3/usr / && \ rm -rf build -RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \ - echo "c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 h3-pg.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ + echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -243,8 +256,8 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214 FROM build-deps AS hypopg-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \ - echo "e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e hypopg.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ + echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -307,8 +320,8 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta FROM build-deps AS ip4r-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O ip4r.tar.gz && \ - echo "78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 ip4r.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ + echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -323,8 +336,8 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O i FROM build-deps AS prefix-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \ - echo "38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e prefix.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ + echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -339,8 +352,8 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O pr FROM build-deps AS hll-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \ - echo "9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 hll.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ + echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -355,8 +368,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar FROM build-deps AS plpgsql-check-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \ - echo "9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b plpgsql_check.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \ + echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -371,12 +384,21 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz FROM build-deps AS timescaledb-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +ARG PG_VERSION ENV PATH "/usr/local/pgsql/bin:$PATH" -RUN apt-get update && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + export TIMESCALEDB_VERSION=2.10.1 \ + export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ + ;; \ + *) \ + echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \ + esac && \ + apt-get update && \ apt-get install -y cmake && \ - wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \ - echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \ + wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ + echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ cd build && \ @@ -405,6 +427,10 @@ RUN case "${PG_VERSION}" in \ export PG_HINT_PLAN_VERSION=15_1_5_0 \ export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \ ;; \ + "v16") \ + export PG_HINT_PLAN_VERSION=16_1_6_0 \ + export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \ + ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ ;; \ @@ -452,8 +478,8 @@ FROM build-deps AS pg-cron-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \ - echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ + echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -479,8 +505,8 @@ RUN apt-get update && \ libfreetype6-dev ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" -RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \ - echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ + echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ -D RDK_BUILD_CAIRO_SUPPORT=OFF \ @@ -551,12 +577,19 @@ FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \ - echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + export PG_EMBEDDING_VERSION=0.3.5 \ + export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ + ;; \ + *) \ + echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \ + esac && \ + wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \ + echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \ mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control + make -j $(getconf _NPROCESSORS_ONLN) install ######################################################################################### # @@ -584,6 +617,10 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre # Layer "rust extensions" # This layer is used to build `pgx` deps # +# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from +# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx +# dependency on all the rust extension that depend on it, too. +# ######################################################################################### FROM build-deps AS rust-extensions-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -598,7 +635,17 @@ USER nonroot WORKDIR /home/nonroot ARG PG_VERSION -RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + ;; \ + "v16") \ + echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \ + ;; \ + esac && \ + curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ @@ -615,10 +662,21 @@ USER root ######################################################################################### FROM rust-extensions-build AS pg-jsonschema-pg-build +ARG PG_VERSION # caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023 # there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5 -RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + ;; \ + "v16") \ + echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \ + ;; \ + esac && \ + wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \ echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -633,12 +691,23 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421e ######################################################################################### FROM rust-extensions-build AS pg-graphql-pg-build +ARG PG_VERSION # b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch) # Currently pgx version bump to >= 0.7.2 causes "call to unsafe function" compliation errors in # pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the # same 1.1 version we've used before. -RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + ;; \ + "v16") \ + echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ + esac && \ + wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \ echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -656,9 +725,20 @@ RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367 ######################################################################################### FROM rust-extensions-build AS pg-tiktoken-pg-build +ARG PG_VERSION # 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023 -RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + ;; \ + "v16") \ + echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ + esac && \ + wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \ echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ cargo pgx install --release && \ @@ -672,8 +752,19 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405 ######################################################################################### FROM rust-extensions-build AS pg-pgx-ulid-build +ARG PG_VERSION -RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + ;; \ + "v16") \ + echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ + esac && \ + wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \ echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -726,6 +817,20 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_utils \ -s install && \ + make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon_rmgr \ + -s install && \ + case "${PG_VERSION}" in \ + "v14" | "v15") \ + ;; \ + "v16") \ + echo "Skipping HNSW for PostgreSQL 16" && exit 0 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ + esac && \ make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/hnsw \ diff --git a/Makefile b/Makefile index 0768b64502..33b5dcad99 100644 --- a/Makefile +++ b/Makefile @@ -29,6 +29,7 @@ else ifeq ($(UNAME_S),Darwin) # It can be configured with OPENSSL_PREFIX variable OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3) PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib + PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig # macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure # brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/: @@ -83,6 +84,8 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status: # I'm not sure why it wouldn't work, but this is the only place (apart from # the "build-all-versions" entry points) where direct mention of PostgreSQL # versions is used. +.PHONY: postgres-configure-v16 +postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status .PHONY: postgres-configure-v15 postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status .PHONY: postgres-configure-v14 @@ -118,6 +121,10 @@ postgres-clean-%: $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean +.PHONY: postgres-check-% +postgres-check-%: postgres-% + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check + .PHONY: neon-pg-ext-% neon-pg-ext-%: postgres-% +@echo "Compiling neon $*" @@ -130,6 +137,11 @@ neon-pg-ext-%: postgres-% $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install + +@echo "Compiling neon_rmgr $*" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install +@echo "Compiling neon_test_utils $*" mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ @@ -140,11 +152,6 @@ neon-pg-ext-%: postgres-% $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install - +@echo "Compiling hnsw $*" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$* - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install .PHONY: neon-pg-ext-clean-% neon-pg-ext-clean-%: @@ -160,35 +167,43 @@ neon-pg-ext-clean-%: $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ - -C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean .PHONY: neon-pg-ext neon-pg-ext: \ neon-pg-ext-v14 \ - neon-pg-ext-v15 + neon-pg-ext-v15 \ + neon-pg-ext-v16 .PHONY: neon-pg-ext-clean neon-pg-ext-clean: \ neon-pg-ext-clean-v14 \ - neon-pg-ext-clean-v15 + neon-pg-ext-clean-v15 \ + neon-pg-ext-clean-v16 # shorthand to build all Postgres versions .PHONY: postgres postgres: \ postgres-v14 \ - postgres-v15 + postgres-v15 \ + postgres-v16 .PHONY: postgres-headers postgres-headers: \ postgres-headers-v14 \ - postgres-headers-v15 + postgres-headers-v15 \ + postgres-headers-v16 .PHONY: postgres-clean postgres-clean: \ postgres-clean-v14 \ - postgres-clean-v15 + postgres-clean-v15 \ + postgres-clean-v16 + +.PHONY: postgres-check +postgres-check: \ + postgres-check-v14 \ + postgres-check-v15 \ + postgres-check-v16 # This doesn't remove the effects of 'configure'. .PHONY: clean diff --git a/README.md b/README.md index 5780608949..75fad605c5 100644 --- a/README.md +++ b/README.md @@ -29,18 +29,18 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \ -libcurl4-openssl-dev openssl python-poetry +libcurl4-openssl-dev openssl python-poetry lsof libicu-dev ``` * On Fedora, these packages are needed: ```bash dnf install flex bison readline-devel zlib-devel openssl-devel \ libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ - protobuf-devel libcurl-devel openssl poetry + protobuf-devel libcurl-devel openssl poetry lsof libicu-devel ``` * On Arch based systems, these packages are needed: ```bash pacman -S base-devel readline zlib libseccomp openssl clang \ -postgresql-libs cmake postgresql protobuf curl +postgresql-libs cmake postgresql protobuf curl lsof ``` Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases). @@ -55,7 +55,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 1. Install XCode and dependencies ``` xcode-select --install -brew install protobuf openssl flex bison +brew install protobuf openssl flex bison icu4c pkg-config # add openssl to PATH, required for ed25519 keys generation in neon_local echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000000..d788afc84d --- /dev/null +++ b/clippy.toml @@ -0,0 +1,5 @@ +disallowed-methods = [ + "tokio::task::block_in_place", + # Allow this for now, to deny it later once we stop using Handle::block_on completely + # "tokio::runtime::Handle::block_on", +] diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index 6f52004fa8..d76eaad0a0 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,7 +1,7 @@ use anyhow::{anyhow, Ok, Result}; use postgres::Client; use tokio_postgres::NoTls; -use tracing::{error, instrument}; +use tracing::{error, instrument, warn}; use crate::compute::ComputeNode; @@ -55,13 +55,24 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> { ON CONFLICT (id) DO UPDATE SET updated_at = now();"; - let result = client.simple_query(query).await?; - - if result.len() != 1 { - return Err(anyhow::format_err!( - "expected 1 query result, but got {}", - result.len() - )); + match client.simple_query(query).await { + Result::Ok(result) => { + if result.len() != 1 { + return Err(anyhow::anyhow!( + "expected 1 query results, but got {}", + result.len() + )); + } + } + Err(err) => { + if let Some(state) = err.code() { + if state == &tokio_postgres::error::SqlState::DISK_FULL { + warn!("Tenant disk is full"); + return Ok(()); + } + } + return Err(err.into()); + } } Ok(()) diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 2da671a149..bc48a2110d 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -46,8 +46,6 @@ pub fn write_postgres_conf( writeln!(file, "{}", conf)?; } - write!(file, "{}", &spec.cluster.settings.as_pg_settings())?; - // Add options for connecting to storage writeln!(file, "# Neon storage settings")?; if let Some(s) = &spec.pageserver_connstring { diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 54c22026e7..3d7ed8c360 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -74,6 +74,7 @@ More specifically, here is an example ext_index.json use anyhow::Context; use anyhow::{self, Result}; use compute_api::spec::RemoteExtSpec; +use regex::Regex; use remote_storage::*; use serde_json; use std::io::Read; @@ -106,16 +107,71 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String { pub fn get_pg_version(pgbin: &str) -> String { // pg_config --version returns a (platform specific) human readable string - // such as "PostgreSQL 15.4". We parse this to v14/v15 + // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc. let human_version = get_pg_config("--version", pgbin); - if human_version.contains("15") { - return "v15".to_string(); - } else if human_version.contains("14") { - return "v14".to_string(); + return parse_pg_version(&human_version).to_string(); +} + +fn parse_pg_version(human_version: &str) -> &str { + // Normal releases have version strings like "PostgreSQL 15.4". But there + // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL + // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version + // configure option, you can tack any string to the version number, + // e.g. "PostgreSQL 15.4foobar". + match Regex::new(r"^PostgreSQL (?\d+).+") + .unwrap() + .captures(human_version) + { + Some(captures) if captures.len() == 2 => match &captures["major"] { + "14" => return "v14", + "15" => return "v15", + "16" => return "v16", + _ => {} + }, + _ => {} } panic!("Unsuported postgres version {human_version}"); } +#[cfg(test)] +mod tests { + use super::parse_pg_version; + + #[test] + fn test_parse_pg_version() { + assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15"); + assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15"); + assert_eq!( + parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"), + "v15" + ); + + assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14"); + assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14"); + assert_eq!( + parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"), + "v14" + ); + + assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16"); + assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16"); + assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16"); + assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16"); + } + + #[test] + #[should_panic] + fn test_parse_pg_unsupported_version() { + parse_pg_version("PostgreSQL 13.14"); + } + + #[test] + #[should_panic] + fn test_parse_pg_incorrect_version_format() { + parse_pg_version("PostgreSQL 14"); + } +} + // download the archive for a given extension, // unzip it, and place files in the appropriate locations (share/lib) pub async fn download_extension( diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index a571628770..8851be1ec1 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -1,4 +1,6 @@ use std::convert::Infallible; +use std::net::IpAddr; +use std::net::Ipv6Addr; use std::net::SocketAddr; use std::sync::Arc; use std::thread; @@ -298,7 +300,9 @@ fn render_json_error(e: &str, status: StatusCode) -> Response { // Main Hyper HTTP server function that runs it and blocks waiting on it forever. #[tokio::main] async fn serve(port: u16, state: Arc) { - let addr = SocketAddr::from(([0, 0, 0, 0], port)); + // this usually binds to both IPv4 and IPv6 on linux + // see e.g. https://github.com/rust-lang/rust/pull/34440 + let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port); let make_service = make_service_fn(move |_conn| { let state = state.clone(); diff --git a/compute_tools/src/params.rs b/compute_tools/src/params.rs index 0ce01ff478..4ccb403ca6 100644 --- a/compute_tools/src/params.rs +++ b/compute_tools/src/params.rs @@ -6,4 +6,4 @@ pub const DEFAULT_LOG_LEVEL: &str = "info"; // https://www.postgresql.org/docs/15/auth-password.html // // So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles. -pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5"; +pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\tall\t\tmd5"; diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index d2c99c5f36..ec685915f9 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -12,6 +12,8 @@ git-version.workspace = true nix.workspace = true once_cell.workspace = true postgres.workspace = true +hex.workspace = true +hyper.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } serde.workspace = true @@ -20,6 +22,7 @@ serde_with.workspace = true tar.workspace = true thiserror.workspace = true toml.workspace = true +tokio.workspace = true url.workspace = true # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api # instead, so that recompile times are better. diff --git a/control_plane/simple.conf b/control_plane/simple.conf index 243e13f3d3..0ad90a4618 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -1,6 +1,7 @@ # Minimal neon environment with one safekeeper. This is equivalent to the built-in # defaults that you get with no --config -[pageserver] +[[pageservers]] +id=1 listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' pg_auth_type = 'Trust' diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs new file mode 100644 index 0000000000..f0e649cfa8 --- /dev/null +++ b/control_plane/src/attachment_service.rs @@ -0,0 +1,105 @@ +use crate::{background_process, local_env::LocalEnv}; +use anyhow::anyhow; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use std::{path::PathBuf, process::Child}; +use utils::id::{NodeId, TenantId}; + +pub struct AttachmentService { + env: LocalEnv, + listen: String, + path: PathBuf, +} + +const COMMAND: &str = "attachment_service"; + +#[serde_as] +#[derive(Serialize, Deserialize)] +pub struct AttachHookRequest { + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: TenantId, + pub pageserver_id: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct AttachHookResponse { + pub gen: Option, +} + +impl AttachmentService { + pub fn from_env(env: &LocalEnv) -> Self { + let path = env.base_data_dir.join("attachments.json"); + + // Makes no sense to construct this if pageservers aren't going to use it: assume + // pageservers have control plane API set + let listen_url = env.control_plane_api.clone().unwrap(); + + let listen = format!( + "{}:{}", + listen_url.host_str().unwrap(), + listen_url.port().unwrap() + ); + + Self { + env: env.clone(), + path, + listen, + } + } + + fn pid_file(&self) -> PathBuf { + self.env.base_data_dir.join("attachment_service.pid") + } + + pub fn start(&self) -> anyhow::Result { + let path_str = self.path.to_string_lossy(); + + background_process::start_process( + COMMAND, + &self.env.base_data_dir, + &self.env.attachment_service_bin(), + ["-l", &self.listen, "-p", &path_str], + [], + background_process::InitialPidFile::Create(&self.pid_file()), + // TODO: a real status check + || Ok(true), + ) + } + + pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { + background_process::stop_process(immediate, COMMAND, &self.pid_file()) + } + + /// Call into the attach_hook API, for use before handing out attachments to pageservers + pub fn attach_hook( + &self, + tenant_id: TenantId, + pageserver_id: NodeId, + ) -> anyhow::Result> { + use hyper::StatusCode; + + let url = self + .env + .control_plane_api + .clone() + .unwrap() + .join("attach_hook") + .unwrap(); + let client = reqwest::blocking::ClientBuilder::new() + .build() + .expect("Failed to construct http client"); + + let request = AttachHookRequest { + tenant_id, + pageserver_id: Some(pageserver_id), + }; + + let response = client.post(url).json(&request).send()?; + if response.status() != StatusCode::OK { + return Err(anyhow!("Unexpected status {}", response.status())); + } + + let response = response.json::()?; + Ok(response.gen) + } +} diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs new file mode 100644 index 0000000000..e879646b63 --- /dev/null +++ b/control_plane/src/bin/attachment_service.rs @@ -0,0 +1,273 @@ +/// The attachment service mimics the aspects of the control plane API +/// that are required for a pageserver to operate. +/// +/// This enables running & testing pageservers without a full-blown +/// deployment of the Neon cloud platform. +/// +use anyhow::anyhow; +use clap::Parser; +use hex::FromHex; +use hyper::StatusCode; +use hyper::{Body, Request, Response}; +use serde::{Deserialize, Serialize}; +use std::path::{Path, PathBuf}; +use std::{collections::HashMap, sync::Arc}; +use utils::logging::{self, LogFormat}; + +use utils::{ + http::{ + endpoint::{self}, + error::ApiError, + json::{json_request, json_response}, + RequestExt, RouterBuilder, + }, + id::{NodeId, TenantId}, + tcp_listener, +}; + +use pageserver_api::control_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse, + ValidateResponseTenant, +}; + +use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse}; + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +#[command(arg_required_else_help(true))] +struct Cli { + /// Host and port to listen on, like `127.0.0.1:1234` + #[arg(short, long)] + listen: std::net::SocketAddr, + + /// Path to the .json file to store state (will be created if it doesn't exist) + #[arg(short, long)] + path: PathBuf, +} + +// The persistent state of each Tenant +#[derive(Serialize, Deserialize, Clone)] +struct TenantState { + // Currently attached pageserver + pageserver: Option, + + // Latest generation number: next time we attach, increment this + // and use the incremented number when attaching + generation: u32, +} + +fn to_hex_map(input: &HashMap, serializer: S) -> Result +where + S: serde::Serializer, + V: Clone + Serialize, +{ + let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone())); + + transformed + .collect::>() + .serialize(serializer) +} + +fn from_hex_map<'de, D, V>(deserializer: D) -> Result, D::Error> +where + D: serde::de::Deserializer<'de>, + V: Deserialize<'de>, +{ + let hex_map = HashMap::::deserialize(deserializer)?; + hex_map + .into_iter() + .map(|(k, v)| { + TenantId::from_hex(k) + .map(|k| (k, v)) + .map_err(serde::de::Error::custom) + }) + .collect() +} + +// Top level state available to all HTTP handlers +#[derive(Serialize, Deserialize)] +struct PersistentState { + #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")] + tenants: HashMap, + + #[serde(skip)] + path: PathBuf, +} + +impl PersistentState { + async fn save(&self) -> anyhow::Result<()> { + let bytes = serde_json::to_vec(self)?; + tokio::fs::write(&self.path, &bytes).await?; + + Ok(()) + } + + async fn load(path: &Path) -> anyhow::Result { + let bytes = tokio::fs::read(path).await?; + let mut decoded = serde_json::from_slice::(&bytes)?; + decoded.path = path.to_owned(); + Ok(decoded) + } + + async fn load_or_new(path: &Path) -> Self { + match Self::load(path).await { + Ok(s) => { + tracing::info!("Loaded state file at {}", path.display()); + s + } + Err(e) + if e.downcast_ref::() + .map(|e| e.kind() == std::io::ErrorKind::NotFound) + .unwrap_or(false) => + { + tracing::info!("Will create state file at {}", path.display()); + Self { + tenants: HashMap::new(), + path: path.to_owned(), + } + } + Err(e) => { + panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display()) + } + } + } +} + +/// State available to HTTP request handlers +#[derive(Clone)] +struct State { + inner: Arc>, +} + +impl State { + fn new(persistent_state: PersistentState) -> State { + Self { + inner: Arc::new(tokio::sync::RwLock::new(persistent_state)), + } + } +} + +#[inline(always)] +fn get_state(request: &Request) -> &State { + request + .data::>() + .expect("unknown state type") + .as_ref() +} + +/// Pageserver calls into this on startup, to learn which tenants it should attach +async fn handle_re_attach(mut req: Request) -> Result, ApiError> { + let reattach_req = json_request::(&mut req).await?; + + let state = get_state(&req).inner.clone(); + let mut locked = state.write().await; + + let mut response = ReAttachResponse { + tenants: Vec::new(), + }; + for (t, state) in &mut locked.tenants { + if state.pageserver == Some(reattach_req.node_id) { + state.generation += 1; + response.tenants.push(ReAttachResponseTenant { + id: *t, + generation: state.generation, + }); + } + } + + locked.save().await.map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + +/// Pageserver calls into this before doing deletions, to confirm that it still +/// holds the latest generation for the tenants with deletions enqueued +async fn handle_validate(mut req: Request) -> Result, ApiError> { + let validate_req = json_request::(&mut req).await?; + + let locked = get_state(&req).inner.read().await; + + let mut response = ValidateResponse { + tenants: Vec::new(), + }; + + for req_tenant in validate_req.tenants { + if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) { + let valid = tenant_state.generation == req_tenant.gen; + response.tenants.push(ValidateResponseTenant { + id: req_tenant.id, + valid, + }); + } + } + + json_response(StatusCode::OK, response) +} +/// Call into this before attaching a tenant to a pageserver, to acquire a generation number +/// (in the real control plane this is unnecessary, because the same program is managing +/// generation numbers and doing attachments). +async fn handle_attach_hook(mut req: Request) -> Result, ApiError> { + let attach_req = json_request::(&mut req).await?; + + let state = get_state(&req).inner.clone(); + let mut locked = state.write().await; + + let tenant_state = locked + .tenants + .entry(attach_req.tenant_id) + .or_insert_with(|| TenantState { + pageserver: attach_req.pageserver_id, + generation: 0, + }); + + if attach_req.pageserver_id.is_some() { + tenant_state.generation += 1; + } + let generation = tenant_state.generation; + + locked.save().await.map_err(ApiError::InternalServerError)?; + + json_response( + StatusCode::OK, + AttachHookResponse { + gen: attach_req.pageserver_id.map(|_| generation), + }, + ) +} + +fn make_router(persistent_state: PersistentState) -> RouterBuilder { + endpoint::make_router() + .data(Arc::new(State::new(persistent_state))) + .post("/re-attach", handle_re_attach) + .post("/validate", handle_validate) + .post("/attach_hook", handle_attach_hook) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + logging::init( + LogFormat::Plain, + logging::TracingErrorLayerEnablement::Disabled, + )?; + + let args = Cli::parse(); + tracing::info!( + "Starting, state at {}, listening on {}", + args.path.to_string_lossy(), + args.listen + ); + + let persistent_state = PersistentState::load_or_new(&args.path).await; + + let http_listener = tcp_listener::bind(args.listen)?; + let router = make_router(persistent_state) + .build() + .map_err(|err| anyhow!(err))?; + let service = utils::http::RouterService::new(router).unwrap(); + let server = hyper::Server::from_tcp(http_listener)?.serve(service); + + tracing::info!("Serving on {0}", args.listen); + server.await?; + + Ok(()) +} diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index ef308cb2d2..4cdb91bfd2 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,6 +8,7 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; use compute_api::spec::ComputeMode; +use control_plane::attachment_service::AttachmentService; use control_plane::endpoint::ComputeControlPlane; use control_plane::local_env::LocalEnv; use control_plane::pageserver::PageServerNode; @@ -43,14 +44,18 @@ project_git_version!(GIT_VERSION); const DEFAULT_PG_VERSION: &str = "15"; +const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/"; + fn default_conf() -> String { format!( r#" # Default built-in configuration, defined in main.rs +control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}' + [broker] listen_addr = '{DEFAULT_BROKER_ADDR}' -[pageserver] +[[pageservers]] id = {DEFAULT_PAGESERVER_ID} listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}' listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}' @@ -61,6 +66,7 @@ http_auth_type = '{trust_auth}' id = {DEFAULT_SAFEKEEPER_ID} pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} + "#, trust_auth = AuthType::Trust, ) @@ -107,6 +113,7 @@ fn main() -> Result<()> { "start" => handle_start_all(sub_args, &env), "stop" => handle_stop_all(sub_args, &env), "pageserver" => handle_pageserver(sub_args, &env), + "attachment_service" => handle_attachment_service(sub_args, &env), "safekeeper" => handle_safekeeper(sub_args, &env), "endpoint" => handle_endpoint(sub_args, &env), "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"), @@ -252,7 +259,7 @@ fn get_timeline_infos( env: &local_env::LocalEnv, tenant_id: &TenantId, ) -> Result> { - Ok(PageServerNode::from_env(env) + Ok(get_default_pageserver(env) .timeline_list(tenant_id)? .into_iter() .map(|timeline_info| (timeline_info.timeline_id, timeline_info)) @@ -313,17 +320,30 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { .context("Failed to initialize neon repository")?; // Initialize pageserver, create initial tenant and timeline. - let pageserver = PageServerNode::from_env(&env); - pageserver - .initialize(&pageserver_config_overrides(init_match)) - .unwrap_or_else(|e| { - eprintln!("pageserver init failed: {e:?}"); - exit(1); - }); + for ps_conf in &env.pageservers { + PageServerNode::from_env(&env, ps_conf) + .initialize(&pageserver_config_overrides(init_match)) + .unwrap_or_else(|e| { + eprintln!("pageserver init failed: {e:?}"); + exit(1); + }); + } Ok(env) } +/// The default pageserver is the one where CLI tenant/timeline operations are sent by default. +/// For typical interactive use, one would just run with a single pageserver. Scenarios with +/// tenant/timeline placement across multiple pageservers are managed by python test code rather +/// than this CLI. +fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode { + let ps_conf = env + .pageservers + .first() + .expect("Config is validated to contain at least one pageserver"); + PageServerNode::from_env(env, ps_conf) +} + fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { init_match .get_many::("pageserver-config-override") @@ -334,7 +354,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { } fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> { - let pageserver = PageServerNode::from_env(env); + let pageserver = get_default_pageserver(env); match tenant_match.subcommand() { Some(("list", _)) => { for t in pageserver.tenant_list()? { @@ -342,13 +362,25 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an } } Some(("create", create_match)) => { - let initial_tenant_id = parse_tenant_id(create_match)?; let tenant_conf: HashMap<_, _> = create_match .get_many::("config") .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) .unwrap_or_default(); - let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?; - println!("tenant {new_tenant_id} successfully created on the pageserver"); + + // If tenant ID was not specified, generate one + let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate); + + let generation = if env.control_plane_api.is_some() { + // We must register the tenant with the attachment service, so + // that when the pageserver restarts, it will be re-attached. + let attachment_service = AttachmentService::from_env(env); + attachment_service.attach_hook(tenant_id, pageserver.conf.id)? + } else { + None + }; + + pageserver.tenant_create(tenant_id, generation, tenant_conf)?; + println!("tenant {tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant let new_timeline_id = parse_timeline_id(create_match)?; @@ -358,7 +390,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an .context("Failed to parse postgres version from the argument string")?; let timeline_info = pageserver.timeline_create( - new_tenant_id, + tenant_id, new_timeline_id, None, None, @@ -369,17 +401,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an env.register_branch_mapping( DEFAULT_BRANCH_NAME.to_string(), - new_tenant_id, + tenant_id, new_timeline_id, )?; println!( - "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}", + "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}", ); if create_match.get_flag("set-default") { - println!("Setting tenant {new_tenant_id} as a default one"); - env.default_tenant_id = Some(new_tenant_id); + println!("Setting tenant {tenant_id} as a default one"); + env.default_tenant_id = Some(tenant_id); } } Some(("set-default", set_default_match)) => { @@ -407,7 +439,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an } fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { - let pageserver = PageServerNode::from_env(env); + let pageserver = get_default_pageserver(env); match timeline_match.subcommand() { Some(("list", list_match)) => { @@ -484,6 +516,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - None, pg_version, ComputeMode::Primary, + DEFAULT_PAGESERVER_ID, )?; println!("Done"); } @@ -537,7 +570,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( Some(ep_subcommand_data) => ep_subcommand_data, None => bail!("no endpoint subcommand provided"), }; - let mut cplane = ComputeControlPlane::load(env.clone())?; // All subcommands take an optional --tenant-id option @@ -634,6 +666,13 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .copied() .unwrap_or(false); + let pageserver_id = + if let Some(id_str) = sub_args.get_one::("endpoint-pageserver-id") { + NodeId(id_str.parse().context("while parsing pageserver id")?) + } else { + DEFAULT_PAGESERVER_ID + }; + let mode = match (lsn, hot_standby) { (Some(lsn), false) => ComputeMode::Static(lsn), (None, true) => ComputeMode::Replica, @@ -649,6 +688,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( http_port, pg_version, mode, + pageserver_id, )?; } "start" => { @@ -658,6 +698,13 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .get_one::("endpoint_id") .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?; + let pageserver_id = + if let Some(id_str) = sub_args.get_one::("endpoint-pageserver-id") { + NodeId(id_str.parse().context("while parsing pageserver id")?) + } else { + DEFAULT_PAGESERVER_ID + }; + let remote_ext_config = sub_args.get_one::("remote-ext-config"); // If --safekeepers argument is given, use only the listed safekeeper nodes. @@ -677,7 +724,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( let endpoint = cplane.endpoints.get(endpoint_id.as_str()); - let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) { + let ps_conf = env.get_pageserver_conf(pageserver_id)?; + let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) { let claims = Claims::new(Some(tenant_id), Scope::Tenant); Some(env.generate_auth_token(&claims)?) @@ -744,6 +792,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( http_port, pg_version, mode, + pageserver_id, )?; ep.start(&auth_token, safekeepers, remote_ext_config)?; } @@ -768,51 +817,94 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( } fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let pageserver = PageServerNode::from_env(env); + fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result { + let node_id = if let Some(id_str) = args.get_one::("pageserver-id") { + NodeId(id_str.parse().context("while parsing pageserver id")?) + } else { + DEFAULT_PAGESERVER_ID + }; + + Ok(PageServerNode::from_env( + env, + env.get_pageserver_conf(node_id)?, + )) + } match sub_match.subcommand() { - Some(("start", start_match)) => { - if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) { + Some(("start", subcommand_args)) => { + if let Err(e) = get_pageserver(env, subcommand_args)? + .start(&pageserver_config_overrides(subcommand_args)) + { eprintln!("pageserver start failed: {e}"); exit(1); } } + Some(("stop", subcommand_args)) => { + let immediate = subcommand_args + .get_one::("stop-mode") + .map(|s| s.as_str()) + == Some("immediate"); + + if let Err(e) = get_pageserver(env, subcommand_args)?.stop(immediate) { + eprintln!("pageserver stop failed: {}", e); + exit(1); + } + } + + Some(("restart", subcommand_args)) => { + let pageserver = get_pageserver(env, subcommand_args)?; + //TODO what shutdown strategy should we use here? + if let Err(e) = pageserver.stop(false) { + eprintln!("pageserver stop failed: {}", e); + exit(1); + } + + if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) { + eprintln!("pageserver start failed: {e}"); + exit(1); + } + } + + Some(("status", subcommand_args)) => { + match get_pageserver(env, subcommand_args)?.check_status() { + Ok(_) => println!("Page server is up and running"), + Err(err) => { + eprintln!("Page server is not available: {}", err); + exit(1); + } + } + } + + Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name), + None => bail!("no pageserver subcommand provided"), + } + Ok(()) +} + +fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { + let svc = AttachmentService::from_env(env); + match sub_match.subcommand() { + Some(("start", _start_match)) => { + if let Err(e) = svc.start() { + eprintln!("start failed: {e}"); + exit(1); + } + } + Some(("stop", stop_match)) => { let immediate = stop_match .get_one::("stop-mode") .map(|s| s.as_str()) == Some("immediate"); - if let Err(e) = pageserver.stop(immediate) { - eprintln!("pageserver stop failed: {}", e); + if let Err(e) = svc.stop(immediate) { + eprintln!("stop failed: {}", e); exit(1); } } - - Some(("restart", restart_match)) => { - //TODO what shutdown strategy should we use here? - if let Err(e) = pageserver.stop(false) { - eprintln!("pageserver stop failed: {}", e); - exit(1); - } - - if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) { - eprintln!("pageserver start failed: {e}"); - exit(1); - } - } - - Some(("status", _)) => match PageServerNode::from_env(env).check_status() { - Ok(_) => println!("Page server is up and running"), - Err(err) => { - eprintln!("Page server is not available: {}", err); - exit(1); - } - }, - - Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name), - None => bail!("no pageserver subcommand provided"), + Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name), + None => bail!("no attachment_service subcommand provided"), } Ok(()) } @@ -897,11 +989,23 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow broker::start_broker_process(env)?; - let pageserver = PageServerNode::from_env(env); - if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) { - eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e); - try_stop_all(env, true); - exit(1); + // Only start the attachment service if the pageserver is configured to need it + if env.control_plane_api.is_some() { + let attachment_service = AttachmentService::from_env(env); + if let Err(e) = attachment_service.start() { + eprintln!("attachment_service start failed: {:#}", e); + try_stop_all(env, true); + exit(1); + } + } + + for ps_conf in &env.pageservers { + let pageserver = PageServerNode::from_env(env, ps_conf); + if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) { + eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); + try_stop_all(env, true); + exit(1); + } } for node in env.safekeepers.iter() { @@ -925,8 +1029,6 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result< } fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { - let pageserver = PageServerNode::from_env(env); - // Stop all endpoints match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { @@ -941,8 +1043,11 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { } } - if let Err(e) = pageserver.stop(immediate) { - eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e); + for ps_conf in &env.pageservers { + let pageserver = PageServerNode::from_env(env, ps_conf); + if let Err(e) = pageserver.stop(immediate) { + eprintln!("pageserver {} stop failed: {:#}", ps_conf.id, e); + } } for node in env.safekeepers.iter() { @@ -955,6 +1060,13 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { if let Err(e) = broker::stop_broker_process(env) { eprintln!("neon broker stop failed: {e:#}"); } + + if env.control_plane_api.is_some() { + let attachment_service = AttachmentService::from_env(env); + if let Err(e) = attachment_service.stop(immediate) { + eprintln!("attachment service stop failed: {e:#}"); + } + } } fn cli() -> Command { @@ -969,6 +1081,16 @@ fn cli() -> Command { let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); + // --id, when using a pageserver command + let pageserver_id_arg = Arg::new("pageserver-id") + .long("id") + .help("pageserver id") + .required(false); + // --pageserver-id when using a non-pageserver command + let endpoint_pageserver_id_arg = Arg::new("endpoint-pageserver-id") + .long("pageserver-id") + .required(false); + let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt") .short('e') .long("safekeeper-extra-opt") @@ -1133,10 +1255,24 @@ fn cli() -> Command { .arg_required_else_help(true) .about("Manage pageserver") .subcommand(Command::new("status")) + .arg(pageserver_id_arg.clone()) + .subcommand(Command::new("start").about("Start local pageserver") + .arg(pageserver_id_arg.clone()) + .arg(pageserver_config_args.clone())) + .subcommand(Command::new("stop").about("Stop local pageserver") + .arg(pageserver_id_arg.clone()) + .arg(stop_mode_arg.clone())) + .subcommand(Command::new("restart").about("Restart local pageserver") + .arg(pageserver_id_arg.clone()) + .arg(pageserver_config_args.clone())) + ) + .subcommand( + Command::new("attachment_service") + .arg_required_else_help(true) + .about("Manage attachment_service") .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone())) .subcommand(Command::new("stop").about("Stop local pageserver") .arg(stop_mode_arg.clone())) - .subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone())) ) .subcommand( Command::new("safekeeper") @@ -1172,6 +1308,7 @@ fn cli() -> Command { .arg(lsn_arg.clone()) .arg(pg_port_arg.clone()) .arg(http_port_arg.clone()) + .arg(endpoint_pageserver_id_arg.clone()) .arg( Arg::new("config-only") .help("Don't do basebackup, create endpoint directory with only config files") @@ -1189,6 +1326,7 @@ fn cli() -> Command { .arg(lsn_arg) .arg(pg_port_arg) .arg(http_port_arg) + .arg(endpoint_pageserver_id_arg.clone()) .arg(pg_version_arg) .arg(hot_standby_arg) .arg(safekeepers_arg) diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 4ed03c8771..cba364c049 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -70,6 +70,7 @@ pub struct EndpointConf { http_port: u16, pg_version: u32, skip_pg_catalog_updates: bool, + pageserver_id: NodeId, } // @@ -82,19 +83,16 @@ pub struct ComputeControlPlane { pub endpoints: BTreeMap>, env: LocalEnv, - pageserver: Arc, } impl ComputeControlPlane { // Load current endpoints from the endpoints/ subdirectories pub fn load(env: LocalEnv) -> Result { - let pageserver = Arc::new(PageServerNode::from_env(&env)); - let mut endpoints = BTreeMap::default(); for endpoint_dir in std::fs::read_dir(env.endpoints_path()) .with_context(|| format!("failed to list {}", env.endpoints_path().display()))? { - let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?; + let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?; endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep)); } @@ -102,7 +100,6 @@ impl ComputeControlPlane { base_port: 55431, endpoints, env, - pageserver, }) } @@ -125,15 +122,18 @@ impl ComputeControlPlane { http_port: Option, pg_version: u32, mode: ComputeMode, + pageserver_id: NodeId, ) -> Result> { let pg_port = pg_port.unwrap_or_else(|| self.get_port()); let http_port = http_port.unwrap_or_else(|| self.get_port() + 1); + let pageserver = + PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?); let ep = Arc::new(Endpoint { endpoint_id: endpoint_id.to_owned(), pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port), http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port), env: self.env.clone(), - pageserver: Arc::clone(&self.pageserver), + pageserver, timeline_id, mode, tenant_id, @@ -159,6 +159,7 @@ impl ComputeControlPlane { pg_port, pg_version, skip_pg_catalog_updates: true, + pageserver_id, })?, )?; std::fs::write( @@ -193,18 +194,14 @@ pub struct Endpoint { // These are not part of the endpoint as such, but the environment // the endpoint runs in. pub env: LocalEnv, - pageserver: Arc, + pageserver: PageServerNode, // Optimizations skip_pg_catalog_updates: bool, } impl Endpoint { - fn from_dir_entry( - entry: std::fs::DirEntry, - env: &LocalEnv, - pageserver: &Arc, - ) -> Result { + fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result { if !entry.file_type()?.is_dir() { anyhow::bail!( "Endpoint::from_dir_entry failed: '{}' is not a directory", @@ -220,12 +217,15 @@ impl Endpoint { let conf: EndpointConf = serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; + let pageserver = + PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?); + Ok(Endpoint { pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port), http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port), endpoint_id, env: env.clone(), - pageserver: Arc::clone(pageserver), + pageserver, timeline_id: conf.timeline_id, mode: conf.mode, tenant_id: conf.tenant_id, diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index a773b8dcc3..7592880402 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -7,6 +7,7 @@ // local installations. // +pub mod attachment_service; mod background_process; pub mod broker; pub mod endpoint; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 9e42c2e333..45a7469787 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -68,11 +68,17 @@ pub struct LocalEnv { pub broker: NeonBroker, - pub pageserver: PageServerConf, + /// This Vec must always contain at least one pageserver + pub pageservers: Vec, #[serde(default)] pub safekeepers: Vec, + // Control plane location: if None, we will not run attachment_service. If set, this will + // be propagated into each pageserver's configuration. + #[serde(default)] + pub control_plane_api: Option, + /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. #[serde(default)] // A `HashMap>` would be more appropriate here, @@ -176,32 +182,28 @@ impl LocalEnv { pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); + #[allow(clippy::manual_range_patterns)] match pg_version { - 14 => Ok(path.join(format!("v{pg_version}"))), - 15 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { - match pg_version { - 14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), - 15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), - _ => bail!("Unsupported postgres version: {}", pg_version), - } + Ok(self.pg_distrib_dir(pg_version)?.join("bin")) } pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { - match pg_version { - 14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), - 15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), - _ => bail!("Unsupported postgres version: {}", pg_version), - } + Ok(self.pg_distrib_dir(pg_version)?.join("lib")) } pub fn pageserver_bin(&self) -> PathBuf { self.neon_distrib_dir.join("pageserver") } + pub fn attachment_service_bin(&self) -> PathBuf { + self.neon_distrib_dir.join("attachment_service") + } + pub fn safekeeper_bin(&self) -> PathBuf { self.neon_distrib_dir.join("safekeeper") } @@ -214,15 +216,23 @@ impl LocalEnv { self.base_data_dir.join("endpoints") } - // TODO: move pageserver files into ./pageserver - pub fn pageserver_data_dir(&self) -> PathBuf { - self.base_data_dir.clone() + pub fn pageserver_data_dir(&self, pageserver_id: NodeId) -> PathBuf { + self.base_data_dir + .join(format!("pageserver_{pageserver_id}")) } pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf { self.base_data_dir.join("safekeepers").join(data_dir_name) } + pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> { + if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) { + Ok(conf) + } else { + bail!("could not find pageserver {id}") + } + } + pub fn register_branch_mapping( &mut self, branch_name: String, @@ -299,6 +309,10 @@ impl LocalEnv { env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); } + if env.pageservers.is_empty() { + anyhow::bail!("Configuration must contain at least one pageserver"); + } + env.base_data_dir = base_path(); Ok(env) @@ -331,7 +345,7 @@ impl LocalEnv { // We read that in, in `create_config`, and fill any missing defaults. Then it's saved // to .neon/config. TODO: We lose any formatting and comments along the way, which is // a bit sad. - let mut conf_content = r#"# This file describes a locale deployment of the page server + let mut conf_content = r#"# This file describes a local deployment of the page server # and safekeeeper node. It is read by the 'neon_local' command-line # utility. "# @@ -461,9 +475,9 @@ impl LocalEnv { } fn auth_keys_needed(&self) -> bool { - self.pageserver.pg_auth_type == AuthType::NeonJWT - || self.pageserver.http_auth_type == AuthType::NeonJWT - || self.safekeepers.iter().any(|sk| sk.auth_enabled) + self.pageservers.iter().any(|ps| { + ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT + }) || self.safekeepers.iter().any(|sk| sk.auth_enabled) } } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 2ff09021e5..a6b675fdb5 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -27,6 +27,7 @@ use utils::{ lsn::Lsn, }; +use crate::local_env::PageServerConf; use crate::{background_process, local_env::LocalEnv}; #[derive(Error, Debug)] @@ -76,43 +77,40 @@ impl ResponseErrorMessageExt for Response { #[derive(Debug)] pub struct PageServerNode { pub pg_connection_config: PgConnectionConfig, + pub conf: PageServerConf, pub env: LocalEnv, pub http_client: Client, pub http_base_url: String, } impl PageServerNode { - pub fn from_env(env: &LocalEnv) -> PageServerNode { - let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr) - .expect("Unable to parse listen_pg_addr"); + pub fn from_env(env: &LocalEnv, conf: &PageServerConf) -> PageServerNode { + let (host, port) = + parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr"); let port = port.unwrap_or(5432); Self { pg_connection_config: PgConnectionConfig::new_host_port(host, port), + conf: conf.clone(), env: env.clone(), http_client: Client::new(), - http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr), + http_base_url: format!("http://{}/v1", conf.listen_http_addr), } } // pageserver conf overrides defined by neon_local configuration. fn neon_local_overrides(&self) -> Vec { - let id = format!("id={}", self.env.pageserver.id); + let id = format!("id={}", self.conf.id); // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( "pg_distrib_dir='{}'", self.env.pg_distrib_dir_raw().display() ); - let http_auth_type_param = - format!("http_auth_type='{}'", self.env.pageserver.http_auth_type); - let listen_http_addr_param = format!( - "listen_http_addr='{}'", - self.env.pageserver.listen_http_addr - ); + let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type); + let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr); - let pg_auth_type_param = format!("pg_auth_type='{}'", self.env.pageserver.pg_auth_type); - let listen_pg_addr_param = - format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr); + let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type); + let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr); let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); @@ -126,10 +124,18 @@ impl PageServerNode { broker_endpoint_param, ]; - if self.env.pageserver.http_auth_type != AuthType::Trust - || self.env.pageserver.pg_auth_type != AuthType::Trust + if let Some(control_plane_api) = &self.env.control_plane_api { + overrides.push(format!( + "control_plane_api='{}'", + control_plane_api.as_str() + )); + } + + if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust { - overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned()); + // Keys are generated in the toplevel repo dir, pageservers' workdirs + // are one level below that, so refer to keys with ../ + overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned()); } overrides } @@ -137,16 +143,12 @@ impl PageServerNode { /// Initializes a pageserver node by creating its config with the overrides provided. pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> { // First, run `pageserver --init` and wait for it to write a config into FS and exit. - self.pageserver_init(config_overrides).with_context(|| { - format!( - "Failed to run init for pageserver node {}", - self.env.pageserver.id, - ) - }) + self.pageserver_init(config_overrides) + .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id,)) } pub fn repo_path(&self) -> PathBuf { - self.env.pageserver_data_dir() + self.env.pageserver_data_dir(self.conf.id) } /// The pid file is created by the pageserver process, with its pid stored inside. @@ -162,7 +164,7 @@ impl PageServerNode { fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> { let datadir = self.repo_path(); - let node_id = self.env.pageserver.id; + let node_id = self.conf.id; println!( "Initializing pageserver node {} at '{}' in {:?}", node_id, @@ -171,6 +173,10 @@ impl PageServerNode { ); io::stdout().flush()?; + if !datadir.exists() { + std::fs::create_dir(&datadir)?; + } + let datadir_path_str = datadir.to_str().with_context(|| { format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}") })?; @@ -201,7 +207,7 @@ impl PageServerNode { let datadir = self.repo_path(); print!( "Starting pageserver node {} at '{}' in {:?}", - self.env.pageserver.id, + self.conf.id, self.pg_connection_config.raw_address(), datadir ); @@ -210,7 +216,7 @@ impl PageServerNode { let datadir_path_str = datadir.to_str().with_context(|| { format!( "Cannot start pageserver node {} in path that has no string representation: {:?}", - self.env.pageserver.id, datadir, + self.conf.id, datadir, ) })?; let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); @@ -254,7 +260,7 @@ impl PageServerNode { // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper // needs a token, and how to generate that token, seems independent to whether // the pageserver requires a token in incoming requests. - Ok(if self.env.pageserver.http_auth_type != AuthType::Trust { + Ok(if self.conf.http_auth_type != AuthType::Trust { // Generate a token to connect from the pageserver to a safekeeper let token = self .env @@ -279,7 +285,7 @@ impl PageServerNode { pub fn page_server_psql_client(&self) -> anyhow::Result { let mut config = self.pg_connection_config.clone(); - if self.env.pageserver.pg_auth_type == AuthType::NeonJWT { + if self.conf.pg_auth_type == AuthType::NeonJWT { let token = self .env .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; @@ -290,7 +296,7 @@ impl PageServerNode { fn http_request(&self, method: Method, url: U) -> anyhow::Result { let mut builder = self.http_client.request(method, url); - if self.env.pageserver.http_auth_type == AuthType::NeonJWT { + if self.conf.http_auth_type == AuthType::NeonJWT { let token = self .env .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?; @@ -316,7 +322,8 @@ impl PageServerNode { pub fn tenant_create( &self, - new_tenant_id: Option, + new_tenant_id: TenantId, + generation: Option, settings: HashMap<&str, &str>, ) -> anyhow::Result { let mut settings = settings.clone(); @@ -382,11 +389,9 @@ impl PageServerNode { .context("Failed to parse 'gc_feedback' as bool")?, }; - // If tenant ID was not specified, generate one - let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate()); - let request = models::TenantCreateRequest { new_tenant_id, + generation, config, }; if !settings.is_empty() { diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index 9de5277bf1..e18b0f9176 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -30,7 +30,7 @@ cleanup() { echo "clean up containers if exists" cleanup -for pg_version in 14 15; do +for pg_version in 14 15 16; do echo "start containers (pg_version=$pg_version)." PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d diff --git a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md new file mode 100644 index 0000000000..2c6b46eabe --- /dev/null +++ b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md @@ -0,0 +1,281 @@ + +# Crash-Consistent Layer Map Updates By Leveraging `index_part.json` + +* Created on: Aug 23, 2023 +* Author: Christian Schwarz + +## Summary + +This RFC describes a simple scheme to make layer map updates crash consistent by leveraging the `index_part.json` in remote storage. +Without such a mechanism, crashes can induce certain edge cases in which broadly held assumptions about system invariants don't hold. + +## Motivation + +### Background + +We can currently easily make complex, atomic updates to the layer map by means of an RwLock. +If we crash or restart pageserver, we reconstruct the layer map from: +1. local timeline directory contents +2. remote `index_part.json` contents. + +The function that is responsible for this is called `Timeline::load_layer_map()`. +The reconciliation process's behavior is the following: +* local-only files will become part of the layer map as local-only layers and rescheduled for upload +* For a file name that, by its name, is present locally and in the remote `index_part.json`, but where the local file has a different size (future: checksum) than the remote file, we will delete the local file and leave the remote file as a `RemoteLayer` in the layer map. + +### The Problem + +There are are cases where we need to make an atomic update to the layer map that involves **more than one layer**. +The best example is compaction, where we need to insert the L1 layers generated from the L0 layers, and remove the L0 layers. +As stated above, making the update to the layer map in atomic way is trivial. +But, there is no system call API to make an atomic update to a directory that involves more than one file rename and deletion. +Currently, we issue the system calls one by one and hope we don't crash. + +What happens if we crash and restart in the middle of that system call sequence? +We will reconstruct the layer map according to the reconciliation process, taking as input whatever transitory state the timeline directory ended up in. + +We cannot roll back or complete the timeline directory update during which we crashed, because we keep no record of the changes we plan to make. + +### Problem's Implications For Compaction + +The implications of the above are primarily problematic for compaction. +Specifically, the part of it that compacts L0 layers into L1 layers. + +Remember that compaction takes a set of L0 layers and reshuffles the delta records in them into L1 layer files. +Once the L1 layer files are written to disk, it atomically removes the L0 layers from the layer map and adds the L1 layers to the layer map. +It then deletes the L0 layers locally, and schedules an upload of the L1 layers and and updated index part. + +If we crash before deleting L0s, but after writing out L1s, the next compaction after restart will re-digest the L0s and produce new L1s. +This means the compaction after restart will **overwrite** the previously written L1s. +Currently we also schedule an S3 upload of the overwritten L1. + +If the compaction algorithm doesn't change between the two compaction runs, is deterministic, and uses the same set of L0s as input, then the second run will produce identical L1s and the overwrites will go unnoticed. + +*However*: +1. the file size of the overwritten L1s may not be identical, and +2. the bit pattern of the overwritten L1s may not be identical, and, +3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite + +The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted). + +For example, if an unresponsive node A becomes active again after control plane has relocated the tenant to a new node B, the node A may overwrite some L1s. +But node B based its world view on the version of node A's `index_part.json` from _before_ the overwrite. +That earlier `index_part.json`` contained the file size of the pre-overwrite L1. +If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1. +Effectively, the data in the L1 has become inaccessible to node B. +If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem. + +If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems. + +In case of (1) and (2), where we know that the logical content of the layers is still the same, we can recover by manually patching the `index_part.json` of the new node to the overwritten L1's file size / checksum. + +But if (3) ever happens, the logical content may be different, and, we could have truly lost data. + +Given the above considerations, we should avoid making correctness of split-brain protection dependent on overwrites preserving _logical_ layer file contents. +**It is a much cleaner separation of concerns to require that layer files are truly immutable in S3, i.e., PUT once and then only DELETEd, never overwritten (overPUTted).** + +## Design + +Instead of reconciling a layer map from local timeline directory contents and remote index part, this RFC proposes to view the remote index part as authoritative during timeline load. +Local layer files will be recognized if they match what's listed in remote index part, and removed otherwise. + +During **timeline load**, the only thing that matters is the remote index part content. +Essentially, timeline load becomes much like attach, except we don't need to prefix-list the remote timelines. +The local timeline dir's `metadata` file does not matter. +The layer files in the local timeline dir are seen as a nice-to-have cache of layer files that are in the remote index part. +Any layer files in the local timeline dir that aren't in the remote index part are removed during startup. +The `Timeline::load_layer_map()` no longer "merges" local timeline dir contents with the remote index part. +Instead, it treats the remote index part as the authoritative layer map. +If the local timeline dir contains a layer that is in the remote index part, that's nice, and we'll re-use it if file size (and in the future, check sum) match what's stated in the index part. +If it doesn't match, we remove the file from the local timeline dir. + +After load, **at runtime**, nothing changes compared to what we did before this RFC. +The procedure for single- and multi-object changes is reproduced here for reference: +* For any new layers that the change adds: + * Write them to a temporary location. + * While holding layer map lock: + * Move them to the final location. + * Insert into layer map. +* Make the S3 changes. + We won't reproduce the remote timeline client method calls here because these are subject to change. + Instead we reproduce the sequence of s3 changes that must result for a given single-/multi-object change: + * PUT layer files inserted by the change. + * PUT an index part that has insertions and deletions of the change. + * DELETE the layer files that are deleted by the change. + +Note that it is safe for the DELETE to be deferred arbitrarily. +* If it never happens, we leak the object, but, that's not a correctness concern. +* As of #4938, we don't schedule the remote timeline client operation for deletion immediately, but, only when we drop the `LayerInner`. +* With the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919), the deletions will be written to deletion queue for processing when it's safe to do so (see the RFC for details). + +## How This Solves The Problem + +If we crash before we've finished the S3 changes, then timeline load will reset layer map to the state that's in the S3 index part. +The S3 change sequence above is obviously crash-consistent. +If we crash before the index part PUT, then we leak the inserted layer files to S3. +If we crash after the index part PUT, we leak the to-be-DELETEd layer files to S3. +Leaking is fine, it's a pre-existing condition and not addressed in this RFC. + +Multi-object changes that previously created and removed files in timeline dir are now atomic because the layer map updates are atomic and crash consistent: +* atomic layer map update at runtime, currently by using an RwLock in write mode +* atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic +* local timeline dir state: + * irrelevant for layer map content => irrelevant for atomic updates / crash consistency + * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them + * if we crash before index part PUT, local layer files will be deleted + +## Trade-Offs + +### Fundamental + +If we crash before finishing the index part PUT, we lose all the work that hasn't reached the S3 `index_part.json`: +* wal ingest: we lose not-yet-uploaded L0s; load on the **safekeepers** + work for pageserver +* compaction: we lose the entire compaction iteration work; need to re-do it again +* gc: no change to what we have today + +If the work is still deemed necessary after restart, the restarted restarted pageserver will re-do this work. +The amount of work to be re-do is capped to the lag of S3 changes to the local changes. +Assuming upload queue allows for unlimited queue depth (that's what it does today), this means: +* on-demand downloads that were needed to do the work: are likely still present, not lost +* wal ingest: currently unbounded +* L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()` + * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M. + * In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`. +* image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))` + * I have no intuition how expensive / long-running it is in reality. +* gc: `update_gc_info`` work (not substantial, AFAIK) + +To limit the amount of lost upload work, and ingest work, we can limit the upload queue depth (see suggestions in the next sub-section). +However, to limit the amount of lost CPU work, we would need a way to make make the compaction/image-layer-generation algorithms interruptible & resumable. +We aren't there yet, the need for it is tracked by ([#4580](https://github.com/neondatabase/neon/issues/4580)). +However, this RFC is not constraining the design space either. + +### Practical + +#### Pageserver Restarts + +Pageserver crashes are very rare ; it would likely be acceptable to re-do the lost work in that case. +However, regular pageserver restart happen frequently, e.g., during weekly deploys. + +In general, pageserver restart faces the problem of tenants that "take too long" to shut down. +They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down. +We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file). +A longer budget would expose tenants that are done early to a longer downtime. +A short budget would risk throwing away more work that'd have to be re-done after restart. + +In the context of this RFC, killing the process would mean losing the work that hasn't made it to S3. +We can mitigate this problem as follows: +0. initially, by accepting that we need to do the work again +1. short-term, introducing measures to cap the amount of in-flight work: + + - cap upload queue length, use backpressure to slow down compaction + - disabling compaction/image-layer-generation X minutes before `systemctl restart pageserver` + - introducing a read-only shutdown state for tenants that are fast to shut down; + that state would be equivalent to the state of a tenant in hot standby / readonly mode. + +2. mid term, by not restarting pageserver in place, but using [*seamless tenant migration*](https://github.com/neondatabase/neon/pull/5029) to drain a pageserver's tenants before we restart it. + +#### `disk_consistent_lsn` can go backwards + +`disk_consistent_lsn` can go backwards across restarts if we crash before we've finished the index part PUT. +Nobody should care about it, because the only thing that matters is `remote_consistent_lsn`. +Compute certainly doesn't care about `disk_consistent_lsn`. + + +## Side-Effects Of This Design + +* local `metadata` is basically reduced to a cache of which timelines exist for this tenant; i.e., we can avoid a `ListObjects` requests for a tenant's timelines during tenant load. + +## Limitations + +Multi-object changes that span multiple timelines aren't covered by this RFC. +That's fine because we currently don't need them, as evidenced by the absence +of a Pageserver operation that holds multiple timelines' layer map lock at a time. + +## Impacted components + +Primarily pageservers. + +Safekeepers will experience more load when we need to re-ingest WAL because we've thrown away work. +No changes to safekeepers are needed. + +## Alternatives considered + +### Alternative 1: WAL + +We could have a local WAL for timeline dir changes, as proposed here https://github.com/neondatabase/neon/issues/4418 and partially implemented here https://github.com/neondatabase/neon/pull/4422 . +The WAL would be used to +1. make multi-object changes atomic +2. replace `reconcile_with_remote()` reconciliation: scheduling of layer upload would be part of WAL replay. + +The WAL is appealing in a local-first world, but, it's much more complex than the design described above: +* New on-disk state to get right. +* Forward- and backward-compatibility development costs in the future. + +### Alternative 2: Flow Everything Through `index_part.json` + +We could have gone to the other extreme and **only** update the layer map whenever we've PUT `index_part.json`. +I.e., layer map would always be the last-persisted S3 state. +That's axiomatically beautiful, not least because it fully separates the layer file production and consumption path (=> [layer file spreading proposal](https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=4)). +And it might make hot standbys / read-only pageservers less of a special case in the future. + +But, I have some uncertainties with regard to WAL ingestion, because it needs to be able to do some reads for the logical size feedback to safekeepers. + +And it's silly that we wouldn't be able to use the results of compaction or image layer generation before we're done with the upload. + +Lastly, a temporarily clogged-up upload queue (e.g. S3 is down) shouldn't immediately render ingestion unavailable. + +### Alternative 3: Sequence Numbers For Layers + +Instead of what's proposed in this RFC, we could use unique numbers to identify layer files: + +``` +# before +tenants/$tenant/timelines/$timeline/$key_and_lsn_range +# after +tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range +``` + +To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`. + +This alternative does not solve atomic layer map updates. +In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers. +In fact, this alternative makes it worse because the data is now duplicated in the not-overwritten and overwritten L1 layer files. +We'd need to write a deduplication pass that checks if perfectly overlapping layers have identical contents. + +However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC. + +So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3). +But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute. +The proposed design in this RFC addresses both. + +So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top. +That way, we avoid a phase where the crash-during-compaction problem is accute. + +## Related issues + +- https://github.com/neondatabase/neon/issues/4749 +- https://github.com/neondatabase/neon/issues/4418 + - https://github.com/neondatabase/neon/pull/4422 +- https://github.com/neondatabase/neon/issues/5077 +- https://github.com/neondatabase/neon/issues/4088 + - (re)resolutions: + - https://github.com/neondatabase/neon/pull/4696 + - https://github.com/neondatabase/neon/pull/4094 + - https://neondb.slack.com/archives/C033QLM5P7D/p1682519017949719 + +Note that the test case introduced in https://github.com/neondatabase/neon/pull/4696/files#diff-13114949d1deb49ae394405d4c49558adad91150ba8a34004133653a8a5aeb76 will produce L1s with the same logical content, but, as outlined in the last paragraph of the _Problem Statement_ section above, we don't want to make that assumption in order to fix the problem. + + +## Implementation Plan + +1. Remove support for `remote_storage=None`, because we now rely on the existence of an index part. + + - The nasty part here is to fix all the tests that fiddle with the local timeline directory. + Possibly they are just irrelevant with this change, but, each case will require inspection. + +2. Implement the design above. + + - Initially, ship without the mitigations for restart and accept we will do some work twice. + - Measure the impact and implement one of the mitigations. + diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs index 418885a1b0..7b133c61af 100644 --- a/libs/consumption_metrics/src/lib.rs +++ b/libs/consumption_metrics/src/lib.rs @@ -3,9 +3,9 @@ //! use chrono::{DateTime, Utc}; use rand::Rng; -use serde::Serialize; +use serde::{Deserialize, Serialize}; -#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] +#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] #[serde(tag = "type")] pub enum EventType { #[serde(rename = "absolute")] @@ -27,7 +27,8 @@ impl EventType { } pub fn incremental_timerange(&self) -> Option>> { - // these can most likely be thought of as Range or RangeFull + // these can most likely be thought of as Range or RangeFull, at least pageserver creates + // incremental ranges where the stop and next start are equal. use EventType::*; match self { Incremental { @@ -41,15 +42,25 @@ impl EventType { pub fn is_incremental(&self) -> bool { matches!(self, EventType::Incremental { .. }) } + + /// Returns the absolute time, or for incremental ranges, the stop time. + pub fn recorded_at(&self) -> &DateTime { + use EventType::*; + + match self { + Absolute { time } => time, + Incremental { stop_time, .. } => stop_time, + } + } } -#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] -pub struct Event { +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] +pub struct Event { #[serde(flatten)] #[serde(rename = "type")] pub kind: EventType, - pub metric: &'static str, + pub metric: Metric, pub idempotency_key: String, pub value: u64, @@ -58,12 +69,38 @@ pub struct Event { } pub fn idempotency_key(node_id: &str) -> String { - format!( - "{}-{}-{:04}", - Utc::now(), - node_id, - rand::thread_rng().gen_range(0..=9999) - ) + IdempotencyKey::generate(node_id).to_string() +} + +/// Downstream users will use these to detect upload retries. +pub struct IdempotencyKey<'a> { + now: chrono::DateTime, + node_id: &'a str, + nonce: u16, +} + +impl std::fmt::Display for IdempotencyKey<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}-{}-{:04}", self.now, self.node_id, self.nonce) + } +} + +impl<'a> IdempotencyKey<'a> { + pub fn generate(node_id: &'a str) -> Self { + IdempotencyKey { + now: Utc::now(), + node_id, + nonce: rand::thread_rng().gen_range(0..=9999), + } + } + + pub fn for_tests(now: DateTime, node_id: &'a str, nonce: u16) -> Self { + IdempotencyKey { + now, + node_id, + nonce, + } + } } pub const CHUNK_SIZE: usize = 1000; diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/control_api.rs new file mode 100644 index 0000000000..a54fee47a5 --- /dev/null +++ b/libs/pageserver_api/src/control_api.rs @@ -0,0 +1,52 @@ +//! Types in this file are for pageserver's upward-facing API calls to the control plane, +//! required for acquiring and validating tenant generation numbers. +//! +//! See docs/rfcs/025-generation-numbers.md + +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use utils::id::{NodeId, TenantId}; + +#[derive(Serialize, Deserialize)] +pub struct ReAttachRequest { + pub node_id: NodeId, +} + +#[serde_as] +#[derive(Serialize, Deserialize)] +pub struct ReAttachResponseTenant { + #[serde_as(as = "DisplayFromStr")] + pub id: TenantId, + pub generation: u32, +} + +#[derive(Serialize, Deserialize)] +pub struct ReAttachResponse { + pub tenants: Vec, +} + +#[serde_as] +#[derive(Serialize, Deserialize)] +pub struct ValidateRequestTenant { + #[serde_as(as = "DisplayFromStr")] + pub id: TenantId, + pub gen: u32, +} + +#[derive(Serialize, Deserialize)] +pub struct ValidateRequest { + pub tenants: Vec, +} + +#[derive(Serialize, Deserialize)] +pub struct ValidateResponse { + pub tenants: Vec, +} + +#[serde_as] +#[derive(Serialize, Deserialize)] +pub struct ValidateResponseTenant { + #[serde_as(as = "DisplayFromStr")] + pub id: TenantId, + pub valid: bool, +} diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index 4890d54f36..d844021785 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -1,6 +1,7 @@ use const_format::formatcp; /// Public API types +pub mod control_api; pub mod models; pub mod reltag; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 2f4c21326e..f354296be2 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -194,10 +194,22 @@ pub struct TimelineCreateRequest { pub struct TenantCreateRequest { #[serde_as(as = "DisplayFromStr")] pub new_tenant_id: TenantId, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub generation: Option, #[serde(flatten)] pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it } +#[serde_as] +#[derive(Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantLoadRequest { + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub generation: Option, +} + impl std::ops::Deref for TenantCreateRequest { type Target = TenantConfig; @@ -241,15 +253,6 @@ pub struct StatusResponse { pub id: NodeId, } -impl TenantCreateRequest { - pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest { - TenantCreateRequest { - new_tenant_id, - config: TenantConfig::default(), - } - } -} - #[serde_as] #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] @@ -293,9 +296,11 @@ impl TenantConfigRequest { } } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Deserialize)] pub struct TenantAttachRequest { pub config: TenantAttachConfig, + #[serde(default)] + pub generation: Option, } /// Newtype to enforce deny_unknown_fields on TenantConfig for @@ -376,6 +381,8 @@ pub struct TimelineInfo { pub pg_version: u32, pub state: TimelineState, + + pub walreceiver_status: String, } #[derive(Debug, Clone, Serialize)] diff --git a/libs/postgres_ffi/README.md b/libs/postgres_ffi/README.md index de046eb3da..ae949d2da6 100644 --- a/libs/postgres_ffi/README.md +++ b/libs/postgres_ffi/README.md @@ -10,9 +10,11 @@ should be auto-generated too, but that's a TODO. The PostgreSQL on-disk file format is not portable across different CPU architectures and operating systems. It is also subject to change in each major PostgreSQL version. Currently, this module supports -PostgreSQL v14 and v15: bindings and code that depends on them are version-specific. -This code is organized in modules: `postgres_ffi::v14` and `postgres_ffi::v15` -Version independend code is explicitly exported into shared `postgres_ffi`. +PostgreSQL v14, v15 and v16: bindings and code that depends on them are +version-specific. +This code is organized in modules `postgres_ffi::v14`, `postgres_ffi::v15` and +`postgres_ffi::v16`. Version independent code is explicitly exported into +shared `postgres_ffi`. TODO: Currently, there is also some code that deals with WAL records diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index f7e39751ef..8e6761d6d3 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> { PathBuf::from("pg_install") }; - for pg_version in &["v14", "v15"] { + for pg_version in &["v14", "v15", "v16"] { let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); if pg_install_dir_versioned.is_relative() { let cwd = env::current_dir().context("Failed to get current_dir")?; @@ -125,6 +125,7 @@ fn main() -> anyhow::Result<()> { .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") .allowlist_type("PageHeaderData") .allowlist_type("DBState") + .allowlist_type("RelMapFile") // Because structs are used for serialization, tell bindgen to emit // explicit padding fields. .explicit_padding(true) diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index cc115664d5..c9e5df9f04 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -51,11 +51,59 @@ macro_rules! for_all_postgres_versions { ($macro:tt) => { $macro!(v14); $macro!(v15); + $macro!(v16); }; } for_all_postgres_versions! { postgres_ffi } +/// dispatch_pgversion +/// +/// Run a code block in a context where the postgres_ffi bindings for a +/// specific (supported) PostgreSQL version are `use`-ed in scope under the pgv +/// identifier. +/// If the provided pg_version is not supported, we panic!(), unless the +/// optional third argument was provided (in which case that code will provide +/// the default handling instead). +/// +/// Use like +/// +/// dispatch_pgversion!(my_pgversion, { pgv::constants::XLOG_DBASE_CREATE }) +/// dispatch_pgversion!(my_pgversion, pgv::constants::XLOG_DBASE_CREATE) +/// +/// Other uses are for macro-internal purposes only and strictly unsupported. +/// +#[macro_export] +macro_rules! dispatch_pgversion { + ($version:expr, $code:expr) => { + dispatch_pgversion!($version, $code, panic!("Unknown PostgreSQL version {}", $version)) + }; + ($version:expr, $code:expr, $invalid_pgver_handling:expr) => { + dispatch_pgversion!( + $version => $code, + default = $invalid_pgver_handling, + pgversions = [ + 14 : v14, + 15 : v15, + 16 : v16, + ] + ) + }; + ($pgversion:expr => $code:expr, + default = $default:expr, + pgversions = [$($sv:literal : $vsv:ident),+ $(,)?]) => { + match ($pgversion) { + $($sv => { + use $crate::$vsv as pgv; + $code + },)+ + _ => { + $default + } + } + }; +} + pub mod pg_constants; pub mod relfile_utils; @@ -90,13 +138,7 @@ pub use v14::xlog_utils::XLogFileName; pub use v14::bindings::DBState_DB_SHUTDOWNED; pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result { - match version { - 14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0), - 15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 - || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0 - || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0), - _ => anyhow::bail!("Unknown version {}", version), - } + dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info))) } pub fn generate_wal_segment( @@ -107,11 +149,11 @@ pub fn generate_wal_segment( ) -> Result { assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE)); - match pg_version { - 14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn), - 15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn), - _ => Err(SerializeError::BadInput), - } + dispatch_pgversion!( + pg_version, + pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn), + Err(SerializeError::BadInput) + ) } pub fn generate_pg_control( @@ -120,11 +162,11 @@ pub fn generate_pg_control( lsn: Lsn, pg_version: u32, ) -> anyhow::Result<(Bytes, u64)> { - match pg_version { - 14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), - 15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), - _ => anyhow::bail!("Unknown version {}", pg_version), - } + dispatch_pgversion!( + pg_version, + pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), + anyhow::bail!("Unknown version {}", pg_version) + ) } // PG timeline is always 1, changing it doesn't have any useful meaning in Neon. @@ -196,8 +238,6 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber { } pub mod waldecoder { - - use crate::{v14, v15}; use bytes::{Buf, Bytes, BytesMut}; use std::num::NonZeroU32; use thiserror::Error; @@ -248,22 +288,17 @@ pub mod waldecoder { } pub fn poll_decode(&mut self) -> Result, WalDecodeError> { - match self.pg_version { - // This is a trick to support both versions simultaneously. - // See WalStreamDecoderHandler comments. - 14 => { - use self::v14::waldecoder_handler::WalStreamDecoderHandler; + dispatch_pgversion!( + self.pg_version, + { + use pgv::waldecoder_handler::WalStreamDecoderHandler; self.poll_decode_internal() - } - 15 => { - use self::v15::waldecoder_handler::WalStreamDecoderHandler; - self.poll_decode_internal() - } - _ => Err(WalDecodeError { + }, + Err(WalDecodeError { msg: format!("Unknown version {}", self.pg_version), lsn: self.lsn, - }), - } + }) + ) } } } diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 9c39b46cc1..9690dc0eb6 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -137,9 +137,12 @@ pub const XLOG_HEAP_INSERT: u8 = 0x00; pub const XLOG_HEAP_DELETE: u8 = 0x10; pub const XLOG_HEAP_UPDATE: u8 = 0x20; pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40; +pub const XLOG_HEAP_LOCK: u8 = 0x60; pub const XLOG_HEAP_INIT_PAGE: u8 = 0x80; pub const XLOG_HEAP2_VISIBLE: u8 = 0x40; pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50; +pub const XLOG_HEAP2_LOCK_UPDATED: u8 = 0x60; +pub const XLH_LOCK_ALL_FROZEN_CLEARED: u8 = 0x01; pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8; pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; @@ -163,6 +166,20 @@ pub const RM_HEAP2_ID: u8 = 9; pub const RM_HEAP_ID: u8 = 10; pub const RM_LOGICALMSG_ID: u8 = 21; +// from neon_rmgr.h +pub const RM_NEON_ID: u8 = 134; + +pub const XLOG_NEON_HEAP_INIT_PAGE: u8 = 0x80; + +pub const XLOG_NEON_HEAP_INSERT: u8 = 0x00; +pub const XLOG_NEON_HEAP_DELETE: u8 = 0x10; +pub const XLOG_NEON_HEAP_UPDATE: u8 = 0x20; +pub const XLOG_NEON_HEAP_HOT_UPDATE: u8 = 0x30; +pub const XLOG_NEON_HEAP_LOCK: u8 = 0x40; +pub const XLOG_NEON_HEAP_MULTI_INSERT: u8 = 0x50; + +pub const XLOG_NEON_HEAP_VISIBLE: u8 = 0x40; + // from xlogreader.h pub const XLR_INFO_MASK: u8 = 0x0F; pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs index 810898ee80..32f8f51114 100644 --- a/libs/postgres_ffi/src/pg_constants_v14.rs +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -3,3 +3,8 @@ pub const XLOG_DBASE_DROP: u8 = 0x10; pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ +pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ + +pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { + (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0 +} diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs index 6fa5eb008c..626a23c7ea 100644 --- a/libs/postgres_ffi/src/pg_constants_v15.rs +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -1,10 +1,18 @@ pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; -pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10; pub const XLOG_DBASE_DROP: u8 = 0x20; pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ + +pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ + +pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { + const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; + + (bimg_info & ANY_COMPRESS_FLAG) != 0 +} diff --git a/libs/postgres_ffi/src/pg_constants_v16.rs b/libs/postgres_ffi/src/pg_constants_v16.rs new file mode 100644 index 0000000000..587be71cb3 --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v16.rs @@ -0,0 +1,18 @@ +pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; + +pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10; +pub const XLOG_DBASE_DROP: u8 = 0x20; + +pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ +pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ + +pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ + +pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { + const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; + + (bimg_info & ANY_COMPRESS_FLAG) != 0 +} diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index d4aed88048..fb627ca258 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -49,9 +49,9 @@ impl Conf { pub fn pg_distrib_dir(&self) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); + #[allow(clippy::manual_range_patterns)] match self.pg_version { - 14 => Ok(path.join(format!("v{}", self.pg_version))), - 15 => Ok(path.join(format!("v{}", self.pg_version))), + 14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))), _ => bail!("Unsupported postgres version: {}", self.pg_version), } } @@ -250,11 +250,18 @@ fn craft_internal( let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?; let last_lsn = match last_lsn { None => client.pg_current_wal_insert_lsn()?, - Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) { - Ordering::Less => bail!("Some records were inserted after the crafted WAL"), - Ordering::Equal => last_lsn, - Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), - }, + Some(last_lsn) => { + let insert_lsn = client.pg_current_wal_insert_lsn()?; + match last_lsn.cmp(&insert_lsn) { + Ordering::Less => bail!( + "Some records were inserted after the crafted WAL: {} vs {}", + last_lsn, + insert_lsn + ), + Ordering::Equal => last_lsn, + Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), + } + } }; if !intermediate_lsns.starts_with(&[initial_lsn]) { intermediate_lsns.insert(0, initial_lsn); @@ -363,8 +370,9 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { ); ensure!( u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, - "XLOG_SWITCH message ended not on page boundary: {}", - after_xlog_switch + "XLOG_SWITCH message ended not on page boundary: {}, offset = {}", + after_xlog_switch, + u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ ); Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 809fa5fffd..47faad363f 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -959,7 +959,7 @@ mod tests { let make_params = |options| StartupMessageParams::new([("options", options)]); let params = StartupMessageParams::new([]); - assert!(matches!(params.options_escaped(), None)); + assert!(params.options_escaped().is_none()); let params = make_params(""); assert!(split_options(¶ms).is_empty()); diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index f1095ad8b8..5040183045 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -148,21 +148,55 @@ impl RemoteStorage for LocalFs { Some(folder) => folder.with_base(&self.storage_root), None => self.storage_root.clone(), }; - let mut files = vec![]; - let mut directory_queue = vec![full_path.clone()]; + // If we were given a directory, we may use it as our starting point. + // Otherwise, we must go up to the parent directory. This is because + // S3 object list prefixes can be arbitrary strings, but when reading + // the local filesystem we need a directory to start calling read_dir on. + let mut initial_dir = full_path.clone(); + match fs::metadata(full_path.clone()).await { + Ok(meta) => { + if !meta.is_dir() { + // It's not a directory: strip back to the parent + initial_dir.pop(); + } + } + Err(e) if e.kind() == ErrorKind::NotFound => { + // It's not a file that exists: strip the prefix back to the parent directory + initial_dir.pop(); + } + Err(e) => { + // Unexpected I/O error + anyhow::bail!(e) + } + } + + // Note that PathBuf starts_with only considers full path segments, but + // object prefixes are arbitrary strings, so we need the strings for doing + // starts_with later. + let prefix = full_path.to_string_lossy(); + + let mut files = vec![]; + let mut directory_queue = vec![initial_dir.clone()]; while let Some(cur_folder) = directory_queue.pop() { let mut entries = fs::read_dir(cur_folder.clone()).await?; while let Some(entry) = entries.next_entry().await? { let file_name: PathBuf = entry.file_name().into(); let full_file_name = cur_folder.clone().join(&file_name); - let file_remote_path = self.local_file_to_relative_path(full_file_name.clone()); - files.push(file_remote_path.clone()); - if full_file_name.is_dir() { - directory_queue.push(full_file_name); + if full_file_name + .to_str() + .map(|s| s.starts_with(prefix.as_ref())) + .unwrap_or(false) + { + let file_remote_path = self.local_file_to_relative_path(full_file_name.clone()); + files.push(file_remote_path.clone()); + if full_file_name.is_dir() { + directory_queue.push(full_file_name); + } } } } + Ok(files) } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 9d05fa32b3..9262f1e88f 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -573,7 +573,7 @@ mod tests { #[test] fn relative_path() { - let all_paths = vec!["", "some/path", "some/path/"]; + let all_paths = ["", "some/path", "some/path/"]; let all_paths: Vec = all_paths .iter() .map(|x| RemotePath::new(Path::new(x)).expect("bad path")) diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh index 92cd164b7d..316ec8ed0d 100755 --- a/libs/utils/scripts/restore_from_wal.sh +++ b/libs/utils/scripts/restore_from_wal.sh @@ -9,11 +9,12 @@ PORT=$4 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-) rm -fr "$DATA_DIR" env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID" -echo port="$PORT" >> "$DATA_DIR"/postgresql.conf +echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf +echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-) declare -i WAL_SIZE=$REDO_POS+114 -"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start -"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate +"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start +"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate cp "$DATA_DIR"/pg_wal/000000010000000000000001 . cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/ for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 87c6361255..163c8c0467 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -53,6 +53,7 @@ impl Generation { matches!(self, Self::None) } + #[track_caller] pub fn get_suffix(&self) -> String { match self { Self::Valid(v) => { @@ -64,6 +65,30 @@ impl Generation { } } } + + /// `suffix` is the part after "-" in a key + /// + /// Returns None if parsing was unsuccessful + pub fn parse_suffix(suffix: &str) -> Option { + u32::from_str_radix(suffix, 16).map(Generation::new).ok() + } + + #[track_caller] + pub fn previous(&self) -> Generation { + match self { + Self::Valid(n) => { + if *n == 0 { + // Since a tenant may be upgraded from a pre-generations state, interpret the "previous" generation + // to 0 as being "no generation". + Self::None + } else { + Self::Valid(n - 1) + } + } + Self::None => Self::None, + Self::Broken => panic!("Attempted to use a broken generation"), + } + } } impl Serialize for Generation { diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs index 4f529d16fd..3254fa4501 100644 --- a/libs/vm_monitor/src/cgroup.rs +++ b/libs/vm_monitor/src/cgroup.rs @@ -315,12 +315,8 @@ impl CgroupWatcher { where E: Stream>, { - // There are several actions might do when receiving a `memory.high`, - // such as freezing the cgroup, or increasing its `memory.high`. We don't - // want to do these things too often (because postgres needs to run, and - // we only have so much memory). These timers serve as rate limits for this. let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO)); - let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO)); + let mut last_memory_high_increase_at: Option = None; let mut events = pin!(events); // Are we waiting to be upscaled? Could be true if we request upscale due @@ -332,6 +328,8 @@ impl CgroupWatcher { upscale = upscales.recv() => { let Sequenced { seqnum, data } = upscale .context("failed to listen on upscale notification channel")?; + waiting_on_upscale = false; + last_memory_high_increase_at = None; self.last_upscale_seqnum.store(seqnum, Ordering::Release); info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale"); } @@ -396,12 +394,17 @@ impl CgroupWatcher { .send(()) .await .context("failed to request upscale")?; + waiting_on_upscale = true; continue; } // Shoot, we can't freeze or and we're still waiting on upscale, // increase memory.high to reduce throttling - if wait_to_increase_memory_high.is_elapsed() { + let can_increase_memory_high = match last_memory_high_increase_at { + None => true, + Some(t) => t.elapsed() > self.config.memory_high_increase_every, + }; + if can_increase_memory_high { info!( "received memory.high event, \ but too soon to refreeze and already requested upscale \ @@ -437,12 +440,11 @@ impl CgroupWatcher { ); self.set_high_bytes(new_high) .context("failed to set memory.high")?; - wait_to_increase_memory_high - .as_mut() - .reset(Instant::now() + self.config.memory_high_increase_every) + last_memory_high_increase_at = Some(Instant::now()); + continue; } - // we can't do anything + info!("received memory.high event, but can't do anything"); } }; } @@ -559,14 +561,7 @@ impl CgroupWatcher { /// Setting these values also affects the thresholds for receiving usage alerts. #[derive(Debug)] pub struct MemoryLimits { - high: u64, - max: u64, -} - -impl MemoryLimits { - pub fn new(high: u64, max: u64) -> Self { - Self { max, high } - } + pub high: u64, } // Methods for manipulating the actual cgroup @@ -643,12 +638,7 @@ impl CgroupWatcher { /// Set cgroup memory.high and memory.max. pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> { - info!( - limits.high, - limits.max, - path = self.path(), - "writing new memory limits", - ); + info!(limits.high, path = self.path(), "writing new memory limits",); self.memory() .context("failed to get memory subsystem while setting memory limits")? .set_mem(cgroups_rs::memory::SetMemory { @@ -657,7 +647,7 @@ impl CgroupWatcher { high: Some(MaxValue::Value( u64::min(limits.high, i64::MAX as u64) as i64 )), - max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)), + max: None, }) .context("failed to set memory limits") } @@ -665,7 +655,7 @@ impl CgroupWatcher { /// Given some amount of available memory, set the desired cgroup memory limits pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> { let new_high = self.config.calculate_memory_high_value(available_memory); - let limits = MemoryLimits::new(new_high, available_memory); + let limits = MemoryLimits { high: new_high }; info!( path = self.path(), memory = ?limits, diff --git a/libs/vm_monitor/src/lib.rs b/libs/vm_monitor/src/lib.rs index 2eee31c965..1cbfdc6ba6 100644 --- a/libs/vm_monitor/src/lib.rs +++ b/libs/vm_monitor/src/lib.rs @@ -178,14 +178,17 @@ pub async fn ws_handler( /// Starts the monitor. If startup fails or the monitor exits, an error will /// be logged and our internal state will be reset to allow for new connections. -#[tracing::instrument(skip_all, fields(?args))] +#[tracing::instrument(skip_all)] async fn start_monitor( ws: WebSocket, args: &Args, kill: broadcast::Receiver<()>, token: CancellationToken, ) { - info!("accepted new websocket connection -> starting monitor"); + info!( + ?args, + "accepted new websocket connection -> starting monitor" + ); let timeout = Duration::from_secs(4); let monitor = tokio::time::timeout( timeout, diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs index 82055fda2e..376017d784 100644 --- a/libs/vm_monitor/src/runner.rs +++ b/libs/vm_monitor/src/runner.rs @@ -5,6 +5,7 @@ //! all functionality. use std::sync::Arc; +use std::time::{Duration, Instant}; use std::{fmt::Debug, mem}; use anyhow::{bail, Context}; @@ -36,6 +37,8 @@ pub struct Runner { /// by us vs the autoscaler-agent. counter: usize, + last_upscale_request_at: Option, + /// A signal to kill the main thread produced by `self.run()`. This is triggered /// when the server receives a new connection. When the thread receives the /// signal off this channel, it will gracefully shutdown. @@ -99,6 +102,7 @@ impl Runner { cgroup: None, dispatcher, counter: 1, // NB: must be odd, see the comment about the field for more. + last_upscale_request_at: None, kill, }; @@ -253,12 +257,11 @@ impl Runner { new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory); } - let limits = MemoryLimits::new( + let limits = MemoryLimits { // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here // since it is properly initialized in the previous cgroup if let block - new_cgroup_mem_high, - available_memory, - ); + high: new_cgroup_mem_high, + }; cgroup .set_limits(&limits) .context("failed to set cgroup memory limits")?; @@ -324,7 +327,9 @@ impl Runner { name = cgroup.path(), "updating cgroup memory.high", ); - let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory); + let limits = MemoryLimits { + high: new_cgroup_mem_high, + }; cgroup .set_limits(&limits) .context("failed to set file cache size")?; @@ -397,6 +402,20 @@ impl Runner { if request.is_none() { bail!("failed to listen for upscale event from cgroup") } + + // If it's been less than 1 second since the last time we requested upscaling, + // ignore the event, to avoid spamming the agent (otherwise, this can happen + // ~1k times per second). + if let Some(t) = self.last_upscale_request_at { + let elapsed = t.elapsed(); + if elapsed < Duration::from_secs(1) { + info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring"); + continue; + } + } + + self.last_upscale_request_at = Some(Instant::now()); + info!("cgroup asking for upscale; forwarding request"); self.counter += 2; // Increment, preserving parity (i.e. keep the // counter odd). See the field comment for more. diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index bbdd8b1e99..9cb71dea09 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -80,11 +80,11 @@ enum-map.workspace = true enumset.workspace = true strum.workspace = true strum_macros.workspace = true +tempfile.workspace = true [dev-dependencies] criterion.workspace = true hex-literal.workspace = true -tempfile.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } [[bench]] diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 29bd6ce598..de7b4861cb 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -3,6 +3,9 @@ //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data. use anyhow::Result; +use pageserver::context::{DownloadBehavior, RequestContext}; +use pageserver::task_mgr::TaskKind; +use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::ops::Range; @@ -95,9 +98,9 @@ pub(crate) fn parse_filename(name: &str) -> Option { } // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH" -async fn get_holes(path: &Path, max_holes: usize) -> Result> { - let file = FileBlockReader::new(VirtualFile::open(path)?); - let summary_blk = file.read_blk(0).await?; +async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Result> { + let file = FileBlockReader::new(VirtualFile::open(path).await?); + let summary_blk = file.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( actual_summary.index_start_blk, @@ -124,6 +127,7 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result> { prev_key = Some(curr.next()); true }, + ctx, ) .await?; let mut holes = heap.into_vec(); @@ -134,6 +138,7 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result> { pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let storage_path = &cmd.path; let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES); + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. pageserver::virtual_file::init(10); @@ -142,12 +147,12 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let mut total_delta_layers = 0usize; let mut total_image_layers = 0usize; let mut total_excess_layers = 0usize; - for tenant in fs::read_dir(storage_path.join("tenants"))? { + for tenant in fs::read_dir(storage_path.join(TENANTS_SEGMENT_NAME))? { let tenant = tenant?; if !tenant.file_type()?.is_dir() { continue; } - for timeline in fs::read_dir(tenant.path().join("timelines"))? { + for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? { let timeline = timeline?; if !timeline.file_type()?.is_dir() { continue; @@ -162,7 +167,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { parse_filename(&layer.file_name().into_string().unwrap()) { if layer_file.is_delta { - layer_file.holes = get_holes(&layer.path(), max_holes).await?; + layer_file.holes = get_holes(&layer.path(), max_holes, &ctx).await?; n_deltas += 1; } layers.push(layer_file); diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 2af54902f7..e8d16d31f1 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -2,9 +2,12 @@ use std::path::{Path, PathBuf}; use anyhow::Result; use clap::Subcommand; +use pageserver::context::{DownloadBehavior, RequestContext}; +use pageserver::task_mgr::TaskKind; use pageserver::tenant::block_io::BlockCursor; use pageserver::tenant::disk_btree::DiskBtreeReader; use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary}; +use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::{page_cache, virtual_file}; use pageserver::{ repository::{Key, KEY_SIZE}, @@ -43,12 +46,12 @@ pub(crate) enum LayerCmd { }, } -async fn read_delta_file(path: impl AsRef) -> Result<()> { +async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { let path = path.as_ref(); virtual_file::init(10); page_cache::init(100); - let file = FileBlockReader::new(VirtualFile::open(path)?); - let summary_blk = file.read_blk(0).await?; + let file = FileBlockReader::new(VirtualFile::open(path).await?); + let summary_blk = file.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( actual_summary.index_start_blk, @@ -66,11 +69,12 @@ async fn read_delta_file(path: impl AsRef) -> Result<()> { all.push((curr, BlobRef(value_offset))); true }, + ctx, ) .await?; - let cursor = BlockCursor::new_fileblockreader_virtual(&file); + let cursor = BlockCursor::new_fileblockreader(&file); for (k, v) in all { - let value = cursor.read_blob(v.pos()).await?; + let value = cursor.read_blob(v.pos(), ctx).await?; println!("key:{} value_len:{}", k, value.len()); } // TODO(chi): special handling for last key? @@ -78,15 +82,16 @@ async fn read_delta_file(path: impl AsRef) -> Result<()> { } pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); match cmd { LayerCmd::List { path } => { - for tenant in fs::read_dir(path.join("tenants"))? { + for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? { let tenant = tenant?; if !tenant.file_type()?.is_dir() { continue; } println!("tenant {}", tenant.file_name().to_string_lossy()); - for timeline in fs::read_dir(tenant.path().join("timelines"))? { + for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? { let timeline = timeline?; if !timeline.file_type()?.is_dir() { continue; @@ -101,9 +106,9 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { timeline, } => { let timeline_path = path - .join("tenants") + .join(TENANTS_SEGMENT_NAME) .join(tenant) - .join("timelines") + .join(TIMELINES_SEGMENT_NAME) .join(timeline); let mut idx = 0; for layer in fs::read_dir(timeline_path)? { @@ -152,7 +157,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { ); if layer_file.is_delta { - read_delta_file(layer.path()).await?; + read_delta_file(layer.path(), &ctx).await?; } else { anyhow::bail!("not supported yet :("); } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index d2dc759835..a959f1cddc 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -25,6 +25,7 @@ use crate::context::RequestContext; use crate::tenant::Timeline; use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::dispatch_pgversion; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; @@ -323,14 +324,25 @@ where .timeline .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx) .await?; - ensure!(img.len() == 512); + + ensure!( + img.len() + == dispatch_pgversion!( + self.timeline.pg_version, + pgv::bindings::SIZEOF_RELMAPFILE + ) + ); + Some(img) } else { None }; if spcnode == GLOBALTABLESPACE_OID { - let pg_version_str = self.timeline.pg_version.to_string(); + let pg_version_str = match self.timeline.pg_version { + 14 | 15 => self.timeline.pg_version.to_string(), + ver => format!("{ver}\x0A"), + }; let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; self.ar.append(&header, pg_version_str.as_bytes()).await?; @@ -374,7 +386,10 @@ where if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); - let pg_version_str = self.timeline.pg_version.to_string(); + let pg_version_str = match self.timeline.pg_version { + 14 | 15 => self.timeline.pg_version.to_string(), + ver => format!("{ver}\x0A"), + }; let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; self.ar.append(&header, pg_version_str.as_bytes()).await?; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 71e3a0ff3f..b6a2117f9c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -388,6 +388,7 @@ fn start_pageserver( remote_storage: remote_storage.clone(), }, order, + shutdown_pageserver.clone(), ))?; BACKGROUND_RUNTIME.spawn({ @@ -476,16 +477,19 @@ fn start_pageserver( { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); - let router = http::make_router( - conf, - launch_ts, - http_auth, - broker_client.clone(), - remote_storage, - disk_usage_eviction_state, - )? - .build() - .map_err(|err| anyhow!(err))?; + let router_state = Arc::new( + http::routes::State::new( + conf, + http_auth.clone(), + remote_storage, + broker_client.clone(), + disk_usage_eviction_state, + ) + .context("Failed to initialize router state")?, + ); + let router = http::make_router(router_state, launch_ts, http_auth.clone())? + .build() + .map_err(|err| anyhow!(err))?; let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) @@ -514,6 +518,9 @@ fn start_pageserver( // creates a child context with the right DownloadBehavior. DownloadBehavior::Error, ); + + let local_disk_storage = conf.workdir.join("last_consumption_metrics.json"); + task_mgr::spawn( crate::BACKGROUND_RUNTIME.handle(), TaskKind::MetricsCollection, @@ -540,6 +547,7 @@ fn start_pageserver( conf.cached_metric_collection_interval, conf.synthetic_size_calculation_interval, conf.id, + local_disk_storage, metrics_ctx, ) .instrument(info_span!("metrics_collection")) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 5394f17398..8ee7f28c11 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -32,7 +32,8 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; use crate::tenant::{ - TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, + TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, + TIMELINES_SEGMENT_NAME, }; use crate::{ IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, @@ -63,7 +64,7 @@ pub mod defaults { super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; - pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour"; + pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s"; pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; @@ -72,7 +73,7 @@ pub mod defaults { /// Default built-in configuration file. /// pub const DEFAULT_CONFIG_FILE: &str = formatcp!( - r###" + r#" # Initial configuration file created by 'pageserver --init' #listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}' #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}' @@ -117,7 +118,7 @@ pub mod defaults { [remote_storage] -"### +"# ); } @@ -204,6 +205,8 @@ pub struct PageServerConf { /// has it's initial logical size calculated. Not running background tasks for some seconds is /// not terrible. pub background_task_maximum_delay: Duration, + + pub control_plane_api: Option, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -278,6 +281,8 @@ struct PageServerConfigBuilder { ondemand_download_behavior_treat_error_as_warn: BuilderValue, background_task_maximum_delay: BuilderValue, + + control_plane_api: BuilderValue>, } impl Default for PageServerConfigBuilder { @@ -340,6 +345,8 @@ impl Default for PageServerConfigBuilder { DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, ) .unwrap()), + + control_plane_api: Set(None), } } } @@ -468,6 +475,10 @@ impl PageServerConfigBuilder { self.background_task_maximum_delay = BuilderValue::Set(delay); } + pub fn control_plane_api(&mut self, api: Url) { + self.control_plane_api = BuilderValue::Set(Some(api)) + } + pub fn build(self) -> anyhow::Result { let concurrent_tenant_size_logical_size_queries = self .concurrent_tenant_size_logical_size_queries @@ -553,6 +564,9 @@ impl PageServerConfigBuilder { background_task_maximum_delay: self .background_task_maximum_delay .ok_or(anyhow!("missing background_task_maximum_delay"))?, + control_plane_api: self + .control_plane_api + .ok_or(anyhow!("missing control_plane_api"))?, }) } } @@ -563,7 +577,7 @@ impl PageServerConf { // pub fn tenants_path(&self) -> PathBuf { - self.workdir.join("tenants") + self.workdir.join(TENANTS_SEGMENT_NAME) } pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf { @@ -654,26 +668,18 @@ impl PageServerConf { pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); + #[allow(clippy::manual_range_patterns)] match pg_version { - 14 => Ok(path.join(format!("v{pg_version}"))), - 15 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { - match pg_version { - 14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), - 15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), - _ => bail!("Unsupported postgres version: {}", pg_version), - } + Ok(self.pg_distrib_dir(pg_version)?.join("bin")) } pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { - match pg_version { - 14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), - 15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), - _ => bail!("Unsupported postgres version: {}", pg_version), - } + Ok(self.pg_distrib_dir(pg_version)?.join("lib")) } /// Parse a configuration file (pageserver.toml) into a PageServerConf struct, @@ -741,6 +747,7 @@ impl PageServerConf { }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?), + "control_plane_api" => builder.control_plane_api(parse_toml_string(key, item)?.parse().context("failed to parse control plane URL")?), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -909,6 +916,7 @@ impl PageServerConf { test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, background_task_maximum_delay: Duration::ZERO, + control_plane_api: None, } } } @@ -1132,6 +1140,7 @@ background_task_maximum_delay = '334 s' background_task_maximum_delay: humantime::parse_duration( defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY )?, + control_plane_api: None }, "Correct defaults should be used when no config values are provided" ); @@ -1187,6 +1196,7 @@ background_task_maximum_delay = '334 s' test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, background_task_maximum_delay: Duration::from_secs(334), + control_plane_api: None }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index e4284b9e9c..5f64bb2b3b 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -1,188 +1,54 @@ -//! //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. -//! Cache metrics to send only the updated ones. -//! use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::{mgr, LogicalSizeCalculationCause}; -use anyhow; -use chrono::{DateTime, Utc}; -use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; +use consumption_metrics::EventType; use pageserver_api::models::TenantState; use reqwest::Url; -use serde::Serialize; -use serde_with::{serde_as, DisplayFromStr}; use std::collections::HashMap; +use std::path::PathBuf; use std::sync::Arc; use std::time::{Duration, SystemTime}; use tracing::*; -use utils::id::{NodeId, TenantId, TimelineId}; -use utils::lsn::Lsn; +use utils::id::NodeId; + +mod metrics; +use metrics::MetricsKey; +mod disk_cache; +mod upload; const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); -#[serde_as] -#[derive(Serialize, Debug, Clone, Copy)] -struct Ids { - #[serde_as(as = "DisplayFromStr")] - tenant_id: TenantId, - #[serde_as(as = "Option")] - #[serde(skip_serializing_if = "Option::is_none")] - timeline_id: Option, -} +/// Basically a key-value pair, but usually in a Vec except for [`Cache`]. +/// +/// This is as opposed to `consumption_metrics::Event` which is the externally communicated form. +/// Difference is basically the missing idempotency key, which lives only for the duration of +/// upload attempts. +type RawMetric = (MetricsKey, (EventType, u64)); -/// Key that uniquely identifies the object, this metric describes. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -struct MetricsKey { - tenant_id: TenantId, - timeline_id: Option, - metric: &'static str, -} - -impl MetricsKey { - const fn absolute_values(self) -> AbsoluteValueFactory { - AbsoluteValueFactory(self) - } - const fn incremental_values(self) -> IncrementalValueFactory { - IncrementalValueFactory(self) - } -} - -/// Helper type which each individual metric kind can return to produce only absolute values. -struct AbsoluteValueFactory(MetricsKey); - -impl AbsoluteValueFactory { - fn at(self, time: DateTime, val: u64) -> (MetricsKey, (EventType, u64)) { - let key = self.0; - (key, (EventType::Absolute { time }, val)) - } -} - -/// Helper type which each individual metric kind can return to produce only incremental values. -struct IncrementalValueFactory(MetricsKey); - -impl IncrementalValueFactory { - #[allow(clippy::wrong_self_convention)] - fn from_previous_up_to( - self, - prev_end: DateTime, - up_to: DateTime, - val: u64, - ) -> (MetricsKey, (EventType, u64)) { - let key = self.0; - // cannot assert prev_end < up_to because these are realtime clock based - ( - key, - ( - EventType::Incremental { - start_time: prev_end, - stop_time: up_to, - }, - val, - ), - ) - } - - fn key(&self) -> &MetricsKey { - &self.0 - } -} - -// the static part of a MetricsKey -impl MetricsKey { - /// Absolute value of [`Timeline::get_last_record_lsn`]. - /// - /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn - const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory { - MetricsKey { - tenant_id, - timeline_id: Some(timeline_id), - metric: "written_size", - } - .absolute_values() - } - - /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we - /// previously sent, starting from the previously sent incremental time range ending at the - /// latest absolute measurement. - const fn written_size_delta( - tenant_id: TenantId, - timeline_id: TimelineId, - ) -> IncrementalValueFactory { - MetricsKey { - tenant_id, - timeline_id: Some(timeline_id), - // the name here is correctly about data not size, because that is what is wanted by - // downstream pipeline - metric: "written_data_bytes_delta", - } - .incremental_values() - } - - /// Exact [`Timeline::get_current_logical_size`]. - /// - /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size - const fn timeline_logical_size( - tenant_id: TenantId, - timeline_id: TimelineId, - ) -> AbsoluteValueFactory { - MetricsKey { - tenant_id, - timeline_id: Some(timeline_id), - metric: "timeline_logical_size", - } - .absolute_values() - } - - /// [`Tenant::remote_size`] - /// - /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size - const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory { - MetricsKey { - tenant_id, - timeline_id: None, - metric: "remote_storage_size", - } - .absolute_values() - } - - /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`. - /// - /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size - const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory { - MetricsKey { - tenant_id, - timeline_id: None, - metric: "resident_size", - } - .absolute_values() - } - - /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`]. - /// - /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size - const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory { - MetricsKey { - tenant_id, - timeline_id: None, - metric: "synthetic_storage_size", - } - .absolute_values() - } -} +/// Caches the [`RawMetric`]s +/// +/// In practice, during startup, last sent values are stored here to be used in calculating new +/// ones. After successful uploading, the cached values are updated to cache. This used to be used +/// for deduplication, but that is no longer needed. +type Cache = HashMap; /// Main thread that serves metrics collection pub async fn collect_metrics( metric_collection_endpoint: &Url, metric_collection_interval: Duration, - cached_metric_collection_interval: Duration, + _cached_metric_collection_interval: Duration, synthetic_size_calculation_interval: Duration, node_id: NodeId, + local_disk_storage: PathBuf, ctx: RequestContext, ) -> anyhow::Result<()> { - let mut ticker = tokio::time::interval(metric_collection_interval); - info!("starting collect_metrics"); + if _cached_metric_collection_interval != Duration::ZERO { + tracing::warn!( + "cached_metric_collection_interval is no longer used, please set it to zero." + ) + } // spin up background worker that caclulates tenant sizes let worker_ctx = @@ -202,543 +68,216 @@ pub async fn collect_metrics( }, ); + let path: Arc = Arc::new(local_disk_storage); + + let cancel = task_mgr::shutdown_token(); + + let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval); + + let mut cached_metrics = tokio::select! { + _ = cancel.cancelled() => return Ok(()), + ret = restore_and_reschedule => ret, + }; + // define client here to reuse it for all requests let client = reqwest::ClientBuilder::new() .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT) .build() .expect("Failed to create http client with timeout"); - let mut cached_metrics = HashMap::new(); - let mut prev_iteration_time: std::time::Instant = std::time::Instant::now(); - - loop { - tokio::select! { - _ = task_mgr::shutdown_watcher() => { - info!("collect_metrics received cancellation request"); - return Ok(()); - }, - tick_at = ticker.tick() => { - - // send cached metrics every cached_metric_collection_interval - let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval; - - if send_cached { - prev_iteration_time = std::time::Instant::now(); - } - - collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await; - - crate::tenant::tasks::warn_when_period_overrun( - tick_at.elapsed(), - metric_collection_interval, - "consumption_metrics_collect_metrics", - ); - } - } - } -} - -/// One iteration of metrics collection -/// -/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`. -/// Cache metrics to avoid sending the same metrics multiple times. -/// -/// This function handles all errors internally -/// and doesn't break iteration if just one tenant fails. -/// -/// TODO -/// - refactor this function (chunking+sending part) to reuse it in proxy module; -async fn collect_metrics_iteration( - client: &reqwest::Client, - cached_metrics: &mut HashMap, - metric_collection_endpoint: &reqwest::Url, - node_id: NodeId, - ctx: &RequestContext, - send_cached: bool, -) { - let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new(); - trace!( - "starting collect_metrics_iteration. metric_collection_endpoint: {}", - metric_collection_endpoint - ); - - // get list of tenants - let tenants = match mgr::list_tenants().await { - Ok(tenants) => tenants, - Err(err) => { - error!("failed to list tenants: {:?}", err); - return; - } - }; - - // iterate through list of Active tenants and collect metrics - for (tenant_id, tenant_state) in tenants { - if tenant_state != TenantState::Active { - continue; - } - - let tenant = match mgr::get_tenant(tenant_id, true).await { - Ok(tenant) => tenant, - Err(err) => { - // It is possible that tenant was deleted between - // `list_tenants` and `get_tenant`, so just warn about it. - warn!("failed to get tenant {tenant_id:?}: {err:?}"); - continue; - } - }; - - let mut tenant_resident_size = 0; - - // iterate through list of timelines in tenant - for timeline in tenant.list_timelines() { - // collect per-timeline metrics only for active timelines - - let timeline_id = timeline.timeline_id; - - match TimelineSnapshot::collect(&timeline, ctx) { - Ok(Some(snap)) => { - snap.to_metrics( - tenant_id, - timeline_id, - Utc::now(), - &mut current_metrics, - cached_metrics, - ); - } - Ok(None) => {} - Err(e) => { - error!( - "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}", - timeline.timeline_id - ); - continue; - } - } - - tenant_resident_size += timeline.resident_physical_size(); - } - - current_metrics - .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size())); - - current_metrics - .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size)); - - // Note that this metric is calculated in a separate bgworker - // Here we only use cached value, which may lag behind the real latest one - let synthetic_size = tenant.cached_synthetic_size(); - - if synthetic_size != 0 { - // only send non-zeroes because otherwise these show up as errors in logs - current_metrics - .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size)); - } - } - - // Filter metrics, unless we want to send all metrics, including cached ones. - // See: https://github.com/neondatabase/neon/issues/3485 - if !send_cached { - current_metrics.retain(|(curr_key, (kind, curr_val))| { - if kind.is_incremental() { - // incremental values (currently only written_size_delta) should not get any cache - // deduplication because they will be used by upstream for "is still alive." - true - } else { - match cached_metrics.get(curr_key) { - Some((_, val)) => val != curr_val, - None => true, - } - } - }); - } - - if current_metrics.is_empty() { - trace!("no new metrics to send"); - return; - } - - // Send metrics. - // Split into chunks of 1000 metrics to avoid exceeding the max request size - let chunks = current_metrics.chunks(CHUNK_SIZE); - - let mut chunk_to_send: Vec> = Vec::with_capacity(CHUNK_SIZE); let node_id = node_id.to_string(); - for chunk in chunks { - chunk_to_send.clear(); + // reminder: ticker is ready immediatedly + let mut ticker = tokio::time::interval(metric_collection_interval); - // enrich metrics with type,timestamp and idempotency key before sending - chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event { - kind: *when, - metric: curr_key.metric, - idempotency_key: idempotency_key(&node_id), - value: *curr_val, - extra: Ids { - tenant_id: curr_key.tenant_id, - timeline_id: curr_key.timeline_id, - }, - })); + loop { + let tick_at = tokio::select! { + _ = cancel.cancelled() => return Ok(()), + tick_at = ticker.tick() => tick_at, + }; - const MAX_RETRIES: u32 = 3; + // these are point in time, with variable "now" + let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await; - for attempt in 0..MAX_RETRIES { - let res = client - .post(metric_collection_endpoint.clone()) - .json(&EventChunk { - events: (&chunk_to_send).into(), - }) - .send() - .await; + if metrics.is_empty() { + continue; + } - match res { - Ok(res) => { - if res.status().is_success() { - // update cached metrics after they were sent successfully - for (curr_key, curr_val) in chunk.iter() { - cached_metrics.insert(curr_key.clone(), *curr_val); - } - } else { - error!("metrics endpoint refused the sent metrics: {:?}", res); - for metric in chunk_to_send - .iter() - .filter(|metric| metric.value > (1u64 << 40)) - { - // Report if the metric value is suspiciously large - error!("potentially abnormal metric value: {:?}", metric); - } - } - break; + let metrics = Arc::new(metrics); + + // why not race cancellation here? because we are one of the last tasks, and if we are + // already here, better to try to flush the new values. + + let flush = async { + match disk_cache::flush_metrics_to_disk(&metrics, &path).await { + Ok(()) => { + tracing::debug!("flushed metrics to disk"); } - Err(err) if err.is_timeout() => { - error!(attempt, "timeout sending metrics, retrying immediately"); - continue; - } - Err(err) => { - error!(attempt, ?err, "failed to send metrics"); - break; + Err(e) => { + // idea here is that if someone creates a directory as our path, then they + // might notice it from the logs before shutdown and remove it + tracing::error!("failed to persist metrics to {path:?}: {e:#}"); } } + }; + + let upload = async { + let res = upload::upload_metrics( + &client, + metric_collection_endpoint, + &cancel, + &node_id, + &metrics, + &mut cached_metrics, + ) + .await; + if let Err(e) = res { + // serialization error which should never happen + tracing::error!("failed to upload due to {e:#}"); + } + }; + + // let these run concurrently + let (_, _) = tokio::join!(flush, upload); + + crate::tenant::tasks::warn_when_period_overrun( + tick_at.elapsed(), + metric_collection_interval, + "consumption_metrics_collect_metrics", + ); + } +} + +/// Called on the first iteration in an attempt to join the metric uploading schedule from previous +/// pageserver session. Pageserver is supposed to upload at intervals regardless of restarts. +/// +/// Cancellation safe. +async fn restore_and_reschedule( + path: &Arc, + metric_collection_interval: Duration, +) -> Cache { + let (cached, earlier_metric_at) = match disk_cache::read_metrics_from_disk(path.clone()).await { + Ok(found_some) => { + // there is no min needed because we write these sequentially in + // collect_all_metrics + let earlier_metric_at = found_some + .iter() + .map(|(_, (et, _))| et.recorded_at()) + .copied() + .next(); + + let cached = found_some.into_iter().collect::(); + + (cached, earlier_metric_at) + } + Err(e) => { + use std::io::{Error, ErrorKind}; + + let root = e.root_cause(); + let maybe_ioerr = root.downcast_ref::(); + let is_not_found = maybe_ioerr.is_some_and(|e| e.kind() == ErrorKind::NotFound); + + if !is_not_found { + tracing::info!("failed to read any previous metrics from {path:?}: {e:#}"); + } + + (HashMap::new(), None) + } + }; + + if let Some(earlier_metric_at) = earlier_metric_at { + let earlier_metric_at: SystemTime = earlier_metric_at.into(); + + let error = reschedule(earlier_metric_at, metric_collection_interval).await; + + if let Some(error) = error { + if error.as_secs() >= 60 { + tracing::info!( + error_ms = error.as_millis(), + "startup scheduling error due to restart" + ) + } } } + + cached } -/// Internal type to make timeline metric production testable. -/// -/// As this value type contains all of the information needed from a timeline to produce the -/// metrics, it can easily be created with different values in test. -struct TimelineSnapshot { - loaded_at: (Lsn, SystemTime), - last_record_lsn: Lsn, - current_exact_logical_size: Option, -} +async fn reschedule( + earlier_metric_at: SystemTime, + metric_collection_interval: Duration, +) -> Option { + let now = SystemTime::now(); + match now.duration_since(earlier_metric_at) { + Ok(from_last_send) if from_last_send < metric_collection_interval => { + let sleep_for = metric_collection_interval - from_last_send; -impl TimelineSnapshot { - /// Collect the metrics from an actual timeline. - /// - /// Fails currently only when [`Timeline::get_current_logical_size`] fails. - /// - /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size - fn collect( - t: &Arc, - ctx: &RequestContext, - ) -> anyhow::Result> { - use anyhow::Context; + let deadline = std::time::Instant::now() + sleep_for; - if !t.is_active() { - // no collection for broken or stopping needed, we will still keep the cached values - // though at the caller. - Ok(None) - } else { - let loaded_at = t.loaded_at; - let last_record_lsn = t.get_last_record_lsn(); + tokio::time::sleep_until(deadline.into()).await; - let current_exact_logical_size = { - let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id); - let res = span - .in_scope(|| t.get_current_logical_size(ctx)) - .context("get_current_logical_size"); - match res? { - // Only send timeline logical size when it is fully calculated. - (size, is_exact) if is_exact => Some(size), - (_, _) => None, - } - }; + let now = std::time::Instant::now(); - Ok(Some(TimelineSnapshot { - loaded_at, - last_record_lsn, - current_exact_logical_size, - })) - } - } - - /// Produce the timeline consumption metrics into the `metrics` argument. - fn to_metrics( - &self, - tenant_id: TenantId, - timeline_id: TimelineId, - now: DateTime, - metrics: &mut Vec<(MetricsKey, (EventType, u64))>, - cache: &HashMap, - ) { - let timeline_written_size = u64::from(self.last_record_lsn); - - let (key, written_size_now) = - MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size); - - // last_record_lsn can only go up, right now at least, TODO: #2592 or related - // features might change this. - - let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id); - - // use this when available, because in a stream of incremental values, it will be - // accurate where as when last_record_lsn stops moving, we will only cache the last - // one of those. - let last_stop_time = cache - .get(written_size_delta_key.key()) - .map(|(until, _val)| { - until - .incremental_timerange() - .expect("never create EventType::Absolute for written_size_delta") - .end - }); - - // by default, use the last sent written_size as the basis for - // calculating the delta. if we don't yet have one, use the load time value. - let prev = cache - .get(&key) - .map(|(prev_at, prev)| { - // use the prev time from our last incremental update, or default to latest - // absolute update on the first round. - let prev_at = prev_at - .absolute_time() - .expect("never create EventType::Incremental for written_size"); - let prev_at = last_stop_time.unwrap_or(prev_at); - (*prev_at, *prev) - }) - .unwrap_or_else(|| { - // if we don't have a previous point of comparison, compare to the load time - // lsn. - let (disk_consistent_lsn, loaded_at) = &self.loaded_at; - (DateTime::from(*loaded_at), disk_consistent_lsn.0) - }); - - // written_size_bytes_delta - metrics.extend( - if let Some(delta) = written_size_now.1.checked_sub(prev.1) { - let up_to = written_size_now - .0 - .absolute_time() - .expect("never create EventType::Incremental for written_size"); - let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta); - Some(key_value) + // executor threads might be busy, add extra measurements + Some(if now < deadline { + deadline - now } else { - None - }, - ); - - // written_size - metrics.push((key, written_size_now)); - - if let Some(size) = self.current_exact_logical_size { - metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size)); + now - deadline + }) + } + Ok(from_last_send) => Some(from_last_send.saturating_sub(metric_collection_interval)), + Err(_) => { + tracing::warn!( + ?now, + ?earlier_metric_at, + "oldest recorded metric is in future; first values will come out with inconsistent timestamps" + ); + earlier_metric_at.duration_since(now).ok() } } } /// Caclculate synthetic size for each active tenant -pub async fn calculate_synthetic_size_worker( +async fn calculate_synthetic_size_worker( synthetic_size_calculation_interval: Duration, ctx: &RequestContext, ) -> anyhow::Result<()> { info!("starting calculate_synthetic_size_worker"); + // reminder: ticker is ready immediatedly let mut ticker = tokio::time::interval(synthetic_size_calculation_interval); + let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize; loop { - tokio::select! { - _ = task_mgr::shutdown_watcher() => { - return Ok(()); - }, - tick_at = ticker.tick() => { + let tick_at = tokio::select! { + _ = task_mgr::shutdown_watcher() => return Ok(()), + tick_at = ticker.tick() => tick_at, + }; - let tenants = match mgr::list_tenants().await { - Ok(tenants) => tenants, - Err(e) => { - warn!("cannot get tenant list: {e:#}"); - continue; - } - }; - // iterate through list of Active tenants and collect metrics - for (tenant_id, tenant_state) in tenants { + let tenants = match mgr::list_tenants().await { + Ok(tenants) => tenants, + Err(e) => { + warn!("cannot get tenant list: {e:#}"); + continue; + } + }; - if tenant_state != TenantState::Active { - continue; - } - - if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await - { - if let Err(e) = tenant.calculate_synthetic_size( - LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize, - ctx).await { - error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e); - } - } + for (tenant_id, tenant_state) in tenants { + if tenant_state != TenantState::Active { + continue; + } + if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await { + if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await { + error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}"); } - - crate::tenant::tasks::warn_when_period_overrun( - tick_at.elapsed(), - synthetic_size_calculation_interval, - "consumption_metrics_synthetic_size_worker", - ); } } - } -} - -#[cfg(test)] -mod tests { - use std::collections::HashMap; - - use std::time::SystemTime; - use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, - }; - - use crate::consumption_metrics::MetricsKey; - - use super::TimelineSnapshot; - use chrono::{DateTime, Utc}; - - #[test] - fn startup_collected_timeline_metrics_before_advancing() { - let tenant_id = TenantId::generate(); - let timeline_id = TimelineId::generate(); - - let mut metrics = Vec::new(); - let cache = HashMap::new(); - - let initdb_lsn = Lsn(0x10000); - let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); - - let snap = TimelineSnapshot { - loaded_at: (disk_consistent_lsn, SystemTime::now()), - last_record_lsn: disk_consistent_lsn, - current_exact_logical_size: Some(0x42000), - }; - - let now = DateTime::::from(SystemTime::now()); - - snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); - - assert_eq!( - metrics, - &[ - MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to( - snap.loaded_at.1.into(), - now, - 0 - ), - MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) - ] - ); - } - - #[test] - fn startup_collected_timeline_metrics_second_round() { - let tenant_id = TenantId::generate(); - let timeline_id = TimelineId::generate(); - - let [now, before, init] = time_backwards(); - - let now = DateTime::::from(now); - let before = DateTime::::from(before); - - let initdb_lsn = Lsn(0x10000); - let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); - - let mut metrics = Vec::new(); - let cache = HashMap::from([ - MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0) - ]); - - let snap = TimelineSnapshot { - loaded_at: (disk_consistent_lsn, init), - last_record_lsn: disk_consistent_lsn, - current_exact_logical_size: Some(0x42000), - }; - - snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); - - assert_eq!( - metrics, - &[ - MetricsKey::written_size_delta(tenant_id, timeline_id) - .from_previous_up_to(before, now, 0), - MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) - ] - ); - } - - #[test] - fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { - let tenant_id = TenantId::generate(); - let timeline_id = TimelineId::generate(); - - let [now, just_before, before, init] = time_backwards(); - - let now = DateTime::::from(now); - let just_before = DateTime::::from(just_before); - let before = DateTime::::from(before); - - let initdb_lsn = Lsn(0x10000); - let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); - - let mut metrics = Vec::new(); - let cache = HashMap::from([ - // at t=before was the last time the last_record_lsn changed - MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0), - // end time of this event is used for the next ones - MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to( - before, - just_before, - 0, - ), - ]); - - let snap = TimelineSnapshot { - loaded_at: (disk_consistent_lsn, init), - last_record_lsn: disk_consistent_lsn, - current_exact_logical_size: Some(0x42000), - }; - - snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); - - assert_eq!( - metrics, - &[ - MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to( - just_before, - now, - 0 - ), - MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) - ] - ); - } - - fn time_backwards() -> [std::time::SystemTime; N] { - let mut times = [std::time::SystemTime::UNIX_EPOCH; N]; - times[0] = std::time::SystemTime::now(); - for behind in 1..N { - times[behind] = times[0] - std::time::Duration::from_secs(behind as u64); - } - - times + + crate::tenant::tasks::warn_when_period_overrun( + tick_at.elapsed(), + synthetic_size_calculation_interval, + "consumption_metrics_synthetic_size_worker", + ); } } diff --git a/pageserver/src/consumption_metrics/disk_cache.rs b/pageserver/src/consumption_metrics/disk_cache.rs new file mode 100644 index 0000000000..4b1cd79c6d --- /dev/null +++ b/pageserver/src/consumption_metrics/disk_cache.rs @@ -0,0 +1,117 @@ +use anyhow::Context; +use std::path::PathBuf; +use std::sync::Arc; + +use super::RawMetric; + +pub(super) async fn read_metrics_from_disk(path: Arc) -> anyhow::Result> { + // do not add context to each error, callsite will log with full path + let span = tracing::Span::current(); + tokio::task::spawn_blocking(move || { + let _e = span.entered(); + + if let Some(parent) = path.parent() { + if let Err(e) = scan_and_delete_with_same_prefix(&path) { + tracing::info!("failed to cleanup temporary files in {parent:?}: {e:#}"); + } + } + + let mut file = std::fs::File::open(&*path)?; + let reader = std::io::BufReader::new(&mut file); + anyhow::Ok(serde_json::from_reader::<_, Vec>(reader)?) + }) + .await + .context("read metrics join error") + .and_then(|x| x) +} + +fn scan_and_delete_with_same_prefix(path: &std::path::Path) -> std::io::Result<()> { + let it = std::fs::read_dir(path.parent().expect("caller checked"))?; + + let prefix = path.file_name().expect("caller checked").to_string_lossy(); + + for entry in it { + let entry = entry?; + if !entry.metadata()?.is_file() { + continue; + } + let file_name = entry.file_name(); + + if path.file_name().unwrap() == file_name { + // do not remove our actual file + continue; + } + + let file_name = file_name.to_string_lossy(); + + if !file_name.starts_with(&*prefix) { + continue; + } + + let path = entry.path(); + + if let Err(e) = std::fs::remove_file(&path) { + tracing::warn!("cleaning up old tempfile {file_name:?} failed: {e:#}"); + } else { + tracing::info!("cleaned up old tempfile {file_name:?}"); + } + } + + Ok(()) +} + +pub(super) async fn flush_metrics_to_disk( + current_metrics: &Arc>, + path: &Arc, +) -> anyhow::Result<()> { + use std::io::Write; + + anyhow::ensure!(path.parent().is_some(), "path must have parent: {path:?}"); + anyhow::ensure!( + path.file_name().is_some(), + "path must have filename: {path:?}" + ); + + let span = tracing::Span::current(); + tokio::task::spawn_blocking({ + let current_metrics = current_metrics.clone(); + let path = path.clone(); + move || { + let _e = span.entered(); + + let parent = path.parent().expect("existence checked"); + let file_name = path.file_name().expect("existence checked"); + let mut tempfile = tempfile::Builder::new() + .prefix(file_name) + .suffix(".tmp") + .tempfile_in(parent)?; + + tracing::debug!("using tempfile {:?}", tempfile.path()); + + // write out all of the raw metrics, to be read out later on restart as cached values + { + let mut writer = std::io::BufWriter::new(&mut tempfile); + serde_json::to_writer(&mut writer, &*current_metrics) + .context("serialize metrics")?; + writer + .into_inner() + .map_err(|_| anyhow::anyhow!("flushing metrics failed"))?; + } + + tempfile.flush()?; + tempfile.as_file().sync_all()?; + + fail::fail_point!("before-persist-last-metrics-collected"); + + drop(tempfile.persist(&*path).map_err(|e| e.error)?); + + let f = std::fs::File::open(path.parent().unwrap())?; + f.sync_all()?; + + anyhow::Ok(()) + } + }) + .await + .with_context(|| format!("write metrics to {path:?} join error")) + .and_then(|x| x.with_context(|| format!("write metrics to {path:?}"))) +} diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs new file mode 100644 index 0000000000..652dd98683 --- /dev/null +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -0,0 +1,455 @@ +use crate::context::RequestContext; +use anyhow::Context; +use chrono::{DateTime, Utc}; +use consumption_metrics::EventType; +use futures::stream::StreamExt; +use serde_with::serde_as; +use std::{sync::Arc, time::SystemTime}; +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +use super::{Cache, RawMetric}; + +/// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events` +/// instead of static str. +// Do not rename any of these without first consulting with data team and partner +// management. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] +pub(super) enum Name { + /// Timeline last_record_lsn, absolute + #[serde(rename = "written_size")] + WrittenSize, + /// Timeline last_record_lsn, incremental + #[serde(rename = "written_data_bytes_delta")] + WrittenSizeDelta, + /// Timeline logical size + #[serde(rename = "timeline_logical_size")] + LogicalSize, + /// Tenant remote size + #[serde(rename = "remote_storage_size")] + RemoteSize, + /// Tenant resident size + #[serde(rename = "resident_size")] + ResidentSize, + /// Tenant synthetic size + #[serde(rename = "synthetic_storage_size")] + SyntheticSize, +} + +/// Key that uniquely identifies the object this metric describes. +/// +/// This is a denormalization done at the MetricsKey const methods; these should not be constructed +/// elsewhere. +#[serde_with::serde_as] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] +pub(crate) struct MetricsKey { + #[serde_as(as = "serde_with::DisplayFromStr")] + pub(super) tenant_id: TenantId, + + #[serde_as(as = "Option")] + #[serde(skip_serializing_if = "Option::is_none")] + pub(super) timeline_id: Option, + + pub(super) metric: Name, +} + +impl MetricsKey { + const fn absolute_values(self) -> AbsoluteValueFactory { + AbsoluteValueFactory(self) + } + const fn incremental_values(self) -> IncrementalValueFactory { + IncrementalValueFactory(self) + } +} + +/// Helper type which each individual metric kind can return to produce only absolute values. +struct AbsoluteValueFactory(MetricsKey); + +impl AbsoluteValueFactory { + const fn at(self, time: DateTime, val: u64) -> RawMetric { + let key = self.0; + (key, (EventType::Absolute { time }, val)) + } + + fn key(&self) -> &MetricsKey { + &self.0 + } +} + +/// Helper type which each individual metric kind can return to produce only incremental values. +struct IncrementalValueFactory(MetricsKey); + +impl IncrementalValueFactory { + #[allow(clippy::wrong_self_convention)] + const fn from_until( + self, + prev_end: DateTime, + up_to: DateTime, + val: u64, + ) -> RawMetric { + let key = self.0; + // cannot assert prev_end < up_to because these are realtime clock based + let when = EventType::Incremental { + start_time: prev_end, + stop_time: up_to, + }; + (key, (when, val)) + } + + fn key(&self) -> &MetricsKey { + &self.0 + } +} + +// the static part of a MetricsKey +impl MetricsKey { + /// Absolute value of [`Timeline::get_last_record_lsn`]. + /// + /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn + const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory { + MetricsKey { + tenant_id, + timeline_id: Some(timeline_id), + metric: Name::WrittenSize, + } + .absolute_values() + } + + /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we + /// previously sent, starting from the previously sent incremental time range ending at the + /// latest absolute measurement. + const fn written_size_delta( + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> IncrementalValueFactory { + MetricsKey { + tenant_id, + timeline_id: Some(timeline_id), + metric: Name::WrittenSizeDelta, + } + .incremental_values() + } + + /// Exact [`Timeline::get_current_logical_size`]. + /// + /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size + const fn timeline_logical_size( + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> AbsoluteValueFactory { + MetricsKey { + tenant_id, + timeline_id: Some(timeline_id), + metric: Name::LogicalSize, + } + .absolute_values() + } + + /// [`Tenant::remote_size`] + /// + /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size + const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory { + MetricsKey { + tenant_id, + timeline_id: None, + metric: Name::RemoteSize, + } + .absolute_values() + } + + /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`. + /// + /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size + const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory { + MetricsKey { + tenant_id, + timeline_id: None, + metric: Name::ResidentSize, + } + .absolute_values() + } + + /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`]. + /// + /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size + /// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker + const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory { + MetricsKey { + tenant_id, + timeline_id: None, + metric: Name::SyntheticSize, + } + .absolute_values() + } +} + +pub(super) async fn collect_all_metrics( + cached_metrics: &Cache, + ctx: &RequestContext, +) -> Vec { + use pageserver_api::models::TenantState; + + let started_at = std::time::Instant::now(); + + let tenants = match crate::tenant::mgr::list_tenants().await { + Ok(tenants) => tenants, + Err(err) => { + tracing::error!("failed to list tenants: {:?}", err); + return vec![]; + } + }; + + let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move { + if state != TenantState::Active { + None + } else { + crate::tenant::mgr::get_tenant(id, true) + .await + .ok() + .map(|tenant| (id, tenant)) + } + }); + + let res = collect(tenants, cached_metrics, ctx).await; + + tracing::info!( + elapsed_ms = started_at.elapsed().as_millis(), + total = res.len(), + "collected metrics" + ); + + res +} + +async fn collect(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec +where + S: futures::stream::Stream)>, +{ + let mut current_metrics: Vec = Vec::new(); + + let mut tenants = std::pin::pin!(tenants); + + while let Some((tenant_id, tenant)) = tenants.next().await { + let mut tenant_resident_size = 0; + + for timeline in tenant.list_timelines() { + let timeline_id = timeline.timeline_id; + + match TimelineSnapshot::collect(&timeline, ctx) { + Ok(Some(snap)) => { + snap.to_metrics( + tenant_id, + timeline_id, + Utc::now(), + &mut current_metrics, + cache, + ); + } + Ok(None) => {} + Err(e) => { + tracing::error!( + "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}", + timeline.timeline_id + ); + continue; + } + } + + tenant_resident_size += timeline.resident_physical_size(); + } + + let snap = TenantSnapshot::collect(&tenant, tenant_resident_size); + snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics); + } + + current_metrics +} + +/// In-between abstraction to allow testing metrics without actual Tenants. +struct TenantSnapshot { + resident_size: u64, + remote_size: u64, + synthetic_size: u64, +} + +impl TenantSnapshot { + /// Collect tenant status to have metrics created out of it. + /// + /// `resident_size` is calculated of the timelines we had access to for other metrics, so we + /// cannot just list timelines here. + fn collect(t: &Arc, resident_size: u64) -> Self { + TenantSnapshot { + resident_size, + remote_size: t.remote_size(), + // Note that this metric is calculated in a separate bgworker + // Here we only use cached value, which may lag behind the real latest one + synthetic_size: t.cached_synthetic_size(), + } + } + + fn to_metrics( + &self, + tenant_id: TenantId, + now: DateTime, + cached: &Cache, + metrics: &mut Vec, + ) { + let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size); + + let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size); + + let synthetic_size = { + let factory = MetricsKey::synthetic_size(tenant_id); + let mut synthetic_size = self.synthetic_size; + + if synthetic_size == 0 { + if let Some((_, value)) = cached.get(factory.key()) { + // use the latest value from previous session + synthetic_size = *value; + } + } + + if synthetic_size != 0 { + // only send non-zeroes because otherwise these show up as errors in logs + Some(factory.at(now, synthetic_size)) + } else { + None + } + }; + + metrics.extend( + [Some(remote_size), Some(resident_size), synthetic_size] + .into_iter() + .flatten(), + ); + } +} + +/// Internal type to make timeline metric production testable. +/// +/// As this value type contains all of the information needed from a timeline to produce the +/// metrics, it can easily be created with different values in test. +struct TimelineSnapshot { + loaded_at: (Lsn, SystemTime), + last_record_lsn: Lsn, + current_exact_logical_size: Option, +} + +impl TimelineSnapshot { + /// Collect the metrics from an actual timeline. + /// + /// Fails currently only when [`Timeline::get_current_logical_size`] fails. + /// + /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size + fn collect( + t: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result> { + if !t.is_active() { + // no collection for broken or stopping needed, we will still keep the cached values + // though at the caller. + Ok(None) + } else { + let loaded_at = t.loaded_at; + let last_record_lsn = t.get_last_record_lsn(); + + let current_exact_logical_size = { + let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id); + let res = span + .in_scope(|| t.get_current_logical_size(ctx)) + .context("get_current_logical_size"); + match res? { + // Only send timeline logical size when it is fully calculated. + (size, is_exact) if is_exact => Some(size), + (_, _) => None, + } + }; + + Ok(Some(TimelineSnapshot { + loaded_at, + last_record_lsn, + current_exact_logical_size, + })) + } + } + + /// Produce the timeline consumption metrics into the `metrics` argument. + fn to_metrics( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + now: DateTime, + metrics: &mut Vec, + cache: &Cache, + ) { + let timeline_written_size = u64::from(self.last_record_lsn); + + let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id); + + let last_stop_time = cache + .get(written_size_delta_key.key()) + .map(|(until, _val)| { + until + .incremental_timerange() + .expect("never create EventType::Absolute for written_size_delta") + .end + }); + + let (key, written_size_now) = + MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size); + + // by default, use the last sent written_size as the basis for + // calculating the delta. if we don't yet have one, use the load time value. + let prev = cache + .get(&key) + .map(|(prev_at, prev)| { + // use the prev time from our last incremental update, or default to latest + // absolute update on the first round. + let prev_at = prev_at + .absolute_time() + .expect("never create EventType::Incremental for written_size"); + let prev_at = last_stop_time.unwrap_or(prev_at); + (*prev_at, *prev) + }) + .unwrap_or_else(|| { + // if we don't have a previous point of comparison, compare to the load time + // lsn. + let (disk_consistent_lsn, loaded_at) = &self.loaded_at; + (DateTime::from(*loaded_at), disk_consistent_lsn.0) + }); + + let up_to = now; + + if let Some(delta) = written_size_now.1.checked_sub(prev.1) { + let key_value = written_size_delta_key.from_until(prev.0, up_to, delta); + // written_size_delta + metrics.push(key_value); + // written_size + metrics.push((key, written_size_now)); + } else { + // the cached value was ahead of us, report zero until we've caught up + metrics.push(written_size_delta_key.from_until(prev.0, up_to, 0)); + // the cached value was ahead of us, report the same until we've caught up + metrics.push((key, (written_size_now.0, prev.1))); + } + + { + let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id); + let current_or_previous = self + .current_exact_logical_size + .or_else(|| cache.get(factory.key()).map(|(_, val)| *val)); + + if let Some(size) = current_or_previous { + metrics.push(factory.at(now, size)); + } + } + } +} + +#[cfg(test)] +mod tests; + +#[cfg(test)] +pub(crate) use tests::metric_examples; diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs new file mode 100644 index 0000000000..38a4c9eb5d --- /dev/null +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -0,0 +1,297 @@ +use super::*; +use std::collections::HashMap; +use std::time::SystemTime; +use utils::lsn::Lsn; + +#[test] +fn startup_collected_timeline_metrics_before_advancing() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let mut metrics = Vec::new(); + let cache = HashMap::new(); + + let initdb_lsn = Lsn(0x10000); + let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + + let snap = TimelineSnapshot { + loaded_at: (disk_consistent_lsn, SystemTime::now()), + last_record_lsn: disk_consistent_lsn, + current_exact_logical_size: Some(0x42000), + }; + + let now = DateTime::::from(SystemTime::now()); + + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( + snap.loaded_at.1.into(), + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + ] + ); +} + +#[test] +fn startup_collected_timeline_metrics_second_round() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let [now, before, init] = time_backwards(); + + let now = DateTime::::from(now); + let before = DateTime::::from(before); + + let initdb_lsn = Lsn(0x10000); + let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + + let mut metrics = Vec::new(); + let cache = HashMap::from([ + MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0) + ]); + + let snap = TimelineSnapshot { + loaded_at: (disk_consistent_lsn, init), + last_record_lsn: disk_consistent_lsn, + current_exact_logical_size: Some(0x42000), + }; + + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), + MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + ] + ); +} + +#[test] +fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let [now, just_before, before, init] = time_backwards(); + + let now = DateTime::::from(now); + let just_before = DateTime::::from(just_before); + let before = DateTime::::from(before); + + let initdb_lsn = Lsn(0x10000); + let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + + let mut metrics = Vec::new(); + let cache = HashMap::from([ + // at t=before was the last time the last_record_lsn changed + MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0), + // end time of this event is used for the next ones + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, just_before, 0), + ]); + + let snap = TimelineSnapshot { + loaded_at: (disk_consistent_lsn, init), + last_record_lsn: disk_consistent_lsn, + current_exact_logical_size: Some(0x42000), + }; + + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0), + MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + ] + ); +} + +#[test] +fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { + // it can happen that we lose the inmemorylayer but have previously sent metrics and we + // should never go backwards + + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let [later, now, at_restart] = time_backwards(); + + // FIXME: tests would be so much easier if we did not need to juggle back and forth + // SystemTime and DateTime:: ... Could do the conversion only at upload time? + let now = DateTime::::from(now); + let later = DateTime::::from(later); + let before_restart = at_restart - std::time::Duration::from_secs(5 * 60); + let way_before = before_restart - std::time::Duration::from_secs(10 * 60); + let before_restart = DateTime::::from(before_restart); + let way_before = DateTime::::from(way_before); + + let snap = TimelineSnapshot { + loaded_at: (Lsn(50), at_restart), + last_record_lsn: Lsn(50), + current_exact_logical_size: None, + }; + + let mut cache = HashMap::from([ + MetricsKey::written_size(tenant_id, timeline_id).at(before_restart, 100), + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( + way_before, + before_restart, + // not taken into account, but the timestamps are important + 999_999_999, + ), + ]); + + let mut metrics = Vec::new(); + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( + before_restart, + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, 100), + ] + ); + + // now if we cache these metrics, and re-run while "still in recovery" + cache.extend(metrics.drain(..)); + + // "still in recovery", because our snapshot did not change + snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0), + MetricsKey::written_size(tenant_id, timeline_id).at(later, 100), + ] + ); +} + +#[test] +fn post_restart_current_exact_logical_size_uses_cached() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let [now, at_restart] = time_backwards(); + + let now = DateTime::::from(now); + let before_restart = at_restart - std::time::Duration::from_secs(5 * 60); + let before_restart = DateTime::::from(before_restart); + + let snap = TimelineSnapshot { + loaded_at: (Lsn(50), at_restart), + last_record_lsn: Lsn(50), + current_exact_logical_size: None, + }; + + let cache = HashMap::from([ + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(before_restart, 100) + ]); + + let mut metrics = Vec::new(); + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + metrics.retain(|(key, _)| key.metric == Name::LogicalSize); + + assert_eq!( + metrics, + &[MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 100)] + ); +} + +#[test] +fn post_restart_synthetic_size_uses_cached_if_available() { + let tenant_id = TenantId::generate(); + + let ts = TenantSnapshot { + resident_size: 1000, + remote_size: 1000, + // not yet calculated + synthetic_size: 0, + }; + + let now = SystemTime::now(); + let before_restart = DateTime::::from(now - std::time::Duration::from_secs(5 * 60)); + let now = DateTime::::from(now); + + let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id).at(before_restart, 1000)]); + + let mut metrics = Vec::new(); + ts.to_metrics(tenant_id, now, &cached, &mut metrics); + + assert_eq!( + metrics, + &[ + MetricsKey::remote_storage_size(tenant_id).at(now, 1000), + MetricsKey::resident_size(tenant_id).at(now, 1000), + MetricsKey::synthetic_size(tenant_id).at(now, 1000), + ] + ); +} + +#[test] +fn post_restart_synthetic_size_is_not_sent_when_not_cached() { + let tenant_id = TenantId::generate(); + + let ts = TenantSnapshot { + resident_size: 1000, + remote_size: 1000, + // not yet calculated + synthetic_size: 0, + }; + + let now = SystemTime::now(); + let now = DateTime::::from(now); + + let cached = HashMap::new(); + + let mut metrics = Vec::new(); + ts.to_metrics(tenant_id, now, &cached, &mut metrics); + + assert_eq!( + metrics, + &[ + MetricsKey::remote_storage_size(tenant_id).at(now, 1000), + MetricsKey::resident_size(tenant_id).at(now, 1000), + // no synthetic size here + ] + ); +} + +fn time_backwards() -> [std::time::SystemTime; N] { + let mut times = [std::time::SystemTime::UNIX_EPOCH; N]; + times[0] = std::time::SystemTime::now(); + for behind in 1..N { + times[behind] = times[0] - std::time::Duration::from_secs(behind as u64); + } + + times +} + +pub(crate) const fn metric_examples( + tenant_id: TenantId, + timeline_id: TimelineId, + now: DateTime, + before: DateTime, +) -> [RawMetric; 6] { + [ + MetricsKey::written_size(tenant_id, timeline_id).at(now, 0), + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0), + MetricsKey::remote_storage_size(tenant_id).at(now, 0), + MetricsKey::resident_size(tenant_id).at(now, 0), + MetricsKey::synthetic_size(tenant_id).at(now, 1), + ] +} diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs new file mode 100644 index 0000000000..d69d43a2a8 --- /dev/null +++ b/pageserver/src/consumption_metrics/upload.rs @@ -0,0 +1,443 @@ +use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE}; +use serde_with::serde_as; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; + +use super::{metrics::Name, Cache, MetricsKey, RawMetric}; +use utils::id::{TenantId, TimelineId}; + +/// How the metrics from pageserver are identified. +#[serde_with::serde_as] +#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)] +struct Ids { + #[serde_as(as = "serde_with::DisplayFromStr")] + pub(super) tenant_id: TenantId, + #[serde_as(as = "Option")] + #[serde(skip_serializing_if = "Option::is_none")] + pub(super) timeline_id: Option, +} + +#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))] +pub(super) async fn upload_metrics( + client: &reqwest::Client, + metric_collection_endpoint: &reqwest::Url, + cancel: &CancellationToken, + node_id: &str, + metrics: &[RawMetric], + cached_metrics: &mut Cache, +) -> anyhow::Result<()> { + let mut uploaded = 0; + let mut failed = 0; + + let started_at = std::time::Instant::now(); + + let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id); + + while let Some(res) = iter.next() { + let (chunk, body) = res?; + + let event_bytes = body.len(); + + let is_last = iter.len() == 0; + + let res = upload(client, metric_collection_endpoint, body, cancel, is_last) + .instrument(tracing::info_span!( + "upload", + %event_bytes, + uploaded, + total = metrics.len(), + )) + .await; + + match res { + Ok(()) => { + for (curr_key, curr_val) in chunk { + cached_metrics.insert(*curr_key, *curr_val); + } + uploaded += chunk.len(); + } + Err(_) => { + // failure(s) have already been logged + // + // however this is an inconsistency: if we crash here, we will start with the + // values as uploaded. in practice, the rejections no longer happen. + failed += chunk.len(); + } + } + } + + let elapsed = started_at.elapsed(); + + tracing::info!( + uploaded, + failed, + elapsed_ms = elapsed.as_millis(), + "done sending metrics" + ); + + Ok(()) +} + +// The return type is quite ugly, but we gain testability in isolation +fn serialize_in_chunks<'a, F>( + chunk_size: usize, + input: &'a [RawMetric], + factory: F, +) -> impl ExactSizeIterator> + 'a +where + F: KeyGen<'a> + 'a, +{ + use bytes::BufMut; + + struct Iter<'a, F> { + inner: std::slice::Chunks<'a, RawMetric>, + chunk_size: usize, + + // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries + buffer: bytes::BytesMut, + // chunk amount of events are reused to produce the serialized document + scratch: Vec>, + factory: F, + } + + impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> { + type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>; + + fn next(&mut self) -> Option { + let chunk = self.inner.next()?; + + if self.scratch.is_empty() { + // first round: create events with N strings + self.scratch.extend( + chunk + .iter() + .map(|raw_metric| raw_metric.as_event(&self.factory.generate())), + ); + } else { + // next rounds: update_in_place to reuse allocations + assert_eq!(self.scratch.len(), self.chunk_size); + self.scratch + .iter_mut() + .zip(chunk.iter()) + .for_each(|(slot, raw_metric)| { + raw_metric.update_in_place(slot, &self.factory.generate()) + }); + } + + let res = serde_json::to_writer( + (&mut self.buffer).writer(), + &EventChunk { + events: (&self.scratch[..chunk.len()]).into(), + }, + ); + + match res { + Ok(()) => Some(Ok((chunk, self.buffer.split().freeze()))), + Err(e) => Some(Err(e)), + } + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + } + + impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {} + + let buffer = bytes::BytesMut::new(); + let inner = input.chunks(chunk_size); + let scratch = Vec::new(); + + Iter { + inner, + chunk_size, + buffer, + scratch, + factory, + } +} + +trait RawMetricExt { + fn as_event(&self, key: &IdempotencyKey<'_>) -> Event; + fn update_in_place(&self, event: &mut Event, key: &IdempotencyKey<'_>); +} + +impl RawMetricExt for RawMetric { + fn as_event(&self, key: &IdempotencyKey<'_>) -> Event { + let MetricsKey { + metric, + tenant_id, + timeline_id, + } = self.0; + + let (kind, value) = self.1; + + Event { + kind, + metric, + idempotency_key: key.to_string(), + value, + extra: Ids { + tenant_id, + timeline_id, + }, + } + } + + fn update_in_place(&self, event: &mut Event, key: &IdempotencyKey<'_>) { + use std::fmt::Write; + + let MetricsKey { + metric, + tenant_id, + timeline_id, + } = self.0; + + let (kind, value) = self.1; + + *event = Event { + kind, + metric, + idempotency_key: { + event.idempotency_key.clear(); + write!(event.idempotency_key, "{key}").unwrap(); + std::mem::take(&mut event.idempotency_key) + }, + value, + extra: Ids { + tenant_id, + timeline_id, + }, + }; + } +} + +trait KeyGen<'a>: Copy { + fn generate(&self) -> IdempotencyKey<'a>; +} + +impl<'a> KeyGen<'a> for &'a str { + fn generate(&self) -> IdempotencyKey<'a> { + IdempotencyKey::generate(self) + } +} + +enum UploadError { + Rejected(reqwest::StatusCode), + Reqwest(reqwest::Error), + Cancelled, +} + +impl std::fmt::Debug for UploadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // use same impl because backoff::retry will log this using both + std::fmt::Display::fmt(self, f) + } +} + +impl std::fmt::Display for UploadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use UploadError::*; + + match self { + Rejected(code) => write!(f, "server rejected the metrics with {code}"), + Reqwest(e) => write!(f, "request failed: {e}"), + Cancelled => write!(f, "cancelled"), + } + } +} + +impl UploadError { + fn is_reject(&self) -> bool { + matches!(self, UploadError::Rejected(_)) + } +} + +// this is consumed by the test verifiers +static LAST_IN_BATCH: reqwest::header::HeaderName = + reqwest::header::HeaderName::from_static("pageserver-metrics-last-upload-in-batch"); + +async fn upload( + client: &reqwest::Client, + metric_collection_endpoint: &reqwest::Url, + body: bytes::Bytes, + cancel: &CancellationToken, + is_last: bool, +) -> Result<(), UploadError> { + let warn_after = 3; + let max_attempts = 10; + let res = utils::backoff::retry( + move || { + let body = body.clone(); + async move { + let res = client + .post(metric_collection_endpoint.clone()) + .header(reqwest::header::CONTENT_TYPE, "application/json") + .header( + LAST_IN_BATCH.clone(), + if is_last { "true" } else { "false" }, + ) + .body(body) + .send() + .await; + + let res = res.and_then(|res| res.error_for_status()); + + // 10 redirects are normally allowed, so we don't need worry about 3xx + match res { + Ok(_response) => Ok(()), + Err(e) => { + let status = e.status().filter(|s| s.is_client_error()); + if let Some(status) = status { + // rejection used to be a thing when the server could reject a + // whole batch of metrics if one metric was bad. + Err(UploadError::Rejected(status)) + } else { + Err(UploadError::Reqwest(e)) + } + } + } + } + }, + UploadError::is_reject, + warn_after, + max_attempts, + "upload consumption_metrics", + utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled), + ) + .await; + + match &res { + Ok(_) => {} + Err(e) if e.is_reject() => { + // permanent errors currently do not get logged by backoff::retry + // display alternate has no effect, but keeping it here for easier pattern matching. + tracing::error!("failed to upload metrics: {e:#}"); + } + Err(_) => { + // these have been logged already + } + } + + res +} + +#[cfg(test)] +mod tests { + use super::*; + use chrono::{DateTime, Utc}; + use once_cell::sync::Lazy; + + #[test] + fn chunked_serialization() { + let examples = metric_samples(); + assert!(examples.len() > 1); + + let factory = FixedGen::new(Utc::now(), "1", 42); + + // need to use Event here because serde_json::Value uses default hashmap, not linked + // hashmap + #[derive(serde::Deserialize)] + struct EventChunk { + events: Vec>, + } + + let correct = serialize_in_chunks(examples.len(), &examples, factory) + .map(|res| res.unwrap().1) + .flat_map(|body| serde_json::from_slice::(&body).unwrap().events) + .collect::>(); + + for chunk_size in 1..examples.len() { + let actual = serialize_in_chunks(chunk_size, &examples, factory) + .map(|res| res.unwrap().1) + .flat_map(|body| serde_json::from_slice::(&body).unwrap().events) + .collect::>(); + + // if these are equal, it means that multi-chunking version works as well + assert_eq!(correct, actual); + } + } + + #[derive(Clone, Copy)] + struct FixedGen<'a>(chrono::DateTime, &'a str, u16); + + impl<'a> FixedGen<'a> { + fn new(now: chrono::DateTime, node_id: &'a str, nonce: u16) -> Self { + FixedGen(now, node_id, nonce) + } + } + + impl<'a> KeyGen<'a> for FixedGen<'a> { + fn generate(&self) -> IdempotencyKey<'a> { + IdempotencyKey::for_tests(self.0, self.1, self.2) + } + } + + static SAMPLES_NOW: Lazy> = Lazy::new(|| { + DateTime::parse_from_rfc3339("2023-09-15T00:00:00.123456789Z") + .unwrap() + .into() + }); + + #[test] + fn metric_image_stability() { + // it is important that these strings stay as they are + + let examples = [ + ( + line!(), + r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, + ), + ( + line!(), + r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, + ), + ( + line!(), + r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, + ), + ( + line!(), + r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#, + ), + ( + line!(), + r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#, + ), + ( + line!(), + r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#, + ), + ]; + + let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(*SAMPLES_NOW, "1", 0); + let examples = examples.into_iter().zip(metric_samples()); + + for ((line, expected), (key, (kind, value))) in examples { + let e = consumption_metrics::Event { + kind, + metric: key.metric, + idempotency_key: idempotency_key.to_string(), + value, + extra: Ids { + tenant_id: key.tenant_id, + timeline_id: key.timeline_id, + }, + }; + let actual = serde_json::to_string(&e).unwrap(); + assert_eq!(expected, actual, "example for {kind:?} from line {line}"); + } + } + + fn metric_samples() -> [RawMetric; 6] { + let tenant_id = TenantId::from_array([0; 16]); + let timeline_id = TimelineId::from_array([0xff; 16]); + + let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z") + .unwrap() + .into(); + let [now, before] = [*SAMPLES_NOW, before]; + + super::super::metrics::metric_examples(tenant_id, timeline_id, now, before) + } +} diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 2953208d1e..ee331ea154 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -94,6 +94,18 @@ pub struct RequestContext { task_kind: TaskKind, download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, + page_content_kind: PageContentKind, +} + +/// The kind of access to the page cache. +#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)] +pub enum PageContentKind { + Unknown, + DeltaLayerBtreeNode, + DeltaLayerValue, + ImageLayerBtreeNode, + ImageLayerValue, + InMemoryLayer, } /// Desired behavior if the operation requires an on-demand download @@ -137,6 +149,7 @@ impl RequestContextBuilder { task_kind, download_behavior: DownloadBehavior::Download, access_stats_behavior: AccessStatsBehavior::Update, + page_content_kind: PageContentKind::Unknown, }, } } @@ -149,6 +162,7 @@ impl RequestContextBuilder { task_kind: original.task_kind, download_behavior: original.download_behavior, access_stats_behavior: original.access_stats_behavior, + page_content_kind: original.page_content_kind, }, } } @@ -167,6 +181,11 @@ impl RequestContextBuilder { self } + pub(crate) fn page_content_kind(mut self, k: PageContentKind) -> Self { + self.inner.page_content_kind = k; + self + } + pub fn build(self) -> RequestContext { self.inner } @@ -263,4 +282,8 @@ impl RequestContext { pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior { self.access_stats_behavior } + + pub(crate) fn page_content_kind(&self) -> PageContentKind { + self.page_content_kind + } } diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs new file mode 100644 index 0000000000..192eb16789 --- /dev/null +++ b/pageserver/src/control_plane_client.rs @@ -0,0 +1,119 @@ +use std::collections::HashMap; + +use hyper::StatusCode; +use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse}; +use tokio_util::sync::CancellationToken; +use url::Url; +use utils::{ + backoff, + generation::Generation, + id::{NodeId, TenantId}, +}; + +use crate::config::PageServerConf; + +// Backoffs when control plane requests do not succeed: compromise between reducing load +// on control plane, and retrying frequently when we are blocked on a control plane +// response to make progress. +const BACKOFF_INCREMENT: f64 = 0.1; +const BACKOFF_MAX: f64 = 10.0; + +/// The Pageserver's client for using the control plane API: this is a small subset +/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) +pub(crate) struct ControlPlaneClient { + http_client: reqwest::Client, + base_url: Url, + node_id: NodeId, + cancel: CancellationToken, +} + +impl ControlPlaneClient { + /// A None return value indicates that the input `conf` object does not have control + /// plane API enabled. + pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option { + let mut url = match conf.control_plane_api.as_ref() { + Some(u) => u.clone(), + None => return None, + }; + + if let Ok(mut segs) = url.path_segments_mut() { + // This ensures that `url` ends with a slash if it doesn't already. + // That way, we can subsequently use join() to safely attach extra path elements. + segs.pop_if_empty().push(""); + } + + let client = reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"); + + Some(Self { + http_client: client, + base_url: url, + node_id: conf.id, + cancel: cancel.clone(), + }) + } + + async fn try_re_attach( + &self, + url: Url, + request: &ReAttachRequest, + ) -> anyhow::Result { + match self.http_client.post(url).json(request).send().await { + Err(e) => Err(anyhow::Error::from(e)), + Ok(r) => { + if r.status() == StatusCode::OK { + r.json::() + .await + .map_err(anyhow::Error::from) + } else { + Err(anyhow::anyhow!("Unexpected status {}", r.status())) + } + } + } + } + + /// Block until we get a successful response + pub(crate) async fn re_attach(&self) -> anyhow::Result> { + let re_attach_path = self + .base_url + .join("re-attach") + .expect("Failed to build re-attach path"); + let request = ReAttachRequest { + node_id: self.node_id, + }; + + let mut attempt = 0; + loop { + let result = self.try_re_attach(re_attach_path.clone(), &request).await; + match result { + Ok(res) => { + tracing::info!( + "Received re-attach response with {} tenants", + res.tenants.len() + ); + + return Ok(res + .tenants + .into_iter() + .map(|t| (t.id, Generation::new(t.generation))) + .collect::>()); + } + Err(e) => { + tracing::error!("Error re-attaching tenants, retrying: {e:#}"); + backoff::exponential_backoff( + attempt, + BACKOFF_INCREMENT, + BACKOFF_MAX, + &self.cancel, + ) + .await; + if self.cancel.is_cancelled() { + return Err(anyhow::anyhow!("Shutting down")); + } + attempt += 1; + } + } + } + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 38e07f172d..4988641d6a 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -383,7 +383,6 @@ paths: schema: type: string format: hex - post: description: | Schedules attach operation to happen in the background for the given tenant. @@ -1020,6 +1019,9 @@ components: properties: config: $ref: '#/components/schemas/TenantConfig' + generation: + type: integer + description: Attachment generation number. TenantConfigRequest: allOf: - $ref: '#/components/schemas/TenantConfig' diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f86657fa77..a8e914ba08 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -8,9 +8,10 @@ use anyhow::{anyhow, Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; -use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest}; +use pageserver_api::models::{ + DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest, +}; use remote_storage::GenericRemoteStorage; -use storage_broker::BrokerClientChannel; use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; @@ -32,11 +33,13 @@ use crate::tenant::mgr::{ }; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; -use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; +use crate::tenant::timeline::Timeline; +use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; use utils::{ auth::JwtAuth, + generation::Generation, http::{ endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with}, error::{ApiError, HttpErrorBody}, @@ -51,7 +54,7 @@ use utils::{ // Imports only used for testing APIs use super::models::ConfigureFailpointsRequest; -struct State { +pub struct State { conf: &'static PageServerConf, auth: Option>, allowlist_routes: Vec, @@ -61,7 +64,7 @@ struct State { } impl State { - fn new( + pub fn new( conf: &'static PageServerConf, auth: Option>, remote_storage: Option, @@ -282,6 +285,8 @@ async fn build_timeline_info_common( let state = timeline.current_state(); let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); + let walreceiver_status = timeline.walreceiver_status(); + let info = TimelineInfo { tenant_id: timeline.tenant_id, timeline_id: timeline.timeline_id, @@ -302,6 +307,8 @@ async fn build_timeline_info_common( pg_version: timeline.pg_version, state, + + walreceiver_status, }; Ok(info) } @@ -472,7 +479,7 @@ async fn tenant_attach_handler( check_permission(&request, Some(tenant_id))?; let maybe_body: Option = json_request_or_empty_body(&mut request).await?; - let tenant_conf = match maybe_body { + let tenant_conf = match &maybe_body { Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?, None => TenantConfOpt::default(), }; @@ -483,10 +490,13 @@ async fn tenant_attach_handler( let state = get_state(&request); + let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; + if let Some(remote_storage) = &state.remote_storage { mgr::attach_tenant( state.conf, tenant_id, + generation, tenant_conf, state.broker_client.clone(), remote_storage.clone(), @@ -538,7 +548,7 @@ async fn tenant_detach_handler( } async fn tenant_load_handler( - request: Request, + mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; @@ -546,10 +556,18 @@ async fn tenant_load_handler( let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + let maybe_body: Option = json_request_or_empty_body(&mut request).await?; + let state = get_state(&request); + + // The /load request is only usable when control_plane_api is not set. Once it is set, callers + // should always use /attach instead. + let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; + mgr::load_tenant( state.conf, tenant_id, + generation, state.broker_client.clone(), state.remote_storage.clone(), &ctx, @@ -851,6 +869,21 @@ pub fn html_response(status: StatusCode, data: String) -> Result, Ok(response) } +/// Helper for requests that may take a generation, which is mandatory +/// when control_plane_api is set, but otherwise defaults to Generation::none() +fn get_request_generation(state: &State, req_gen: Option) -> Result { + if state.conf.control_plane_api.is_some() { + req_gen + .map(Generation::new) + .ok_or(ApiError::BadRequest(anyhow!( + "generation attribute missing" + ))) + } else { + // Legacy mode: all tenants operate with no generation + Ok(Generation::none()) + } +} + async fn tenant_create_handler( mut request: Request, _cancel: CancellationToken, @@ -867,14 +900,17 @@ async fn tenant_create_handler( let tenant_conf = TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - let state = get_state(&request); + let generation = get_request_generation(state, request_data.generation)?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + let new_tenant = mgr::create_tenant( state.conf, tenant_conf, target_tenant_id, + generation, state.broker_client.clone(), state.remote_storage.clone(), &ctx, @@ -1321,12 +1357,9 @@ where } pub fn make_router( - conf: &'static PageServerConf, + state: Arc, launch_ts: &'static LaunchTimestamp, auth: Option>, - broker_client: BrokerClientChannel, - remote_storage: Option, - disk_usage_eviction_state: Arc, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); @@ -1350,16 +1383,7 @@ pub fn make_router( ); Ok(router - .data(Arc::new( - State::new( - conf, - auth, - remote_storage, - broker_client, - disk_usage_eviction_state, - ) - .context("Failed to initialize router state")?, - )) + .data(state) .get("/v1/status", |r| api_handler(r, status_handler)) .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 5bff5337bd..5a1affdb11 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir( { pg_control = Some(control_file); } - modification.flush().await?; + modification.flush(ctx).await?; } } // We're done importing all the data files. - modification.commit().await?; + modification.commit(ctx).await?; // We expect the Postgres server to be shut down cleanly. let pg_control = pg_control.context("pg_control file not found")?; @@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar( // We found the pg_control file. pg_control = Some(res); } - modification.flush().await?; + modification.flush(ctx).await?; } tokio_tar::EntryType::Directory => { debug!("directory {:?}", file_path); @@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar( // sanity check: ensure that pg_control is loaded let _pg_control = pg_control.context("pg_control file not found")?; - modification.commit().await?; + modification.commit(ctx).await?; Ok(()) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index cb20caba1f..3049ad6a4e 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -3,6 +3,7 @@ pub mod basebackup; pub mod config; pub mod consumption_metrics; pub mod context; +mod control_plane_client; pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 1dc039056b..98dee095a3 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,3 +1,4 @@ +use enum_map::EnumMap; use metrics::metric_vec_duration::DurationResultObserver; use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, @@ -127,22 +128,24 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub struct PageCacheMetrics { +pub struct PageCacheMetricsForTaskKind { pub read_accesses_materialized_page: IntCounter, - pub read_accesses_ephemeral: IntCounter, pub read_accesses_immutable: IntCounter, - pub read_hits_ephemeral: IntCounter, pub read_hits_immutable: IntCounter, pub read_hits_materialized_page_exact: IntCounter, pub read_hits_materialized_page_older_lsn: IntCounter, } +pub struct PageCacheMetrics { + map: EnumMap>, +} + static PAGE_CACHE_READ_HITS: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_page_cache_read_hits_total", "Number of read accesses to the page cache that hit", - &["key_kind", "hit_kind"] + &["task_kind", "key_kind", "content_kind", "hit_kind"] ) .expect("failed to define a metric") }); @@ -151,55 +154,73 @@ static PAGE_CACHE_READ_ACCESSES: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_page_cache_read_accesses_total", "Number of read accesses to the page cache", - &["key_kind"] + &["task_kind", "key_kind", "content_kind"] ) .expect("failed to define a metric") }); pub static PAGE_CACHE: Lazy = Lazy::new(|| PageCacheMetrics { - read_accesses_materialized_page: { - PAGE_CACHE_READ_ACCESSES - .get_metric_with_label_values(&["materialized_page"]) - .unwrap() - }, + map: EnumMap::from_array(std::array::from_fn(|task_kind| { + let task_kind = ::from_usize(task_kind); + let task_kind: &'static str = task_kind.into(); + EnumMap::from_array(std::array::from_fn(|content_kind| { + let content_kind = ::from_usize(content_kind); + let content_kind: &'static str = content_kind.into(); + PageCacheMetricsForTaskKind { + read_accesses_materialized_page: { + PAGE_CACHE_READ_ACCESSES + .get_metric_with_label_values(&[ + task_kind, + "materialized_page", + content_kind, + ]) + .unwrap() + }, - read_accesses_ephemeral: { - PAGE_CACHE_READ_ACCESSES - .get_metric_with_label_values(&["ephemeral"]) - .unwrap() - }, + read_accesses_immutable: { + PAGE_CACHE_READ_ACCESSES + .get_metric_with_label_values(&[task_kind, "immutable", content_kind]) + .unwrap() + }, - read_accesses_immutable: { - PAGE_CACHE_READ_ACCESSES - .get_metric_with_label_values(&["immutable"]) - .unwrap() - }, + read_hits_immutable: { + PAGE_CACHE_READ_HITS + .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"]) + .unwrap() + }, - read_hits_ephemeral: { - PAGE_CACHE_READ_HITS - .get_metric_with_label_values(&["ephemeral", "-"]) - .unwrap() - }, + read_hits_materialized_page_exact: { + PAGE_CACHE_READ_HITS + .get_metric_with_label_values(&[ + task_kind, + "materialized_page", + content_kind, + "exact", + ]) + .unwrap() + }, - read_hits_immutable: { - PAGE_CACHE_READ_HITS - .get_metric_with_label_values(&["immutable", "-"]) - .unwrap() - }, - - read_hits_materialized_page_exact: { - PAGE_CACHE_READ_HITS - .get_metric_with_label_values(&["materialized_page", "exact"]) - .unwrap() - }, - - read_hits_materialized_page_older_lsn: { - PAGE_CACHE_READ_HITS - .get_metric_with_label_values(&["materialized_page", "older_lsn"]) - .unwrap() - }, + read_hits_materialized_page_older_lsn: { + PAGE_CACHE_READ_HITS + .get_metric_with_label_values(&[ + task_kind, + "materialized_page", + content_kind, + "older_lsn", + ]) + .unwrap() + }, + } + })) + })), }); +impl PageCacheMetrics { + pub(crate) fn for_ctx(&self, ctx: &RequestContext) -> &PageCacheMetricsForTaskKind { + &self.map[ctx.task_kind()][ctx.page_content_kind()] + } +} + pub struct PageCacheSizeMetrics { pub max_bytes: UIntGauge, @@ -537,7 +558,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ 30.000, // 30000 ms ]; -/// Tracks time taken by fs operations near VirtualFile. +/// VirtualFile fs operation variants. /// /// Operations: /// - open ([`std::fs::OpenOptions::open`]) @@ -548,15 +569,66 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ /// - seek (modify internal position or file length query) /// - fsync ([`std::fs::File::sync_all`]) /// - metadata ([`std::fs::File::metadata`]) -pub(crate) static STORAGE_IO_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( - "pageserver_io_operations_seconds", - "Time spent in IO operations", - &["operation"], - STORAGE_IO_TIME_BUCKETS.into() - ) - .expect("failed to define a metric") -}); +#[derive( + Debug, Clone, Copy, strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr, +)] +pub(crate) enum StorageIoOperation { + Open, + Close, + CloseByReplace, + Read, + Write, + Seek, + Fsync, + Metadata, +} + +impl StorageIoOperation { + pub fn as_str(&self) -> &'static str { + match self { + StorageIoOperation::Open => "open", + StorageIoOperation::Close => "close", + StorageIoOperation::CloseByReplace => "close-by-replace", + StorageIoOperation::Read => "read", + StorageIoOperation::Write => "write", + StorageIoOperation::Seek => "seek", + StorageIoOperation::Fsync => "fsync", + StorageIoOperation::Metadata => "metadata", + } + } +} + +/// Tracks time taken by fs operations near VirtualFile. +#[derive(Debug)] +pub(crate) struct StorageIoTime { + metrics: [Histogram; StorageIoOperation::COUNT], +} + +impl StorageIoTime { + fn new() -> Self { + let storage_io_histogram_vec = register_histogram_vec!( + "pageserver_io_operations_seconds", + "Time spent in IO operations", + &["operation"], + STORAGE_IO_TIME_BUCKETS.into() + ) + .expect("failed to define a metric"); + let metrics = std::array::from_fn(|i| { + let op = StorageIoOperation::from_repr(i).unwrap(); + let metric = storage_io_histogram_vec + .get_metric_with_label_values(&[op.as_str()]) + .unwrap(); + metric + }); + Self { metrics } + } + + pub(crate) fn get(&self, op: StorageIoOperation) -> &Histogram { + &self.metrics[op as usize] + } +} + +pub(crate) static STORAGE_IO_TIME_METRIC: Lazy = Lazy::new(StorageIoTime::new); const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"]; @@ -1165,6 +1237,12 @@ impl TimelineMetrics { ), } } + + pub fn record_new_file_metrics(&self, sz: u64) { + self.resident_physical_size_gauge.add(sz); + self.num_persistent_files_created.inc_by(1); + self.persistent_bytes_written.inc_by(sz); + } } impl Drop for TimelineMetrics { @@ -1223,6 +1301,9 @@ use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; use std::time::{Duration, Instant}; +use crate::context::{PageContentKind, RequestContext}; +use crate::task_mgr::TaskKind; + pub struct RemoteTimelineClientMetrics { tenant_id: String, timeline_id: String, diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index fb1c5fc485..38b169ea85 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -85,7 +85,7 @@ use utils::{ lsn::Lsn, }; -use crate::{metrics::PageCacheSizeMetrics, repository::Key}; +use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key}; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 50; @@ -346,8 +346,10 @@ impl PageCache { timeline_id: TimelineId, key: &Key, lsn: Lsn, + ctx: &RequestContext, ) -> Option<(Lsn, PageReadGuard)> { crate::metrics::PAGE_CACHE + .for_ctx(ctx) .read_accesses_materialized_page .inc(); @@ -368,10 +370,12 @@ impl PageCache { { if available_lsn == lsn { crate::metrics::PAGE_CACHE + .for_ctx(ctx) .read_hits_materialized_page_exact .inc(); } else { crate::metrics::PAGE_CACHE + .for_ctx(ctx) .read_hits_materialized_page_older_lsn .inc(); } @@ -426,10 +430,11 @@ impl PageCache { &self, file_id: FileId, blkno: u32, + ctx: &RequestContext, ) -> anyhow::Result { let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno }; - self.lock_for_read(&mut cache_key).await + self.lock_for_read(&mut cache_key, ctx).await } // @@ -497,14 +502,20 @@ impl PageCache { /// } /// ``` /// - async fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result { + async fn lock_for_read( + &self, + cache_key: &mut CacheKey, + ctx: &RequestContext, + ) -> anyhow::Result { let (read_access, hit) = match cache_key { CacheKey::MaterializedPage { .. } => { unreachable!("Materialized pages use lookup_materialized_page") } CacheKey::ImmutableFilePage { .. } => ( - &crate::metrics::PAGE_CACHE.read_accesses_immutable, - &crate::metrics::PAGE_CACHE.read_hits_immutable, + &crate::metrics::PAGE_CACHE + .for_ctx(ctx) + .read_accesses_immutable, + &crate::metrics::PAGE_CACHE.for_ctx(ctx).read_hits_immutable, ), }; read_access.inc(); @@ -799,8 +810,9 @@ impl PageCache { fn new(num_pages: usize) -> Self { assert!(num_pages > 0, "page cache size must be > 0"); - // We use Box::leak here and into_boxed_slice to avoid leaking uninitialized - // memory that Vec's might contain. + // We could use Vec::leak here, but that potentially also leaks + // uninitialized reserved capacity. With into_boxed_slice and Box::leak + // this is avoided. let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice()); let size_metrics = &crate::metrics::PAGE_CACHE_SIZE; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 72a66d51a6..2a87ee0381 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -469,7 +469,9 @@ impl PageServerHandler { // Create empty timeline info!("creating new timeline"); let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?; - let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?; + let timeline = tenant + .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) + .await?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 54b41f3e9d..9a1281a522 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1138,7 +1138,7 @@ impl<'a> DatadirModification<'a> { /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. - pub async fn flush(&mut self) -> anyhow::Result<()> { + pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; @@ -1154,7 +1154,7 @@ impl<'a> DatadirModification<'a> { if is_rel_block_key(key) || is_slru_block_key(key) { // This bails out on first error without modifying pending_updates. // That's Ok, cf this function's doc comment. - writer.put(key, self.lsn, &value).await?; + writer.put(key, self.lsn, &value, ctx).await?; } else { retained_pending_updates.insert(key, value); } @@ -1174,14 +1174,14 @@ impl<'a> DatadirModification<'a> { /// underlying timeline. /// All the modifications in this atomic update are stamped by the specified LSN. /// - pub async fn commit(&mut self) -> anyhow::Result<()> { + pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { let writer = self.tline.writer().await; let lsn = self.lsn; let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; for (key, value) in self.pending_updates.drain() { - writer.put(key, lsn, &value).await?; + writer.put(key, lsn, &value, ctx).await?; } for key_range in self.pending_deletions.drain(..) { writer.delete(key_range, lsn).await?; diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 3c7a1115df..650bc119b6 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -187,6 +187,7 @@ task_local! { Debug, // NB: enumset::EnumSetType derives PartialEq, Eq, Clone, Copy enumset::EnumSetType, + enum_map::Enum, serde::Serialize, serde::Deserialize, strum_macros::IntoStaticStr, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2168db57de..1c92c618fa 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -32,9 +32,7 @@ use std::fmt::Debug; use std::fmt::Display; use std::fs; use std::fs::File; -use std::fs::OpenOptions; use std::io; -use std::io::Write; use std::ops::Bound::Included; use std::path::Path; use std::path::PathBuf; @@ -68,7 +66,7 @@ use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; use crate::tenant::metadata::load_metadata; -use crate::tenant::remote_timeline_client::index::IndexPart; +pub use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; @@ -115,7 +113,6 @@ pub mod block_io; pub mod disk_btree; pub(crate) mod ephemeral_file; pub mod layer_map; -pub mod manifest; mod span; pub mod metadata; @@ -144,6 +141,9 @@ pub use crate::tenant::metadata::save_metadata; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; +/// The "tenants" part of `tenants//timelines...` +pub const TENANTS_SEGMENT_NAME: &str = "tenants"; + /// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; @@ -195,7 +195,7 @@ pub struct Tenant { walredo_mgr: Arc, // provides access to timeline data sitting in the remote storage - remote_storage: Option, + pub(crate) remote_storage: Option, /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`]. cached_logical_sizes: tokio::sync::Mutex>, @@ -407,7 +407,6 @@ impl Tenant { remote_startup_data: Option, local_metadata: Option, ancestor: Option>, - first_save: bool, init_order: Option<&InitializationOrder>, _ctx: &RequestContext, ) -> anyhow::Result<()> { @@ -441,14 +440,9 @@ impl Tenant { // Save the metadata file to local disk. if !picked_local { - save_metadata( - self.conf, - &tenant_id, - &timeline_id, - up_to_date_metadata, - first_save, - ) - .context("save_metadata")?; + save_metadata(self.conf, &tenant_id, &timeline_id, up_to_date_metadata) + .await + .context("save_metadata")?; } let index_part = remote_startup_data.as_ref().map(|x| &x.index_part); @@ -833,7 +827,6 @@ impl Tenant { }), local_metadata, ancestor, - true, None, ctx, ) @@ -1386,7 +1379,6 @@ impl Tenant { remote_startup_data, Some(local_metadata), ancestor, - false, init_order, ctx, ) @@ -1450,7 +1442,7 @@ impl Tenant { /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the /// minimum amount of keys required to get a writable timeline. /// (Without it, `put` might fail due to `repartition` failing.) - pub fn create_empty_timeline( + pub async fn create_empty_timeline( &self, new_timeline_id: TimelineId, initdb_lsn: Lsn, @@ -1462,10 +1454,10 @@ impl Tenant { "Cannot create empty timelines on inactive tenant" ); - let timelines = self.timelines.lock().unwrap(); - let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?; - drop(timelines); - + let timeline_uninit_mark = { + let timelines = self.timelines.lock().unwrap(); + self.create_timeline_uninit_mark(new_timeline_id, &timelines)? + }; let new_metadata = TimelineMetadata::new( // Initialize disk_consistent LSN to 0, The caller must import some data to // make it valid, before calling finish_creation() @@ -1484,6 +1476,7 @@ impl Tenant { initdb_lsn, None, ) + .await } /// Helper for unit tests to create an empty timeline. @@ -1499,7 +1492,9 @@ impl Tenant { pg_version: u32, ctx: &RequestContext, ) -> anyhow::Result> { - let uninit_tl = self.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)?; + let uninit_tl = self + .create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx) + .await?; let tline = uninit_tl.raw_timeline().expect("we just created it"); assert_eq!(tline.get_last_record_lsn(), Lsn(0)); @@ -1509,7 +1504,7 @@ impl Tenant { .init_empty_test_timeline() .context("init_empty_test_timeline")?; modification - .commit() + .commit(ctx) .await .context("commit init_empty_test_timeline modification")?; @@ -1517,6 +1512,15 @@ impl Tenant { tline.maybe_spawn_flush_loop(); tline.freeze_and_flush().await.context("freeze_and_flush")?; + // Make sure the freeze_and_flush reaches remote storage. + tline + .remote_client + .as_ref() + .unwrap() + .wait_completion() + .await + .unwrap(); + let tl = uninit_tl.finish_creation()?; // The non-test code would call tl.activate() here. tl.set_state(TimelineState::Active); @@ -1693,65 +1697,6 @@ impl Tenant { Ok(()) } - /// Flush all in-memory data to disk and remote storage, if any. - /// - /// Used at graceful shutdown. - async fn freeze_and_flush_on_shutdown(&self) { - let mut js = tokio::task::JoinSet::new(); - - // execute on each timeline on the JoinSet, join after. - let per_timeline = |timeline_id: TimelineId, timeline: Arc| { - async move { - debug_assert_current_span_has_tenant_and_timeline_id(); - - match timeline.freeze_and_flush().await { - Ok(()) => {} - Err(e) => { - warn!("failed to freeze and flush: {e:#}"); - return; - } - } - - let res = if let Some(client) = timeline.remote_client.as_ref() { - // if we did not wait for completion here, it might be our shutdown process - // didn't wait for remote uploads to complete at all, as new tasks can forever - // be spawned. - // - // what is problematic is the shutting down of RemoteTimelineClient, because - // obviously it does not make sense to stop while we wait for it, but what - // about corner cases like s3 suddenly hanging up? - client.wait_completion().await - } else { - Ok(()) - }; - - if let Err(e) = res { - warn!("failed to await for frozen and flushed uploads: {e:#}"); - } - } - .instrument(tracing::info_span!("freeze_and_flush_on_shutdown", %timeline_id)) - }; - - { - let timelines = self.timelines.lock().unwrap(); - timelines - .iter() - .map(|(id, tl)| (*id, Arc::clone(tl))) - .for_each(|(timeline_id, timeline)| { - js.spawn(per_timeline(timeline_id, timeline)); - }) - }; - - while let Some(res) = js.join_next().await { - match res { - Ok(()) => {} - Err(je) if je.is_cancelled() => unreachable!("no cancelling used"), - Err(je) if je.is_panic() => { /* logged already */ } - Err(je) => warn!("unexpected JoinError: {je:?}"), - } - } - } - pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } @@ -1882,19 +1827,22 @@ impl Tenant { } }; - if freeze_and_flush { - // walreceiver has already began to shutdown with TenantState::Stopping, but we need to - // await for them to stop. - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_id), - None, - ) - .await; - - // this will wait for uploads to complete; in the past, it was done outside tenant - // shutdown in pageserver::shutdown_pageserver. - self.freeze_and_flush_on_shutdown().await; + let mut js = tokio::task::JoinSet::new(); + { + let timelines = self.timelines.lock().unwrap(); + timelines.values().for_each(|timeline| { + let timeline = Arc::clone(timeline); + let span = Span::current(); + js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await }); + }) + }; + while let Some(res) = js.join_next().await { + match res { + Ok(()) => {} + Err(je) if je.is_cancelled() => unreachable!("no cancelling used"), + Err(je) if je.is_panic() => { /* logged already */ } + Err(je) => warn!("unexpected JoinError: {je:?}"), + } } // shutdown all tenant and timeline tasks: gc, compaction, page service @@ -2421,72 +2369,37 @@ impl Tenant { Ok(tenant_conf) } - pub(super) fn persist_tenant_config( + #[tracing::instrument(skip_all, fields(%tenant_id))] + pub(super) async fn persist_tenant_config( tenant_id: &TenantId, target_config_path: &Path, tenant_conf: TenantConfOpt, - creating_tenant: bool, ) -> anyhow::Result<()> { - let _enter = info_span!("saving tenantconf").entered(); - // imitate a try-block with a closure - let do_persist = |target_config_path: &Path| -> anyhow::Result<()> { - let target_config_parent = target_config_path.parent().with_context(|| { - format!( - "Config path does not have a parent: {}", - target_config_path.display() - ) - })?; + info!("persisting tenantconf to {}", target_config_path.display()); - info!("persisting tenantconf to {}", target_config_path.display()); - - let mut conf_content = r#"# This file contains a specific per-tenant's config. + let mut conf_content = r#"# This file contains a specific per-tenant's config. # It is read in case of pageserver restart. [tenant_config] "# - .to_string(); + .to_string(); - // Convert the config to a toml file. - conf_content += &toml_edit::ser::to_string(&tenant_conf)?; + // Convert the config to a toml file. + conf_content += &toml_edit::ser::to_string(&tenant_conf)?; - let mut target_config_file = VirtualFile::open_with_options( - target_config_path, - OpenOptions::new() - .truncate(true) // This needed for overwriting with small config files - .write(true) - .create_new(creating_tenant) - // when creating a new tenant, first_save will be true and `.create(true)` will be - // ignored (per rust std docs). - // - // later when updating the config of created tenant, or persisting config for the - // first time for attached tenant, the `.create(true)` is used. - .create(true), - )?; + let conf_content = conf_content.as_bytes(); - target_config_file - .write(conf_content.as_bytes()) - .context("write toml bytes into file") - .and_then(|_| target_config_file.sync_all().context("fsync config file")) - .context("write config file")?; - - // fsync the parent directory to ensure the directory entry is durable. - // before this was done conditionally on creating_tenant, but these management actions are rare - // enough to just fsync it always. - - crashsafe::fsync(target_config_parent)?; - // XXX we're not fsyncing the parent dir, need to do that in case `creating_tenant` - Ok(()) - }; - - // this function is called from creating the tenant and updating the tenant config, which - // would otherwise share this context, so keep it here in one place. - do_persist(target_config_path).with_context(|| { - format!( - "write tenant {tenant_id} config to {}", - target_config_path.display() - ) - }) + let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX); + VirtualFile::crashsafe_overwrite(target_config_path, &temp_path, conf_content) + .await + .with_context(|| { + format!( + "write tenant {tenant_id} config to {}", + target_config_path.display() + ) + })?; + Ok(()) } // @@ -2797,13 +2710,15 @@ impl Tenant { src_timeline.pg_version, ); - let uninitialized_timeline = self.prepare_new_timeline( - dst_id, - &metadata, - timeline_uninit_mark, - start_lsn + 1, - Some(Arc::clone(src_timeline)), - )?; + let uninitialized_timeline = self + .prepare_new_timeline( + dst_id, + &metadata, + timeline_uninit_mark, + start_lsn + 1, + Some(Arc::clone(src_timeline)), + ) + .await?; let new_timeline = uninitialized_timeline.finish_creation()?; @@ -2881,13 +2796,15 @@ impl Tenant { pgdata_lsn, pg_version, ); - let raw_timeline = self.prepare_new_timeline( - timeline_id, - &new_metadata, - timeline_uninit_mark, - pgdata_lsn, - None, - )?; + let raw_timeline = self + .prepare_new_timeline( + timeline_id, + &new_metadata, + timeline_uninit_mark, + pgdata_lsn, + None, + ) + .await?; let tenant_id = raw_timeline.owning_tenant.tenant_id; let unfinished_timeline = raw_timeline.raw_timeline()?; @@ -2958,7 +2875,7 @@ impl Tenant { /// at 'disk_consistent_lsn'. After any initial data has been imported, call /// `finish_creation` to insert the Timeline into the timelines map and to remove the /// uninit mark file. - fn prepare_new_timeline( + async fn prepare_new_timeline( &self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, @@ -2986,8 +2903,9 @@ impl Tenant { timeline_struct.init_empty_layer_map(start_lsn); - if let Err(e) = - self.create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata) + if let Err(e) = self + .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata) + .await { error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}"); cleanup_timeline_directory(uninit_mark); @@ -3003,7 +2921,7 @@ impl Tenant { )) } - fn create_timeline_files( + async fn create_timeline_files( &self, timeline_path: &Path, new_timeline_id: &TimelineId, @@ -3015,14 +2933,9 @@ impl Tenant { anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); }); - save_metadata( - self.conf, - &self.tenant_id, - new_timeline_id, - new_metadata, - true, - ) - .context("Failed to create timeline metadata")?; + save_metadata(self.conf, &self.tenant_id, new_timeline_id, new_metadata) + .await + .context("Failed to create timeline metadata")?; Ok(()) } @@ -3169,7 +3082,7 @@ pub(crate) enum CreateTenantFilesMode { Attach, } -pub(crate) fn create_tenant_files( +pub(crate) async fn create_tenant_files( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: &TenantId, @@ -3205,7 +3118,8 @@ pub(crate) fn create_tenant_files( mode, &temporary_tenant_dir, &target_tenant_directory, - ); + ) + .await; if creation_result.is_err() { error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data"); @@ -3223,7 +3137,7 @@ pub(crate) fn create_tenant_files( Ok(target_tenant_directory) } -fn try_create_target_tenant_dir( +async fn try_create_target_tenant_dir( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: &TenantId, @@ -3262,7 +3176,7 @@ fn try_create_target_tenant_dir( ) .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?; - Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf, true)?; + Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf).await?; crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| { format!( @@ -3467,6 +3381,8 @@ pub mod harness { pub tenant_conf: TenantConf, pub tenant_id: TenantId, pub generation: Generation, + pub remote_storage: GenericRemoteStorage, + pub remote_fs_dir: PathBuf, } static LOG_HANDLE: OnceCell<()> = OnceCell::new(); @@ -3504,29 +3420,39 @@ pub mod harness { fs::create_dir_all(conf.tenant_path(&tenant_id))?; fs::create_dir_all(conf.timelines_path(&tenant_id))?; + use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; + let remote_fs_dir = conf.workdir.join("localfs"); + std::fs::create_dir_all(&remote_fs_dir).unwrap(); + let config = RemoteStorageConfig { + // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, + max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(), + // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, + max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(), + storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + }; + let remote_storage = GenericRemoteStorage::from_config(&config).unwrap(); + Ok(Self { conf, tenant_conf, tenant_id, generation: Generation::new(0xdeadbeef), + remote_storage, + remote_fs_dir, }) } pub async fn load(&self) -> (Arc, RequestContext) { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); ( - self.try_load(&ctx, None) + self.try_load(&ctx) .await .expect("failed to load test tenant"), ctx, ) } - pub async fn try_load( - &self, - ctx: &RequestContext, - remote_storage: Option, - ) -> anyhow::Result> { + pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result> { let walredo_mgr = Arc::new(TestRedoManager); let tenant = Arc::new(Tenant::new( @@ -3536,7 +3462,7 @@ pub mod harness { walredo_mgr, self.tenant_id, self.generation, - remote_storage, + Some(self.remote_storage.clone()), )); tenant .load(None, ctx) @@ -3612,14 +3538,24 @@ mod tests { let writer = tline.writer().await; writer - .put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10"))) + .put( + *TEST_KEY, + Lsn(0x10), + &Value::Image(TEST_IMG("foo at 0x10")), + &ctx, + ) .await?; writer.finish_write(Lsn(0x10)); drop(writer); let writer = tline.writer().await; writer - .put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20"))) + .put( + *TEST_KEY, + Lsn(0x20), + &Value::Image(TEST_IMG("foo at 0x20")), + &ctx, + ) .await?; writer.finish_write(Lsn(0x20)); drop(writer); @@ -3649,7 +3585,10 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; - match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) { + match tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), @@ -3690,19 +3629,19 @@ mod tests { // Insert a value on the timeline writer - .put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20")) + .put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"), &ctx) .await?; writer - .put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20")) + .put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"), &ctx) .await?; writer.finish_write(Lsn(0x20)); writer - .put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30")) + .put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"), &ctx) .await?; writer.finish_write(Lsn(0x30)); writer - .put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40")) + .put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"), &ctx) .await?; writer.finish_write(Lsn(0x40)); @@ -3717,7 +3656,7 @@ mod tests { .expect("Should have a local timeline"); let new_writer = newtline.writer().await; new_writer - .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40")) + .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx) .await?; new_writer.finish_write(Lsn(0x40)); @@ -3740,7 +3679,11 @@ mod tests { Ok(()) } - async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> { + async fn make_some_layers( + tline: &Timeline, + start_lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result<()> { let mut lsn = start_lsn; #[allow(non_snake_case)] { @@ -3751,6 +3694,7 @@ mod tests { *TEST_KEY, lsn, &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + ctx, ) .await?; writer.finish_write(lsn); @@ -3760,6 +3704,7 @@ mod tests { *TEST_KEY, lsn, &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + ctx, ) .await?; writer.finish_write(lsn); @@ -3773,6 +3718,7 @@ mod tests { *TEST_KEY, lsn, &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + ctx, ) .await?; writer.finish_write(lsn); @@ -3782,6 +3728,7 @@ mod tests { *TEST_KEY, lsn, &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + ctx, ) .await?; writer.finish_write(lsn); @@ -3798,7 +3745,7 @@ mod tests { let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; - make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 // FIXME: this doesn't actually remove any layer currently, given how the flushing @@ -3872,7 +3819,7 @@ mod tests { .load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; - make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?; let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); @@ -3894,7 +3841,7 @@ mod tests { let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; - make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) @@ -3903,7 +3850,7 @@ mod tests { .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); - make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; + make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?; tline.set_broken("test".to_owned()); @@ -3944,7 +3891,7 @@ mod tests { let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; - make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) @@ -3969,7 +3916,7 @@ mod tests { let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; - make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) @@ -3978,7 +3925,7 @@ mod tests { .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); - make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; + make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?; // run gc on parent tenant @@ -4003,7 +3950,14 @@ mod tests { let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx) .await?; - make_some_layers(tline.as_ref(), Lsn(0x8000)).await?; + make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?; + // so that all uploads finish & we can call harness.load() below again + tenant + .shutdown(Default::default(), true) + .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id)) + .await + .ok() + .unwrap(); } let (tenant, _ctx) = harness.load().await; @@ -4025,7 +3979,7 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; - make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; let child_tline = tenant .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx) @@ -4036,7 +3990,15 @@ mod tests { .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); - make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; + make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?; + + // so that all uploads finish & we can call harness.load() below again + tenant + .shutdown(Default::default(), true) + .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id)) + .await + .ok() + .unwrap(); } // check that both of them are initially unloaded @@ -4060,7 +4022,7 @@ mod tests { let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; - make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?; let layer_map = tline.layers.read().await; let level0_deltas = layer_map.layer_map().get_level0_deltas()?; @@ -4089,6 +4051,13 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; drop(tline); + // so that all uploads finish & we can call harness.try_load() below again + tenant + .shutdown(Default::default(), true) + .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id)) + .await + .ok() + .unwrap(); drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -4100,11 +4069,7 @@ mod tests { metadata_bytes[8] ^= 1; std::fs::write(metadata_path, metadata_bytes)?; - let err = harness - .try_load(&ctx, None) - .await - .err() - .expect("should fail"); + let err = harness.try_load(&ctx).await.err().expect("should fail"); // get all the stack with all .context, not only the last one let message = format!("{err:#}"); let expected = "failed to load metadata"; @@ -4140,7 +4105,12 @@ mod tests { let writer = tline.writer().await; writer - .put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10"))) + .put( + *TEST_KEY, + Lsn(0x10), + &Value::Image(TEST_IMG("foo at 0x10")), + &ctx, + ) .await?; writer.finish_write(Lsn(0x10)); drop(writer); @@ -4150,7 +4120,12 @@ mod tests { let writer = tline.writer().await; writer - .put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20"))) + .put( + *TEST_KEY, + Lsn(0x20), + &Value::Image(TEST_IMG("foo at 0x20")), + &ctx, + ) .await?; writer.finish_write(Lsn(0x20)); drop(writer); @@ -4160,7 +4135,12 @@ mod tests { let writer = tline.writer().await; writer - .put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30"))) + .put( + *TEST_KEY, + Lsn(0x30), + &Value::Image(TEST_IMG("foo at 0x30")), + &ctx, + ) .await?; writer.finish_write(Lsn(0x30)); drop(writer); @@ -4170,7 +4150,12 @@ mod tests { let writer = tline.writer().await; writer - .put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40"))) + .put( + *TEST_KEY, + Lsn(0x40), + &Value::Image(TEST_IMG("foo at 0x40")), + &ctx, + ) .await?; writer.finish_write(Lsn(0x40)); drop(writer); @@ -4228,6 +4213,7 @@ mod tests { test_key, lsn, &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &ctx, ) .await?; writer.finish_write(lsn); @@ -4280,6 +4266,7 @@ mod tests { test_key, lsn, &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &ctx, ) .await?; writer.finish_write(lsn); @@ -4300,6 +4287,7 @@ mod tests { test_key, lsn, &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &ctx, ) .await?; writer.finish_write(lsn); @@ -4359,6 +4347,7 @@ mod tests { test_key, lsn, &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &ctx, ) .await?; writer.finish_write(lsn); @@ -4387,6 +4376,7 @@ mod tests { test_key, lsn, &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &ctx, ) .await?; println!("updating {} at {}", blknum, lsn); @@ -4455,6 +4445,7 @@ mod tests { test_key, lsn, &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))), + &ctx, ) .await?; println!("updating [{}][{}] at {}", idx, blknum, lsn); @@ -4489,8 +4480,9 @@ mod tests { .await; let initdb_lsn = Lsn(0x20); - let utline = - tenant.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)?; + let utline = tenant + .create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx) + .await?; let tline = utline.raw_timeline().unwrap(); // Spawn flush loop now so that we can set the `expect_initdb_optimization` @@ -4526,7 +4518,7 @@ mod tests { .init_empty_test_timeline() .context("init_empty_test_timeline")?; modification - .commit() + .commit(&ctx) .await .context("commit init_empty_test_timeline modification")?; @@ -4555,9 +4547,15 @@ mod tests { let harness = TenantHarness::create(name)?; { let (tenant, ctx) = harness.load().await; - let tline = - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) + .await?; // Keeps uninit mark in place + let raw_tline = tline.raw_timeline().unwrap(); + raw_tline + .shutdown(false) + .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id)) + .await; std::mem::forget(tline); } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index f5ff15b50c..21327deb70 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -11,16 +11,22 @@ //! len < 128: 0XXXXXXX //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! +use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; use crate::tenant::block_io::BlockCursor; +use crate::virtual_file::VirtualFile; use std::cmp::min; use std::io::{Error, ErrorKind}; impl<'a> BlockCursor<'a> { /// Read a blob into a new buffer. - pub async fn read_blob(&self, offset: u64) -> Result, std::io::Error> { + pub async fn read_blob( + &self, + offset: u64, + ctx: &RequestContext, + ) -> Result, std::io::Error> { let mut buf = Vec::new(); - self.read_blob_into_buf(offset, &mut buf).await?; + self.read_blob_into_buf(offset, &mut buf, ctx).await?; Ok(buf) } /// Read blob into the given buffer. Any previous contents in the buffer @@ -29,11 +35,12 @@ impl<'a> BlockCursor<'a> { &self, offset: u64, dstbuf: &mut Vec, + ctx: &RequestContext, ) -> Result<(), std::io::Error> { let mut blknum = (offset / PAGE_SZ as u64) as u32; let mut off = (offset % PAGE_SZ as u64) as usize; - let mut buf = self.read_blk(blknum).await?; + let mut buf = self.read_blk(blknum, ctx).await?; // peek at the first byte, to determine if it's a 1- or 4-byte length let first_len_byte = buf[off]; @@ -49,7 +56,7 @@ impl<'a> BlockCursor<'a> { // it is split across two pages len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]); blknum += 1; - buf = self.read_blk(blknum).await?; + buf = self.read_blk(blknum, ctx).await?; len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]); off = 4 - thislen; } else { @@ -70,7 +77,7 @@ impl<'a> BlockCursor<'a> { if page_remain == 0 { // continue on next page blknum += 1; - buf = self.read_blk(blknum).await?; + buf = self.read_blk(blknum, ctx).await?; off = 0; page_remain = PAGE_SZ; } @@ -83,35 +90,24 @@ impl<'a> BlockCursor<'a> { } } +/// A wrapper of `VirtualFile` that allows users to write blobs. /// -/// Abstract trait for a data sink that you can write blobs to. -/// -pub trait BlobWriter { - /// Write a blob of data. Returns the offset that it was written to, - /// which can be used to retrieve the data later. - fn write_blob(&mut self, srcbuf: &[u8]) -> Result; -} - -/// -/// An implementation of BlobWriter to write blobs to anything that -/// implements std::io::Write. -/// -pub struct WriteBlobWriter -where - W: std::io::Write, -{ - inner: W, +/// If a `BlobWriter` is dropped, the internal buffer will be +/// discarded. You need to call [`flush_buffer`](Self::flush_buffer) +/// manually before dropping. +pub struct BlobWriter { + inner: VirtualFile, offset: u64, + /// A buffer to save on write calls, only used if BUFFERED=true + buf: Vec, } -impl WriteBlobWriter -where - W: std::io::Write, -{ - pub fn new(inner: W, start_offset: u64) -> Self { - WriteBlobWriter { +impl BlobWriter { + pub fn new(inner: VirtualFile, start_offset: u64) -> Self { + Self { inner, offset: start_offset, + buf: Vec::with_capacity(Self::CAPACITY), } } @@ -119,28 +115,79 @@ where self.offset } - /// Access the underlying Write object. - /// - /// NOTE: WriteBlobWriter keeps track of the current write offset. If - /// you write something directly to the inner Write object, it makes the - /// internally tracked 'offset' to go out of sync. So don't do that. - pub fn into_inner(self) -> W { - self.inner - } -} + const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 }; -impl BlobWriter for WriteBlobWriter -where - W: std::io::Write, -{ - fn write_blob(&mut self, srcbuf: &[u8]) -> Result { + #[inline(always)] + /// Writes the given buffer directly to the underlying `VirtualFile`. + /// You need to make sure that the internal buffer is empty, otherwise + /// data will be written in wrong order. + async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> { + self.inner.write_all(src_buf).await?; + self.offset += src_buf.len() as u64; + Ok(()) + } + + #[inline(always)] + /// Flushes the internal buffer to the underlying `VirtualFile`. + pub async fn flush_buffer(&mut self) -> Result<(), Error> { + self.inner.write_all(&self.buf).await?; + self.buf.clear(); + Ok(()) + } + + #[inline(always)] + /// Writes as much of `src_buf` into the internal buffer as it fits + fn write_into_buffer(&mut self, src_buf: &[u8]) -> usize { + let remaining = Self::CAPACITY - self.buf.len(); + let to_copy = src_buf.len().min(remaining); + self.buf.extend_from_slice(&src_buf[..to_copy]); + self.offset += to_copy as u64; + to_copy + } + + /// Internal, possibly buffered, write function + async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> { + if !BUFFERED { + assert!(self.buf.is_empty()); + self.write_all_unbuffered(src_buf).await?; + return Ok(()); + } + let remaining = Self::CAPACITY - self.buf.len(); + // First try to copy as much as we can into the buffer + if remaining > 0 { + let copied = self.write_into_buffer(src_buf); + src_buf = &src_buf[copied..]; + } + // Then, if the buffer is full, flush it out + if self.buf.len() == Self::CAPACITY { + self.flush_buffer().await?; + } + // Finally, write the tail of src_buf: + // If it wholly fits into the buffer without + // completely filling it, then put it there. + // If not, write it out directly. + if !src_buf.is_empty() { + assert_eq!(self.buf.len(), 0); + if src_buf.len() < Self::CAPACITY { + let copied = self.write_into_buffer(src_buf); + // We just verified above that src_buf fits into our internal buffer. + assert_eq!(copied, src_buf.len()); + } else { + self.write_all_unbuffered(src_buf).await?; + } + } + Ok(()) + } + + /// Write a blob of data. Returns the offset that it was written to, + /// which can be used to retrieve the data later. + pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result { let offset = self.offset; if srcbuf.len() < 128 { // Short blob. Write a 1-byte length header let len_buf = srcbuf.len() as u8; - self.inner.write_all(&[len_buf])?; - self.offset += 1; + self.write_all(&[len_buf]).await?; } else { // Write a 4-byte length header if srcbuf.len() > 0x7fff_ffff { @@ -151,11 +198,154 @@ where } let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes(); len_buf[0] |= 0x80; - self.inner.write_all(&len_buf)?; - self.offset += 4; + self.write_all(&len_buf).await?; } - self.inner.write_all(srcbuf)?; - self.offset += srcbuf.len() as u64; + self.write_all(srcbuf).await?; Ok(offset) } } + +impl BlobWriter { + /// Access the underlying `VirtualFile`. + /// + /// This function flushes the internal buffer before giving access + /// to the underlying `VirtualFile`. + pub async fn into_inner(mut self) -> Result { + self.flush_buffer().await?; + Ok(self.inner) + } + + /// Access the underlying `VirtualFile`. + /// + /// Unlike [`into_inner`](Self::into_inner), this doesn't flush + /// the internal buffer before giving access. + pub fn into_inner_no_flush(self) -> VirtualFile { + self.inner + } +} + +impl BlobWriter { + /// Access the underlying `VirtualFile`. + pub fn into_inner(self) -> VirtualFile { + self.inner + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef}; + use rand::{Rng, SeedableRng}; + + async fn round_trip_test(blobs: &[Vec]) -> Result<(), Error> { + let temp_dir = tempfile::tempdir()?; + let path = temp_dir.path().join("file"); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + // Write part (in block to drop the file) + let mut offsets = Vec::new(); + { + let file = VirtualFile::create(&path).await?; + let mut wtr = BlobWriter::::new(file, 0); + for blob in blobs.iter() { + let offs = wtr.write_blob(blob).await?; + offsets.push(offs); + } + // Write out one page worth of zeros so that we can + // read again with read_blk + let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?; + println!("Writing final blob at offs={offs}"); + wtr.flush_buffer().await?; + } + + let file = VirtualFile::open(&path).await?; + let rdr = BlockReaderRef::VirtualFile(&file); + let rdr = BlockCursor::new(rdr); + for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { + let blob_read = rdr.read_blob(*offset, &ctx).await?; + assert_eq!( + blob, &blob_read, + "mismatch for idx={idx} at offset={offset}" + ); + } + Ok(()) + } + + fn random_array(len: usize) -> Vec { + let mut rng = rand::thread_rng(); + (0..len).map(|_| rng.gen()).collect::<_>() + } + + #[tokio::test] + async fn test_one() -> Result<(), Error> { + let blobs = &[vec![12, 21, 22]]; + round_trip_test::(blobs).await?; + round_trip_test::(blobs).await?; + Ok(()) + } + + #[tokio::test] + async fn test_hello_simple() -> Result<(), Error> { + let blobs = &[ + vec![0, 1, 2, 3], + b"Hello, World!".to_vec(), + Vec::new(), + b"foobar".to_vec(), + ]; + round_trip_test::(blobs).await?; + round_trip_test::(blobs).await?; + Ok(()) + } + + #[tokio::test] + async fn test_really_big_array() -> Result<(), Error> { + let blobs = &[ + b"test".to_vec(), + random_array(10 * PAGE_SZ), + b"foobar".to_vec(), + ]; + round_trip_test::(blobs).await?; + round_trip_test::(blobs).await?; + Ok(()) + } + + #[tokio::test] + async fn test_arrays_inc() -> Result<(), Error> { + let blobs = (0..PAGE_SZ / 8) + .map(|v| random_array(v * 16)) + .collect::>(); + round_trip_test::(&blobs).await?; + round_trip_test::(&blobs).await?; + Ok(()) + } + + #[tokio::test] + async fn test_arrays_random_size() -> Result<(), Error> { + let mut rng = rand::rngs::StdRng::seed_from_u64(42); + let blobs = (0..1024) + .map(|_| { + let mut sz: u16 = rng.gen(); + // Make 50% of the arrays small + if rng.gen() { + sz |= 63; + } + random_array(sz.into()) + }) + .collect::>(); + round_trip_test::(&blobs).await?; + round_trip_test::(&blobs).await?; + Ok(()) + } + + #[tokio::test] + async fn test_arrays_page_boundary() -> Result<(), Error> { + let blobs = &[ + random_array(PAGE_SZ - 4), + random_array(PAGE_SZ - 4), + random_array(PAGE_SZ - 4), + ]; + round_trip_test::(blobs).await?; + round_trip_test::(blobs).await?; + Ok(()) + } +} diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 69d5b49c6d..d81cf1b8a0 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -4,12 +4,11 @@ use super::ephemeral_file::EphemeralFile; use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; +use crate::context::RequestContext; use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ}; use crate::virtual_file::VirtualFile; use bytes::Bytes; -use std::fs::File; use std::ops::{Deref, DerefMut}; -use std::os::unix::fs::FileExt; /// This is implemented by anything that can read 8 kB (PAGE_SZ) /// blocks, using the page cache @@ -73,25 +72,31 @@ impl<'a> Deref for BlockLease<'a> { /// /// Unlike traits, we also support the read function to be async though. pub(crate) enum BlockReaderRef<'a> { - FileBlockReaderVirtual(&'a FileBlockReader), - FileBlockReaderFile(&'a FileBlockReader), + FileBlockReader(&'a FileBlockReader), EphemeralFile(&'a EphemeralFile), Adapter(Adapter<&'a DeltaLayerInner>), #[cfg(test)] TestDisk(&'a super::disk_btree::tests::TestDisk), + #[cfg(test)] + VirtualFile(&'a VirtualFile), } impl<'a> BlockReaderRef<'a> { #[inline(always)] - async fn read_blk(&self, blknum: u32) -> Result { + async fn read_blk( + &self, + blknum: u32, + ctx: &RequestContext, + ) -> Result { use BlockReaderRef::*; match self { - FileBlockReaderVirtual(r) => r.read_blk(blknum).await, - FileBlockReaderFile(r) => r.read_blk(blknum).await, - EphemeralFile(r) => r.read_blk(blknum).await, - Adapter(r) => r.read_blk(blknum).await, + FileBlockReader(r) => r.read_blk(blknum, ctx).await, + EphemeralFile(r) => r.read_blk(blknum, ctx).await, + Adapter(r) => r.read_blk(blknum, ctx).await, #[cfg(test)] TestDisk(r) => r.read_blk(blknum), + #[cfg(test)] + VirtualFile(r) => r.read_blk(blknum).await, } } } @@ -105,11 +110,13 @@ impl<'a> BlockReaderRef<'a> { /// /// ```no_run /// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader}; -/// # let reader: FileBlockReader = unimplemented!("stub"); +/// # use pageserver::context::RequestContext; +/// # let reader: FileBlockReader = unimplemented!("stub"); +/// # let ctx: RequestContext = unimplemented!("stub"); /// let cursor = reader.block_cursor(); -/// let buf = cursor.read_blk(1); +/// let buf = cursor.read_blk(1, &ctx); /// // do stuff with 'buf' -/// let buf = cursor.read_blk(2); +/// let buf = cursor.read_blk(2, &ctx); /// // do stuff with 'buf' /// ``` /// @@ -122,9 +129,9 @@ impl<'a> BlockCursor<'a> { BlockCursor { reader } } // Needed by cli - pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader) -> Self { + pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self { BlockCursor { - reader: BlockReaderRef::FileBlockReaderVirtual(reader), + reader: BlockReaderRef::FileBlockReader(reader), } } @@ -134,8 +141,12 @@ impl<'a> BlockCursor<'a> { /// access to the contents of the page. (For the page cache, the /// lease object represents a lock on the buffer.) #[inline(always)] - pub async fn read_blk(&self, blknum: u32) -> Result { - self.reader.read_blk(blknum).await + pub async fn read_blk( + &self, + blknum: u32, + ctx: &RequestContext, + ) -> Result { + self.reader.read_blk(blknum, ctx).await } } @@ -143,38 +154,41 @@ impl<'a> BlockCursor<'a> { /// /// The file is assumed to be immutable. This doesn't provide any functions /// for modifying the file, nor for invalidating the cache if it is modified. -pub struct FileBlockReader { - pub file: F, +pub struct FileBlockReader { + pub file: VirtualFile, /// Unique ID of this file, used as key in the page cache. file_id: page_cache::FileId, } -impl FileBlockReader -where - F: FileExt, -{ - pub fn new(file: F) -> Self { +impl FileBlockReader { + pub fn new(file: VirtualFile) -> Self { let file_id = page_cache::next_file_id(); FileBlockReader { file_id, file } } /// Read a page from the underlying file into given buffer. - fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> { + async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> { assert!(buf.len() == PAGE_SZ); - self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64) + self.file + .read_exact_at(buf, blkno as u64 * PAGE_SZ as u64) + .await } /// Read a block. /// /// Returns a "lease" object that can be used to /// access to the contents of the page. (For the page cache, the /// lease object represents a lock on the buffer.) - pub async fn read_blk(&self, blknum: u32) -> Result { + pub async fn read_blk( + &self, + blknum: u32, + ctx: &RequestContext, + ) -> Result { let cache = page_cache::get(); loop { match cache - .read_immutable_buf(self.file_id, blknum) + .read_immutable_buf(self.file_id, blknum, ctx) .await .map_err(|e| { std::io::Error::new( @@ -185,7 +199,7 @@ where ReadBufResult::Found(guard) => break Ok(guard.into()), ReadBufResult::NotFound(mut write_guard) => { // Read the page from disk into the buffer - self.fill_buffer(write_guard.deref_mut(), blknum)?; + self.fill_buffer(write_guard.deref_mut(), blknum).await?; write_guard.mark_valid(); // Swap for read lock @@ -196,15 +210,9 @@ where } } -impl BlockReader for FileBlockReader { +impl BlockReader for FileBlockReader { fn block_cursor(&self) -> BlockCursor<'_> { - BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self)) - } -} - -impl BlockReader for FileBlockReader { - fn block_cursor(&self) -> BlockCursor<'_> { - BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self)) + BlockCursor::new(BlockReaderRef::FileBlockReader(self)) } } diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 44d6b4f87e..06a04bf536 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -26,7 +26,11 @@ use std::{cmp::Ordering, io, result}; use thiserror::Error; use tracing::error; -use crate::tenant::block_io::{BlockReader, BlockWriter}; +use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::TaskKind, + tenant::block_io::{BlockReader, BlockWriter}, +}; // The maximum size of a value stored in the B-tree. 5 bytes is enough currently. pub const VALUE_SZ: usize = 5; @@ -231,14 +235,19 @@ where /// /// Read the value for given key. Returns the value, or None if it doesn't exist. /// - pub async fn get(&self, search_key: &[u8; L]) -> Result> { + pub async fn get(&self, search_key: &[u8; L], ctx: &RequestContext) -> Result> { let mut result: Option = None; - self.visit(search_key, VisitDirection::Forwards, |key, value| { - if key == search_key { - result = Some(value); - } - false - }) + self.visit( + search_key, + VisitDirection::Forwards, + |key, value| { + if key == search_key { + result = Some(value); + } + false + }, + ctx, + ) .await?; Ok(result) } @@ -253,6 +262,7 @@ where search_key: &[u8; L], dir: VisitDirection, mut visitor: V, + ctx: &RequestContext, ) -> Result where V: FnMut(&[u8], u64) -> bool, @@ -262,7 +272,9 @@ where let block_cursor = self.reader.block_cursor(); while let Some((node_blknum, opt_iter)) = stack.pop() { // Locate the node. - let node_buf = block_cursor.read_blk(self.start_blk + node_blknum).await?; + let node_buf = block_cursor + .read_blk(self.start_blk + node_blknum, ctx) + .await?; let node = OnDiskNode::deparse(node_buf.as_ref())?; let prefix_len = node.prefix_len as usize; @@ -351,13 +363,14 @@ where #[allow(dead_code)] pub async fn dump(&self) -> Result<()> { let mut stack = Vec::new(); + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); stack.push((self.root_blk, String::new(), 0, 0, 0)); let block_cursor = self.reader.block_cursor(); while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() { - let blk = block_cursor.read_blk(self.start_blk + blknum).await?; + let blk = block_cursor.read_blk(self.start_blk + blknum, &ctx).await?; let buf: &[u8] = blk.as_ref(); let node = OnDiskNode::::deparse(buf)?; @@ -688,6 +701,8 @@ impl BuildNode { #[cfg(test)] pub(crate) mod tests { use super::*; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; use rand::Rng; use std::collections::BTreeMap; @@ -725,6 +740,8 @@ pub(crate) mod tests { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let all_keys: Vec<&[u8; 6]> = vec![ b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb", ]; @@ -745,12 +762,12 @@ pub(crate) mod tests { // Test the `get` function on all the keys. for (key, val) in all_data.iter() { - assert_eq!(reader.get(key).await?, Some(*val)); + assert_eq!(reader.get(key, &ctx).await?, Some(*val)); } // And on some keys that don't exist - assert_eq!(reader.get(b"aaaaaa").await?, None); - assert_eq!(reader.get(b"zzzzzz").await?, None); - assert_eq!(reader.get(b"xaaabx").await?, None); + assert_eq!(reader.get(b"aaaaaa", &ctx).await?, None); + assert_eq!(reader.get(b"zzzzzz", &ctx).await?, None); + assert_eq!(reader.get(b"xaaabx", &ctx).await?, None); // Test search with `visit` function let search_key = b"xabaaa"; @@ -762,10 +779,15 @@ pub(crate) mod tests { let mut data = Vec::new(); reader - .visit(search_key, VisitDirection::Forwards, |key, value| { - data.push((key.to_vec(), value)); - true - }) + .visit( + search_key, + VisitDirection::Forwards, + |key, value| { + data.push((key.to_vec(), value)); + true + }, + &ctx, + ) .await?; assert_eq!(data, expected); @@ -778,18 +800,28 @@ pub(crate) mod tests { expected.reverse(); let mut data = Vec::new(); reader - .visit(search_key, VisitDirection::Backwards, |key, value| { - data.push((key.to_vec(), value)); - true - }) + .visit( + search_key, + VisitDirection::Backwards, + |key, value| { + data.push((key.to_vec(), value)); + true + }, + &ctx, + ) .await?; assert_eq!(data, expected); // Backward scan where nothing matches reader - .visit(b"aaaaaa", VisitDirection::Backwards, |key, value| { - panic!("found unexpected key {}: {}", hex::encode(key), value); - }) + .visit( + b"aaaaaa", + VisitDirection::Backwards, + |key, value| { + panic!("found unexpected key {}: {}", hex::encode(key), value); + }, + &ctx, + ) .await?; // Full scan @@ -799,10 +831,15 @@ pub(crate) mod tests { .collect(); let mut data = Vec::new(); reader - .visit(&[0u8; 6], VisitDirection::Forwards, |key, value| { - data.push((key.to_vec(), value)); - true - }) + .visit( + &[0u8; 6], + VisitDirection::Forwards, + |key, value| { + data.push((key.to_vec(), value)); + true + }, + &ctx, + ) .await?; assert_eq!(data, expected); @@ -813,6 +850,7 @@ pub(crate) mod tests { async fn lots_of_keys() -> Result<()> { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); const NUM_KEYS: u64 = 1000; @@ -851,14 +889,14 @@ pub(crate) mod tests { for search_key_int in 0..(NUM_KEYS * 2 + 10) { let search_key = u64::to_be_bytes(search_key_int); assert_eq!( - reader.get(&search_key).await?, + reader.get(&search_key, &ctx).await?, all_data.get(&search_key_int).cloned() ); // Test a forward scan starting with this key result.lock().unwrap().clear(); reader - .visit(&search_key, VisitDirection::Forwards, take_ten) + .visit(&search_key, VisitDirection::Forwards, take_ten, &ctx) .await?; let expected = all_data .range(search_key_int..) @@ -870,7 +908,7 @@ pub(crate) mod tests { // And a backwards scan result.lock().unwrap().clear(); reader - .visit(&search_key, VisitDirection::Backwards, take_ten) + .visit(&search_key, VisitDirection::Backwards, take_ten, &ctx) .await?; let expected = all_data .range(..=search_key_int) @@ -886,7 +924,7 @@ pub(crate) mod tests { limit.store(usize::MAX, Ordering::Relaxed); result.lock().unwrap().clear(); reader - .visit(&search_key, VisitDirection::Forwards, take_ten) + .visit(&search_key, VisitDirection::Forwards, take_ten, &ctx) .await?; let expected = all_data .iter() @@ -899,7 +937,7 @@ pub(crate) mod tests { limit.store(usize::MAX, Ordering::Relaxed); result.lock().unwrap().clear(); reader - .visit(&search_key, VisitDirection::Backwards, take_ten) + .visit(&search_key, VisitDirection::Backwards, take_ten, &ctx) .await?; let expected = all_data .iter() @@ -913,6 +951,8 @@ pub(crate) mod tests { #[tokio::test] async fn random_data() -> Result<()> { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + // Generate random keys with exponential distribution, to // exercise the prefix compression const NUM_KEYS: usize = 100000; @@ -939,22 +979,24 @@ pub(crate) mod tests { // Test get() operation on all the keys for (&key, &val) in all_data.iter() { let search_key = u128::to_be_bytes(key); - assert_eq!(reader.get(&search_key).await?, Some(val)); + assert_eq!(reader.get(&search_key, &ctx).await?, Some(val)); } // Test get() operations on random keys, most of which will not exist for _ in 0..100000 { let key_int = rand::thread_rng().gen::(); let search_key = u128::to_be_bytes(key_int); - assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned()); + assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned()); } // Test boundary cases assert!( - reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned() + reader.get(&u128::to_be_bytes(u128::MIN), &ctx).await? + == all_data.get(&u128::MIN).cloned() ); assert!( - reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned() + reader.get(&u128::to_be_bytes(u128::MAX), &ctx).await? + == all_data.get(&u128::MAX).cloned() ); Ok(()) @@ -985,6 +1027,7 @@ pub(crate) mod tests { // Build a tree from it let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); for (key, val) in disk_btree_test_data::TEST_DATA { writer.append(&key, val)?; @@ -997,16 +1040,21 @@ pub(crate) mod tests { // Test get() operation on all the keys for (key, val) in disk_btree_test_data::TEST_DATA { - assert_eq!(reader.get(&key).await?, Some(val)); + assert_eq!(reader.get(&key, &ctx).await?, Some(val)); } // Test full scan let mut count = 0; reader - .visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| { - count += 1; - true - }) + .visit( + &[0u8; 26], + VisitDirection::Forwards, + |_key, _value| { + count += 1; + true + }, + &ctx, + ) .await?; assert_eq!(count, disk_btree_test_data::TEST_DATA.len()); diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 31db3869d9..8785f51c06 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -2,6 +2,7 @@ //! used to keep in-memory layers spilled on disk. use crate::config::PageServerConf; +use crate::context::RequestContext; use crate::page_cache::{self, PAGE_SZ}; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader}; use crate::virtual_file::VirtualFile; @@ -9,7 +10,6 @@ use std::cmp::min; use std::fs::OpenOptions; use std::io::{self, ErrorKind}; use std::ops::DerefMut; -use std::os::unix::prelude::FileExt; use std::path::PathBuf; use std::sync::atomic::AtomicU64; use tracing::*; @@ -29,7 +29,7 @@ pub struct EphemeralFile { } impl EphemeralFile { - pub fn create( + pub async fn create( conf: &PageServerConf, tenant_id: TenantId, timeline_id: TimelineId, @@ -45,7 +45,8 @@ impl EphemeralFile { let file = VirtualFile::open_with_options( &filename, OpenOptions::new().read(true).write(true).create(true), - )?; + ) + .await?; Ok(EphemeralFile { page_cache_file_id: page_cache::next_file_id(), @@ -61,13 +62,17 @@ impl EphemeralFile { self.len } - pub(crate) async fn read_blk(&self, blknum: u32) -> Result { + pub(crate) async fn read_blk( + &self, + blknum: u32, + ctx: &RequestContext, + ) -> Result { let flushed_blknums = 0..self.len / PAGE_SZ as u64; if flushed_blknums.contains(&(blknum as u64)) { let cache = page_cache::get(); loop { match cache - .read_immutable_buf(self.page_cache_file_id, blknum) + .read_immutable_buf(self.page_cache_file_id, blknum, ctx) .await .map_err(|e| { std::io::Error::new( @@ -88,7 +93,8 @@ impl EphemeralFile { let buf: &mut [u8] = write_guard.deref_mut(); debug_assert_eq!(buf.len(), PAGE_SZ); self.file - .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?; + .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64) + .await?; write_guard.mark_valid(); // Swap for read lock @@ -102,7 +108,11 @@ impl EphemeralFile { } } - pub(crate) async fn write_blob(&mut self, srcbuf: &[u8]) -> Result { + pub(crate) async fn write_blob( + &mut self, + srcbuf: &[u8], + ctx: &RequestContext, + ) -> Result { struct Writer<'a> { ephemeral_file: &'a mut EphemeralFile, /// The block to which the next [`push_bytes`] will write. @@ -119,7 +129,11 @@ impl EphemeralFile { }) } #[inline(always)] - async fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> { + async fn push_bytes( + &mut self, + src: &[u8], + ctx: &RequestContext, + ) -> Result<(), io::Error> { let mut src_remaining = src; while !src_remaining.is_empty() { let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..]; @@ -128,10 +142,15 @@ impl EphemeralFile { self.off += n; src_remaining = &src_remaining[n..]; if self.off == PAGE_SZ { - match self.ephemeral_file.file.write_all_at( - &self.ephemeral_file.mutable_tail, - self.blknum as u64 * PAGE_SZ as u64, - ) { + match self + .ephemeral_file + .file + .write_all_at( + &self.ephemeral_file.mutable_tail, + self.blknum as u64 * PAGE_SZ as u64, + ) + .await + { Ok(_) => { // Pre-warm the page cache with what we just wrote. // This isn't necessary for coherency/correctness, but it's how we've always done it. @@ -140,6 +159,7 @@ impl EphemeralFile { .read_immutable_buf( self.ephemeral_file.page_cache_file_id, self.blknum, + ctx, ) .await { @@ -193,15 +213,15 @@ impl EphemeralFile { if srcbuf.len() < 0x80 { // short one-byte length header let len_buf = [srcbuf.len() as u8]; - writer.push_bytes(&len_buf).await?; + writer.push_bytes(&len_buf, ctx).await?; } else { let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32); len_buf[0] |= 0x80; - writer.push_bytes(&len_buf).await?; + writer.push_bytes(&len_buf, ctx).await?; } // Write the payload - writer.push_bytes(srcbuf).await?; + writer.push_bytes(srcbuf, ctx).await?; if srcbuf.len() < 0x80 { self.len += 1; @@ -255,6 +275,8 @@ impl BlockReader for EphemeralFile { #[cfg(test)] mod tests { use super::*; + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; use crate::tenant::block_io::{BlockCursor, BlockReaderRef}; use rand::{thread_rng, RngCore}; use std::fs; @@ -262,7 +284,15 @@ mod tests { fn harness( test_name: &str, - ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> { + ) -> Result< + ( + &'static PageServerConf, + TenantId, + TimelineId, + RequestContext, + ), + io::Error, + > { let repo_dir = PageServerConf::test_repo_dir(test_name); let _ = fs::remove_dir_all(&repo_dir); let conf = PageServerConf::dummy_conf(repo_dir); @@ -274,46 +304,57 @@ mod tests { let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap(); fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?; - Ok((conf, tenant_id, timeline_id)) + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + Ok((conf, tenant_id, timeline_id, ctx)) } #[tokio::test] async fn test_ephemeral_blobs() -> Result<(), io::Error> { - let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?; + let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?; + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?; - let pos_foo = file.write_blob(b"foo").await?; + let pos_foo = file.write_blob(b"foo", &ctx).await?; assert_eq!( b"foo", - file.block_cursor().read_blob(pos_foo).await?.as_slice() + file.block_cursor() + .read_blob(pos_foo, &ctx) + .await? + .as_slice() ); - let pos_bar = file.write_blob(b"bar").await?; + let pos_bar = file.write_blob(b"bar", &ctx).await?; assert_eq!( b"foo", - file.block_cursor().read_blob(pos_foo).await?.as_slice() + file.block_cursor() + .read_blob(pos_foo, &ctx) + .await? + .as_slice() ); assert_eq!( b"bar", - file.block_cursor().read_blob(pos_bar).await?.as_slice() + file.block_cursor() + .read_blob(pos_bar, &ctx) + .await? + .as_slice() ); let mut blobs = Vec::new(); for i in 0..10000 { let data = Vec::from(format!("blob{}", i).as_bytes()); - let pos = file.write_blob(&data).await?; + let pos = file.write_blob(&data, &ctx).await?; blobs.push((pos, data)); } // also test with a large blobs for i in 0..100 { let data = format!("blob{}", i).as_bytes().repeat(100); - let pos = file.write_blob(&data).await?; + let pos = file.write_blob(&data, &ctx).await?; blobs.push((pos, data)); } let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file)); for (pos, expected) in blobs { - let actual = cursor.read_blob(pos).await?; + let actual = cursor.read_blob(pos, &ctx).await?; assert_eq!(actual, expected); } @@ -321,8 +362,8 @@ mod tests { let mut large_data = Vec::new(); large_data.resize(20000, 0); thread_rng().fill_bytes(&mut large_data); - let pos_large = file.write_blob(&large_data).await?; - let result = file.block_cursor().read_blob(pos_large).await?; + let pos_large = file.write_blob(&large_data, &ctx).await?; + let result = file.block_cursor().read_blob(pos_large, &ctx).await?; assert_eq!(result, large_data); Ok(()) diff --git a/pageserver/src/tenant/manifest.rs b/pageserver/src/tenant/manifest.rs deleted file mode 100644 index 1d2835114f..0000000000 --- a/pageserver/src/tenant/manifest.rs +++ /dev/null @@ -1,325 +0,0 @@ -//! This module contains the encoding and decoding of the local manifest file. -//! -//! MANIFEST is a write-ahead log which is stored locally to each timeline. It -//! records the state of the storage engine. It contains a snapshot of the -//! state and all operations proceeding that snapshot. The file begins with a -//! header recording MANIFEST version number. After that, it contains a snapshot. -//! The snapshot is followed by a list of operations. Each operation is a list -//! of records. Each record is either an addition or a removal of a layer. -//! -//! With MANIFEST, we can: -//! -//! 1. recover state quickly by reading the file, potentially boosting the -//! startup speed. -//! 2. ensure all operations are atomic and avoid corruption, solving issues -//! like redundant image layer and preparing us for future compaction -//! strategies. -//! -//! There is also a format for storing all layer files on S3, called -//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which -//! records all operations as logs, and therefore we can easily replay the -//! operations when recovering from crash, while ensuring those operations -//! are atomic upon restart. -//! -//! Currently, this is not used in the system. Future refactors will ensure -//! the storage state will be recorded in this file, and the system can be -//! recovered from this file. This is tracked in -//! - -use std::io::{self, Read, Write}; - -use crate::virtual_file::VirtualFile; -use anyhow::Result; -use bytes::{Buf, BufMut, Bytes, BytesMut}; -use crc32c::crc32c; -use serde::{Deserialize, Serialize}; -use tracing::log::warn; -use utils::lsn::Lsn; - -use super::storage_layer::PersistentLayerDesc; - -pub struct Manifest { - file: VirtualFile, -} - -#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)] -pub struct Snapshot { - pub layers: Vec, -} - -/// serde by default encode this in tagged enum, and therefore it will be something -/// like `{ "AddLayer": { ... } }`. -#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)] -pub enum Record { - AddLayer(PersistentLayerDesc), - RemoveLayer(PersistentLayerDesc), -} - -/// `echo neon.manifest | sha1sum` and take the leading 8 bytes. -const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c; -const MANIFEST_VERSION: u64 = 1; - -#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)] -pub struct ManifestHeader { - magic_number: u64, - version: u64, -} - -const MANIFEST_HEADER_LEN: usize = 16; - -impl ManifestHeader { - fn encode(&self) -> BytesMut { - let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN); - buf.put_u64(self.magic_number); - buf.put_u64(self.version); - buf - } - - fn decode(mut buf: &[u8]) -> Self { - assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header"); - Self { - magic_number: buf.get_u64(), - version: buf.get_u64(), - } - } -} - -#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)] -pub enum Operation { - /// A snapshot of the current state. - /// - /// Lsn field represents the LSN that is persisted to disk for this snapshot. - Snapshot(Snapshot, Lsn), - /// An atomic operation that changes the state. - /// - /// Lsn field represents the LSN that is persisted to disk after the operation is done. - /// This will only change when new L0 is flushed to the disk. - Operation(Vec, Lsn), -} - -struct RecordHeader { - size: u32, - checksum: u32, -} - -const RECORD_HEADER_LEN: usize = 8; - -impl RecordHeader { - fn encode(&self) -> BytesMut { - let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN); - buf.put_u32(self.size); - buf.put_u32(self.checksum); - buf - } - - fn decode(mut buf: &[u8]) -> Self { - assert!(buf.len() == RECORD_HEADER_LEN, "invalid header"); - Self { - size: buf.get_u32(), - checksum: buf.get_u32(), - } - } -} - -#[derive(Debug, thiserror::Error)] -pub enum ManifestLoadError { - #[error("manifest header is corrupted")] - CorruptedManifestHeader, - #[error("unsupported manifest version: got {0}, expected {1}")] - UnsupportedVersion(u64, u64), - #[error("error when decoding record: {0}")] - DecodeRecord(serde_json::Error), - #[error("I/O error: {0}")] - Io(io::Error), -} - -#[must_use = "Should check if the manifest is partially corrupted"] -pub struct ManifestPartiallyCorrupted(bool); - -impl Manifest { - /// Create a new manifest by writing the manifest header and a snapshot record to the given file. - pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result { - let mut manifest = Self { file }; - manifest.append_manifest_header(ManifestHeader { - magic_number: MANIFEST_MAGIC_NUMBER, - version: MANIFEST_VERSION, - })?; - manifest.append_operation(Operation::Snapshot(snapshot, lsn))?; - Ok(manifest) - } - - /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted, - /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and - /// backup the current one. - pub fn load( - mut file: VirtualFile, - ) -> Result<(Self, Vec, ManifestPartiallyCorrupted), ManifestLoadError> { - let mut buf = vec![]; - file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?; - - // Read manifest header - let mut buf = Bytes::from(buf); - if buf.remaining() < MANIFEST_HEADER_LEN { - return Err(ManifestLoadError::CorruptedManifestHeader); - } - let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]); - buf.advance(MANIFEST_HEADER_LEN); - if header.version != MANIFEST_VERSION { - return Err(ManifestLoadError::UnsupportedVersion( - header.version, - MANIFEST_VERSION, - )); - } - - // Read operations - let mut operations = Vec::new(); - let corrupted = loop { - if buf.remaining() == 0 { - break false; - } - if buf.remaining() < RECORD_HEADER_LEN { - warn!("incomplete header when decoding manifest, could be corrupted"); - break true; - } - let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]); - let size = size as usize; - buf.advance(RECORD_HEADER_LEN); - if buf.remaining() < size { - warn!("incomplete data when decoding manifest, could be corrupted"); - break true; - } - let data = &buf[..size]; - if crc32c(data) != checksum { - warn!("checksum mismatch when decoding manifest, could be corrupted"); - break true; - } - // if the following decode fails, we cannot use the manifest or safely ignore any record. - operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?); - buf.advance(size); - }; - Ok(( - Self { file }, - operations, - ManifestPartiallyCorrupted(corrupted), - )) - } - - fn append_data(&mut self, data: &[u8]) -> Result<()> { - if data.len() >= u32::MAX as usize { - panic!("data too large"); - } - let header = RecordHeader { - size: data.len() as u32, - checksum: crc32c(data), - }; - let header = header.encode(); - self.file.write_all(&header)?; - self.file.write_all(data)?; - self.file.sync_all()?; - Ok(()) - } - - fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> { - let encoded = header.encode(); - self.file.write_all(&encoded)?; - Ok(()) - } - - /// Add an operation to the manifest. The operation will be appended to the end of the file, - /// and the file will fsync. - pub fn append_operation(&mut self, operation: Operation) -> Result<()> { - let encoded = Vec::from(serde_json::to_string(&operation)?); - self.append_data(&encoded) - } -} - -#[cfg(test)] -mod tests { - use std::fs::OpenOptions; - - use crate::repository::Key; - - use super::*; - - #[test] - fn test_read_manifest() { - let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest"); - std::fs::create_dir_all(&testdir).unwrap(); - let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap(); - let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233)); - let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333)); - let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333)); - let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333)); - - // Write a manifest with a snapshot and some operations - let snapshot = Snapshot { - layers: vec![layer1, layer2], - }; - let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap(); - manifest - .append_operation(Operation::Operation( - vec![Record::AddLayer(layer3.clone())], - Lsn::from(1), - )) - .unwrap(); - drop(manifest); - - // Open the second time and write - let file = VirtualFile::open_with_options( - &testdir.join("MANIFEST"), - OpenOptions::new() - .read(true) - .write(true) - .create_new(false) - .truncate(false), - ) - .unwrap(); - let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap(); - assert!(!corrupted.0); - assert_eq!(operations.len(), 2); - assert_eq!( - &operations[0], - &Operation::Snapshot(snapshot.clone(), Lsn::from(0)) - ); - assert_eq!( - &operations[1], - &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1)) - ); - manifest - .append_operation(Operation::Operation( - vec![ - Record::RemoveLayer(layer3.clone()), - Record::AddLayer(layer4.clone()), - ], - Lsn::from(2), - )) - .unwrap(); - drop(manifest); - - // Open the third time and verify - let file = VirtualFile::open_with_options( - &testdir.join("MANIFEST"), - OpenOptions::new() - .read(true) - .write(true) - .create_new(false) - .truncate(false), - ) - .unwrap(); - let (_manifest, operations, corrupted) = Manifest::load(file).unwrap(); - assert!(!corrupted.0); - assert_eq!(operations.len(), 3); - assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0))); - assert_eq!( - &operations[1], - &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1)) - ); - assert_eq!( - &operations[2], - &Operation::Operation( - vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)], - Lsn::from(2) - ) - ); - } -} diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index dbf2d5ac37..75ffe09696 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -8,14 +8,13 @@ //! //! [`remote_timeline_client`]: super::remote_timeline_client -use std::fs::{File, OpenOptions}; -use std::io::{self, Write}; +use std::io::{self}; -use anyhow::{bail, ensure, Context}; +use anyhow::{ensure, Context}; use serde::{de::Error, Deserialize, Serialize, Serializer}; use thiserror::Error; -use tracing::info_span; use utils::bin_ser::SerializeError; +use utils::crashsafe::path_with_suffix_extension; use utils::{ bin_ser::BeSer, id::{TenantId, TimelineId}, @@ -24,6 +23,7 @@ use utils::{ use crate::config::PageServerConf; use crate::virtual_file::VirtualFile; +use crate::TEMP_FILE_SUFFIX; /// Use special format number to enable backward compatibility. const METADATA_FORMAT_VERSION: u16 = 4; @@ -230,6 +230,23 @@ impl TimelineMetadata { pub fn pg_version(&self) -> u32 { self.body.pg_version } + + // Checksums make it awkward to build a valid instance by hand. This helper + // provides a TimelineMetadata with a valid checksum in its header. + #[cfg(test)] + pub fn example() -> Self { + let instance = Self::new( + "0/16960E8".parse::().unwrap(), + None, + None, + Lsn::from_hex("00000000").unwrap(), + Lsn::from_hex("00000000").unwrap(), + Lsn::from_hex("00000000").unwrap(), + 0, + ); + let bytes = instance.to_bytes().unwrap(); + Self::from_bytes(&bytes).unwrap() + } } impl<'de> Deserialize<'de> for TimelineMetadata { @@ -255,38 +272,19 @@ impl Serialize for TimelineMetadata { } /// Save timeline metadata to file -pub fn save_metadata( +#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))] +pub async fn save_metadata( conf: &'static PageServerConf, tenant_id: &TenantId, timeline_id: &TimelineId, data: &TimelineMetadata, - first_save: bool, ) -> anyhow::Result<()> { - let _enter = info_span!("saving metadata").entered(); let path = conf.metadata_path(tenant_id, timeline_id); - // use OpenOptions to ensure file presence is consistent with first_save - let mut file = VirtualFile::open_with_options( - &path, - OpenOptions::new().write(true).create_new(first_save), - ) - .context("open_with_options")?; - - let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; - - if file.write(&metadata_bytes)? != metadata_bytes.len() { - bail!("Could not write all the metadata bytes in a single call"); - } - file.sync_all()?; - - // fsync the parent directory to ensure the directory entry is durable - if first_save { - let timeline_dir = File::open( - path.parent() - .expect("Metadata should always have a parent dir"), - )?; - timeline_dir.sync_all()?; - } - + let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX); + let metadata_bytes = data.to_bytes().context("serialize metadata")?; + VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes) + .await + .context("write metadata")?; Ok(()) } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 87617b544c..74faee1115 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1,9 +1,10 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. +use rand::{distributions::Alphanumeric, Rng}; use std::collections::{hash_map, HashMap}; use std::ffi::OsStr; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::sync::Arc; use tokio::fs; @@ -11,6 +12,7 @@ use anyhow::Context; use once_cell::sync::Lazy; use tokio::sync::RwLock; use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; use tracing::*; use remote_storage::GenericRemoteStorage; @@ -18,12 +20,14 @@ use utils::crashsafe; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; +use crate::control_plane_client::ControlPlaneClient; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::TenantConfOpt; use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState}; -use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME}; +use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX}; +use utils::crashsafe::path_with_suffix_extension; use utils::fs_ext::PathExt; use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; @@ -60,6 +64,39 @@ impl TenantsMap { } } +/// This is "safe" in that that it won't leave behind a partially deleted directory +/// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting +/// the contents. +/// +/// This is pageserver-specific, as it relies on future processes after a crash to check +/// for TEMP_FILE_SUFFIX when loading things. +async fn safe_remove_tenant_dir_all(path: impl AsRef) -> std::io::Result<()> { + let tmp_path = safe_rename_tenant_dir(path).await?; + fs::remove_dir_all(tmp_path).await +} + +async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result { + let parent = path + .as_ref() + .parent() + // It is invalid to call this function with a relative path. Tenant directories + // should always have a parent. + .ok_or(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "Path must be absolute", + ))?; + let rand_suffix = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(8) + .map(char::from) + .collect::() + + TEMP_FILE_SUFFIX; + let tmp_path = path_with_suffix_extension(&path, &rand_suffix); + fs::rename(&path, &tmp_path).await?; + fs::File::open(parent).await?.sync_all().await?; + Ok(tmp_path) +} + static TENANTS: Lazy> = Lazy::new(|| RwLock::new(TenantsMap::Initializing)); /// Initialize repositories with locally available timelines. @@ -70,12 +107,21 @@ pub async fn init_tenant_mgr( conf: &'static PageServerConf, resources: TenantSharedResources, init_order: InitializationOrder, + cancel: CancellationToken, ) -> anyhow::Result<()> { // Scan local filesystem for attached tenants let tenants_dir = conf.tenants_path(); let mut tenants = HashMap::new(); + // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. + let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) { + Some(client.re_attach().await?) + } else { + info!("Control plane API not configured, tenant generations are disabled"); + None + }; + let mut dir_entries = fs::read_dir(&tenants_dir) .await .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?; @@ -92,6 +138,8 @@ pub async fn init_tenant_mgr( "Found temporary tenant directory, removing: {}", tenant_dir_path.display() ); + // No need to use safe_remove_tenant_dir_all because this is already + // a temporary path if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await { error!( "Failed to remove temporary directory '{}': {:?}", @@ -123,9 +171,53 @@ pub async fn init_tenant_mgr( continue; } + let tenant_id = match tenant_dir_path + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + { + Ok(id) => id, + Err(_) => { + warn!( + "Invalid tenant path (garbage in our repo directory?): {}", + tenant_dir_path.display() + ); + continue; + } + }; + + let generation = if let Some(generations) = &tenant_generations { + // We have a generation map: treat it as the authority for whether + // this tenant is really attached. + if let Some(gen) = generations.get(&tenant_id) { + *gen + } else { + info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response"); + if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { + error!( + "Failed to remove detached tenant directory '{}': {:?}", + tenant_dir_path.display(), + e + ); + } + continue; + } + } else { + // Legacy mode: no generation information, any tenant present + // on local disk may activate + info!( + "Starting tenant {} in legacy mode, no generation", + tenant_dir_path.display() + ); + Generation::none() + }; + match schedule_local_tenant_processing( conf, + tenant_id, &tenant_dir_path, + generation, resources.clone(), Some(init_order.clone()), &TENANTS, @@ -159,9 +251,12 @@ pub async fn init_tenant_mgr( Ok(()) } +#[allow(clippy::too_many_arguments)] pub(crate) fn schedule_local_tenant_processing( conf: &'static PageServerConf, + tenant_id: TenantId, tenant_path: &Path, + generation: Generation, resources: TenantSharedResources, init_order: Option, tenants: &'static tokio::sync::RwLock, @@ -182,15 +277,6 @@ pub(crate) fn schedule_local_tenant_processing( "Cannot load tenant from empty directory {tenant_path:?}" ); - let tenant_id = tenant_path - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .with_context(|| { - format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}") - })?; - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id); anyhow::ensure!( !conf.tenant_ignore_mark_file_path(&tenant_id).exists(), @@ -203,7 +289,7 @@ pub(crate) fn schedule_local_tenant_processing( match Tenant::spawn_attach( conf, tenant_id, - Generation::none(), + generation, resources.broker_client, tenants, remote_storage, @@ -227,13 +313,7 @@ pub(crate) fn schedule_local_tenant_processing( info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); // Start loading the tenant into memory. It will initially be in Loading state. Tenant::spawn_load( - conf, - tenant_id, - Generation::none(), - resources, - init_order, - tenants, - ctx, + conf, tenant_id, generation, resources, init_order, tenants, ctx, ) }; Ok(tenant) @@ -357,15 +437,16 @@ pub async fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: TenantId, + generation: Generation, broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, ctx: &RequestContext, ) -> Result, TenantMapInsertError> { - tenant_map_insert(tenant_id, || { + tenant_map_insert(tenant_id, || async { // We're holding the tenants lock in write mode while doing local IO. // If this section ever becomes contentious, introduce a new `TenantState::Creating` // and do the work in that state. - let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create)?; + let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create).await?; // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 @@ -374,7 +455,8 @@ pub async fn create_tenant( remote_storage, }; let created_tenant = - schedule_local_tenant_processing(conf, &tenant_directory, tenant_resources, None, &TENANTS, ctx)?; + schedule_local_tenant_processing(conf, tenant_id, &tenant_directory, + generation, tenant_resources, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -404,7 +486,8 @@ pub async fn set_new_tenant_config( let tenant = get_tenant(tenant_id, true).await?; let tenant_config_path = conf.tenant_config_path(&tenant_id); - Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf, false) + Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf) + .await .map_err(SetNewTenantConfigError::Persist)?; tenant.set_new_tenant_config(new_tenant_conf); Ok(()) @@ -420,6 +503,8 @@ pub enum GetTenantError { /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. +/// +/// This method is cancel-safe. pub async fn get_tenant( tenant_id: TenantId, active_only: bool, @@ -479,7 +564,24 @@ pub async fn detach_tenant( tenant_id: TenantId, detach_ignored: bool, ) -> Result<(), TenantStateError> { - detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await + let tmp_path = detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await?; + // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. + // After a tenant is detached, there are no more task_mgr tasks for that tenant_id. + let task_tenant_id = None; + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + TaskKind::MgmtRequest, + task_tenant_id, + None, + "tenant_files_delete", + false, + async move { + fs::remove_dir_all(tmp_path.as_path()) + .await + .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) + }, + ); + Ok(()) } async fn detach_tenant0( @@ -487,20 +589,16 @@ async fn detach_tenant0( tenants: &tokio::sync::RwLock, tenant_id: TenantId, detach_ignored: bool, -) -> Result<(), TenantStateError> { - let local_files_cleanup_operation = |tenant_id_to_clean| async move { +) -> Result { + let tenant_dir_rename_operation = |tenant_id_to_clean| async move { let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); - fs::remove_dir_all(&local_tenant_directory) + safe_rename_tenant_dir(&local_tenant_directory) .await - .with_context(|| { - format!("local tenant directory {local_tenant_directory:?} removal") - })?; - Ok(()) + .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename")) }; let removal_result = - remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id)) - .await; + remove_tenant_from_memory(tenants, tenant_id, tenant_dir_rename_operation(tenant_id)).await; // Ignored tenants are not present in memory and will bail the removal from memory operation. // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then. @@ -508,10 +606,10 @@ async fn detach_tenant0( let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id); if tenant_ignore_mark.exists() { info!("Detaching an ignored tenant"); - local_files_cleanup_operation(tenant_id) + let tmp_path = tenant_dir_rename_operation(tenant_id) .await - .with_context(|| format!("Ignored tenant {tenant_id} local files cleanup"))?; - return Ok(()); + .with_context(|| format!("Ignored tenant {tenant_id} local directory rename"))?; + return Ok(tmp_path); } } @@ -521,11 +619,12 @@ async fn detach_tenant0( pub async fn load_tenant( conf: &'static PageServerConf, tenant_id: TenantId, + generation: Generation, broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { - tenant_map_insert(tenant_id, || { + tenant_map_insert(tenant_id, || async { let tenant_path = conf.tenant_path(&tenant_id); let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id); if tenant_ignore_mark.exists() { @@ -537,7 +636,7 @@ pub async fn load_tenant( broker_client, remote_storage, }; - let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, resources, None, &TENANTS, ctx) + let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None, &TENANTS, ctx) .with_context(|| { format!("Failed to schedule tenant processing in path {tenant_path:?}") })?; @@ -601,13 +700,14 @@ pub async fn list_tenants() -> Result, TenantMapLis pub async fn attach_tenant( conf: &'static PageServerConf, tenant_id: TenantId, + generation: Generation, tenant_conf: TenantConfOpt, broker_client: storage_broker::BrokerClientChannel, remote_storage: GenericRemoteStorage, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { - tenant_map_insert(tenant_id, || { - let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?; + tenant_map_insert(tenant_id, || async { + let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach).await?; // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 @@ -622,7 +722,7 @@ pub async fn attach_tenant( broker_client, remote_storage: Some(remote_storage), }; - let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, resources, None, &TENANTS, ctx)?; + let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -655,12 +755,13 @@ pub enum TenantMapInsertError { /// /// NB: the closure should return quickly because the current implementation of tenants map /// serializes access through an `RwLock`. -async fn tenant_map_insert( +async fn tenant_map_insert( tenant_id: TenantId, insert_fn: F, ) -> Result, TenantMapInsertError> where - F: FnOnce() -> anyhow::Result>, + F: FnOnce() -> R, + R: std::future::Future>>, { let mut guard = TENANTS.write().await; let m = match &mut *guard { @@ -673,7 +774,7 @@ where tenant_id, e.get().current_state(), )), - hash_map::Entry::Vacant(v) => match insert_fn() { + hash_map::Entry::Vacant(v) => match insert_fn().await { Ok(tenant) => { v.insert(tenant.clone()); Ok(tenant) diff --git a/pageserver/src/tenant/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs index 3cbcfe8774..705b42aff7 100644 --- a/pageserver/src/tenant/par_fsync.rs +++ b/pageserver/src/tenant/par_fsync.rs @@ -4,10 +4,9 @@ use std::{ sync::atomic::{AtomicUsize, Ordering}, }; -use crate::virtual_file::VirtualFile; - fn fsync_path(path: &Path) -> io::Result<()> { - let file = VirtualFile::open(path)?; + // TODO use VirtualFile::fsync_all once we fully go async. + let file = std::fs::File::open(path)?; file.sync_all() } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 50bb8b43de..6f42b54ac2 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -342,7 +342,12 @@ impl RemoteTimelineClient { ) -> RemoteTimelineClient { RemoteTimelineClient { conf, - runtime: BACKGROUND_RUNTIME.handle().to_owned(), + runtime: if cfg!(test) { + // remote_timeline_client.rs tests rely on current-thread runtime + tokio::runtime::Handle::current() + } else { + BACKGROUND_RUNTIME.handle().clone() + }, tenant_id, timeline_id, generation, @@ -1425,6 +1430,30 @@ pub fn remote_index_path( .expect("Failed to construct path") } +/// Given the key of an index, parse out the generation part of the name +pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option { + let file_name = match path.get_path().file_name() { + Some(f) => f, + None => { + // Unexpected: we should be seeing index_part.json paths only + tracing::warn!("Malformed index key {}", path); + return None; + } + }; + + let file_name_str = match file_name.to_str() { + Some(s) => s, + None => { + tracing::warn!("Malformed index key {:?}", path); + return None; + } + }; + match file_name_str.split_once('-') { + Some((_, gen_suffix)) => Generation::parse_suffix(gen_suffix), + None => None, + } +} + /// Files on the remote storage are stored with paths, relative to the workdir. /// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path. /// @@ -1463,11 +1492,8 @@ mod tests { }, DEFAULT_PG_VERSION, }; - use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; - use std::{ - collections::HashSet, - path::{Path, PathBuf}, - }; + + use std::{collections::HashSet, path::Path}; use utils::lsn::Lsn; pub(super) fn dummy_contents(name: &str) -> Vec { @@ -1524,8 +1550,6 @@ mod tests { tenant: Arc, timeline: Arc, tenant_ctx: RequestContext, - remote_fs_dir: PathBuf, - client: Arc, } impl TestSetup { @@ -1535,54 +1559,44 @@ mod tests { let harness = TenantHarness::create(test_name)?; let (tenant, ctx) = harness.load().await; - // create an empty timeline directory let timeline = tenant .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) .await?; - let remote_fs_dir = harness.conf.workdir.join("remote_fs"); - std::fs::create_dir_all(remote_fs_dir)?; - let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?; - - let storage_config = RemoteStorageConfig { - max_concurrent_syncs: std::num::NonZeroUsize::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS, - ) - .unwrap(), - max_sync_errors: std::num::NonZeroU32::new( - remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS, - ) - .unwrap(), - storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), - }; - - let generation = Generation::new(0xdeadbeef); - - let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); - - let client = Arc::new(RemoteTimelineClient { - conf: harness.conf, - runtime: tokio::runtime::Handle::current(), - tenant_id: harness.tenant_id, - timeline_id: TIMELINE_ID, - generation, - storage_impl: storage, - upload_queue: Mutex::new(UploadQueue::Uninitialized), - metrics: Arc::new(RemoteTimelineClientMetrics::new( - &harness.tenant_id, - &TIMELINE_ID, - )), - }); - Ok(Self { harness, tenant, timeline, tenant_ctx: ctx, - remote_fs_dir, - client, }) } + + /// Construct a RemoteTimelineClient in an arbitrary generation + fn build_client(&self, generation: Generation) -> Arc { + Arc::new(RemoteTimelineClient { + conf: self.harness.conf, + runtime: tokio::runtime::Handle::current(), + tenant_id: self.harness.tenant_id, + timeline_id: TIMELINE_ID, + generation, + storage_impl: self.harness.remote_storage.clone(), + upload_queue: Mutex::new(UploadQueue::Uninitialized), + metrics: Arc::new(RemoteTimelineClientMetrics::new( + &self.harness.tenant_id, + &TIMELINE_ID, + )), + }) + } + + /// A tracing::Span that satisfies remote_timeline_client methods that assert tenant_id + /// and timeline_id are present. + fn span(&self) -> tracing::Span { + tracing::info_span!( + "test", + tenant_id = %self.harness.tenant_id, + timeline_id = %TIMELINE_ID + ) + } } // Test scheduling @@ -1602,29 +1616,44 @@ mod tests { // Schedule another deletion. Check that it's launched immediately. // Schedule index upload. Check that it's queued + let test_setup = TestSetup::new("upload_scheduling").await.unwrap(); + let span = test_setup.span(); + let _guard = span.enter(); + let TestSetup { harness, tenant: _tenant, - timeline: _timeline, + timeline, tenant_ctx: _tenant_ctx, - remote_fs_dir, - client, - } = TestSetup::new("upload_scheduling").await.unwrap(); + } = test_setup; + + let client = timeline.remote_client.as_ref().unwrap(); + + // Download back the index.json, and check that the list of files is correct + let initial_index_part = match client.download_index_file().await.unwrap() { + MaybeDeletedIndexPart::IndexPart(index_part) => index_part, + MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"), + }; + let initial_layers = initial_index_part + .layer_metadata + .keys() + .map(|f| f.to_owned()) + .collect::>(); + let initial_layer = { + assert!(initial_layers.len() == 1); + initial_layers.into_iter().next().unwrap() + }; let timeline_path = harness.timeline_path(&TIMELINE_ID); println!("workdir: {}", harness.conf.workdir.display()); - let remote_timeline_dir = - remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap()); + let remote_timeline_dir = harness + .remote_fs_dir + .join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap()); println!("remote_timeline_dir: {}", remote_timeline_dir.display()); - let metadata = dummy_metadata(Lsn(0x10)); - client - .init_upload_queue_for_empty_remote(&metadata) - .unwrap(); - - let generation = Generation::new(0xdeadbeef); + let generation = harness.generation; // Create a couple of dummy files, schedule upload for them let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); @@ -1705,6 +1734,7 @@ mod tests { .map(|f| f.to_owned()) .collect(), &[ + &initial_layer.file_name(), &layer_file_name_1.file_name(), &layer_file_name_2.file_name(), ], @@ -1734,6 +1764,7 @@ mod tests { } assert_remote_files( &[ + &initial_layer.file_name(), &layer_file_name_1.file_name(), &layer_file_name_2.file_name(), "index_part.json", @@ -1747,6 +1778,7 @@ mod tests { assert_remote_files( &[ + &initial_layer.file_name(), &layer_file_name_2.file_name(), &layer_file_name_3.file_name(), "index_part.json", @@ -1763,16 +1795,10 @@ mod tests { let TestSetup { harness, tenant: _tenant, - timeline: _timeline, - client, + timeline, .. } = TestSetup::new("metrics").await.unwrap(); - - let metadata = dummy_metadata(Lsn(0x10)); - client - .init_upload_queue_for_empty_remote(&metadata) - .unwrap(); - + let client = timeline.remote_client.as_ref().unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); @@ -1783,11 +1809,20 @@ mod tests { ) .unwrap(); - #[derive(Debug, PartialEq)] + #[derive(Debug, PartialEq, Clone, Copy)] struct BytesStartedFinished { started: Option, finished: Option, } + impl std::ops::Add for BytesStartedFinished { + type Output = Self; + fn add(self, rhs: Self) -> Self::Output { + Self { + started: self.started.map(|v| v + rhs.started.unwrap_or(0)), + finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)), + } + } + } let get_bytes_started_stopped = || { let started = client .metrics @@ -1804,47 +1839,140 @@ mod tests { }; // Test + tracing::info!("now doing actual test"); - let generation = Generation::new(0xdeadbeef); - - let init = get_bytes_started_stopped(); + let actual_a = get_bytes_started_stopped(); client .schedule_layer_file_upload( &layer_file_name_1, - &LayerFileMetadata::new(content_1.len() as u64, generation), + &LayerFileMetadata::new(content_1.len() as u64, harness.generation), ) .unwrap(); - let pre = get_bytes_started_stopped(); + let actual_b = get_bytes_started_stopped(); client.wait_completion().await.unwrap(); - let post = get_bytes_started_stopped(); + let actual_c = get_bytes_started_stopped(); // Validate - assert_eq!( - init, - BytesStartedFinished { - started: None, - finished: None - } - ); - assert_eq!( - pre, - BytesStartedFinished { + let expected_b = actual_a + + BytesStartedFinished { started: Some(content_1.len()), // assert that the _finished metric is created eagerly so that subtractions work on first sample finished: Some(0), - } - ); - assert_eq!( - post, - BytesStartedFinished { + }; + assert_eq!(actual_b, expected_b); + + let expected_c = actual_a + + BytesStartedFinished { started: Some(content_1.len()), - finished: Some(content_1.len()) - } + finished: Some(content_1.len()), + }; + assert_eq!(actual_c, expected_c); + } + + async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart { + // An empty IndexPart, just sufficient to ensure deserialization will succeed + let example_metadata = TimelineMetadata::example(); + let example_index_part = IndexPart::new( + HashMap::new(), + example_metadata.disk_consistent_lsn(), + example_metadata, ); + + let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); + + let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID); + let remote_timeline_dir = test_state.harness.remote_fs_dir.join( + timeline_path + .strip_prefix(&test_state.harness.conf.workdir) + .unwrap(), + ); + + std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work"); + + let index_path = test_state.harness.remote_fs_dir.join( + remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(), + ); + eprintln!("Writing {}", index_path.display()); + std::fs::write(&index_path, index_part_bytes).unwrap(); + example_index_part + } + + /// Assert that when a RemoteTimelineclient in generation `get_generation` fetches its + /// index, the IndexPart returned is equal to `expected` + async fn assert_got_index_part( + test_state: &TestSetup, + get_generation: Generation, + expected: &IndexPart, + ) { + let client = test_state.build_client(get_generation); + + let download_r = client + .download_index_file() + .await + .expect("download should always succeed"); + assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_))); + match download_r { + MaybeDeletedIndexPart::IndexPart(index_part) => { + assert_eq!(&index_part, expected); + } + MaybeDeletedIndexPart::Deleted(_index_part) => panic!("Test doesn't set deleted_at"), + } + } + + #[tokio::test] + async fn index_part_download_simple() -> anyhow::Result<()> { + let test_state = TestSetup::new("index_part_download_simple").await.unwrap(); + let span = test_state.span(); + let _guard = span.enter(); + + // Simple case: we are in generation N, load the index from generation N - 1 + let generation_n = 5; + let injected = inject_index_part(&test_state, Generation::new(generation_n - 1)).await; + + assert_got_index_part(&test_state, Generation::new(generation_n), &injected).await; + + Ok(()) + } + + #[tokio::test] + async fn index_part_download_ordering() -> anyhow::Result<()> { + let test_state = TestSetup::new("index_part_download_ordering") + .await + .unwrap(); + + let span = test_state.span(); + let _guard = span.enter(); + + // A generation-less IndexPart exists in the bucket, we should find it + let generation_n = 5; + let injected_none = inject_index_part(&test_state, Generation::none()).await; + assert_got_index_part(&test_state, Generation::new(generation_n), &injected_none).await; + + // If a more recent-than-none generation exists, we should prefer to load that + let injected_1 = inject_index_part(&test_state, Generation::new(1)).await; + assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await; + + // If a more-recent-than-me generation exists, we should ignore it. + let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await; + assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await; + + // If a directly previous generation exists, _and_ an index exists in my own + // generation, I should prefer my own generation. + let _injected_prev = + inject_index_part(&test_state, Generation::new(generation_n - 1)).await; + let injected_current = inject_index_part(&test_state, Generation::new(generation_n)).await; + assert_got_index_part( + &test_state, + Generation::new(generation_n), + &injected_current, + ) + .await; + + Ok(()) } } diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index dc8d87b9e1..9863215529 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -24,7 +24,10 @@ use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; use super::index::{IndexPart, LayerFileMetadata}; -use super::{remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES}; +use super::{ + parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, +}; static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120); @@ -219,13 +222,13 @@ pub async fn list_remote_timelines( Ok(timeline_ids) } -pub(super) async fn download_index_part( +async fn do_download_index_part( storage: &GenericRemoteStorage, tenant_id: &TenantId, timeline_id: &TimelineId, - generation: Generation, + index_generation: Generation, ) -> Result { - let remote_path = remote_index_path(tenant_id, timeline_id, generation); + let remote_path = remote_index_path(tenant_id, timeline_id, index_generation); let index_part_bytes = download_retry( || async { @@ -252,6 +255,105 @@ pub(super) async fn download_index_part( Ok(index_part) } +/// index_part.json objects are suffixed with a generation number, so we cannot +/// directly GET the latest index part without doing some probing. +/// +/// In this function we probe for the most recent index in a generation <= our current generation. +/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md +#[tracing::instrument(skip_all, fields(generation=?my_generation))] +pub(super) async fn download_index_part( + storage: &GenericRemoteStorage, + tenant_id: &TenantId, + timeline_id: &TimelineId, + my_generation: Generation, +) -> Result { + debug_assert_current_span_has_tenant_and_timeline_id(); + + if my_generation.is_none() { + // Operating without generations: just fetch the generation-less path + return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await; + } + + // Stale case: If we were intentionally attached in a stale generation, there may already be a remote + // index in our generation. + // + // This is an optimization to avoid doing the listing for the general case below. + let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await; + match res { + Ok(index_part) => { + tracing::debug!( + "Found index_part from current generation (this is a stale attachment)" + ); + return Ok(index_part); + } + Err(DownloadError::NotFound) => {} + Err(e) => return Err(e), + }; + + // Typical case: the previous generation of this tenant was running healthily, and had uploaded + // and index part. We may safely start from this index without doing a listing, because: + // - We checked for current generation case above + // - generations > my_generation are to be ignored + // - any other indices that exist would have an older generation than `previous_gen`, and + // we want to find the most recent index from a previous generation. + // + // This is an optimization to avoid doing the listing for the general case below. + let res = + do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await; + match res { + Ok(index_part) => { + tracing::debug!("Found index_part from previous generation"); + return Ok(index_part); + } + Err(DownloadError::NotFound) => { + tracing::debug!( + "No index_part found from previous generation, falling back to listing" + ); + } + Err(e) => { + return Err(e); + } + } + + // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json + // objects, and select the highest one with a generation <= my_generation. + let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none()); + let indices = backoff::retry( + || async { storage.list_files(Some(&index_prefix)).await }, + |_| false, + FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "listing index_part files", + // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066) + backoff::Cancel::new(CancellationToken::new(), || -> anyhow::Error { + unreachable!() + }), + ) + .await + .map_err(DownloadError::Other)?; + + // General case logic for which index to use: the latest index whose generation + // is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md + let max_previous_generation = indices + .into_iter() + .filter_map(parse_remote_index_path) + .filter(|g| g <= &my_generation) + .max(); + + match max_previous_generation { + Some(g) => { + tracing::debug!("Found index_part in generation {g:?}"); + do_download_index_part(storage, tenant_id, timeline_id, g).await + } + None => { + // Migration from legacy pre-generation state: we have a generation but no prior + // attached pageservers did. Try to load from a no-generation path. + tracing::info!("No index_part.json* found"); + do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await + } + } +} + /// Helper function to handle retries for a download operation. /// /// Remote operations can fail due to rate limits (IAM, S3), spurious network diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 9cc5256568..05f9f5dcd2 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -96,6 +96,10 @@ impl IndexPart { /// is always generated from the keys of `layer_metadata`) /// - 4: timeline_layers is fully removed. const LATEST_VERSION: usize = 4; + + // Versions we may see when reading from a bucket. + pub const KNOWN_VERSIONS: &[usize] = &[1, 2, 3, 4]; + pub const FILE_NAME: &'static str = "index_part.json"; pub fn new( @@ -117,6 +121,16 @@ impl IndexPart { deleted_at: None, } } + + pub fn get_version(&self) -> usize { + self.version + } + + /// If you want this under normal operations, read it from self.metadata: + /// this method is just for the scrubber to use when validating an index. + pub fn get_disk_consistent_lsn(&self) -> Lsn { + self.disk_consistent_lsn + } } impl TryFrom<&UploadQueueInitialized> for IndexPart { @@ -137,7 +151,7 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart { /// Serialized form of [`LayerFileMetadata`]. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub struct IndexLayerMetadata { - pub(super) file_size: u64, + pub file_size: u64, #[serde(default = "Generation::none")] #[serde(skip_serializing_if = "Generation::is_none")] diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index d9df346a14..fbc5ecc9c0 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -28,10 +28,10 @@ //! "values" part. //! use crate::config::PageServerConf; -use crate::context::RequestContext; +use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value, KEY_SIZE}; -use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter}; +use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; use crate::tenant::storage_layer::{ @@ -45,8 +45,7 @@ use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs::{self, File}; -use std::io::{BufWriter, Write}; -use std::io::{Seek, SeekFrom}; +use std::io::SeekFrom; use std::ops::Range; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; @@ -219,7 +218,7 @@ pub struct DeltaLayerInner { index_root_blk: u32, /// Reader object for reading blocks from the file. - file: FileBlockReader, + file: FileBlockReader, } impl AsRef for DeltaLayerInner { @@ -318,11 +317,11 @@ impl DeltaLayer { tree_reader.dump().await?; - let keys = DeltaLayerInner::load_keys(&inner).await?; + let keys = DeltaLayerInner::load_keys(&inner, ctx).await?; // A subroutine to dump a single blob - async fn dump_blob(val: ValueRef<'_>) -> Result { - let buf = val.reader.read_blob(val.blob_ref.pos()).await?; + async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> Result { + let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?; let val = Value::des(&buf)?; let desc = match val { Value::Image(img) => { @@ -343,7 +342,7 @@ impl DeltaLayer { for entry in keys { let DeltaEntry { key, lsn, val, .. } = entry; - let desc = match dump_blob(val).await { + let desc = match dump_blob(val, ctx).await { Ok(desc) => desc, Err(err) => { let err: anyhow::Error = err; @@ -371,7 +370,7 @@ impl DeltaLayer { .load(LayerAccessKind::GetValueReconstructData, ctx) .await?; inner - .get_value_reconstruct_data(key, lsn_range, reconstruct_state) + .get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx) .await } @@ -454,12 +453,12 @@ impl DeltaLayer { self.access_stats.record_access(access_kind, ctx); // Quick exit if already loaded self.inner - .get_or_try_init(|| self.load_inner()) + .get_or_try_init(|| self.load_inner(ctx)) .await .with_context(|| format!("Failed to load delta layer {}", self.path().display())) } - async fn load_inner(&self) -> Result> { + async fn load_inner(&self, ctx: &RequestContext) -> Result> { let path = self.path(); let summary = match &self.path_or_conf { @@ -467,7 +466,7 @@ impl DeltaLayer { PathOrConf::Path(_) => None, }; - let loaded = DeltaLayerInner::load(&path, summary).await?; + let loaded = DeltaLayerInner::load(&path, summary, ctx).await?; if let PathOrConf::Path(ref path) = self.path_or_conf { // not production code @@ -555,7 +554,7 @@ impl DeltaLayer { .load(LayerAccessKind::KeyIter, ctx) .await .context("load delta layer keys")?; - DeltaLayerInner::load_keys(inner) + DeltaLayerInner::load_keys(inner, ctx) .await .context("Layer index is corrupted") } @@ -583,14 +582,14 @@ struct DeltaLayerWriterInner { tree: DiskBtreeBuilder, - blob_writer: WriteBlobWriter>, + blob_writer: BlobWriter, } impl DeltaLayerWriterInner { /// /// Start building a new delta layer. /// - fn new( + async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_id: TenantId, @@ -605,11 +604,10 @@ impl DeltaLayerWriterInner { // FIXME: throw an error instead? let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range); - let mut file = VirtualFile::create(&path)?; + let mut file = VirtualFile::create(&path).await?; // make room for the header block - file.seek(SeekFrom::Start(PAGE_SZ as u64))?; - let buf_writer = BufWriter::new(file); - let blob_writer = WriteBlobWriter::new(buf_writer, PAGE_SZ as u64); + file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; + let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); // Initialize the b-tree index builder let block_buf = BlockBuf::new(); @@ -632,11 +630,12 @@ impl DeltaLayerWriterInner { /// /// The values must be appended in key, lsn order. /// - fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { + async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init()) + .await } - fn put_value_bytes( + async fn put_value_bytes( &mut self, key: Key, lsn: Lsn, @@ -645,7 +644,7 @@ impl DeltaLayerWriterInner { ) -> anyhow::Result<()> { assert!(self.lsn_range.start <= lsn); - let off = self.blob_writer.write_blob(val)?; + let off = self.blob_writer.write_blob(val).await?; let blob_ref = BlobRef::new(off, will_init); @@ -662,18 +661,18 @@ impl DeltaLayerWriterInner { /// /// Finish writing the delta layer. /// - fn finish(self, key_end: Key) -> anyhow::Result { + async fn finish(self, key_end: Key) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; - let buf_writer = self.blob_writer.into_inner(); - let mut file = buf_writer.into_inner()?; + let mut file = self.blob_writer.into_inner().await?; // Write out the index let (index_root_blk, block_buf) = self.tree.finish()?; - file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?; + file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64)) + .await?; for buf in block_buf.blocks { - file.write_all(buf.as_ref())?; + file.write_all(buf.as_ref()).await?; } assert!(self.lsn_range.start < self.lsn_range.end); // Fill in the summary on blk 0 @@ -687,11 +686,22 @@ impl DeltaLayerWriterInner { index_start_blk, index_root_blk, }; - file.seek(SeekFrom::Start(0))?; - Summary::ser_into(&summary, &mut file)?; + + let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + Summary::ser_into(&summary, &mut buf)?; + if buf.spilled() { + // This is bad as we only have one free block for the summary + warn!( + "Used more than one page size for summary buffer: {}", + buf.len() + ); + } + file.seek(SeekFrom::Start(0)).await?; + file.write_all(&buf).await?; let metadata = file .metadata() + .await .context("get file metadata to determine size")?; // 5GB limit for objects without multipart upload (which we don't want to use) @@ -722,7 +732,7 @@ impl DeltaLayerWriterInner { }; // fsync the file - file.sync_all()?; + file.sync_all().await?; // Rename the file to its final name // // Note: This overwrites any existing file. There shouldn't be any. @@ -774,7 +784,7 @@ impl DeltaLayerWriter { /// /// Start building a new delta layer. /// - pub fn new( + pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_id: TenantId, @@ -782,13 +792,10 @@ impl DeltaLayerWriter { lsn_range: Range, ) -> anyhow::Result { Ok(Self { - inner: Some(DeltaLayerWriterInner::new( - conf, - timeline_id, - tenant_id, - key_start, - lsn_range, - )?), + inner: Some( + DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range) + .await?, + ), }) } @@ -797,11 +804,11 @@ impl DeltaLayerWriter { /// /// The values must be appended in key, lsn order. /// - pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { - self.inner.as_mut().unwrap().put_value(key, lsn, val) + pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { + self.inner.as_mut().unwrap().put_value(key, lsn, val).await } - pub fn put_value_bytes( + pub async fn put_value_bytes( &mut self, key: Key, lsn: Lsn, @@ -812,6 +819,7 @@ impl DeltaLayerWriter { .as_mut() .unwrap() .put_value_bytes(key, lsn, val, will_init) + .await } pub fn size(&self) -> u64 { @@ -821,21 +829,18 @@ impl DeltaLayerWriter { /// /// Finish writing the delta layer. /// - pub fn finish(mut self, key_end: Key) -> anyhow::Result { - self.inner.take().unwrap().finish(key_end) + pub async fn finish(mut self, key_end: Key) -> anyhow::Result { + self.inner.take().unwrap().finish(key_end).await } } impl Drop for DeltaLayerWriter { fn drop(&mut self) { if let Some(inner) = self.inner.take() { - match inner.blob_writer.into_inner().into_inner() { - Ok(vfile) => vfile.remove(), - Err(err) => warn!( - "error while flushing buffer of image layer temporary file: {}", - err - ), - } + // We want to remove the virtual file here, so it's fine to not + // having completely flushed unwritten data. + let vfile = inner.blob_writer.into_inner_no_flush(); + vfile.remove(); } } } @@ -844,12 +849,14 @@ impl DeltaLayerInner { pub(super) async fn load( path: &std::path::Path, summary: Option, + ctx: &RequestContext, ) -> anyhow::Result { let file = VirtualFile::open(path) + .await .with_context(|| format!("Failed to open file '{}'", path.display()))?; let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0).await?; + let summary_blk = file.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; if let Some(mut expected_summary) = summary { @@ -877,6 +884,7 @@ impl DeltaLayerInner { key: Key, lsn_range: Range, reconstruct_state: &mut ValueReconstructState, + ctx: &RequestContext, ) -> anyhow::Result { let mut need_image = true; // Scan the page versions backwards, starting from `lsn`. @@ -891,27 +899,38 @@ impl DeltaLayerInner { let mut offsets: Vec<(Lsn, u64)> = Vec::new(); tree_reader - .visit(&search_key.0, VisitDirection::Backwards, |key, value| { - let blob_ref = BlobRef(value); - if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] { - return false; - } - let entry_lsn = DeltaKey::extract_lsn_from_buf(key); - if entry_lsn < lsn_range.start { - return false; - } - offsets.push((entry_lsn, blob_ref.pos())); + .visit( + &search_key.0, + VisitDirection::Backwards, + |key, value| { + let blob_ref = BlobRef(value); + if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] { + return false; + } + let entry_lsn = DeltaKey::extract_lsn_from_buf(key); + if entry_lsn < lsn_range.start { + return false; + } + offsets.push((entry_lsn, blob_ref.pos())); - !blob_ref.will_init() - }) + !blob_ref.will_init() + }, + &RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::DeltaLayerBtreeNode) + .build(), + ) .await?; + let ctx = &RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::DeltaLayerValue) + .build(); + // Ok, 'offsets' now contains the offsets of all the entries we need to read let cursor = file.block_cursor(); let mut buf = Vec::new(); for (entry_lsn, pos) in offsets { cursor - .read_blob_into_buf(pos, &mut buf) + .read_blob_into_buf(pos, &mut buf, ctx) .await .with_context(|| { format!( @@ -952,9 +971,10 @@ impl DeltaLayerInner { } } - pub(super) async fn load_keys + Clone>( - this: &T, - ) -> Result>> { + pub(super) async fn load_keys<'a, 'b, T: AsRef + Clone>( + this: &'a T, + ctx: &'b RequestContext, + ) -> Result>> { let dl = this.as_ref(); let file = &dl.file; @@ -991,6 +1011,9 @@ impl DeltaLayerInner { all_keys.push(entry); true }, + &RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::DeltaLayerBtreeNode) + .build(), ) .await?; if let Some(last) = all_keys.last_mut() { @@ -1020,9 +1043,9 @@ pub struct ValueRef<'a> { impl<'a> ValueRef<'a> { /// Loads the value from disk - pub async fn load(&self) -> Result { + pub async fn load(&self, ctx: &RequestContext) -> Result { // theoretically we *could* record an access time for each, but it does not really matter - let buf = self.reader.read_blob(self.blob_ref.pos()).await?; + let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?; let val = Value::des(&buf)?; Ok(val) } @@ -1031,7 +1054,11 @@ impl<'a> ValueRef<'a> { pub(crate) struct Adapter(T); impl> Adapter { - pub(crate) async fn read_blk(&self, blknum: u32) -> Result { - self.0.as_ref().file.read_blk(blknum).await + pub(crate) async fn read_blk( + &self, + blknum: u32, + ctx: &RequestContext, + ) -> Result { + self.0.as_ref().file.read_blk(blknum, ctx).await } } diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs index b52c20a7c6..9fb0c23dd7 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/filename.rs @@ -212,7 +212,7 @@ pub enum LayerFileName { } impl LayerFileName { - pub(crate) fn file_name(&self) -> String { + pub fn file_name(&self) -> String { self.to_string() } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index b1fc257092..a5470a9f9d 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -24,10 +24,10 @@ //! mapping from Key to an offset in the "values" part. The //! actual page images are stored in the "values" part. use crate::config::PageServerConf; -use crate::context::RequestContext; +use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::PAGE_SZ; use crate::repository::{Key, KEY_SIZE}; -use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter}; +use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; use crate::tenant::storage_layer::{ @@ -42,8 +42,7 @@ use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs::{self, File}; -use std::io::Write; -use std::io::{Seek, SeekFrom}; +use std::io::SeekFrom; use std::ops::Range; use std::os::unix::prelude::FileExt; use std::path::{Path, PathBuf}; @@ -155,7 +154,7 @@ pub struct ImageLayerInner { lsn: Lsn, /// Reader object for reading blocks from the file. - file: FileBlockReader, + file: FileBlockReader, } impl std::fmt::Debug for ImageLayerInner { @@ -238,10 +237,15 @@ impl ImageLayer { tree_reader.dump().await?; tree_reader - .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| { - println!("key: {} offset {}", hex::encode(key), value); - true - }) + .visit( + &[0u8; KEY_SIZE], + VisitDirection::Forwards, + |key, value| { + println!("key: {} offset {}", hex::encode(key), value); + true + }, + ctx, + ) .await?; Ok(()) @@ -262,7 +266,7 @@ impl ImageLayer { .load(LayerAccessKind::GetValueReconstructData, ctx) .await?; inner - .get_value_reconstruct_data(key, reconstruct_state) + .get_value_reconstruct_data(key, reconstruct_state, ctx) .await // FIXME: makes no sense to dump paths .with_context(|| format!("read {}", self.path().display())) @@ -336,12 +340,12 @@ impl ImageLayer { ) -> Result<&ImageLayerInner> { self.access_stats.record_access(access_kind, ctx); self.inner - .get_or_try_init(|| self.load_inner()) + .get_or_try_init(|| self.load_inner(ctx)) .await .with_context(|| format!("Failed to load image layer {}", self.path().display())) } - async fn load_inner(&self) -> Result { + async fn load_inner(&self, ctx: &RequestContext) -> Result { let path = self.path(); let expected_summary = match &self.path_or_conf { @@ -350,7 +354,8 @@ impl ImageLayer { }; let loaded = - ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary).await?; + ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx) + .await?; if let PathOrConf::Path(ref path) = self.path_or_conf { // not production code @@ -437,11 +442,13 @@ impl ImageLayerInner { path: &std::path::Path, lsn: Lsn, summary: Option, + ctx: &RequestContext, ) -> anyhow::Result { let file = VirtualFile::open(path) + .await .with_context(|| format!("Failed to open file '{}'", path.display()))?; let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0).await?; + let summary_blk = file.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; if let Some(mut expected_summary) = summary { @@ -470,16 +477,30 @@ impl ImageLayerInner { &self, key: Key, reconstruct_state: &mut ValueReconstructState, + ctx: &RequestContext, ) -> anyhow::Result { let file = &self.file; let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file); let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); - if let Some(offset) = tree_reader.get(&keybuf).await? { + if let Some(offset) = tree_reader + .get( + &keybuf, + &RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::ImageLayerBtreeNode) + .build(), + ) + .await? + { let blob = file .block_cursor() - .read_blob(offset) + .read_blob( + offset, + &RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::ImageLayerValue) + .build(), + ) .await .with_context(|| format!("failed to read value from offset {}", offset))?; let value = Bytes::from(blob); @@ -511,7 +532,7 @@ struct ImageLayerWriterInner { key_range: Range, lsn: Lsn, - blob_writer: WriteBlobWriter, + blob_writer: BlobWriter, tree: DiskBtreeBuilder, } @@ -519,7 +540,7 @@ impl ImageLayerWriterInner { /// /// Start building a new image layer. /// - fn new( + async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_id: TenantId, @@ -541,10 +562,11 @@ impl ImageLayerWriterInner { let mut file = VirtualFile::open_with_options( &path, std::fs::OpenOptions::new().write(true).create_new(true), - )?; + ) + .await?; // make room for the header block - file.seek(SeekFrom::Start(PAGE_SZ as u64))?; - let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64); + file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; + let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); // Initialize the b-tree index builder let block_buf = BlockBuf::new(); @@ -569,9 +591,9 @@ impl ImageLayerWriterInner { /// /// The page versions must be appended in blknum order. /// - fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { + async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); - let off = self.blob_writer.write_blob(img)?; + let off = self.blob_writer.write_blob(img).await?; let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); @@ -583,17 +605,18 @@ impl ImageLayerWriterInner { /// /// Finish writing the image layer. /// - fn finish(self) -> anyhow::Result { + async fn finish(self) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; let mut file = self.blob_writer.into_inner(); // Write out the index - file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?; + file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64)) + .await?; let (index_root_blk, block_buf) = self.tree.finish()?; for buf in block_buf.blocks { - file.write_all(buf.as_ref())?; + file.write_all(buf.as_ref()).await?; } // Fill in the summary on blk 0 @@ -607,11 +630,22 @@ impl ImageLayerWriterInner { index_start_blk, index_root_blk, }; - file.seek(SeekFrom::Start(0))?; - Summary::ser_into(&summary, &mut file)?; + + let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + Summary::ser_into(&summary, &mut buf)?; + if buf.spilled() { + // This is bad as we only have one free block for the summary + warn!( + "Used more than one page size for summary buffer: {}", + buf.len() + ); + } + file.seek(SeekFrom::Start(0)).await?; + file.write_all(&buf).await?; let metadata = file .metadata() + .await .context("get metadata to determine file size")?; let desc = PersistentLayerDesc::new_img( @@ -634,7 +668,7 @@ impl ImageLayerWriterInner { }; // fsync the file - file.sync_all()?; + file.sync_all().await?; // Rename the file to its final name // @@ -687,7 +721,7 @@ impl ImageLayerWriter { /// /// Start building a new image layer. /// - pub fn new( + pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_id: TenantId, @@ -695,13 +729,9 @@ impl ImageLayerWriter { lsn: Lsn, ) -> anyhow::Result { Ok(Self { - inner: Some(ImageLayerWriterInner::new( - conf, - timeline_id, - tenant_id, - key_range, - lsn, - )?), + inner: Some( + ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?, + ), }) } @@ -710,15 +740,15 @@ impl ImageLayerWriter { /// /// The page versions must be appended in blknum order. /// - pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { - self.inner.as_mut().unwrap().put_image(key, img) + pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { + self.inner.as_mut().unwrap().put_image(key, img).await } /// /// Finish writing the image layer. /// - pub fn finish(mut self) -> anyhow::Result { - self.inner.take().unwrap().finish() + pub async fn finish(mut self) -> anyhow::Result { + self.inner.take().unwrap().finish().await } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 35a77a7331..764dc2c64e 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -5,7 +5,7 @@ //! its position in the file, is kept in memory, though. //! use crate::config::PageServerConf; -use crate::context::RequestContext; +use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::repository::{Key, Value}; use crate::tenant::block_io::BlockReader; use crate::tenant::ephemeral_file::EphemeralFile; @@ -106,7 +106,7 @@ impl InMemoryLayer { /// debugging function to print out the contents of the layer /// /// this is likely completly unused - pub async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> { + pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> { let inner = self.inner.read().await; let end_str = self.end_lsn_or_max(); @@ -125,7 +125,7 @@ impl InMemoryLayer { for (key, vec_map) in inner.index.iter() { for (lsn, pos) in vec_map.as_slice() { let mut desc = String::new(); - cursor.read_blob_into_buf(*pos, &mut buf).await?; + cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?; let val = Value::des(&buf); match val { Ok(Value::Image(img)) => { @@ -158,11 +158,15 @@ impl InMemoryLayer { key: Key, lsn_range: Range, reconstruct_state: &mut ValueReconstructState, - _ctx: &RequestContext, + ctx: &RequestContext, ) -> anyhow::Result { ensure!(lsn_range.start >= self.start_lsn); let mut need_image = true; + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::InMemoryLayer) + .build(); + let inner = self.inner.read().await; let reader = inner.file.block_cursor(); @@ -171,7 +175,7 @@ impl InMemoryLayer { if let Some(vec_map) = inner.index.get(&key) { let slice = vec_map.slice_range(lsn_range); for (entry_lsn, pos) in slice.iter().rev() { - let buf = reader.read_blob(*pos).await?; + let buf = reader.read_blob(*pos, &ctx).await?; let value = Value::des(&buf)?; match value { Value::Image(img) => { @@ -236,7 +240,7 @@ impl InMemoryLayer { /// /// Create a new, empty, in-memory layer /// - pub fn create( + pub async fn create( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_id: TenantId, @@ -244,7 +248,7 @@ impl InMemoryLayer { ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_id, timeline_id)?; + let file = EphemeralFile::create(conf, tenant_id, timeline_id).await?; Ok(InMemoryLayer { conf, @@ -263,7 +267,13 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { + pub async fn put_value( + &self, + key: Key, + lsn: Lsn, + val: &Value, + ctx: &RequestContext, + ) -> Result<()> { trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); let inner: &mut _ = &mut *self.inner.write().await; self.assert_writable(); @@ -275,7 +285,15 @@ impl InMemoryLayer { let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); buf.clear(); val.ser_into(&mut buf)?; - inner.file.write_blob(&buf).await? + inner + .file + .write_blob( + &buf, + &RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::InMemoryLayer) + .build(), + ) + .await? }; let vec_map = inner.index.entry(key).or_default(); @@ -313,7 +331,7 @@ impl InMemoryLayer { /// Write this frozen in-memory layer to disk. /// /// Returns a new delta layer with all the same data as this in-memory layer - pub(crate) async fn write_to_disk(&self) -> Result { + pub(crate) async fn write_to_disk(&self, ctx: &RequestContext) -> Result { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception @@ -333,7 +351,8 @@ impl InMemoryLayer { self.tenant_id, Key::MIN, self.start_lsn..end_lsn, - )?; + ) + .await?; let mut buf = Vec::new(); @@ -342,17 +361,22 @@ impl InMemoryLayer { let mut keys: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); keys.sort_by_key(|k| k.0); + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::InMemoryLayer) + .build(); for (key, vec_map) in keys.iter() { let key = **key; // Write all page versions for (lsn, pos) in vec_map.as_slice() { - cursor.read_blob_into_buf(*pos, &mut buf).await?; + cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; let will_init = Value::des(&buf)?.will_init(); - delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?; + delta_layer_writer + .put_value_bytes(key, *lsn, &buf, will_init) + .await?; } } - let delta_layer = delta_layer_writer.finish(Key::MAX)?; + let delta_layer = delta_layer_writer.finish(Key::MAX).await?; Ok(delta_layer) } } diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 3f8d700863..3968c16c31 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -74,7 +74,7 @@ impl Layer for RemoteLayer { _reconstruct_state: &mut ValueReconstructState, _ctx: &RequestContext, ) -> Result { - bail!("layer {self} needs to be downloaded"); + Err(anyhow::anyhow!("layer {self} needs to be downloaded")) } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index c067a84471..df3ffd08d3 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -102,6 +102,7 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { let started_at = Instant::now(); let sleep_duration = if period == Duration::ZERO { + #[cfg(not(feature = "testing"))] info!("automatic compaction is disabled"); // check again in 10 seconds, in case it's been enabled again. Duration::from_secs(10) @@ -166,6 +167,7 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { let gc_horizon = tenant.get_gc_horizon(); let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 { + #[cfg(not(feature = "testing"))] info!("automatic GC is disabled"); // check again in 10 seconds, in case it's been enabled again. Duration::from_secs(10) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f0ae385806..78ac1338db 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -90,6 +90,7 @@ use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::config::TenantConf; +use super::debug_assert_current_span_has_tenant_and_timeline_id; use super::remote_timeline_client::index::IndexPart; use super::remote_timeline_client::RemoteTimelineClient; use super::storage_layer::{ @@ -470,7 +471,7 @@ impl Timeline { // The cached image can be returned directly if there is no WAL between the cached image // and requested LSN. The cached image can also be used to reduce the amount of WAL needed // for redo. - let cached_page_img = match self.lookup_cached_page(&key, lsn).await { + let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await { Some((cached_lsn, cached_img)) => { match cached_lsn.cmp(&lsn) { Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check @@ -584,15 +585,7 @@ impl Timeline { Err(e) => { // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo drop(_timer); - let walreceiver_status = { - match &*self.walreceiver.lock().unwrap() { - None => "stopping or stopped".to_string(), - Some(walreceiver) => match walreceiver.status() { - Some(status) => status.to_human_readable_string(), - None => "Not active".to_string(), - }, - } - }; + let walreceiver_status = self.walreceiver_status(); Err(anyhow::Error::new(e).context({ format!( "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", @@ -606,6 +599,16 @@ impl Timeline { } } + pub(crate) fn walreceiver_status(&self) -> String { + match &*self.walreceiver.lock().unwrap() { + None => "stopping or stopped".to_string(), + Some(walreceiver) => match walreceiver.status() { + Some(status) => status.to_human_readable_string(), + None => "Not active".to_string(), + }, + } + } + /// Check that it is valid to request operations with that lsn. pub fn check_lsn_is_in_scope( &self, @@ -933,6 +936,48 @@ impl Timeline { self.launch_eviction_task(background_jobs_can_start); } + #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] + pub async fn shutdown(self: &Arc, freeze_and_flush: bool) { + debug_assert_current_span_has_tenant_and_timeline_id(); + + // prevent writes to the InMemoryLayer + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(self.tenant_id), + Some(self.timeline_id), + ) + .await; + + // now all writers to InMemory layer are gone, do the final flush if requested + if freeze_and_flush { + match self.freeze_and_flush().await { + Ok(()) => {} + Err(e) => { + warn!("failed to freeze and flush: {e:#}"); + return; // TODO: should probably drain remote timeline client anyways? + } + } + + // drain the upload queue + let res = if let Some(client) = self.remote_client.as_ref() { + // if we did not wait for completion here, it might be our shutdown process + // didn't wait for remote uploads to complete at all, as new tasks can forever + // be spawned. + // + // what is problematic is the shutting down of RemoteTimelineClient, because + // obviously it does not make sense to stop while we wait for it, but what + // about corner cases like s3 suddenly hanging up? + client.wait_completion().await + } else { + Ok(()) + }; + + if let Err(e) = res { + warn!("failed to await for frozen and flushed uploads: {e:#}"); + } + } + } + pub fn set_state(&self, new_state: TimelineState) { match (self.current_state(), new_state) { (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { @@ -1681,11 +1726,18 @@ impl Timeline { for (name, decision) in decided { let decision = match decision { Ok(UseRemote { local, remote }) => { - path.push(name.file_name()); - init::cleanup_local_file_for_remote(&path, &local, &remote)?; - path.pop(); - - UseRemote { local, remote } + // Remote is authoritative, but we may still choose to retain + // the local file if the contents appear to match + if local.file_size() == remote.file_size() { + // Use the local file, but take the remote metadata so that we pick up + // the correct generation. + UseLocal(remote) + } else { + path.push(name.file_name()); + init::cleanup_local_file_for_remote(&path, &local, &remote)?; + path.pop(); + UseRemote { local, remote } + } } Ok(decision) => decision, Err(FutureLayer { local }) => { @@ -2466,13 +2518,18 @@ impl Timeline { } } - async fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> { + async fn lookup_cached_page( + &self, + key: &Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Option<(Lsn, Bytes)> { let cache = page_cache::get(); // FIXME: It's pointless to check the cache for things that are not 8kB pages. // We should look at the key to determine if it's a cacheable object let (lsn, read_guard) = cache - .lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn) + .lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn, ctx) .await?; let img = Bytes::from(read_guard.to_vec()); Some((lsn, img)) @@ -2494,20 +2551,28 @@ impl Timeline { /// async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { let mut guard = self.layers.write().await; - let layer = guard.get_layer_for_write( - lsn, - self.get_last_record_lsn(), - self.conf, - self.timeline_id, - self.tenant_id, - )?; + let layer = guard + .get_layer_for_write( + lsn, + self.get_last_record_lsn(), + self.conf, + self.timeline_id, + self.tenant_id, + ) + .await?; Ok(layer) } - async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> { + async fn put_value( + &self, + key: Key, + lsn: Lsn, + val: &Value, + ctx: &RequestContext, + ) -> anyhow::Result<()> { //info!("PUT: key {} at {}", key, lsn); let layer = self.get_layer_for_write(lsn).await?; - layer.put_value(key, lsn, val).await?; + layer.put_value(key, lsn, val, ctx).await?; Ok(()) } @@ -2679,7 +2744,7 @@ impl Timeline { // Normal case, write out a L0 delta layer file. // `create_delta_layer` will not modify the layer map. // We will remove frozen layer and add delta layer in one atomic operation later. - let layer = self.create_delta_layer(&frozen_layer).await?; + let layer = self.create_delta_layer(&frozen_layer, ctx).await?; ( HashMap::from([( layer.filename(), @@ -2704,9 +2769,7 @@ impl Timeline { // update metrics let sz = l.layer_desc().file_size; - self.metrics.resident_physical_size_gauge.add(sz); - self.metrics.num_persistent_files_created.inc_by(1); - self.metrics.persistent_bytes_written.inc_by(sz); + self.metrics.record_new_file_metrics(sz); } guard.finish_flush_l0_layer(delta_layer_to_add, &frozen_layer); @@ -2735,6 +2798,7 @@ impl Timeline { if disk_consistent_lsn != old_disk_consistent_lsn { assert!(disk_consistent_lsn > old_disk_consistent_lsn); self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload) + .await .context("update_metadata_file")?; // Also update the in-memory copy self.disk_consistent_lsn.store(disk_consistent_lsn); @@ -2743,7 +2807,7 @@ impl Timeline { } /// Update metadata file - fn update_metadata_file( + async fn update_metadata_file( &self, disk_consistent_lsn: Lsn, layer_paths_to_upload: HashMap, @@ -2784,14 +2848,9 @@ impl Timeline { x.unwrap() )); - save_metadata( - self.conf, - &self.tenant_id, - &self.timeline_id, - &metadata, - false, - ) - .context("save_metadata")?; + save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata) + .await + .context("save_metadata")?; if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { @@ -2808,19 +2867,21 @@ impl Timeline { async fn create_delta_layer( self: &Arc, frozen_layer: &Arc, + ctx: &RequestContext, ) -> anyhow::Result { let span = tracing::info_span!("blocking"); let new_delta: DeltaLayer = tokio::task::spawn_blocking({ let _g = span.entered(); let self_clone = Arc::clone(self); let frozen_layer = Arc::clone(frozen_layer); + let ctx = ctx.attached_child(); move || { // Write it out // Keep this inside `spawn_blocking` and `Handle::current` // as long as the write path is still sync and the read impl // is still not fully async. Otherwise executor threads would // be blocked. - let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?; + let new_delta = Handle::current().block_on(frozen_layer.write_to_disk(&ctx))?; let new_delta_path = new_delta.path(); // Sync it to disk. @@ -2994,7 +3055,8 @@ impl Timeline { self.tenant_id, &img_range, lsn, - )?; + ) + .await?; fail_point!("image-layer-writer-fail-before-finish", |_| { Err(PageReconstructError::Other(anyhow::anyhow!( @@ -3030,11 +3092,11 @@ impl Timeline { } } }; - image_layer_writer.put_image(key, &img)?; + image_layer_writer.put_image(key, &img).await?; key = key.next(); } } - let image_layer = image_layer_writer.finish()?; + let image_layer = image_layer_writer.finish().await?; image_layers.push(image_layer); } } @@ -3084,9 +3146,8 @@ impl Timeline { LayerFileMetadata::new(metadata.len(), self.generation), ); - self.metrics - .resident_physical_size_gauge - .add(metadata.len()); + // update metrics + self.metrics.record_new_file_metrics(metadata.len()); let l = Arc::new(l); l.access_stats().record_residence_event( LayerResidenceStatus::Resident, @@ -3526,7 +3587,7 @@ impl Timeline { key, lsn, ref val, .. } in all_values_iter { - let value = val.load().await?; + let value = val.load(ctx).await?; let same_key = prev_key.map_or(false, |prev_key| prev_key == key); // We need to check key boundaries once we reach next key or end of layer with the same key if !same_key || lsn == dup_end_lsn { @@ -3579,7 +3640,11 @@ impl Timeline { { // ... if so, flush previous layer and prepare to write new one new_layers.push(Arc::new( - writer.take().unwrap().finish(prev_key.unwrap().next())?, + writer + .take() + .unwrap() + .finish(prev_key.unwrap().next()) + .await?, )); writer = None; @@ -3594,20 +3659,23 @@ impl Timeline { } if writer.is_none() { // Create writer if not initiaized yet - writer = Some(DeltaLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_id, - key, - if dup_end_lsn.is_valid() { - // this is a layer containing slice of values of the same key - debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); - dup_start_lsn..dup_end_lsn - } else { - debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); - lsn_range.clone() - }, - )?); + writer = Some( + DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + key, + if dup_end_lsn.is_valid() { + // this is a layer containing slice of values of the same key + debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); + dup_start_lsn..dup_end_lsn + } else { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + lsn_range.clone() + }, + ) + .await?, + ); } fail_point!("delta-layer-writer-fail-before-finish", |_| { @@ -3616,11 +3684,11 @@ impl Timeline { ))) }); - writer.as_mut().unwrap().put_value(key, lsn, value)?; + writer.as_mut().unwrap().put_value(key, lsn, value).await?; prev_key = Some(key); } if let Some(writer) = writer { - new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?)); + new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next()).await?)); } // Sync layers @@ -3761,10 +3829,8 @@ impl Timeline { )?; } - // update the timeline's physical size - self.metrics - .resident_physical_size_gauge - .add(metadata.len()); + // update metrics, including the timeline's physical size + self.metrics.record_new_file_metrics(metadata.len()); new_layer_paths.insert( new_delta_path, @@ -4122,7 +4188,8 @@ impl Timeline { if !layers_to_remove.is_empty() { // Persist the new GC cutoff value in the metadata file, before // we actually remove anything. - self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; + self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new()) + .await?; // Actually delete the layers from disk and remove them from the map. // (couldn't do this in the loop above, because you cannot modify a collection @@ -4645,8 +4712,14 @@ impl<'a> TimelineWriter<'a> { /// /// This will implicitly extend the relation, if the page is beyond the /// current end-of-file. - pub async fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> { - self.tl.put_value(key, lsn, value).await + pub async fn put( + &self, + key: Key, + lsn: Lsn, + value: &Value, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.tl.put_value(key, lsn, value, ctx).await } pub async fn delete(&self, key_range: Range, lsn: Lsn) -> anyhow::Result<()> { @@ -4742,22 +4815,8 @@ mod tests { let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap(); - let remote_storage = { - // this is never used for anything, because of how the create_test_timeline works, but - // it is with us in spirit and a Some. - use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind}; - let path = harness.conf.workdir.join("localfs"); - std::fs::create_dir_all(&path).unwrap(); - let config = RemoteStorageConfig { - max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(), - max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(), - storage: RemoteStorageKind::LocalFs(path), - }; - GenericRemoteStorage::from_config(&config).unwrap() - }; - let ctx = any_context(); - let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap(); + let tenant = harness.try_load(&ctx).await.unwrap(); let timeline = tenant .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await @@ -4807,22 +4866,8 @@ mod tests { async fn layer_eviction_aba_fails() { let harness = TenantHarness::create("layer_eviction_aba_fails").unwrap(); - let remote_storage = { - // this is never used for anything, because of how the create_test_timeline works, but - // it is with us in spirit and a Some. - use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind}; - let path = harness.conf.workdir.join("localfs"); - std::fs::create_dir_all(&path).unwrap(); - let config = RemoteStorageConfig { - max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(), - max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(), - storage: RemoteStorageKind::LocalFs(path), - }; - GenericRemoteStorage::from_config(&config).unwrap() - }; - let ctx = any_context(); - let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap(); + let tenant = harness.try_load(&ctx).await.unwrap(); let timeline = tenant .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 3e407dda57..39f0d03a01 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -328,9 +328,24 @@ impl Timeline { // Make one of the tenant's timelines draw the short straw and run the calculation. // The others wait until the calculation is done so that they take into account the // imitated accesses that the winner made. - let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else { - // likely, we're shutting down - return ControlFlow::Break(()); + // + // It is critical we are responsive to cancellation here. Otherwise, we deadlock with + // tenant deletion (holds TENANTS in read mode) any other task that attempts to + // acquire TENANTS in write mode before we here call get_tenant. + // See https://github.com/neondatabase/neon/issues/5284. + let res = tokio::select! { + _ = cancel.cancelled() => { + return ControlFlow::Break(()); + } + res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => { + res + } + }; + let tenant = match res { + Ok(t) => t, + Err(_) => { + return ControlFlow::Break(()); + } }; let mut state = tenant.eviction_task_tenant_state.lock().await; match state.last_layer_access_imitation { diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 33effb4318..22976a514d 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -147,11 +147,7 @@ pub(super) fn reconcile( Err(FutureLayer { local }) } else { Ok(match (local, remote) { - (Some(local), Some(remote)) if local != remote => { - assert_eq!(local.generation, remote.generation); - - UseRemote { local, remote } - } + (Some(local), Some(remote)) if local != remote => UseRemote { local, remote }, (Some(x), Some(_)) => UseLocal(x), (None, Some(x)) => Evicted(x), (Some(x), None) => NeedsUpload(x), diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 5522ea1788..3c88d31f24 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -87,7 +87,7 @@ impl LayerManager { /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer, /// called within `get_layer_for_write`. - pub(crate) fn get_layer_for_write( + pub(crate) async fn get_layer_for_write( &mut self, lsn: Lsn, last_record_lsn: Lsn, @@ -129,7 +129,7 @@ impl LayerManager { lsn ); - let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn)?; + let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn).await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index ccff735c3c..842bc3675c 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -135,7 +135,7 @@ impl WalReceiver { .await; } - pub(super) fn status(&self) -> Option { + pub(crate) fn status(&self) -> Option { self.manager_status.read().unwrap().clone() } } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index a86b8fa2a6..dfb8d397b4 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -10,10 +10,11 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! -use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME}; +use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; +use crate::tenant::TENANTS_SEGMENT_NAME; use once_cell::sync::OnceCell; use std::fs::{self, File, OpenOptions}; -use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write}; +use std::io::{Error, ErrorKind, Seek, SeekFrom}; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; @@ -154,8 +155,8 @@ impl OpenFiles { if let Some(old_file) = slot_guard.file.take() { // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to // distinguish the two. - STORAGE_IO_TIME - .with_label_values(&["close-by-replace"]) + STORAGE_IO_TIME_METRIC + .get(StorageIoOperation::CloseByReplace) .observe_closure_duration(|| drop(old_file)); } @@ -172,19 +173,55 @@ impl OpenFiles { } } +#[derive(Debug, thiserror::Error)] +pub enum CrashsafeOverwriteError { + #[error("final path has no parent dir")] + FinalPathHasNoParentDir, + #[error("remove tempfile: {0}")] + RemovePreviousTempfile(#[source] std::io::Error), + #[error("create tempfile: {0}")] + CreateTempfile(#[source] std::io::Error), + #[error("write tempfile: {0}")] + WriteContents(#[source] std::io::Error), + #[error("sync tempfile: {0}")] + SyncTempfile(#[source] std::io::Error), + #[error("rename tempfile to final path: {0}")] + RenameTempfileToFinalPath(#[source] std::io::Error), + #[error("open final path parent dir: {0}")] + OpenFinalPathParentDir(#[source] std::io::Error), + #[error("sync final path parent dir: {0}")] + SyncFinalPathParentDir(#[source] std::io::Error), +} +impl CrashsafeOverwriteError { + /// Returns true iff the new contents are durably stored. + pub fn are_new_contents_durable(&self) -> bool { + match self { + Self::FinalPathHasNoParentDir => false, + Self::RemovePreviousTempfile(_) => false, + Self::CreateTempfile(_) => false, + Self::WriteContents(_) => false, + Self::SyncTempfile(_) => false, + Self::RenameTempfileToFinalPath(_) => false, + Self::OpenFinalPathParentDir(_) => false, + Self::SyncFinalPathParentDir(_) => true, + } + } +} + impl VirtualFile { /// Open a file in read-only mode. Like File::open. - pub fn open(path: &Path) -> Result { - Self::open_with_options(path, OpenOptions::new().read(true)) + pub async fn open(path: &Path) -> Result { + Self::open_with_options(path, OpenOptions::new().read(true)).await } /// Create a new file for writing. If the file exists, it will be truncated. /// Like File::create. - pub fn create(path: &Path) -> Result { + pub async fn create(path: &Path) -> Result { Self::open_with_options( path, OpenOptions::new().write(true).create(true).truncate(true), ) + .await } /// Open a file with given options. @@ -192,7 +229,7 @@ impl VirtualFile { /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt, /// they will be applied also when the file is subsequently re-opened, not only /// on the first time. Make sure that's sane! - pub fn open_with_options( + pub async fn open_with_options( path: &Path, open_options: &OpenOptions, ) -> Result { @@ -200,7 +237,7 @@ impl VirtualFile { let parts = path_str.split('/').collect::>(); let tenant_id; let timeline_id; - if parts.len() > 5 && parts[parts.len() - 5] == "tenants" { + if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME { tenant_id = parts[parts.len() - 4].to_string(); timeline_id = parts[parts.len() - 2].to_string(); } else { @@ -208,8 +245,9 @@ impl VirtualFile { timeline_id = "*".to_string(); } let (handle, mut slot_guard) = get_open_files().find_victim_slot(); - let file = STORAGE_IO_TIME - .with_label_values(&["open"]) + + let file = STORAGE_IO_TIME_METRIC + .get(StorageIoOperation::Open) .observe_closure_duration(|| open_options.open(path))?; // Strip all options other than read and write. @@ -236,19 +274,76 @@ impl VirtualFile { Ok(vfile) } - /// Call File::sync_all() on the underlying File. - pub fn sync_all(&self) -> Result<(), Error> { - self.with_file("fsync", |file| file.sync_all())? + /// Writes a file to the specified `final_path` in a crash safe fasion + /// + /// The file is first written to the specified tmp_path, and in a second + /// step, the tmp path is renamed to the final path. As renames are + /// atomic, a crash during the write operation will never leave behind a + /// partially written file. + pub async fn crashsafe_overwrite( + final_path: &Path, + tmp_path: &Path, + content: &[u8], + ) -> Result<(), CrashsafeOverwriteError> { + let Some(final_path_parent) = final_path.parent() else { + return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir); + }; + match std::fs::remove_file(tmp_path) { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)), + } + let mut file = Self::open_with_options( + tmp_path, + OpenOptions::new() + .write(true) + // Use `create_new` so that, if we race with ourselves or something else, + // we bail out instead of causing damage. + .create_new(true), + ) + .await + .map_err(CrashsafeOverwriteError::CreateTempfile)?; + file.write_all(content) + .await + .map_err(CrashsafeOverwriteError::WriteContents)?; + file.sync_all() + .await + .map_err(CrashsafeOverwriteError::SyncTempfile)?; + drop(file); // before the rename, that's important! + // renames are atomic + std::fs::rename(tmp_path, final_path) + .map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?; + // Only open final path parent dirfd now, so that this operation only + // ever holds one VirtualFile fd at a time. That's important because + // the current `find_victim_slot` impl might pick the same slot for both + // VirtualFile., and it eventually does a blocking write lock instead of + // try_lock. + let final_parent_dirfd = + Self::open_with_options(final_path_parent, OpenOptions::new().read(true)) + .await + .map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?; + final_parent_dirfd + .sync_all() + .await + .map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?; + Ok(()) } - pub fn metadata(&self) -> Result { - self.with_file("metadata", |file| file.metadata())? + /// Call File::sync_all() on the underlying File. + pub async fn sync_all(&self) -> Result<(), Error> { + self.with_file(StorageIoOperation::Fsync, |file| file.sync_all()) + .await? + } + + pub async fn metadata(&self) -> Result { + self.with_file(StorageIoOperation::Metadata, |file| file.metadata()) + .await? } /// Helper function that looks up the underlying File for this VirtualFile, /// opening it and evicting some other File if necessary. It calls 'func' /// with the physical File. - fn with_file(&self, op: &str, mut func: F) -> Result + async fn with_file(&self, op: StorageIoOperation, mut func: F) -> Result where F: FnMut(&File) -> R, { @@ -271,8 +366,8 @@ impl VirtualFile { if let Some(file) = &slot_guard.file { // Found a cached file descriptor. slot.recently_used.store(true, Ordering::Relaxed); - return Ok(STORAGE_IO_TIME - .with_label_values(&[op]) + return Ok(STORAGE_IO_TIME_METRIC + .get(op) .observe_closure_duration(|| func(file))); } } @@ -298,13 +393,13 @@ impl VirtualFile { let (handle, mut slot_guard) = open_files.find_victim_slot(); // Open the physical file - let file = STORAGE_IO_TIME - .with_label_values(&["open"]) + let file = STORAGE_IO_TIME_METRIC + .get(StorageIoOperation::Open) .observe_closure_duration(|| self.open_options.open(&self.path))?; // Perform the requested operation on it - let result = STORAGE_IO_TIME - .with_label_values(&[op]) + let result = STORAGE_IO_TIME_METRIC + .get(op) .observe_closure_duration(|| func(&file)); // Store the File in the slot and update the handle in the VirtualFile @@ -321,60 +416,18 @@ impl VirtualFile { drop(self); std::fs::remove_file(path).expect("failed to remove the virtual file"); } -} -impl Drop for VirtualFile { - /// If a VirtualFile is dropped, close the underlying file if it was open. - fn drop(&mut self) { - let handle = self.handle.get_mut().unwrap(); - - // We could check with a read-lock first, to avoid waiting on an - // unrelated I/O. - let slot = &get_open_files().slots[handle.index]; - let mut slot_guard = slot.inner.write().unwrap(); - if slot_guard.tag == handle.tag { - slot.recently_used.store(false, Ordering::Relaxed); - // there is also operation "close-by-replace" for closes done on eviction for - // comparison. - STORAGE_IO_TIME - .with_label_values(&["close"]) - .observe_closure_duration(|| drop(slot_guard.file.take())); - } - } -} - -impl Read for VirtualFile { - fn read(&mut self, buf: &mut [u8]) -> Result { - let pos = self.pos; - let n = self.read_at(buf, pos)?; - self.pos += n as u64; - Ok(n) - } -} - -impl Write for VirtualFile { - fn write(&mut self, buf: &[u8]) -> Result { - let pos = self.pos; - let n = self.write_at(buf, pos)?; - self.pos += n as u64; - Ok(n) - } - - fn flush(&mut self) -> Result<(), std::io::Error> { - // flush is no-op for File (at least on unix), so we don't need to do - // anything here either. - Ok(()) - } -} - -impl Seek for VirtualFile { - fn seek(&mut self, pos: SeekFrom) -> Result { + pub async fn seek(&mut self, pos: SeekFrom) -> Result { match pos { SeekFrom::Start(offset) => { self.pos = offset; } SeekFrom::End(offset) => { - self.pos = self.with_file("seek", |mut file| file.seek(SeekFrom::End(offset)))?? + self.pos = self + .with_file(StorageIoOperation::Seek, |mut file| { + file.seek(SeekFrom::End(offset)) + }) + .await?? } SeekFrom::Current(offset) => { let pos = self.pos as i128 + offset as i128; @@ -392,11 +445,79 @@ impl Seek for VirtualFile { } Ok(self.pos) } -} -impl FileExt for VirtualFile { - fn read_at(&self, buf: &mut [u8], offset: u64) -> Result { - let result = self.with_file("read", |file| file.read_at(buf, offset))?; + // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 + pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> { + while !buf.is_empty() { + match self.read_at(buf, offset).await { + Ok(0) => { + return Err(Error::new( + std::io::ErrorKind::UnexpectedEof, + "failed to fill whole buffer", + )) + } + Ok(n) => { + buf = &mut buf[n..]; + offset += n as u64; + } + Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + } + Ok(()) + } + + // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 + pub async fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> { + while !buf.is_empty() { + match self.write_at(buf, offset).await { + Ok(0) => { + return Err(Error::new( + std::io::ErrorKind::WriteZero, + "failed to write whole buffer", + )); + } + Ok(n) => { + buf = &buf[n..]; + offset += n as u64; + } + Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + } + Ok(()) + } + + pub async fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> { + while !buf.is_empty() { + match self.write(buf).await { + Ok(0) => { + return Err(Error::new( + std::io::ErrorKind::WriteZero, + "failed to write whole buffer", + )); + } + Ok(n) => { + buf = &buf[n..]; + } + Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + } + Ok(()) + } + + async fn write(&mut self, buf: &[u8]) -> Result { + let pos = self.pos; + let n = self.write_at(buf, pos).await?; + self.pos += n as u64; + Ok(n) + } + + pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result { + let result = self + .with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset)) + .await?; if let Ok(size) = result { STORAGE_IO_SIZE .with_label_values(&["read", &self.tenant_id, &self.timeline_id]) @@ -405,8 +526,10 @@ impl FileExt for VirtualFile { result } - fn write_at(&self, buf: &[u8], offset: u64) -> Result { - let result = self.with_file("write", |file| file.write_at(buf, offset))?; + async fn write_at(&self, buf: &[u8], offset: u64) -> Result { + let result = self + .with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset)) + .await?; if let Ok(size) = result { STORAGE_IO_SIZE .with_label_values(&["write", &self.tenant_id, &self.timeline_id]) @@ -416,6 +539,55 @@ impl FileExt for VirtualFile { } } +#[cfg(test)] +impl VirtualFile { + pub(crate) async fn read_blk( + &self, + blknum: u32, + ) -> Result, std::io::Error> { + use crate::page_cache::PAGE_SZ; + let mut buf = [0; PAGE_SZ]; + self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64)) + .await?; + Ok(std::sync::Arc::new(buf).into()) + } + + async fn read_to_end(&mut self, buf: &mut Vec) -> Result<(), Error> { + loop { + let mut tmp = [0; 128]; + match self.read_at(&mut tmp, self.pos).await { + Ok(0) => return Ok(()), + Ok(n) => { + self.pos += n as u64; + buf.extend_from_slice(&tmp[..n]); + } + Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + } + } +} + +impl Drop for VirtualFile { + /// If a VirtualFile is dropped, close the underlying file if it was open. + fn drop(&mut self) { + let handle = self.handle.get_mut().unwrap(); + + // We could check with a read-lock first, to avoid waiting on an + // unrelated I/O. + let slot = &get_open_files().slots[handle.index]; + let mut slot_guard = slot.inner.write().unwrap(); + if slot_guard.tag == handle.tag { + slot.recently_used.store(false, Ordering::Relaxed); + // there is also operation "close-by-replace" for closes done on eviction for + // comparison. + STORAGE_IO_TIME_METRIC + .get(StorageIoOperation::Close) + .observe_closure_duration(|| drop(slot_guard.file.take())); + } + } +} + impl OpenFiles { fn new(num_slots: usize) -> OpenFiles { let mut slots = Box::new(Vec::with_capacity(num_slots)); @@ -469,33 +641,75 @@ mod tests { use rand::seq::SliceRandom; use rand::thread_rng; use rand::Rng; + use std::future::Future; + use std::io::Write; use std::sync::Arc; - use std::thread; - // Helper function to slurp contents of a file, starting at the current position, - // into a string - fn read_string(vfile: &mut FD) -> Result - where - FD: Read, - { - let mut buf = String::new(); - vfile.read_to_string(&mut buf)?; - Ok(buf) + enum MaybeVirtualFile { + VirtualFile(VirtualFile), + File(File), } - // Helper function to slurp a portion of a file into a string - fn read_string_at(vfile: &mut FD, pos: u64, len: usize) -> Result - where - FD: FileExt, - { - let mut buf = Vec::new(); - buf.resize(len, 0); - vfile.read_exact_at(&mut buf, pos)?; - Ok(String::from_utf8(buf).unwrap()) + impl From for MaybeVirtualFile { + fn from(vf: VirtualFile) -> Self { + MaybeVirtualFile::VirtualFile(vf) + } } - #[test] - fn test_virtual_files() -> Result<(), Error> { + impl MaybeVirtualFile { + async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> { + match self { + MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await, + MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset), + } + } + async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> { + match self { + MaybeVirtualFile::VirtualFile(file) => file.write_all_at(buf, offset).await, + MaybeVirtualFile::File(file) => file.write_all_at(buf, offset), + } + } + async fn seek(&mut self, pos: SeekFrom) -> Result { + match self { + MaybeVirtualFile::VirtualFile(file) => file.seek(pos).await, + MaybeVirtualFile::File(file) => file.seek(pos), + } + } + async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> { + match self { + MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await, + MaybeVirtualFile::File(file) => file.write_all(buf), + } + } + + // Helper function to slurp contents of a file, starting at the current position, + // into a string + async fn read_string(&mut self) -> Result { + use std::io::Read; + let mut buf = String::new(); + match self { + MaybeVirtualFile::VirtualFile(file) => { + let mut buf = Vec::new(); + file.read_to_end(&mut buf).await?; + return Ok(String::from_utf8(buf).unwrap()); + } + MaybeVirtualFile::File(file) => { + file.read_to_string(&mut buf)?; + } + } + Ok(buf) + } + + // Helper function to slurp a portion of a file into a string + async fn read_string_at(&mut self, pos: u64, len: usize) -> Result { + let mut buf = vec![0; len]; + self.read_exact_at(&mut buf, pos).await?; + Ok(String::from_utf8(buf).unwrap()) + } + } + + #[tokio::test] + async fn test_virtual_files() -> Result<(), Error> { // The real work is done in the test_files() helper function. This // allows us to run the same set of tests against a native File, and // VirtualFile. We trust the native Files and wouldn't need to test them, @@ -503,95 +717,106 @@ mod tests { // results with VirtualFiles as with native Files. (Except that with // native files, you will run out of file descriptors if the ulimit // is low enough.) - test_files("virtual_files", |path, open_options| { - VirtualFile::open_with_options(path, open_options) + test_files("virtual_files", |path, open_options| async move { + let vf = VirtualFile::open_with_options(&path, &open_options).await?; + Ok(MaybeVirtualFile::VirtualFile(vf)) }) + .await } - #[test] - fn test_physical_files() -> Result<(), Error> { - test_files("physical_files", |path, open_options| { - open_options.open(path) + #[tokio::test] + async fn test_physical_files() -> Result<(), Error> { + test_files("physical_files", |path, open_options| async move { + Ok(MaybeVirtualFile::File(open_options.open(path)?)) }) + .await } - fn test_files(testname: &str, openfunc: OF) -> Result<(), Error> + async fn test_files(testname: &str, openfunc: OF) -> Result<(), Error> where - FD: Read + Write + Seek + FileExt, - OF: Fn(&Path, &OpenOptions) -> Result, + OF: Fn(PathBuf, OpenOptions) -> FT, + FT: Future>, { let testdir = crate::config::PageServerConf::test_repo_dir(testname); std::fs::create_dir_all(&testdir)?; let path_a = testdir.join("file_a"); let mut file_a = openfunc( - &path_a, - OpenOptions::new().write(true).create(true).truncate(true), - )?; - file_a.write_all(b"foobar")?; + path_a.clone(), + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .to_owned(), + ) + .await?; + file_a.write_all(b"foobar").await?; // cannot read from a file opened in write-only mode - assert!(read_string(&mut file_a).is_err()); + let _ = file_a.read_string().await.unwrap_err(); // Close the file and re-open for reading - let mut file_a = openfunc(&path_a, OpenOptions::new().read(true))?; + let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?; // cannot write to a file opened in read-only mode - assert!(file_a.write(b"bar").is_err()); + let _ = file_a.write_all(b"bar").await.unwrap_err(); // Try simple read - assert_eq!("foobar", read_string(&mut file_a)?); + assert_eq!("foobar", file_a.read_string().await?); // It's positioned at the EOF now. - assert_eq!("", read_string(&mut file_a)?); + assert_eq!("", file_a.read_string().await?); // Test seeks. - assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1); - assert_eq!("oobar", read_string(&mut file_a)?); + assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1); + assert_eq!("oobar", file_a.read_string().await?); - assert_eq!(file_a.seek(SeekFrom::End(-2))?, 4); - assert_eq!("ar", read_string(&mut file_a)?); + assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4); + assert_eq!("ar", file_a.read_string().await?); - assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1); - assert_eq!(file_a.seek(SeekFrom::Current(2))?, 3); - assert_eq!("bar", read_string(&mut file_a)?); + assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1); + assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3); + assert_eq!("bar", file_a.read_string().await?); - assert_eq!(file_a.seek(SeekFrom::Current(-5))?, 1); - assert_eq!("oobar", read_string(&mut file_a)?); + assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1); + assert_eq!("oobar", file_a.read_string().await?); // Test erroneous seeks to before byte 0 - assert!(file_a.seek(SeekFrom::End(-7)).is_err()); - assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1); - assert!(file_a.seek(SeekFrom::Current(-2)).is_err()); + file_a.seek(SeekFrom::End(-7)).await.unwrap_err(); + assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1); + file_a.seek(SeekFrom::Current(-2)).await.unwrap_err(); // the erroneous seek should have left the position unchanged - assert_eq!("oobar", read_string(&mut file_a)?); + assert_eq!("oobar", file_a.read_string().await?); // Create another test file, and try FileExt functions on it. let path_b = testdir.join("file_b"); let mut file_b = openfunc( - &path_b, + path_b.clone(), OpenOptions::new() .read(true) .write(true) .create(true) - .truncate(true), - )?; - file_b.write_all_at(b"BAR", 3)?; - file_b.write_all_at(b"FOO", 0)?; + .truncate(true) + .to_owned(), + ) + .await?; + file_b.write_all_at(b"BAR", 3).await?; + file_b.write_all_at(b"FOO", 0).await?; - assert_eq!(read_string_at(&mut file_b, 2, 3)?, "OBA"); + assert_eq!(file_b.read_string_at(2, 3).await?, "OBA"); // Open a lot of files, enough to cause some evictions. (Or to be precise, // open the same file many times. The effect is the same.) // // leave file_a positioned at offset 1 before we start - assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1); + assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1); let mut vfiles = Vec::new(); for _ in 0..100 { - let mut vfile = openfunc(&path_b, OpenOptions::new().read(true))?; - assert_eq!("FOOBAR", read_string(&mut vfile)?); + let mut vfile = + openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?; + assert_eq!("FOOBAR", vfile.read_string().await?); vfiles.push(vfile); } @@ -600,13 +825,13 @@ mod tests { // The underlying file descriptor for 'file_a' should be closed now. Try to read // from it again. We left the file positioned at offset 1 above. - assert_eq!("oobar", read_string(&mut file_a)?); + assert_eq!("oobar", file_a.read_string().await?); // Check that all the other FDs still work too. Use them in random order for // good measure. vfiles.as_mut_slice().shuffle(&mut thread_rng()); for vfile in vfiles.iter_mut() { - assert_eq!("OOBAR", read_string_at(vfile, 1, 5)?); + assert_eq!("OOBAR", vfile.read_string_at(1, 5).await?); } Ok(()) @@ -615,8 +840,8 @@ mod tests { /// Test using VirtualFiles from many threads concurrently. This tests both using /// a lot of VirtualFiles concurrently, causing evictions, and also using the same /// VirtualFile from multiple threads concurrently. - #[test] - fn test_vfile_concurrency() -> Result<(), Error> { + #[tokio::test] + async fn test_vfile_concurrency() -> Result<(), Error> { const SIZE: usize = 8 * 1024; const VIRTUAL_FILES: usize = 100; const THREADS: usize = 100; @@ -635,36 +860,87 @@ mod tests { // Open the file many times. let mut files = Vec::new(); for _ in 0..VIRTUAL_FILES { - let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))?; + let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true)) + .await?; files.push(f); } let files = Arc::new(files); // Launch many threads, and use the virtual files concurrently in random order. - let mut threads = Vec::new(); - for threadno in 0..THREADS { - let builder = - thread::Builder::new().name(format!("test_vfile_concurrency thread {}", threadno)); - + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(THREADS) + .thread_name("test_vfile_concurrency thread") + .build() + .unwrap(); + let mut hdls = Vec::new(); + for _threadno in 0..THREADS { let files = files.clone(); - let thread = builder - .spawn(move || { - let mut buf = [0u8; SIZE]; - let mut rng = rand::thread_rng(); - for _ in 1..1000 { - let f = &files[rng.gen_range(0..files.len())]; - f.read_exact_at(&mut buf, 0).unwrap(); - assert!(buf == SAMPLE); - } - }) - .unwrap(); - threads.push(thread); + let hdl = rt.spawn(async move { + let mut buf = [0u8; SIZE]; + let mut rng = rand::rngs::OsRng; + for _ in 1..1000 { + let f = &files[rng.gen_range(0..files.len())]; + f.read_exact_at(&mut buf, 0).await.unwrap(); + assert!(buf == SAMPLE); + } + }); + hdls.push(hdl); } - - for thread in threads { - thread.join().unwrap(); + for hdl in hdls { + hdl.await?; } + std::mem::forget(rt); Ok(()) } + + #[tokio::test] + async fn test_atomic_overwrite_basic() { + let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic"); + std::fs::create_dir_all(&testdir).unwrap(); + + let path = testdir.join("myfile"); + let tmp_path = testdir.join("myfile.tmp"); + + VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo") + .await + .unwrap(); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); + let post = file.read_string().await.unwrap(); + assert_eq!(post, "foo"); + assert!(!tmp_path.exists()); + drop(file); + + VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar") + .await + .unwrap(); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); + let post = file.read_string().await.unwrap(); + assert_eq!(post, "bar"); + assert!(!tmp_path.exists()); + drop(file); + } + + #[tokio::test] + async fn test_atomic_overwrite_preexisting_tmp() { + let testdir = + crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp"); + std::fs::create_dir_all(&testdir).unwrap(); + + let path = testdir.join("myfile"); + let tmp_path = testdir.join("myfile.tmp"); + + std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap(); + assert!(tmp_path.exists()); + + VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo") + .await + .unwrap(); + + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); + let post = file.read_string().await.unwrap(); + assert_eq!(post, "foo"); + assert!(!tmp_path.exists()); + drop(file); + } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 340b75877d..d290715938 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -25,7 +25,7 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; -use anyhow::{Context, Result}; +use anyhow::{bail, Context, Result}; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; @@ -106,6 +106,10 @@ impl<'a> WalIngest<'a> { self.ingest_heapam_record(&mut buf, modification, decoded, ctx) .await?; } + if decoded.xl_rmid == pg_constants::RM_NEON_ID { + self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx) + .await?; + } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) @@ -172,6 +176,32 @@ impl<'a> WalIngest<'a> { .await?; } } + } else if self.timeline.pg_version == 16 { + if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG + { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY + { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb, ctx) + .await?; + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v16::bindings::XLOG_DBASE_DROP + { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification + .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + .await?; + } + } } } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID { trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet"); @@ -333,7 +363,7 @@ impl<'a> WalIngest<'a> { // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - modification.commit().await?; + modification.commit(ctx).await?; Ok(()) } @@ -414,57 +444,215 @@ impl<'a> WalIngest<'a> { // need to clear the corresponding bits in the visibility map. let mut new_heap_blkno: Option = None; let mut old_heap_blkno: Option = None; - if decoded.xl_rmid == pg_constants::RM_HEAP_ID { - let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; - if info == pg_constants::XLOG_HEAP_INSERT { - let xlrec = XlHeapInsert::decode(buf); - assert_eq!(0, buf.remaining()); - if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { - new_heap_blkno = Some(decoded.blocks[0].blkno); - } - } else if info == pg_constants::XLOG_HEAP_DELETE { - let xlrec = XlHeapDelete::decode(buf); - assert_eq!(0, buf.remaining()); - if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { - new_heap_blkno = Some(decoded.blocks[0].blkno); - } - } else if info == pg_constants::XLOG_HEAP_UPDATE - || info == pg_constants::XLOG_HEAP_HOT_UPDATE - { - let xlrec = XlHeapUpdate::decode(buf); - // the size of tuple data is inferred from the size of the record. - // we can't validate the remaining number of bytes without parsing - // the tuple data. - if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { - old_heap_blkno = Some(decoded.blocks[0].blkno); - } - if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { - // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a - // non-HOT update where the new tuple goes to different page than - // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is - // set. - new_heap_blkno = Some(decoded.blocks[1].blkno); - } - } - } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; - if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { - let xlrec = XlHeapMultiInsert::decode(buf); + let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS; - let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { - // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set - 0 + match self.timeline.pg_version { + 14 => { + if decoded.xl_rmid == pg_constants::RM_HEAP_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + + if info == pg_constants::XLOG_HEAP_INSERT { + let xlrec = v14::XlHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_DELETE { + let xlrec = v14::XlHeapDelete::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_UPDATE + || info == pg_constants::XLOG_HEAP_HOT_UPDATE + { + let xlrec = v14::XlHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_LOCK { + let xlrec = v14::XlHeapLock::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { + let xlrec = v14::XlHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + std::mem::size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED { + let xlrec = v14::XlHeapLockUpdated::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } } else { - std::mem::size_of::() * xlrec.ntuples as usize - }; - assert_eq!(offset_array_len, buf.remaining()); - - if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { - new_heap_blkno = Some(decoded.blocks[0].blkno); + bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } + 15 => { + if decoded.xl_rmid == pg_constants::RM_HEAP_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + + if info == pg_constants::XLOG_HEAP_INSERT { + let xlrec = v15::XlHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_DELETE { + let xlrec = v15::XlHeapDelete::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_UPDATE + || info == pg_constants::XLOG_HEAP_HOT_UPDATE + { + let xlrec = v15::XlHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_LOCK { + let xlrec = v15::XlHeapLock::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { + let xlrec = v15::XlHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + std::mem::size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED { + let xlrec = v15::XlHeapLockUpdated::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else { + bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); + } + } + 16 => { + if decoded.xl_rmid == pg_constants::RM_HEAP_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + + if info == pg_constants::XLOG_HEAP_INSERT { + let xlrec = v16::XlHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_DELETE { + let xlrec = v16::XlHeapDelete::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_UPDATE + || info == pg_constants::XLOG_HEAP_HOT_UPDATE + { + let xlrec = v16::XlHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_LOCK { + let xlrec = v16::XlHeapLock::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { + let xlrec = v16::XlHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + std::mem::size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED { + let xlrec = v16::XlHeapLockUpdated::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else { + bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); + } + } + _ => {} } - // FIXME: What about XLOG_HEAP_LOCK and XLOG_HEAP2_LOCK_UPDATED? // Clear the VM bits if required. if new_heap_blkno.is_some() || old_heap_blkno.is_some() { @@ -508,7 +696,7 @@ impl<'a> WalIngest<'a> { NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, - flags: pg_constants::VISIBILITYMAP_VALID_BITS, + flags, }, ctx, ) @@ -524,7 +712,7 @@ impl<'a> WalIngest<'a> { NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno: None, - flags: pg_constants::VISIBILITYMAP_VALID_BITS, + flags, }, ctx, ) @@ -538,7 +726,178 @@ impl<'a> WalIngest<'a> { NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno: None, old_heap_blkno, - flags: pg_constants::VISIBILITYMAP_VALID_BITS, + flags, + }, + ctx, + ) + .await?; + } + } + } + } + + Ok(()) + } + + async fn ingest_neonrmgr_record( + &mut self, + buf: &mut Bytes, + modification: &mut DatadirModification<'_>, + decoded: &mut DecodedWALRecord, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + // Handle VM bit updates that are implicitly part of heap records. + + // First, look at the record to determine which VM bits need + // to be cleared. If either of these variables is set, we + // need to clear the corresponding bits in the visibility map. + let mut new_heap_blkno: Option = None; + let mut old_heap_blkno: Option = None; + let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS; + + assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); + + match self.timeline.pg_version { + 16 => { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + + match info { + pg_constants::XLOG_NEON_HEAP_INSERT => { + let xlrec = v16::rm_neon::XlNeonHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } + pg_constants::XLOG_NEON_HEAP_DELETE => { + let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } + pg_constants::XLOG_NEON_HEAP_UPDATE + | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => { + let xlrec = v16::rm_neon::XlNeonHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } + pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => { + let xlrec = v16::rm_neon::XlNeonHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + std::mem::size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } + pg_constants::XLOG_NEON_HEAP_LOCK => { + let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + info => bail!("Unknown WAL record type for Neon RMGR: {}", info), + } + } + _ => bail!( + "Neon RMGR has no known compatibility with PostgreSQL version {}", + self.timeline.pg_version + ), + } + + // Clear the VM bits if required. + if new_heap_blkno.is_some() || old_heap_blkno.is_some() { + let vm_rel = RelTag { + forknum: VISIBILITYMAP_FORKNUM, + spcnode: decoded.blocks[0].rnode_spcnode, + dbnode: decoded.blocks[0].rnode_dbnode, + relnode: decoded.blocks[0].rnode_relnode, + }; + + let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); + let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); + + // Sometimes, Postgres seems to create heap WAL records with the + // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is + // not set. In fact, it's possible that the VM page does not exist at all. + // In that case, we don't want to store a record to clear the VM bit; + // replaying it would fail to find the previous image of the page, because + // it doesn't exist. So check if the VM page(s) exist, and skip the WAL + // record if it doesn't. + let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?; + if let Some(blknum) = new_vm_blk { + if blknum >= vm_size { + new_vm_blk = None; + } + } + if let Some(blknum) = old_vm_blk { + if blknum >= vm_size { + old_vm_blk = None; + } + } + + if new_vm_blk.is_some() || old_vm_blk.is_some() { + if new_vm_blk == old_vm_blk { + // An UPDATE record that needs to clear the bits for both old and the + // new page, both of which reside on the same VM page. + self.put_rel_wal_record( + modification, + vm_rel, + new_vm_blk.unwrap(), + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, + old_heap_blkno, + flags, + }, + ctx, + ) + .await?; + } else { + // Clear VM bits for one heap page, or for two pages that reside on + // different VM pages. + if let Some(new_vm_blk) = new_vm_blk { + self.put_rel_wal_record( + modification, + vm_rel, + new_vm_blk, + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, + old_heap_blkno: None, + flags, + }, + ctx, + ) + .await?; + } + if let Some(old_vm_blk) = old_vm_blk { + self.put_rel_wal_record( + modification, + vm_rel, + old_vm_blk, + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno: None, + old_heap_blkno, + flags, }, ctx, ) @@ -1202,7 +1561,7 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x10)); m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file - m.commit().await?; + m.commit(ctx).await?; let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?; Ok(walingest) @@ -1221,22 +1580,22 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x30)); walingest .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x40)); walingest .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x50)); walingest .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; assert_current_logical_size(&tline, Lsn(0x50)); @@ -1322,7 +1681,7 @@ mod tests { walingest .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; assert_current_logical_size(&tline, Lsn(0x60)); // Check reported size and contents after truncation @@ -1364,7 +1723,7 @@ mod tests { walingest .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; assert_eq!( tline .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx) @@ -1377,7 +1736,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; assert_eq!( tline .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx) @@ -1402,7 +1761,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; assert_eq!( tline .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx) @@ -1441,7 +1800,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; // Check that rel exists and size is correct assert_eq!( @@ -1460,7 +1819,7 @@ mod tests { // Drop rel let mut m = tline.begin_modification(Lsn(0x30)); walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?; - m.commit().await?; + m.commit(&ctx).await?; // Check that rel is not visible anymore assert_eq!( @@ -1478,7 +1837,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; // Check that rel exists and size is correct assert_eq!( @@ -1517,7 +1876,7 @@ mod tests { .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx) .await?; } - m.commit().await?; + m.commit(&ctx).await?; // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( @@ -1562,7 +1921,7 @@ mod tests { walingest .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; // Check reported size and contents after truncation assert_eq!( @@ -1611,7 +1970,7 @@ mod tests { .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx) .await?; } - m.commit().await?; + m.commit(&ctx).await?; assert_eq!( tline @@ -1658,7 +2017,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; } assert_current_logical_size(&tline, Lsn(lsn)); @@ -1674,7 +2033,7 @@ mod tests { walingest .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, RELSEG_SIZE @@ -1687,7 +2046,7 @@ mod tests { walingest .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, RELSEG_SIZE - 1 @@ -1703,7 +2062,7 @@ mod tests { walingest .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx) .await?; - m.commit().await?; + m.commit(&ctx).await?; assert_eq!( tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?, size as BlockNumber diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 1a34168fed..9c2e522f17 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -4,9 +4,10 @@ use anyhow::Result; use bytes::{Buf, Bytes}; +use postgres_ffi::dispatch_pgversion; use postgres_ffi::pg_constants; use postgres_ffi::BLCKSZ; -use postgres_ffi::{BlockNumber, OffsetNumber, TimestampTz}; +use postgres_ffi::{BlockNumber, TimestampTz}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; use serde::{Deserialize, Serialize}; @@ -76,9 +77,12 @@ pub struct DecodedBkpBlock { pub flags: u8, /* Information on full-page image, if any */ - pub has_image: bool, /* has image, even for consistency checking */ - pub apply_image: bool, /* has image that should be restored */ - pub will_init: bool, /* record doesn't need previous page version to apply */ + pub has_image: bool, + /* has image, even for consistency checking */ + pub apply_image: bool, + /* has image that should be restored */ + pub will_init: bool, + /* record doesn't need previous page version to apply */ //char *bkp_image; pub hole_offset: u16, pub hole_length: u16, @@ -134,6 +138,325 @@ impl XlRelmapUpdate { } } +pub mod v14 { + use bytes::{Buf, Bytes}; + use postgres_ffi::{OffsetNumber, TransactionId}; + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapInsert { + pub offnum: OffsetNumber, + pub flags: u8, + } + + impl XlHeapInsert { + pub fn decode(buf: &mut Bytes) -> XlHeapInsert { + XlHeapInsert { + offnum: buf.get_u16_le(), + flags: buf.get_u8(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapMultiInsert { + pub flags: u8, + pub _padding: u8, + pub ntuples: u16, + } + + impl XlHeapMultiInsert { + pub fn decode(buf: &mut Bytes) -> XlHeapMultiInsert { + XlHeapMultiInsert { + flags: buf.get_u8(), + _padding: buf.get_u8(), + ntuples: buf.get_u16_le(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapDelete { + pub xmax: TransactionId, + pub offnum: OffsetNumber, + pub _padding: u16, + pub t_cid: u32, + pub infobits_set: u8, + pub flags: u8, + } + + impl XlHeapDelete { + pub fn decode(buf: &mut Bytes) -> XlHeapDelete { + XlHeapDelete { + xmax: buf.get_u32_le(), + offnum: buf.get_u16_le(), + _padding: buf.get_u16_le(), + t_cid: buf.get_u32_le(), + infobits_set: buf.get_u8(), + flags: buf.get_u8(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapUpdate { + pub old_xmax: TransactionId, + pub old_offnum: OffsetNumber, + pub old_infobits_set: u8, + pub flags: u8, + pub t_cid: u32, + pub new_xmax: TransactionId, + pub new_offnum: OffsetNumber, + } + + impl XlHeapUpdate { + pub fn decode(buf: &mut Bytes) -> XlHeapUpdate { + XlHeapUpdate { + old_xmax: buf.get_u32_le(), + old_offnum: buf.get_u16_le(), + old_infobits_set: buf.get_u8(), + flags: buf.get_u8(), + t_cid: buf.get_u32_le(), + new_xmax: buf.get_u32_le(), + new_offnum: buf.get_u16_le(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapLock { + pub locking_xid: TransactionId, + pub offnum: OffsetNumber, + pub _padding: u16, + pub t_cid: u32, + pub infobits_set: u8, + pub flags: u8, + } + + impl XlHeapLock { + pub fn decode(buf: &mut Bytes) -> XlHeapLock { + XlHeapLock { + locking_xid: buf.get_u32_le(), + offnum: buf.get_u16_le(), + _padding: buf.get_u16_le(), + t_cid: buf.get_u32_le(), + infobits_set: buf.get_u8(), + flags: buf.get_u8(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapLockUpdated { + pub xmax: TransactionId, + pub offnum: OffsetNumber, + pub infobits_set: u8, + pub flags: u8, + } + + impl XlHeapLockUpdated { + pub fn decode(buf: &mut Bytes) -> XlHeapLockUpdated { + XlHeapLockUpdated { + xmax: buf.get_u32_le(), + offnum: buf.get_u16_le(), + infobits_set: buf.get_u8(), + flags: buf.get_u8(), + } + } + } +} + +pub mod v15 { + pub use super::v14::{ + XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate, + }; +} + +pub mod v16 { + pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert}; + use bytes::{Buf, Bytes}; + use postgres_ffi::{OffsetNumber, TransactionId}; + + pub struct XlHeapDelete { + pub xmax: TransactionId, + pub offnum: OffsetNumber, + pub infobits_set: u8, + pub flags: u8, + } + + impl XlHeapDelete { + pub fn decode(buf: &mut Bytes) -> XlHeapDelete { + XlHeapDelete { + xmax: buf.get_u32_le(), + offnum: buf.get_u16_le(), + infobits_set: buf.get_u8(), + flags: buf.get_u8(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapUpdate { + pub old_xmax: TransactionId, + pub old_offnum: OffsetNumber, + pub old_infobits_set: u8, + pub flags: u8, + pub new_xmax: TransactionId, + pub new_offnum: OffsetNumber, + } + + impl XlHeapUpdate { + pub fn decode(buf: &mut Bytes) -> XlHeapUpdate { + XlHeapUpdate { + old_xmax: buf.get_u32_le(), + old_offnum: buf.get_u16_le(), + old_infobits_set: buf.get_u8(), + flags: buf.get_u8(), + new_xmax: buf.get_u32_le(), + new_offnum: buf.get_u16_le(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapLock { + pub locking_xid: TransactionId, + pub offnum: OffsetNumber, + pub infobits_set: u8, + pub flags: u8, + } + + impl XlHeapLock { + pub fn decode(buf: &mut Bytes) -> XlHeapLock { + XlHeapLock { + locking_xid: buf.get_u32_le(), + offnum: buf.get_u16_le(), + infobits_set: buf.get_u8(), + flags: buf.get_u8(), + } + } + } + + /* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */ + pub mod rm_neon { + use bytes::{Buf, Bytes}; + use postgres_ffi::{OffsetNumber, TransactionId}; + + #[repr(C)] + #[derive(Debug)] + pub struct XlNeonHeapInsert { + pub offnum: OffsetNumber, + pub flags: u8, + } + + impl XlNeonHeapInsert { + pub fn decode(buf: &mut Bytes) -> XlNeonHeapInsert { + XlNeonHeapInsert { + offnum: buf.get_u16_le(), + flags: buf.get_u8(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlNeonHeapMultiInsert { + pub flags: u8, + pub _padding: u8, + pub ntuples: u16, + pub t_cid: u32, + } + + impl XlNeonHeapMultiInsert { + pub fn decode(buf: &mut Bytes) -> XlNeonHeapMultiInsert { + XlNeonHeapMultiInsert { + flags: buf.get_u8(), + _padding: buf.get_u8(), + ntuples: buf.get_u16_le(), + t_cid: buf.get_u32_le(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlNeonHeapDelete { + pub xmax: TransactionId, + pub offnum: OffsetNumber, + pub infobits_set: u8, + pub flags: u8, + pub t_cid: u32, + } + + impl XlNeonHeapDelete { + pub fn decode(buf: &mut Bytes) -> XlNeonHeapDelete { + XlNeonHeapDelete { + xmax: buf.get_u32_le(), + offnum: buf.get_u16_le(), + infobits_set: buf.get_u8(), + flags: buf.get_u8(), + t_cid: buf.get_u32_le(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlNeonHeapUpdate { + pub old_xmax: TransactionId, + pub old_offnum: OffsetNumber, + pub old_infobits_set: u8, + pub flags: u8, + pub t_cid: u32, + pub new_xmax: TransactionId, + pub new_offnum: OffsetNumber, + } + + impl XlNeonHeapUpdate { + pub fn decode(buf: &mut Bytes) -> XlNeonHeapUpdate { + XlNeonHeapUpdate { + old_xmax: buf.get_u32_le(), + old_offnum: buf.get_u16_le(), + old_infobits_set: buf.get_u8(), + flags: buf.get_u8(), + t_cid: buf.get_u32(), + new_xmax: buf.get_u32_le(), + new_offnum: buf.get_u16_le(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlNeonHeapLock { + pub locking_xid: TransactionId, + pub t_cid: u32, + pub offnum: OffsetNumber, + pub infobits_set: u8, + pub flags: u8, + } + + impl XlNeonHeapLock { + pub fn decode(buf: &mut Bytes) -> XlNeonHeapLock { + XlNeonHeapLock { + locking_xid: buf.get_u32_le(), + t_cid: buf.get_u32_le(), + offnum: buf.get_u16_le(), + infobits_set: buf.get_u8(), + flags: buf.get_u8(), + } + } + } + } +} + #[repr(C)] #[derive(Debug)] pub struct XlSmgrCreate { @@ -223,90 +546,6 @@ impl XlDropDatabase { } } -#[repr(C)] -#[derive(Debug)] -pub struct XlHeapInsert { - pub offnum: OffsetNumber, - pub flags: u8, -} - -impl XlHeapInsert { - pub fn decode(buf: &mut Bytes) -> XlHeapInsert { - XlHeapInsert { - offnum: buf.get_u16_le(), - flags: buf.get_u8(), - } - } -} - -#[repr(C)] -#[derive(Debug)] -pub struct XlHeapMultiInsert { - pub flags: u8, - pub _padding: u8, - pub ntuples: u16, -} - -impl XlHeapMultiInsert { - pub fn decode(buf: &mut Bytes) -> XlHeapMultiInsert { - XlHeapMultiInsert { - flags: buf.get_u8(), - _padding: buf.get_u8(), - ntuples: buf.get_u16_le(), - } - } -} - -#[repr(C)] -#[derive(Debug)] -pub struct XlHeapDelete { - pub xmax: TransactionId, - pub offnum: OffsetNumber, - pub _padding: u16, - pub t_cid: u32, - pub infobits_set: u8, - pub flags: u8, -} - -impl XlHeapDelete { - pub fn decode(buf: &mut Bytes) -> XlHeapDelete { - XlHeapDelete { - xmax: buf.get_u32_le(), - offnum: buf.get_u16_le(), - _padding: buf.get_u16_le(), - t_cid: buf.get_u32_le(), - infobits_set: buf.get_u8(), - flags: buf.get_u8(), - } - } -} - -#[repr(C)] -#[derive(Debug)] -pub struct XlHeapUpdate { - pub old_xmax: TransactionId, - pub old_offnum: OffsetNumber, - pub old_infobits_set: u8, - pub flags: u8, - pub t_cid: u32, - pub new_xmax: TransactionId, - pub new_offnum: OffsetNumber, -} - -impl XlHeapUpdate { - pub fn decode(buf: &mut Bytes) -> XlHeapUpdate { - XlHeapUpdate { - old_xmax: buf.get_u32_le(), - old_offnum: buf.get_u16_le(), - old_infobits_set: buf.get_u8(), - flags: buf.get_u8(), - t_cid: buf.get_u32(), - new_xmax: buf.get_u32_le(), - new_offnum: buf.get_u16_le(), - } - } -} - /// /// Note: Parsing some fields is missing, because they're not needed. /// @@ -321,9 +560,10 @@ pub struct XlXactParsedRecord { pub xact_time: TimestampTz, pub xinfo: u32, - pub db_id: Oid, /* MyDatabaseId */ - pub ts_id: Oid, /* MyDatabaseTableSpace */ - + pub db_id: Oid, + /* MyDatabaseId */ + pub ts_id: Oid, + /* MyDatabaseTableSpace */ pub subxacts: Vec, pub xnodes: Vec, @@ -455,9 +695,12 @@ impl MultiXactMember { #[repr(C)] #[derive(Debug)] pub struct XlMultiXactCreate { - pub mid: MultiXactId, /* new MultiXact's ID */ - pub moff: MultiXactOffset, /* its starting offset in members file */ - pub nmembers: u32, /* number of member XIDs */ + pub mid: MultiXactId, + /* new MultiXact's ID */ + pub moff: MultiXactOffset, + /* its starting offset in members file */ + pub nmembers: u32, + /* number of member XIDs */ pub members: Vec, } @@ -484,7 +727,8 @@ impl XlMultiXactCreate { pub struct XlMultiXactTruncate { pub oldest_multi_db: Oid, /* to-be-truncated range of multixact offsets */ - pub start_trunc_off: MultiXactId, /* just for completeness' sake */ + pub start_trunc_off: MultiXactId, + /* just for completeness' sake */ pub end_trunc_off: MultiXactId, /* to-be-truncated range of multixact members */ @@ -626,12 +870,10 @@ pub fn decode_wal_record( blk.hole_offset = buf.get_u16_le(); blk.bimg_info = buf.get_u8(); - blk.apply_image = if pg_version == 14 { - (blk.bimg_info & postgres_ffi::v14::bindings::BKPIMAGE_APPLY) != 0 - } else { - assert_eq!(pg_version, 15); - (blk.bimg_info & postgres_ffi::v15::bindings::BKPIMAGE_APPLY) != 0 - }; + blk.apply_image = dispatch_pgversion!( + pg_version, + (blk.bimg_info & pgv::bindings::BKPIMAGE_APPLY) != 0 + ); let blk_img_is_compressed = postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?; diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control index 24510f6766..fbfa1a5b47 100644 --- a/pgxn/hnsw/hnsw.control +++ b/pgxn/hnsw/hnsw.control @@ -2,4 +2,3 @@ comment = '** Deprecated ** Please use pg_embedding instead' default_version = '0.1.0' module_pathname = '$libdir/hnsw' relocatable = true -trusted = true diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 262814c818..4be75e1dad 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -19,13 +19,16 @@ #include #include "postgres.h" + +#include "neon_pgversioncompat.h" + #include "funcapi.h" #include "miscadmin.h" #include "pgstat.h" #include "pagestore_client.h" #include "access/parallel.h" #include "postmaster/bgworker.h" -#include "storage/relfilenode.h" +#include RELFILEINFO_HDR #include "storage/buf_internals.h" #include "storage/latch.h" #include "storage/ipc.h" @@ -77,13 +80,14 @@ typedef struct FileCacheEntry typedef struct FileCacheControl { + uint64 generation; /* generation is needed to handle correct hash reenabling */ uint32 size; /* size of cache file in chunks */ uint32 used; /* number of used chunks */ dlist_head lru; /* double linked list for LRU replacement algorithm */ } FileCacheControl; static HTAB* lfc_hash; -static int lfc_desc; +static int lfc_desc = 0; static LWLockId lfc_lock; static int lfc_max_size; static int lfc_size_limit; @@ -98,6 +102,60 @@ static int lfc_shrinking_factor; /* power of two by which local cache size wil void FileCacheMonitorMain(Datum main_arg); +/* + * Local file cache is mandatory and Neon can work without it. + * In case of any any errors with this cache, we should disable it but to not throw error. + * Also we should allow re-enable it if source of failure (lack of disk space, permissions,...) is fixed. + * All cache content should be invalidated to avoid reading of stale or corrupted data + */ +static void +lfc_disable(char const* op) +{ + HASH_SEQ_STATUS status; + FileCacheEntry* entry; + + elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path); + + if (lfc_desc > 0) + close(lfc_desc); + + lfc_desc = -1; + lfc_size_limit = 0; + + /* Invalidate hash */ + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + hash_seq_init(&status, lfc_hash); + while ((entry = hash_seq_search(&status)) != NULL) + { + hash_search(lfc_hash, &entry->key, HASH_REMOVE, NULL); + memset(entry->bitmap, 0, sizeof entry->bitmap); + } + hash_seq_term(&status); + lfc_ctl->generation += 1; + lfc_ctl->size = 0; + lfc_ctl->used = 0; + dlist_init(&lfc_ctl->lru); + + LWLockRelease(lfc_lock); +} + +static bool +lfc_ensure_opened(void) +{ + /* Open cache file if not done yet */ + if (lfc_desc <= 0) + { + lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT); + + if (lfc_desc < 0) { + lfc_disable("open"); + return false; + } + } + return true; +} + static void lfc_shmem_startup(void) { @@ -123,6 +181,7 @@ lfc_shmem_startup(void) lfc_size+1, lfc_size+1, &info, HASH_ELEM | HASH_BLOBS); + lfc_ctl->generation = 0; lfc_ctl->size = 0; lfc_ctl->used = 0; dlist_init(&lfc_ctl->lru); @@ -163,12 +222,13 @@ lfc_change_limit_hook(int newval, void *extra) /* * Stats collector detach shared memory, so we should not try to access shared memory here. * Parallel workers first assign default value (0), so not perform truncation in parallel workers. + * The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC. */ - if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker()) + if (!lfc_ctl || !MyProc || !UsedShmemSegAddr || IsParallelWorker()) return; /* Open cache file if not done yet */ - if (lfc_desc == 0) + if (lfc_desc <= 0) { lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT); if (lfc_desc < 0) { @@ -349,7 +409,7 @@ lfc_init(void) * Returns true if page is found in local cache. */ bool -lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) +lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) { BufferTag tag; FileCacheEntry* entry; @@ -360,7 +420,7 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ return false; - tag.rnode = rnode; + CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); hash = get_hash_value(lfc_hash, &tag); @@ -376,7 +436,7 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) * Evict a page (if present) from the local file cache */ void -lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) +lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) { BufferTag tag; FileCacheEntry* entry; @@ -387,7 +447,9 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ return; - INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1))); + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; + tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1)); hash = get_hash_value(lfc_hash, &tag); @@ -445,7 +507,7 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) * In case of error lfc_size_limit is set to zero to disable any further opera-tins with cache. */ bool -lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, +lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer) { BufferTag tag; @@ -454,11 +516,16 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); bool result = true; uint32 hash; + uint64 generation; + uint32 entry_offset; if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ return false; - tag.rnode = rnode; + if (!lfc_ensure_opened()) + return false; + + CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); hash = get_hash_value(lfc_hash, &tag); @@ -474,35 +541,29 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, /* Unlink entry from LRU list to pin it for the duration of IO operation */ if (entry->access_count++ == 0) dlist_delete(&entry->lru_node); + generation = lfc_ctl->generation; + entry_offset = entry->offset; + LWLockRelease(lfc_lock); - /* Open cache file if not done yet */ - if (lfc_desc == 0) + rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ); + if (rc != BLCKSZ) { - lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT); - if (lfc_desc < 0) { - elog(LOG, "Failed to open file cache %s: %m", lfc_path); - lfc_size_limit = 0; /* disable file cache */ - result = false; - } - } - - if (lfc_desc > 0) - { - rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ); - if (rc != BLCKSZ) - { - elog(INFO, "Failed to read file cache: %m"); - lfc_size_limit = 0; /* disable file cache */ - result = false; - } + lfc_disable("read"); + return false; } /* Place entry to the head of LRU list */ LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - Assert(entry->access_count > 0); - if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + if (lfc_ctl->generation == generation) + { + Assert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + } + else + result = false; + LWLockRelease(lfc_lock); return result; @@ -513,8 +574,12 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, * If cache is full then evict some other page. */ void -lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, +lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, +#if PG_MAJORVERSION_NUM < 16 char *buffer) +#else + const void *buffer) +#endif { BufferTag tag; FileCacheEntry* entry; @@ -526,9 +591,14 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ return; - tag.rnode = rnode; + if (!lfc_ensure_opened()) + return; + tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); + + CopyNRelFileInfoToBufTag(tag, rinfo); + hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -567,34 +637,23 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, entry->access_count = 1; memset(entry->bitmap, 0, sizeof entry->bitmap); } - LWLockRelease(lfc_lock); - /* Open cache file if not done yet */ - if (lfc_desc == 0) + rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ); + if (rc != BLCKSZ) { - lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT); - if (lfc_desc < 0) { - elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path); - lfc_size_limit = 0; /* disable file cache */ - } + LWLockRelease(lfc_lock); + lfc_disable("write"); } - if (lfc_desc > 0) + else { - rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ); - if (rc != BLCKSZ) - { - elog(WARNING, "Failed to write file cache: %m, disabling file cache"); - lfc_size_limit = 0; /* disable file cache */ - } - } - /* Place entry to the head of LRU list */ - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - Assert(entry->access_count > 0); - if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); - if (lfc_size_limit != 0) + /* Place entry to the head of LRU list */ + Assert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31)); - LWLockRelease(lfc_lock); + LWLockRelease(lfc_lock); + } } /* @@ -671,8 +730,13 @@ local_cache_pages(PG_FUNCTION_ARGS) tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); TupleDescInitEntry(tupledesc, (AttrNumber) 1, "pageoffs", INT8OID, -1, 0); +#if PG_MAJORVERSION_NUM < 16 TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", OIDOID, -1, 0); +#else + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenumber", + OIDOID, -1, 0); +#endif TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", @@ -694,6 +758,7 @@ local_cache_pages(PG_FUNCTION_ARGS) for (int i = 0; i < BLOCKS_PER_CHUNK; i++) n_pages += (entry->bitmap[i >> 5] & (1 << (i & 31))) != 0; } + hash_seq_term(&status); fctx->record = (LocalCachePagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, sizeof(LocalCachePagesRec) * n_pages); @@ -722,9 +787,9 @@ local_cache_pages(PG_FUNCTION_ARGS) if (entry->bitmap[i >> 5] & (1 << (i & 31))) { fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i; - fctx->record[n_pages].relfilenode = entry->key.rnode.relNode; - fctx->record[n_pages].reltablespace = entry->key.rnode.spcNode; - fctx->record[n_pages].reldatabase = entry->key.rnode.dbNode; + fctx->record[n_pages].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); + fctx->record[n_pages].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key)); + fctx->record[n_pages].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key)); fctx->record[n_pages].forknum = entry->key.forkNum; fctx->record[n_pages].blocknum = entry->key.blockNum + i; fctx->record[n_pages].accesscount = entry->access_count; @@ -732,6 +797,7 @@ local_cache_pages(PG_FUNCTION_ARGS) } } } + hash_seq_term(&status); Assert(n_pages == funcctx->max_calls); LWLockRelease(lfc_lock); } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 4fdc7f8c82..c89de11594 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -442,7 +442,7 @@ pg_init_libpagestore(void) "Maximal attempts to reconnect to pages server (with 1 second timeout)", NULL, &max_reconnect_attempts, - 10, 0, INT_MAX, + 60, 0, INT_MAX, PGC_USERSET, 0, NULL, NULL, NULL); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index c7211ea05a..4850b0d6a1 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -33,6 +33,14 @@ void _PG_init(void); void _PG_init(void) { + /* + * Also load 'neon_rmgr'. This makes it unnecessary to list both 'neon' + * and 'neon_rmgr' in shared_preload_libraries. + */ +#if PG_VERSION_NUM >= 160000 + load_file("$libdir/neon_rmgr", false); +#endif + pg_init_libpagestore(); pg_init_walproposer(); @@ -40,9 +48,9 @@ _PG_init(void) pg_init_extension_server(); - // Important: This must happen after other parts of the extension - // are loaded, otherwise any settings to GUCs that were set before - // the extension was loaded will be removed. + // Important: This must happen after other parts of the extension + // are loaded, otherwise any settings to GUCs that were set before + // the extension was loaded will be removed. EmitWarningsOnPlaceholders("neon"); } diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h new file mode 100644 index 0000000000..8db0d5341e --- /dev/null +++ b/pgxn/neon/neon_pgversioncompat.h @@ -0,0 +1,112 @@ +/* + * Compatibility macros to cover up differences between supported PostgreSQL versions, + * to help with compiling the same sources for all of them. + */ + +#ifndef NEON_PGVERSIONCOMPAT_H +#define NEON_PGVERSIONCOMPAT_H + +#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId) + +#define RelFileInfoEquals(a, b) ( \ + NInfoGetSpcOid(a) == NInfoGetSpcOid(b) && \ + NInfoGetDbOid(a) == NInfoGetDbOid(b) && \ + NInfoGetRelNumber(a) == NInfoGetRelNumber(b) \ +) + +/* buftag population & RelFileNode/RelFileLocator rework */ +#if PG_MAJORVERSION_NUM < 16 + +#define InitBufferTag(tag, rfn, fn, bn) INIT_BUFFERTAG(*tag, *rfn, fn, bn) + +#define USE_RELFILENODE + +#define RELFILEINFO_HDR "storage/relfilenode.h" + +#define NRelFileInfo RelFileNode +#define NRelFileInfoBackend RelFileNodeBackend +#define NRelFileNumber Oid + +#define InfoFromRelation(rel) (rel)->rd_node +#define InfoFromSMgrRel(srel) (srel)->smgr_rnode.node +#define InfoBFromSMgrRel(srel) (srel)->smgr_rnode +#define InfoFromNInfoB(ninfob) ninfob.node + +#define RelFileInfoFmt(rinfo) \ + (rinfo).spcNode, \ + (rinfo).dbNode, \ + (rinfo).relNode + +#define RelFileInfoBackendFmt(ninfob) \ + (ninfob).backend, \ + (ninfob).node.spcNode, \ + (ninfob).node.dbNode, \ + (ninfob).node.relNode + +#define NInfoGetSpcOid(ninfo) (ninfo).spcNode +#define NInfoGetDbOid(ninfo) (ninfo).dbNode +#define NInfoGetRelNumber(ninfo) (ninfo).relNode + +#define CopyNRelFileInfoToBufTag(tag, rinfo) \ + do { \ + (tag).rnode = (rinfo); \ + } while (false); + +#define BufTagGetNRelFileInfo(tag) tag.rnode + +#define SMgrRelGetRelInfo(reln) \ + (reln->smgr_rnode.node) + +#define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers + +#else /* major version >= 16 */ + +#define USE_RELFILELOCATOR + +#define BUFFERTAGS_EQUAL(a, b) BufferTagsEqual(&(a), &(b)) + +#define RELFILEINFO_HDR "storage/relfilelocator.h" + +#define NRelFileInfo RelFileLocator +#define NRelFileInfoBackend RelFileLocatorBackend + +#define InfoFromRelation(rel) (rel)->rd_locator +#define InfoFromSMgrRel(srel) (srel)->smgr_rlocator.locator +#define InfoBFromSMgrRel(srel) (srel)->smgr_rlocator +#define InfoFromNInfoB(ninfob) (ninfob).locator + +#define RelFileInfoFmt(rinfo) \ + (rinfo).spcOid, \ + (rinfo).dbOid, \ + (rinfo).relNumber +#define RelFileInfoBackendFmt(ninfob) \ + (ninfob).backend, \ + (ninfob).locator.spcOid, \ + (ninfob).locator.dbOid, \ + (ninfob).locator.relNumber + +#define NInfoGetSpcOid(ninfo) (ninfo).spcOid +#define NInfoGetDbOid(ninfo) (ninfo).dbOid +#define NInfoGetRelNumber(ninfo) (ninfo).relNumber + +#define CopyNRelFileInfoToBufTag(tag, rinfo) \ + do { \ + (tag).spcOid = (rinfo).spcOid; \ + (tag).dbOid = (rinfo).dbOid; \ + (tag).relNumber = (rinfo).relNumber; \ + } while (false); + +#define BufTagGetNRelFileInfo(tag) \ + ((RelFileLocator) { \ + .spcOid = (tag).spcOid, \ + .dbOid = (tag).dbOid, \ + .relNumber = (tag).relNumber, \ + }) + +#define SMgrRelGetRelInfo(reln) \ + ((reln)->smgr_rlocator) + +#define DropRelationAllLocalBuffers DropRelationAllLocalBuffers +#endif + +#endif //NEON_PGVERSIONCOMPAT_H diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 2889db49bc..d61f74b5c8 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -14,9 +14,10 @@ #define pageserver_h #include "postgres.h" +#include "neon_pgversioncompat.h" #include "access/xlogdefs.h" -#include "storage/relfilenode.h" +#include RELFILEINFO_HDR #include "storage/block.h" #include "storage/smgr.h" #include "lib/stringinfo.h" @@ -71,14 +72,14 @@ typedef struct typedef struct { NeonRequest req; - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; } NeonExistsRequest; typedef struct { NeonRequest req; - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; } NeonNblocksRequest; @@ -91,7 +92,7 @@ typedef struct typedef struct { NeonRequest req; - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blkno; } NeonGetPageRequest; @@ -164,7 +165,7 @@ extern char *neon_tenant; extern bool wal_redo; extern int32 max_cluster_size; -extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode); +extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); extern void readahead_buffer_resize(int newsize, void *extra); @@ -175,19 +176,35 @@ extern void neon_open(SMgrRelation reln); extern void neon_close(SMgrRelation reln, ForkNumber forknum); extern void neon_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); extern bool neon_exists(SMgrRelation reln, ForkNumber forknum); -extern void neon_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void neon_unlink(NRelFileInfoBackend rnode, ForkNumber forknum, bool isRedo); +#if PG_MAJORVERSION_NUM < 16 extern void neon_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); +#else +extern void neon_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); +extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nbuffers, bool skipFsync); +#endif + extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); + +#if PG_MAJORVERSION_NUM < 16 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); - -extern void neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, +extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, XLogRecPtr request_lsn, bool request_latest, char *buffer); - extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); +#else +extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void *buffer); +extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, void *buffer); +extern void neon_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); +#endif extern void neon_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); @@ -198,16 +215,22 @@ extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum); /* utils for neon relsize cache */ extern void relsize_hash_init(void); -extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size); -extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); -extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); -extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); +extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size); +extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size); +extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size); +extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum); /* functions for local file cache */ -extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); -extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); -extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno); -extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno); +#if PG_MAJORVERSION_NUM < 16 +extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + char *buffer); +#else +extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *buffer); +#endif +extern bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer); +extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); +extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 76d71dd94b..919bca03e9 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -58,7 +58,6 @@ #include "postmaster/autovacuum.h" #include "replication/walsender.h" #include "storage/bufmgr.h" -#include "storage/relfilenode.h" #include "storage/buf_internals.h" #include "storage/smgr.h" #include "storage/md.h" @@ -86,7 +85,10 @@ static char *hexdump_page(char *page); #endif -#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId) +#define IS_LOCAL_REL(reln) (\ + NInfoGetDbOid(InfoFromSMgrRel(reln)) != 0 && \ + NInfoGetRelNumber(InfoFromSMgrRel(reln)) > FirstNormalObjectId \ +) const int SmgrTrace = DEBUG5; @@ -160,6 +162,7 @@ typedef enum PrefetchStatus { typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ XLogRecPtr effective_request_lsn; + XLogRecPtr actual_request_lsn; NeonResponse *response; /* may be null */ PrefetchStatus status; uint64 my_ring_index; @@ -255,7 +258,7 @@ static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); -static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode, +static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno); static bool @@ -314,6 +317,7 @@ compact_prefetch_buffers(void) target_slot->status = source_slot->status; target_slot->response = source_slot->response; target_slot->effective_request_lsn = source_slot->effective_request_lsn; + target_slot->actual_request_lsn = source_slot->actual_request_lsn; target_slot->my_ring_index = empty_ring_index; prfh_delete(MyPState->prf_hash, source_slot); @@ -634,7 +638,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force .req.tag = T_NeonGetPageRequest, .req.latest = false, .req.lsn = 0, - .rnode = slot->buftag.rnode, + .rinfo = BufTagGetNRelFileInfo(slot->buftag), .forknum = slot->buftag.forkNum, .blkno = slot->buftag.blockNum, }; @@ -643,13 +647,13 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force { request.req.lsn = *force_lsn; request.req.latest = *force_latest; - slot->effective_request_lsn = *force_lsn; + slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn; } else { XLogRecPtr lsn = neon_get_request_lsn( &request.req.latest, - slot->buftag.rnode, + BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum ); @@ -671,7 +675,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force * The best LSN to use for effective_request_lsn would be * XLogCtl->Insert.RedoRecPtr, but that's expensive to access. */ - request.req.lsn = lsn; + slot->actual_request_lsn = request.req.lsn = lsn; prefetch_lsn = Max(prefetch_lsn, lsn); slot->effective_request_lsn = prefetch_lsn; } @@ -893,9 +897,9 @@ nm_pack_request(NeonRequest * msg) pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->rnode.spcNode); - pq_sendint32(&s, msg_req->rnode.dbNode); - pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); pq_sendbyte(&s, msg_req->forknum); break; @@ -906,9 +910,9 @@ nm_pack_request(NeonRequest * msg) pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->rnode.spcNode); - pq_sendint32(&s, msg_req->rnode.dbNode); - pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); pq_sendbyte(&s, msg_req->forknum); break; @@ -929,9 +933,9 @@ nm_pack_request(NeonRequest * msg) pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->rnode.spcNode); - pq_sendint32(&s, msg_req->rnode.dbNode); - pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); pq_sendbyte(&s, msg_req->forknum); pq_sendint32(&s, msg_req->blkno); @@ -1063,10 +1067,7 @@ nm_to_string(NeonMessage * msg) NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); - appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", - msg_req->rnode.spcNode, - msg_req->rnode.dbNode, - msg_req->rnode.relNode); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); @@ -1079,10 +1080,7 @@ nm_to_string(NeonMessage * msg) NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); - appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", - msg_req->rnode.spcNode, - msg_req->rnode.dbNode, - msg_req->rnode.relNode); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); @@ -1095,10 +1093,7 @@ nm_to_string(NeonMessage * msg) NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); - appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", - msg_req->rnode.spcNode, - msg_req->rnode.dbNode, - msg_req->rnode.relNode); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); @@ -1187,13 +1182,13 @@ nm_to_string(NeonMessage * msg) * directly because it skips the logging if the LSN is new enough. */ static XLogRecPtr -log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, +log_newpage_copy(NRelFileInfo *rinfo, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std) { PGAlignedBlock copied_buffer; memcpy(copied_buffer.data, page, BLCKSZ); - return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std); + return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std); } /* @@ -1210,9 +1205,14 @@ PageIsEmptyHeapPage(char *buffer) } static void -neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) +neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +#if PG_MAJORVERSION_NUM < 16 + char *buffer, bool force) +#else + const char *buffer, bool force) +#endif { - XLogRecPtr lsn = PageGetLSN(buffer); + XLogRecPtr lsn = PageGetLSN((Page) buffer); if (ShutdownRequestPending) return; @@ -1232,15 +1232,14 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch /* FSM is never WAL-logged and we don't care. */ XLogRecPtr recptr; - recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); + recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum, + (Page) buffer, false); XLogFlush(recptr); lsn = recptr; ereport(SmgrTrace, (errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); } else if (lsn == InvalidXLogRecPtr) @@ -1263,24 +1262,20 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch * sign: it implies that the page was not WAL-logged, and its contents * will be lost when it's evicted. */ - if (PageIsNew(buffer)) + if (PageIsNew((Page) buffer)) { ereport(SmgrTrace, (errmsg("Page %u of relation %u/%u/%u.%u is all-zeros", blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } - else if (PageIsEmptyHeapPage(buffer)) + else if (PageIsEmptyHeapPage((Page) buffer)) { ereport(SmgrTrace, (errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } else @@ -1288,9 +1283,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch ereport(PANIC, (errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN", blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } } @@ -1299,9 +1292,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch ereport(SmgrTrace, (errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); } @@ -1309,7 +1300,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch * Remember the LSN on this page. When we read the page again, we must * read the same or newer version of it. */ - SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forknum, blocknum); + SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forknum, blocknum); } /* @@ -1379,7 +1370,7 @@ nm_adjust_lsn(XLogRecPtr lsn) * Return LSN for requesting pages and number of blocks from page server */ static XLogRecPtr -neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) +neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) { XLogRecPtr lsn; @@ -1394,7 +1385,7 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN /* * Get the last written LSN of this page. */ - lsn = GetLastWrittenLSN(rnode, forknum, blkno); + lsn = GetLastWrittenLSN(rinfo, forknum, blkno); lsn = nm_adjust_lsn(lsn); elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", @@ -1416,7 +1407,7 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN * so our request cannot concern those. */ *latest = true; - lsn = GetLastWrittenLSN(rnode, forknum, blkno); + lsn = GetLastWrittenLSN(rinfo, forknum, blkno); Assert(lsn != InvalidXLogRecPtr); elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", (uint32) ((lsn) >> 32), (uint32) (lsn)); @@ -1485,7 +1476,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks)) + if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks)) { return true; } @@ -1500,20 +1491,26 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) * * For now, handle that special case here. */ +#if PG_MAJORVERSION_NUM >= 16 + if (reln->smgr_rlocator.locator.spcOid == 0 && + reln->smgr_rlocator.locator.dbOid == 0 && + reln->smgr_rlocator.locator.relNumber == 0) +#else if (reln->smgr_rnode.node.spcNode == 0 && reln->smgr_rnode.node.dbNode == 0 && reln->smgr_rnode.node.relNode == 0) +#endif { return false; } - request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, .req.latest = latest, .req.lsn = request_lsn, - .rnode = reln->smgr_rnode.node, + .rinfo = InfoFromSMgrRel(reln), .forknum = forkNum}; resp = page_server_request(&request); @@ -1529,9 +1526,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", @@ -1571,9 +1566,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) } elog(SmgrTrace, "Create relation %u/%u/%u.%u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum); /* @@ -1597,12 +1590,12 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) */ if (isRedo) { - update_cached_relsize(reln->smgr_rnode.node, forkNum, 0); - get_cached_relsize(reln->smgr_rnode.node, forkNum, + update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0); + get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &reln->smgr_cached_nblocks[forkNum]); } else - set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1629,17 +1622,17 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * we are usually not in a transaction anymore when this is called. */ void -neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) +neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo) { /* * Might or might not exist locally, depending on whether it's an unlogged * or permanent relation (or if DEBUG_COMPARE_LOCAL is set). Try to * unlink, it won't do any harm if the file doesn't exist. */ - mdunlink(rnode, forkNum, isRedo); - if (!RelFileNodeBackendIsTemp(rnode)) + mdunlink(rinfo, forkNum, isRedo); + if (!NRelFileInfoBackendIsTemp(rinfo)) { - forget_cached_relsize(rnode.node, forkNum); + forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum); } } @@ -1653,8 +1646,13 @@ neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) * causes intervening file space to become filled with zeroes. */ void +#if PG_MAJORVERSION_NUM < 16 neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer, bool skipFsync) +#else +neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + const void *buffer, bool skipFsync) +#endif { XLogRecPtr lsn; BlockNumber n_blocks = 0; @@ -1707,17 +1705,15 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, neon_wallog_page(reln, forkNum, n_blocks++, buffer, true); neon_wallog_page(reln, forkNum, blkno, buffer, false); - set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1); + set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1); - lsn = PageGetLSN(buffer); + lsn = PageGetLSN((Page) buffer); elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, blkno, (uint32) (lsn >> 32), (uint32) lsn); - lfc_write(reln->smgr_rnode.node, forkNum, blkno, buffer); + lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1732,11 +1728,98 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (lsn == InvalidXLogRecPtr) { lsn = GetXLogInsertRecPtr(); - SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forkNum, blkno); + SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum, blkno); } - SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum); + SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum); } +#if PG_MAJORVERSION_NUM >= 16 +void +neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, + int nblocks, bool skipFsync) +{ + const PGAlignedBlock buffer = {0}; + BlockNumber curblocknum = blocknum; + int remblocks = nblocks; + XLogRecPtr lsn = 0; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (max_cluster_size > 0 && + reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && + !IsAutoVacuumWorkerProcess()) + { + uint64 current_size = GetZenithCurrentClusterSize(); + + if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", + max_cluster_size), + errhint("This limit is defined by neon.max_cluster_size GUC"))); + } + + /* + * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any + * more --- we mustn't create a block whose number actually is + * InvalidBlockNumber or larger. + */ + if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot extend file \"%s\" beyond %u blocks", + relpath(reln->smgr_rlocator, forkNum), + InvalidBlockNumber))); + + /* Don't log any pages if we're not allowed to do so. */ + if (!XLogInsertAllowed()) + return; + + while (remblocks > 0) + { + int count = Min(remblocks, XLR_MAX_BLOCK_ID); + + XLogBeginInsert(); + + for (int i = 0; i < count; i++) + XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i, + (char *) buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + + lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI); + + for (int i = 0; i < count; i++) + { + lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data); + SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum, + blocknum + i); + } + + blocknum += count; + remblocks -= count; + } + + Assert(lsn != 0); + + SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum); + set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum); +} +#endif + /* * neon_open() -- Initialize newly-opened relation. */ @@ -1792,14 +1875,14 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - if (lfc_cache_contains(reln->smgr_rnode.node, forknum, blocknum)) + if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum)) return false; tag = (BufferTag) { - .rnode = reln->smgr_rnode.node, .forkNum = forknum, .blockNum = blocknum }; + CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); ring_index = prefetch_register_buffer(tag, NULL, NULL); @@ -1851,9 +1934,15 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, * While function is defined in the neon extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. */ -void -neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, +#if PG_MAJORVERSION_NUM < 16 +void PGDLLEXPORT +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, XLogRecPtr request_lsn, bool request_latest, char *buffer) +#else +void PGDLLEXPORT +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, void *buffer) +#endif { NeonResponse *resp; BufferTag buftag; @@ -1862,11 +1951,12 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, PrefetchRequest *slot; buftag = (BufferTag) { - .rnode = rnode, .forkNum = forkNum, .blockNum = blkno, }; + CopyNRelFileInfoToBufTag(buftag, rinfo); + /* * The redo process does not lock pages that it needs to replay but are * not in the shared buffers, so a concurrent process may request the @@ -1957,7 +2047,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, { case T_NeonGetPageResponse: memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); - lfc_write(rnode, forkNum, blkno, buffer); + lfc_write(rinfo, forkNum, blkno, buffer); break; case T_NeonErrorResponse: @@ -1965,9 +2055,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, (errcode(ERRCODE_IO_ERROR), errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", blkno, - rnode.spcNode, - rnode.dbNode, - rnode.relNode, + RelFileInfoFmt(rinfo), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", @@ -1987,7 +2075,11 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, */ void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, +#if PG_MAJORVERSION_NUM < 16 char *buffer) +#else + void *buffer) +#endif { bool latest; XLogRecPtr request_lsn; @@ -2010,13 +2102,13 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, } /* Try to read from local file cache */ - if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer)) + if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) { return; } - request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); - neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); + request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno); + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -2030,27 +2122,23 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, memcpy(pageserver_masked, buffer, BLCKSZ); memcpy(mdbuf_masked, mdbuf, BLCKSZ); - if (PageIsNew(mdbuf)) + if (PageIsNew((Page) mdbuf)) { - if (!PageIsNew(pageserver_masked)) + if (!PageIsNew((Page) pageserver_masked)) { elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, hexdump_page(buffer)); } } - else if (PageIsNew(buffer)) + else if (PageIsNew((Page) buffer)) { elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, hexdump_page(mdbuf)); @@ -2065,9 +2153,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, { elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, hexdump_page(mdbuf_masked), @@ -2086,9 +2172,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, { elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, hexdump_page(mdbuf_masked), @@ -2130,7 +2214,11 @@ hexdump_page(char *page) */ void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +#if PG_MAJORVERSION_NUM < 16 char *buffer, bool skipFsync) +#else + const void *buffer, bool skipFsync) +#endif { XLogRecPtr lsn; @@ -2168,15 +2256,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, neon_wallog_page(reln, forknum, blocknum, buffer, false); - lsn = PageGetLSN(buffer); + lsn = PageGetLSN((Page) buffer); elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, blocknum, (uint32) (lsn >> 32), (uint32) lsn); - lfc_write(reln->smgr_rnode.node, forknum, blocknum, buffer); + lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -2212,23 +2298,21 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks)) + if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks)) { elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, n_blocks); return n_blocks; } - request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, .req.latest = latest, .req.lsn = request_lsn, - .rnode = reln->smgr_rnode.node, + .rinfo = InfoFromSMgrRel(reln), .forknum = forknum, }; @@ -2245,9 +2329,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", @@ -2257,12 +2339,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) default: elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); } - update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks); + update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, n_blocks); @@ -2281,7 +2361,7 @@ neon_dbsize(Oid dbNode) int64 db_size; XLogRecPtr request_lsn; bool latest; - RelFileNode dummy_node = {InvalidOid, InvalidOid, InvalidOid}; + NRelFileInfo dummy_node = {0}; request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); { @@ -2350,7 +2430,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks); + set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks); /* * Truncating a relation drops all its buffers from the buffer cache @@ -2378,7 +2458,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * for the extended pages, so there's no harm in leaving behind obsolete * entries for the truncated chunks. */ - SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forknum); + SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forknum); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -2448,9 +2528,7 @@ neon_start_unlogged_build(SMgrRelation reln) ereport(SmgrTrace, (errmsg("starting unlogged build of relation %u/%u/%u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode))); + RelFileInfoFmt(InfoFromSMgrRel(reln))))); switch (reln->smgr_relpersistence) { @@ -2500,9 +2578,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) ereport(SmgrTrace, (errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode))); + RelFileInfoFmt(InfoFromSMgrRel(reln))))); if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) return; @@ -2525,18 +2601,16 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) static void neon_end_unlogged_build(SMgrRelation reln) { + NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln); + Assert(unlogged_build_rel == reln); ereport(SmgrTrace, (errmsg("ending unlogged build of relation %u/%u/%u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode))); + RelFileInfoFmt(InfoFromNInfoB(rinfob))))); if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) { - RelFileNodeBackend rnode; - Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); @@ -2544,19 +2618,17 @@ neon_end_unlogged_build(SMgrRelation reln) reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT; /* Remove local copy */ - rnode = reln->smgr_rnode; + rinfob = InfoBFromSMgrRel(reln); for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) { elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u", - rnode.node.spcNode, - rnode.node.dbNode, - rnode.node.relNode, + RelFileInfoFmt(InfoFromNInfoB(rinfob)), forknum); - forget_cached_relsize(rnode.node, forknum); + forget_cached_relsize(InfoFromNInfoB(rinfob), forknum); mdclose(reln, forknum); /* use isRedo == true, so that we drop it immediately */ - mdunlink(rnode, forknum, true); + mdunlink(rinfob, forknum, true); } } @@ -2608,6 +2680,9 @@ static const struct f_smgr neon_smgr = .smgr_exists = neon_exists, .smgr_unlink = neon_unlink, .smgr_extend = neon_extend, +#if PG_MAJORVERSION_NUM >= 16 + .smgr_zeroextend = neon_zeroextend, +#endif .smgr_prefetch = neon_prefetch, .smgr_read = neon_read, .smgr_write = neon_write, @@ -2622,12 +2697,12 @@ static const struct f_smgr neon_smgr = }; const f_smgr * -smgr_neon(BackendId backend, RelFileNode rnode) +smgr_neon(BackendId backend, NRelFileInfo rinfo) { /* Don't use page server for temp relations */ if (backend != InvalidBackendId) - return smgr_standard(backend, rnode); + return smgr_standard(backend, rinfo); else return &neon_smgr; } @@ -2681,7 +2756,7 @@ bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) { XLogRecPtr end_recptr = record->EndRecPtr; - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blkno; BufferTag tag; @@ -2695,10 +2770,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) return true; #if PG_VERSION_NUM < 150000 - if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) + if (!XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno)) elog(PANIC, "failed to locate backup block with ID %d", block_id); #else - XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno); + XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno); #endif /* @@ -2706,10 +2781,13 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) * regardless of whether the block is stored in shared buffers. * See also this function's top comment. */ - if (!OidIsValid(rnode.dbNode)) + if (!OidIsValid(NInfoGetDbOid(rinfo))) return false; - INIT_BUFFERTAG(tag, rnode, forknum, blkno); + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forknum; + tag.blockNum = blkno; + hash = BufTableHashCode(&tag); partitionLock = BufMappingPartitionLock(hash); @@ -2725,24 +2803,24 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) no_redo_needed = buffer < 0; /* In both cases st lwlsn past this WAL record */ - SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno); + SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); /* we don't have the buffer in memory, update lwLsn past this record, * also evict page fro file cache */ if (no_redo_needed) - lfc_evict(rnode, forknum, blkno); + lfc_evict(rinfo, forknum, blkno); LWLockRelease(partitionLock); /* Extend the relation if we know its size */ - if (get_cached_relsize(rnode, forknum, &relsize)) + if (get_cached_relsize(rinfo, forknum, &relsize)) { if (relsize < blkno + 1) { - update_cached_relsize(rnode, forknum, blkno + 1); - SetLastWrittenLSNForRelation(end_recptr, rnode, forknum); + update_cached_relsize(rinfo, forknum, blkno + 1); + SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum); } } else @@ -2763,7 +2841,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) .latest = false, .tag = T_NeonNblocksRequest, }, - .rnode = rnode, + .rinfo = rinfo, .forknum = forknum, }; @@ -2774,8 +2852,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) Assert(nbresponse->n_blocks > blkno); - set_cached_relsize(rnode, forknum, nbresponse->n_blocks); - SetLastWrittenLSNForRelation(end_recptr, rnode, forknum); + set_cached_relsize(rinfo, forknum, nbresponse->n_blocks); + SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum); elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks); } diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index d4262c730a..b13134b5c3 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -14,8 +14,10 @@ */ #include "postgres.h" +#include "neon_pgversioncompat.h" + #include "pagestore_client.h" -#include "storage/relfilenode.h" +#include RELFILEINFO_HDR #include "storage/smgr.h" #include "storage/lwlock.h" #include "storage/ipc.h" @@ -30,7 +32,7 @@ typedef struct { - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; } RelTag; @@ -75,7 +77,7 @@ neon_smgr_shmem_startup(void) } bool -get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size) +get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size) { bool found = false; @@ -84,7 +86,7 @@ get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size) RelTag tag; RelSizeEntry *entry; - tag.rnode = rnode; + tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_SHARED); entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); @@ -99,14 +101,14 @@ get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size) } void -set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) { if (relsize_hash_size > 0) { RelTag tag; RelSizeEntry *entry; - tag.rnode = rnode; + tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_EXCLUSIVE); entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL); @@ -116,7 +118,7 @@ set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) } void -update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) { if (relsize_hash_size > 0) { @@ -124,7 +126,7 @@ update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) RelSizeEntry *entry; bool found; - tag.rnode = rnode; + tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_EXCLUSIVE); entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found); @@ -135,13 +137,13 @@ update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) } void -forget_cached_relsize(RelFileNode rnode, ForkNumber forknum) +forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum) { if (relsize_hash_size > 0) { RelTag tag; - tag.rnode = rnode; + tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_EXCLUSIVE); hash_search(relsize_hash, &tag, HASH_REMOVE, NULL); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index d9999ef2b1..a9342bd984 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -51,6 +51,9 @@ #include "libpq/pqformat.h" #include "replication/slot.h" #include "replication/walreceiver.h" +#if PG_VERSION_NUM >= 160000 +#include "replication/walsender_private.h" +#endif #include "postmaster/bgworker.h" #include "postmaster/interrupt.h" #include "postmaster/postmaster.h" @@ -73,10 +76,10 @@ static bool syncSafekeepers = false; -char *wal_acceptors_list; -int wal_acceptor_reconnect_timeout; -int wal_acceptor_connection_timeout; -bool am_wal_proposer; +char *wal_acceptors_list = ""; +int wal_acceptor_reconnect_timeout = 1000; +int wal_acceptor_connection_timeout = 10000; +bool am_wal_proposer = false; #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" @@ -191,7 +194,7 @@ pg_init_walproposer(void) /* * Entry point for `postgres --sync-safekeepers`. */ -void +PGDLLEXPORT void WalProposerSync(int argc, char *argv[]) { struct stat stat_buf; @@ -315,7 +318,7 @@ nwp_shmem_startup_hook(void) /* * WAL proposer bgworker entry point. */ -void +PGDLLEXPORT void WalProposerMain(Datum main_arg) { #if PG_VERSION_NUM >= 150000 @@ -383,21 +386,55 @@ WalProposerPoll(void) { while (true) { - Safekeeper *sk; - int rc; - WaitEvent event; + Safekeeper *sk = NULL; + bool wait_timeout = false; + bool late_cv_trigger = false; + WaitEvent event = {0}; + int rc = 0; TimestampTz now = GetCurrentTimestamp(); + long timeout = TimeToReconnect(now); - rc = WaitEventSetWait(waitEvents, TimeToReconnect(now), +#if PG_MAJORVERSION_NUM >= 16 + if (WalSndCtl != NULL) + ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv); +#endif + + /* + * Wait for a wait event to happen, or timeout: + * - Safekeeper socket can become available for READ or WRITE + * - Our latch got set, because + * * PG15-: We got woken up by a process triggering the WalSender + * * PG16+: WalSndCtl->wal_flush_cv was triggered + */ + rc = WaitEventSetWait(waitEvents, timeout, &event, 1, WAIT_EVENT_WAL_SENDER_MAIN); - sk = (Safekeeper *) event.user_data; +#if PG_MAJORVERSION_NUM >= 16 + if (WalSndCtl != NULL) + late_cv_trigger = ConditionVariableCancelSleep(); +#endif + /* + * If wait is terminated by latch set (walsenders' latch is set on + * each wal flush), then exit loop. (no need for pm death check due to + * WL_EXIT_ON_PM_DEATH) + */ + if ((rc == 1 && event.events & WL_LATCH_SET) || late_cv_trigger) + { + /* Reset our latch */ + ResetLatch(MyLatch); + + break; + } + /* * If the event contains something that one of our safekeeper states * was waiting for, we'll advance its state. */ - if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))) + if (rc == 1 && (event.events & (WL_SOCKET_MASK))) + { + sk = (Safekeeper *) event.user_data; AdvancePollState(sk, event.events); + } /* * If the timeout expired, attempt to reconnect to any safekeepers @@ -405,15 +442,26 @@ WalProposerPoll(void) */ ReconnectSafekeepers(); - /* - * If wait is terminated by latch set (walsenders' latch is set on - * each wal flush), then exit loop. (no need for pm death check due to - * WL_EXIT_ON_PM_DEATH) - */ - if (rc != 0 && (event.events & WL_LATCH_SET)) + if (rc == 0) /* timeout expired */ { - ResetLatch(MyLatch); - break; + wait_timeout = true; + + /* + * Ensure flushrecptr is set to a recent value. This fixes a case + * where we've not been notified of new WAL records when we were + * planning on consuming them. + */ + if (!syncSafekeepers) { + XLogRecPtr flushed; + +#if PG_MAJORVERSION_NUM < 15 + flushed = GetFlushRecPtr(); +#else + flushed = GetFlushRecPtr(NULL); +#endif + if (flushed > availableLsn) + break; + } } now = GetCurrentTimestamp(); @@ -611,7 +659,8 @@ UpdateEventSet(Safekeeper *sk, uint32 events) ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); } -/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set. +/* + * Hack: provides a way to remove the event corresponding to an individual walproposer from the set. * * Note: Internally, this completely reconstructs the event set. It should be avoided if possible. */ @@ -1408,7 +1457,12 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec elog(FATAL, "could not append password to the safekeeper connection string"); } +#if PG_MAJORVERSION_NUM < 16 wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); +#else + wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err); +#endif + if (!wrconn) { ereport(WARNING, @@ -2242,9 +2296,10 @@ HandleSafekeeperResponse(void) if (synced) n_synced++; } + if (n_synced >= quorum) { - /* All safekeepers synced! */ + /* A quorum of safekeepers has been synced! */ /* * Send empty message to broadcast latest truncateLsn to all safekeepers. @@ -2539,8 +2594,15 @@ backpressure_throttling_impl(void) ? PrevProcessInterruptsCallback() : false; - /* Don't throttle read only transactions and wal sender. */ - if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny())) + /* + * Don't throttle read only transactions or wal sender. + * Do throttle CREATE INDEX CONCURRENTLY, however. It performs some + * stages outside a transaction, even though it writes a lot of WAL. + * Check PROC_IN_SAFE_IC flag to cover that case. + */ + if (am_walsender + || (!(MyProc->statusFlags & PROC_IN_SAFE_IC) + && !TransactionIdIsValid(GetCurrentTransactionIdIfAny()))) return retry; /* Calculate replicas lag */ diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 615fbf9399..fa1ba30a8f 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -379,8 +379,8 @@ typedef struct Safekeeper AppendResponse appendResponse; /* feedback for master */ } Safekeeper; -extern void WalProposerSync(int argc, char *argv[]); -extern void WalProposerMain(Datum main_arg); +extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]); +extern void PGDLLEXPORT WalProposerMain(Datum main_arg); extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); extern void WalProposerPoll(void); extern void ParsePageserverFeedbackMessage(StringInfo reply_message, diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c index 9e1fc11756..05030360f6 100644 --- a/pgxn/neon/walproposer_utils.c +++ b/pgxn/neon/walproposer_utils.c @@ -25,6 +25,9 @@ #include "access/xlogutils.h" #include "access/xlogrecovery.h" #endif +#if PG_MAJORVERSION_NUM >= 16 +#include "utils/guc.h" +#endif /* * These variables are used similarly to openLogFile/SegNo, @@ -558,11 +561,11 @@ StartProposerReplication(StartReplicationCmd *cmd) static void WalSndLoop(void) { + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + for (;;) { - /* Clear any already-pending wakeups */ - ResetLatch(MyLatch); - CHECK_FOR_INTERRUPTS(); XLogBroadcastWalProposer(); diff --git a/pgxn/neon_rmgr/Makefile b/pgxn/neon_rmgr/Makefile new file mode 100644 index 0000000000..20f0a78d79 --- /dev/null +++ b/pgxn/neon_rmgr/Makefile @@ -0,0 +1,19 @@ +# pgxs/neon/Makefile + + +MODULE_big = neon_rmgr +OBJS = \ + $(WIN32RES) \ + neon_rmgr.o \ + neon_rmgr_decode.o \ + neon_rmgr_desc.o + + +EXTENSION = neon_rmgr +DATA = +PGFILEDESC = "Neon WAL Resource Manager - custom WAL records used to make Neon work (since PG 16)" + + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/pgxn/neon_rmgr/neon_rmgr.c b/pgxn/neon_rmgr/neon_rmgr.c new file mode 100644 index 0000000000..496ca08c08 --- /dev/null +++ b/pgxn/neon_rmgr/neon_rmgr.c @@ -0,0 +1,886 @@ +#include "postgres.h" +#include "fmgr.h" + +#if PG_MAJORVERSION_NUM >= 16 +#include "access/bufmask.h" +#include "access/heapam_xlog.h" +#include "access/htup_details.h" +#include "access/neon_xlog.h" +#include "access/rmgr.h" +#include "access/visibilitymap.h" +#include "access/xlog_internal.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "storage/bufpage.h" +#include "storage/freespace.h" +#include "neon_rmgr.h" + +PG_MODULE_MAGIC; +void _PG_init(void); + +static void neon_rm_redo(XLogReaderState *record); +static void neon_rm_startup(void); +static void neon_rm_cleanup(void); +static void neon_rm_mask(char *pagedata, BlockNumber blkno); + +static void redo_neon_heap_insert(XLogReaderState *record); +static void redo_neon_heap_delete(XLogReaderState *record); +static void redo_neon_heap_update(XLogReaderState *record, bool hot_update); +static void redo_neon_heap_lock(XLogReaderState *record); +static void redo_neon_heap_multi_insert(XLogReaderState *record); + +const static RmgrData NeonRmgr = { + .rm_name = "neon", + .rm_redo = neon_rm_redo, + .rm_desc = neon_rm_desc, + .rm_identify = neon_rm_identify, + .rm_startup = neon_rm_startup, + .rm_cleanup = neon_rm_cleanup, + .rm_mask = neon_rm_mask, + .rm_decode = neon_rm_decode, +}; + +void +_PG_init(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + RegisterCustomRmgr(RM_NEON_ID, &NeonRmgr); +} + +static void +neon_rm_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info & XLOG_NEON_OPMASK) + { + case XLOG_NEON_HEAP_INSERT: + redo_neon_heap_insert(record); + break; + case XLOG_NEON_HEAP_DELETE: + redo_neon_heap_delete(record); + break; + case XLOG_NEON_HEAP_UPDATE: + redo_neon_heap_update(record, false); + break; + case XLOG_NEON_HEAP_HOT_UPDATE: + redo_neon_heap_update(record, true); + break; + case XLOG_NEON_HEAP_LOCK: + redo_neon_heap_lock(record); + break; + case XLOG_NEON_HEAP_MULTI_INSERT: + redo_neon_heap_multi_insert(record); + break; + default: + elog(PANIC, "neon_rm_redo: unknown op code %u", info); + } +} + +static void +neon_rm_startup(void) +{ + /* nothing to do here */ +} + +static void +neon_rm_cleanup(void) +{ + /* nothing to do here */ +} + +static void +neon_rm_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + OffsetNumber off; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) + { + ItemId iid = PageGetItemId(page, off); + char *page_item; + + page_item = (char *) (page + ItemIdGetOffset(iid)); + + if (ItemIdIsNormal(iid)) + { + HeapTupleHeader page_htup = (HeapTupleHeader) page_item; + + /* + * If xmin of a tuple is not yet frozen, we should ignore + * differences in hint bits, since they can be set without + * emitting WAL. + */ + if (!HeapTupleHeaderXminFrozen(page_htup)) + page_htup->t_infomask &= ~HEAP_XACT_MASK; + else + { + /* Still we need to mask xmax hint bits. */ + page_htup->t_infomask &= ~HEAP_XMAX_INVALID; + page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED; + } + + /* + * During replay, we set Command Id to FirstCommandId. Hence, mask + * it. See heap_xlog_insert() for details. + */ + page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER; + + /* + * For a speculative tuple, heap_insert() does not set ctid in the + * caller-passed heap tuple itself, leaving the ctid field to + * contain a speculative token value - a per-backend monotonically + * increasing identifier. Besides, it does not WAL-log ctid under + * any circumstances. + * + * During redo, heap_xlog_insert() sets t_ctid to current block + * number and self offset number. It doesn't care about any + * speculative insertions on the primary. Hence, we set t_ctid to + * current block number and self offset number to ignore any + * inconsistency. + */ + if (HeapTupleHeaderIsSpeculative(page_htup)) + ItemPointerSet(&page_htup->t_ctid, blkno, off); + + /* + * NB: Not ignoring ctid changes due to the tuple having moved + * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's + * important information that needs to be in-sync between primary + * and standby, and thus is WAL logged. + */ + } + + /* + * Ignore any padding bytes after the tuple, when the length of the + * item is not MAXALIGNed. + */ + if (ItemIdHasStorage(iid)) + { + int len = ItemIdGetLength(iid); + int padlen = MAXALIGN(len) - len; + + if (padlen > 0) + memset(page_item + len, MASK_MARKER, padlen); + } + } +} + + +/* + * COPIED FROM heapam.c + * Given an "infobits" field from an XLog record, set the correct bits in the + * given infomask and infomask2 for the tuple touched by the record. + * + * (This is the reverse of compute_infobits). + */ +static void +fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) +{ + *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | + HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK); + *infomask2 &= ~HEAP_KEYS_UPDATED; + + if (infobits & XLHL_XMAX_IS_MULTI) + *infomask |= HEAP_XMAX_IS_MULTI; + if (infobits & XLHL_XMAX_LOCK_ONLY) + *infomask |= HEAP_XMAX_LOCK_ONLY; + if (infobits & XLHL_XMAX_EXCL_LOCK) + *infomask |= HEAP_XMAX_EXCL_LOCK; + /* note HEAP_XMAX_SHR_LOCK isn't considered here */ + if (infobits & XLHL_XMAX_KEYSHR_LOCK) + *infomask |= HEAP_XMAX_KEYSHR_LOCK; + + if (infobits & XLHL_KEYS_UPDATED) + *infomask2 |= HEAP_KEYS_UPDATED; +} + +static void +redo_neon_heap_insert(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_neon_heap_insert *xlrec = (xl_neon_heap_insert *) XLogRecGetData(record); + Buffer buffer; + Page page; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSize]; + } tbuf; + HeapTupleHeader htup; + xl_neon_heap_header xlhdr; + uint32 newlen; + Size freespace = 0; + RelFileLocator target_locator; + BlockNumber blkno; + ItemPointerData target_tid; + XLogRedoAction action; + + XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); + ItemPointerSetBlockNumber(&target_tid, blkno); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(target_locator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, blkno, &vmbuffer); + visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* + * If we inserted the first and only tuple on the page, re-initialize the + * page from scratch. + */ + if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) + { + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + Size datalen; + char *data; + + page = BufferGetPage(buffer); + + if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum) + elog(PANIC, "neon_rm_redo: invalid max offset number"); + + data = XLogRecGetBlockData(record, 0, &datalen); + + newlen = datalen - SizeOfNeonHeapHeader; + Assert(datalen > SizeOfNeonHeapHeader && newlen <= MaxHeapTupleSize); + memcpy((char *) &xlhdr, data, SizeOfNeonHeapHeader); + data += SizeOfNeonHeapHeader; + + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ + memcpy((char *) htup + SizeofHeapTupleHeader, + data, + newlen); + newlen += SizeofHeapTupleHeader; + htup->t_infomask2 = xlhdr.t_infomask2; + htup->t_infomask = xlhdr.t_infomask; + htup->t_hoff = xlhdr.t_hoff; + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); + htup->t_ctid = target_tid; + + if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, + true, true) == InvalidOffsetNumber) + elog(PANIC, "neon_rm_redo: failed to add tuple"); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ + if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) + PageSetAllVisible(page); + + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * If the page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(target_locator, blkno, freespace); +} + +static void +redo_neon_heap_delete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_neon_heap_delete *xlrec = (xl_neon_heap_delete *) XLogRecGetData(record); + Buffer buffer; + Page page; + ItemId lp = NULL; + HeapTupleHeader htup; + BlockNumber blkno; + RelFileLocator target_locator; + ItemPointerData target_tid; + + XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); + ItemPointerSetBlockNumber(&target_tid, blkno); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(target_locator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, blkno, &vmbuffer); + visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + if (PageGetMaxOffsetNumber(page) >= xlrec->offnum) + lp = PageGetItemId(page, xlrec->offnum); + + if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "neon_rm_redo: invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + HeapTupleHeaderClearHotUpdated(htup); + fix_infomask_from_infobits(xlrec->infobits_set, + &htup->t_infomask, &htup->t_infomask2); + if (!(xlrec->flags & XLH_DELETE_IS_SUPER)) + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + else + HeapTupleHeaderSetXmin(htup, InvalidTransactionId); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + + /* Mark the page as a candidate for pruning */ + PageSetPrunable(page, XLogRecGetXid(record)); + + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* Make sure t_ctid is set correctly */ + if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE) + HeapTupleHeaderSetMovedPartitions(htup); + else + htup->t_ctid = target_tid; + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +redo_neon_heap_update(XLogReaderState *record, bool hot_update) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_neon_heap_update *xlrec = (xl_neon_heap_update *) XLogRecGetData(record); + RelFileLocator rlocator; + BlockNumber oldblk; + BlockNumber newblk; + ItemPointerData newtid; + Buffer obuffer, + nbuffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleData oldtup; + HeapTupleHeader htup; + uint16 prefixlen = 0, + suffixlen = 0; + char *newp; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSize]; + } tbuf; + xl_neon_heap_header xlhdr; + uint32 newlen; + Size freespace = 0; + XLogRedoAction oldaction; + XLogRedoAction newaction; + + /* initialize to keep the compiler quiet */ + oldtup.t_data = NULL; + oldtup.t_len = 0; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &newblk); + if (XLogRecGetBlockTagExtended(record, 1, NULL, NULL, &oldblk, NULL)) + { + /* HOT updates are never done across pages */ + Assert(!hot_update); + } + else + oldblk = newblk; + + ItemPointerSet(&newtid, newblk, xlrec->new_offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rlocator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, oldblk, &vmbuffer); + visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* + * In normal operation, it is important to lock the two pages in + * page-number order, to avoid possible deadlocks against other update + * operations going the other way. However, during WAL replay there can + * be no other update happening, so we don't need to worry about that. But + * we *do* need to worry that we don't expose an inconsistent state to Hot + * Standby queries --- so the original page can't be unlocked before we've + * added the new tuple to the new page. + */ + + /* Deal with old tuple version */ + oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1, + &obuffer); + if (oldaction == BLK_NEEDS_REDO) + { + page = BufferGetPage(obuffer); + offnum = xlrec->old_offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "neon_rm_redo: invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + oldtup.t_data = htup; + oldtup.t_len = ItemIdGetLength(lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + if (hot_update) + HeapTupleHeaderSetHotUpdated(htup); + else + HeapTupleHeaderClearHotUpdated(htup); + fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, + &htup->t_infomask2); + HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + /* Set forward chain link in t_ctid */ + htup->t_ctid = newtid; + + /* Mark the page as a candidate for pruning */ + PageSetPrunable(page, XLogRecGetXid(record)); + + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(obuffer); + } + + /* + * Read the page the new tuple goes into, if different from old. + */ + if (oldblk == newblk) + { + nbuffer = obuffer; + newaction = oldaction; + } + else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) + { + nbuffer = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(nbuffer); + PageInit(page, BufferGetPageSize(nbuffer), 0); + newaction = BLK_NEEDS_REDO; + } + else + newaction = XLogReadBufferForRedo(record, 0, &nbuffer); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rlocator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, newblk, &vmbuffer); + visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* Deal with new tuple */ + if (newaction == BLK_NEEDS_REDO) + { + char *recdata; + char *recdata_end; + Size datalen; + Size tuplen; + + recdata = XLogRecGetBlockData(record, 0, &datalen); + recdata_end = recdata + datalen; + + page = BufferGetPage(nbuffer); + + offnum = xlrec->new_offnum; + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "neon_rm_redo: invalid max offset number"); + + if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD) + { + Assert(newblk == oldblk); + memcpy(&prefixlen, recdata, sizeof(uint16)); + recdata += sizeof(uint16); + } + if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD) + { + Assert(newblk == oldblk); + memcpy(&suffixlen, recdata, sizeof(uint16)); + recdata += sizeof(uint16); + } + + memcpy((char *) &xlhdr, recdata, SizeOfNeonHeapHeader); + recdata += SizeOfNeonHeapHeader; + + tuplen = recdata_end - recdata; + Assert(tuplen <= MaxHeapTupleSize); + + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + + /* + * Reconstruct the new tuple using the prefix and/or suffix from the + * old tuple, and the data stored in the WAL record. + */ + newp = (char *) htup + SizeofHeapTupleHeader; + if (prefixlen > 0) + { + int len; + + /* copy bitmap [+ padding] [+ oid] from WAL record */ + len = xlhdr.t_hoff - SizeofHeapTupleHeader; + memcpy(newp, recdata, len); + recdata += len; + newp += len; + + /* copy prefix from old tuple */ + memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen); + newp += prefixlen; + + /* copy new tuple data from WAL record */ + len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader); + memcpy(newp, recdata, len); + recdata += len; + newp += len; + } + else + { + /* + * copy bitmap [+ padding] [+ oid] + data from record, all in one + * go + */ + memcpy(newp, recdata, tuplen); + recdata += tuplen; + newp += tuplen; + } + Assert(recdata == recdata_end); + + /* copy suffix from old tuple */ + if (suffixlen > 0) + memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen); + + newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen; + htup->t_infomask2 = xlhdr.t_infomask2; + htup->t_infomask = xlhdr.t_infomask; + htup->t_hoff = xlhdr.t_hoff; + + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); + HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); + /* Make sure there is no forward chain link in t_ctid */ + htup->t_ctid = newtid; + + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "neon_rm_redo: failed to add tuple"); + + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + MarkBufferDirty(nbuffer); + } + + if (BufferIsValid(nbuffer) && nbuffer != obuffer) + UnlockReleaseBuffer(nbuffer); + if (BufferIsValid(obuffer)) + UnlockReleaseBuffer(obuffer); + + /* + * If the new page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * However, don't update the FSM on HOT updates, because after crash + * recovery, either the old or the new tuple will certainly be dead and + * prunable. After pruning, the page will have roughly as much free space + * as it did before the update, assuming the new tuple is about the same + * size as the old one. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(rlocator, newblk, freespace); +} + +static void +redo_neon_heap_lock(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_neon_heap_lock *xlrec = (xl_neon_heap_lock *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) + { + RelFileLocator rlocator; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + Relation reln; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block); + reln = CreateFakeRelcacheEntry(rlocator); + + visibilitymap_pin(reln, block, &vmbuffer); + visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN); + + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "neon_rm_redo: invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, + &htup->t_infomask2); + + /* + * Clear relevant update flags, but only if the modified infomask says + * there's no update. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask)) + { + HeapTupleHeaderClearHotUpdated(htup); + /* Make sure there is no forward chain link in t_ctid */ + ItemPointerSet(&htup->t_ctid, + BufferGetBlockNumber(buffer), + offnum); + } + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +redo_neon_heap_multi_insert(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_neon_heap_multi_insert *xlrec; + RelFileLocator rlocator; + BlockNumber blkno; + Buffer buffer; + Page page; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSize]; + } tbuf; + HeapTupleHeader htup; + uint32 newlen; + Size freespace = 0; + int i; + bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0; + XLogRedoAction action; + + /* + * Insertion doesn't overwrite MVCC data, so no conflict processing is + * required. + */ + xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(record); + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); + + /* check that the mutually exclusive flags are not both set */ + Assert(!((xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) && + (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET))); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rlocator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, blkno, &vmbuffer); + visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (isinit) + { + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + char *tupdata; + char *endptr; + Size len; + + /* Tuples are stored as block data */ + tupdata = XLogRecGetBlockData(record, 0, &len); + endptr = tupdata + len; + + page = (Page) BufferGetPage(buffer); + + for (i = 0; i < xlrec->ntuples; i++) + { + OffsetNumber offnum; + xl_neon_multi_insert_tuple *xlhdr; + + /* + * If we're reinitializing the page, the tuples are stored in + * order from FirstOffsetNumber. Otherwise there's an array of + * offsets in the WAL record, and the tuples come after that. + */ + if (isinit) + offnum = FirstOffsetNumber + i; + else + offnum = xlrec->offsets[i]; + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "neon_rm_redo: invalid max offset number"); + + xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(tupdata); + tupdata = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple; + + newlen = xlhdr->datalen; + Assert(newlen <= MaxHeapTupleSize); + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ + memcpy((char *) htup + SizeofHeapTupleHeader, + (char *) tupdata, + newlen); + tupdata += newlen; + + newlen += SizeofHeapTupleHeader; + htup->t_infomask2 = xlhdr->t_infomask2; + htup->t_infomask = xlhdr->t_infomask; + htup->t_hoff = xlhdr->t_hoff; + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, xlrec->t_cid); + ItemPointerSetBlockNumber(&htup->t_ctid, blkno); + ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); + + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "neon_rm_redo: failed to add tuple"); + } + if (tupdata != endptr) + elog(PANIC, "neon_rm_redo: total tuple length mismatch"); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ + if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) + PageSetAllVisible(page); + + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * If the page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); +} + +#else +/* safeguard for older PostgreSQL versions */ +PG_MODULE_MAGIC; +#endif diff --git a/pgxn/neon_rmgr/neon_rmgr.control b/pgxn/neon_rmgr/neon_rmgr.control new file mode 100644 index 0000000000..d2bbb1b323 --- /dev/null +++ b/pgxn/neon_rmgr/neon_rmgr.control @@ -0,0 +1,4 @@ +# neon_rmgr extension +comment = 'Neon WAL Resource Manager - custom WAL records used to make Neon work (since PG 16)' +default_version = '1.0' +module_pathname = '$libdir/neon_rmgr' diff --git a/pgxn/neon_rmgr/neon_rmgr.h b/pgxn/neon_rmgr/neon_rmgr.h new file mode 100644 index 0000000000..2c26a928ad --- /dev/null +++ b/pgxn/neon_rmgr/neon_rmgr.h @@ -0,0 +1,13 @@ +#ifndef NEON_RMGR_H +#define NEON_RMGR_H +#if PG_MAJORVERSION_NUM >= 16 +#include "access/xlog_internal.h" +#include "replication/decode.h" +#include "replication/logical.h" + +extern void neon_rm_desc(StringInfo buf, XLogReaderState *record); +extern void neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +extern const char *neon_rm_identify(uint8 info); + +#endif +#endif //NEON_RMGR_H diff --git a/pgxn/neon_rmgr/neon_rmgr_decode.c b/pgxn/neon_rmgr/neon_rmgr_decode.c new file mode 100644 index 0000000000..f327e132e9 --- /dev/null +++ b/pgxn/neon_rmgr/neon_rmgr_decode.c @@ -0,0 +1,404 @@ +#include "postgres.h" + +#if PG_MAJORVERSION_NUM >= 16 +#include "access/heapam_xlog.h" +#include "access/neon_xlog.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/snapbuild.h" + +#include "neon_rmgr.h" + +/* individual record(group)'s handlers */ +static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); + +/* common function to decode tuples */ +static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple); + + +void +neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding data changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_NEON_HEAP_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonInsert(ctx, buf); + break; + case XLOG_NEON_HEAP_DELETE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonDelete(ctx, buf); + break; + case XLOG_NEON_HEAP_UPDATE: + case XLOG_NEON_HEAP_HOT_UPDATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonUpdate(ctx, buf); + break; + case XLOG_NEON_HEAP_LOCK: + break; + case XLOG_NEON_HEAP_MULTI_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonMultiInsert(ctx, buf); + break; + default: + elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); + break; + } +} + +static inline bool +FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + if (ctx->callbacks.filter_by_origin_cb == NULL) + return false; + + return filter_by_origin_cb_wrapper(ctx, origin_id); +} + +/* + * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. + * + * Deletes can contain the new tuple. + */ +static void +DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + Size datalen; + char *tupledata; + Size tuplelen; + XLogReaderState *r = buf->record; + xl_neon_heap_insert *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples (this does happen when + * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) + change->action = REORDER_BUFFER_CHANGE_INSERT; + else + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + tupledata = XLogRecGetBlockData(r, 0, &datalen); + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, + xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); +} + +/* + * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. + * + * Deletes can possibly contain the old primary key. + */ +static void +DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_delete *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_delete *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + + if (xlrec->flags & XLH_DELETE_IS_SUPER) + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT; + else + change->action = REORDER_BUFFER_CHANGE_DELETE; + + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + /* old primary key stored */ + if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) + { + Size datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader; + Size tuplelen = datalen - SizeOfNeonHeapHeader; + + Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader)); + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete, + datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout + * in the record, from wal into proper tuplebufs. + * + * Updates can possibly contain a new tuple and the old primary key. + */ +static void +DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_update *xlrec; + ReorderBufferChange *change; + char *data; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_update *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_UPDATE; + change->origin_id = XLogRecGetOrigin(r); + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) + { + Size datalen; + Size tuplelen; + + data = XLogRecGetBlockData(r, 0, &datalen); + + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.newtuple); + } + + if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) + { + Size datalen; + Size tuplelen; + + /* caution, remaining data in record is not aligned */ + data = XLogRecGetData(r) + SizeOfNeonHeapUpdate; + datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate; + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * + * Currently MULTI_INSERT will always contain the full tuples. + */ +static void +DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_multi_insert *xlrec; + int i; + char *data; + char *tupledata; + Size tuplelen; + RelFileLocator rlocator; + + xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples. This happens when a + * multi_insert is done on a catalog or on a non-persistent relation. + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL); + if (rlocator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + /* + * We know that this multi_insert isn't for a catalog, so the block should + * always have data even if a full-page write of it is taken. + */ + tupledata = XLogRecGetBlockData(r, 0, &tuplelen); + Assert(tupledata != NULL); + + data = tupledata; + for (i = 0; i < xlrec->ntuples; i++) + { + ReorderBufferChange *change; + xl_neon_multi_insert_tuple *xlhdr; + int datalen; + ReorderBufferTupleBuf *tuple; + HeapTupleHeader header; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator)); + + xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data); + data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple; + datalen = xlhdr->datalen; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, datalen); + + tuple = change->data.tp.newtuple; + header = tuple->tuple.t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->tuple.t_self); + + /* + * We can only figure this out after reassembling the transactions. + */ + tuple->tuple.t_tableOid = InvalidOid; + + tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy((char *) tuple->tuple.t_data + SizeofHeapTupleHeader, + (char *) data, + datalen); + header->t_infomask = xlhdr->t_infomask; + header->t_infomask2 = xlhdr->t_infomask2; + header->t_hoff = xlhdr->t_hoff; + + /* + * Reset toast reassembly state only after the last row in the last + * xl_multi_insert_tuple record emitted by one heap_multi_insert() + * call. + */ + if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && + (i + 1) == xlrec->ntuples) + change->data.tp.clear_toast_afterwards = true; + else + change->data.tp.clear_toast_afterwards = false; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); + + /* move to the next xl_neon_multi_insert_tuple entry */ + data += datalen; + } + Assert(data == tupledata + tuplelen); +} + +/* + * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete + * (but not by heap_multi_insert) into a tuplebuf. + * + * The size 'len' and the pointer 'data' in the record need to be + * computed outside as they are record specific. + */ +static void +DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple) +{ + xl_neon_heap_header xlhdr; + int datalen = len - SizeOfNeonHeapHeader; + HeapTupleHeader header; + + Assert(datalen >= 0); + + tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; + header = tuple->tuple.t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->tuple.t_self); + + /* we can only figure this out after reassembling the transactions */ + tuple->tuple.t_tableOid = InvalidOid; + + /* data is not stored aligned, copy to aligned storage */ + memcpy((char *) &xlhdr, + data, + SizeOfNeonHeapHeader); + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy(((char *) tuple->tuple.t_data) + SizeofHeapTupleHeader, + data + SizeOfNeonHeapHeader, + datalen); + + header->t_infomask = xlhdr.t_infomask; + header->t_infomask2 = xlhdr.t_infomask2; + header->t_hoff = xlhdr.t_hoff; +} + + +#endif \ No newline at end of file diff --git a/pgxn/neon_rmgr/neon_rmgr_desc.c b/pgxn/neon_rmgr/neon_rmgr_desc.c new file mode 100644 index 0000000000..8901c85ba2 --- /dev/null +++ b/pgxn/neon_rmgr/neon_rmgr_desc.c @@ -0,0 +1,181 @@ +#include "postgres.h" +#if PG_MAJORVERSION_NUM >= 16 +#include "access/heapam_xlog.h" +#include "access/neon_xlog.h" +#include "access/rmgr.h" +#include "access/rmgrdesc_utils.h" +#include "access/xlog_internal.h" +#include "miscadmin.h" +#include "storage/buf.h" +#include "storage/bufpage.h" + +#include "neon_rmgr.h" + +/* + * NOTE: "keyname" argument cannot have trailing spaces or punctuation + * characters + */ +static void +infobits_desc(StringInfo buf, uint8 infobits, const char *keyname) +{ + appendStringInfo(buf, "%s: [", keyname); + + Assert(buf->data[buf->len - 1] != ' '); + + if (infobits & XLHL_XMAX_IS_MULTI) + appendStringInfoString(buf, "IS_MULTI, "); + if (infobits & XLHL_XMAX_LOCK_ONLY) + appendStringInfoString(buf, "LOCK_ONLY, "); + if (infobits & XLHL_XMAX_EXCL_LOCK) + appendStringInfoString(buf, "EXCL_LOCK, "); + if (infobits & XLHL_XMAX_KEYSHR_LOCK) + appendStringInfoString(buf, "KEYSHR_LOCK, "); + if (infobits & XLHL_KEYS_UPDATED) + appendStringInfoString(buf, "KEYS_UPDATED, "); + + if (buf->data[buf->len - 1] == ' ') + { + /* Truncate-away final unneeded ", " */ + Assert(buf->data[buf->len - 2] == ','); + buf->len -= 2; + buf->data[buf->len] = '\0'; + } + + appendStringInfoString(buf, "]"); +} + +static void +truncate_flags_desc(StringInfo buf, uint8 flags) +{ + appendStringInfoString(buf, "flags: ["); + + if (flags & XLH_TRUNCATE_CASCADE) + appendStringInfoString(buf, "CASCADE, "); + if (flags & XLH_TRUNCATE_RESTART_SEQS) + appendStringInfoString(buf, "RESTART_SEQS, "); + + if (buf->data[buf->len - 1] == ' ') + { + /* Truncate-away final unneeded ", " */ + Assert(buf->data[buf->len - 2] == ','); + buf->len -= 2; + buf->data[buf->len] = '\0'; + } + + appendStringInfoString(buf, "]"); +} + +void +neon_rm_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + info &= XLOG_NEON_OPMASK; + + if (info == XLOG_NEON_HEAP_INSERT) + { + xl_neon_heap_insert *xlrec = (xl_neon_heap_insert *) rec; + + appendStringInfo(buf, "off: %u, flags: 0x%02X", + xlrec->offnum, + xlrec->flags); + } + else if (info == XLOG_NEON_HEAP_DELETE) + { + xl_neon_heap_delete *xlrec = (xl_neon_heap_delete *) rec; + + appendStringInfo(buf, "xmax: %u, off: %u, ", + xlrec->xmax, xlrec->offnum); + infobits_desc(buf, xlrec->infobits_set, "infobits"); + appendStringInfo(buf, ", flags: 0x%02X", xlrec->flags); + } + else if (info == XLOG_NEON_HEAP_UPDATE) + { + xl_neon_heap_update *xlrec = (xl_neon_heap_update *) rec; + + appendStringInfo(buf, "old_xmax: %u, old_off: %u, ", + xlrec->old_xmax, xlrec->old_offnum); + infobits_desc(buf, xlrec->old_infobits_set, "old_infobits"); + appendStringInfo(buf, ", flags: 0x%02X, new_xmax: %u, new_off: %u", + xlrec->flags, xlrec->new_xmax, xlrec->new_offnum); + } + else if (info == XLOG_NEON_HEAP_HOT_UPDATE) + { + xl_neon_heap_update *xlrec = (xl_neon_heap_update *) rec; + + appendStringInfo(buf, "old_xmax: %u, old_off: %u, ", + xlrec->old_xmax, xlrec->old_offnum); + infobits_desc(buf, xlrec->old_infobits_set, "old_infobits"); + appendStringInfo(buf, ", flags: 0x%02X, new_xmax: %u, new_off: %u", + xlrec->flags, xlrec->new_xmax, xlrec->new_offnum); + } + else if (info == XLOG_NEON_HEAP_LOCK) + { + xl_neon_heap_lock *xlrec = (xl_neon_heap_lock *) rec; + + appendStringInfo(buf, "xmax: %u, off: %u, ", + xlrec->xmax, xlrec->offnum); + infobits_desc(buf, xlrec->infobits_set, "infobits"); + appendStringInfo(buf, ", flags: 0x%02X", xlrec->flags); + } + else if (info == XLOG_NEON_HEAP_MULTI_INSERT) + { + xl_neon_heap_multi_insert *xlrec = (xl_neon_heap_multi_insert *) rec; + bool isinit = (XLogRecGetInfo(record) & XLOG_NEON_INIT_PAGE) != 0; + + appendStringInfo(buf, "ntuples: %d, flags: 0x%02X", xlrec->ntuples, + xlrec->flags); + + if (XLogRecHasBlockData(record, 0) && !isinit) + { + appendStringInfoString(buf, ", offsets:"); + array_desc(buf, xlrec->offsets, sizeof(OffsetNumber), + xlrec->ntuples, &offset_elem_desc, NULL); + } + } +} + +const char * +neon_rm_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_NEON_HEAP_INSERT: + id = "INSERT"; + break; + case XLOG_NEON_HEAP_INSERT | XLOG_NEON_INIT_PAGE: + id = "INSERT+INIT"; + break; + case XLOG_NEON_HEAP_DELETE: + id = "DELETE"; + break; + case XLOG_NEON_HEAP_UPDATE: + id = "UPDATE"; + break; + case XLOG_NEON_HEAP_UPDATE | XLOG_NEON_INIT_PAGE: + id = "UPDATE+INIT"; + break; + case XLOG_NEON_HEAP_HOT_UPDATE: + id = "HOT_UPDATE"; + break; + case XLOG_NEON_HEAP_HOT_UPDATE | XLOG_HEAP_INIT_PAGE: + id = "HOT_UPDATE+INIT"; + break; + case XLOG_NEON_HEAP_LOCK: + id = "LOCK"; + break; + case XLOG_NEON_HEAP_MULTI_INSERT: + id = "MULTI_INSERT"; + break; + case XLOG_NEON_HEAP_MULTI_INSERT | XLOG_NEON_INIT_PAGE: + id = "MULTI_INSERT+INIT"; + break; + } + + return id; +} + +#endif diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index e0cea4177b..aa644efd40 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -10,6 +10,8 @@ */ #include "postgres.h" +#include "../neon/neon_pgversioncompat.h" + #include "access/relation.h" #include "access/xact.h" #include "access/xlog.h" @@ -39,8 +41,13 @@ PG_FUNCTION_INFO_V1(neon_xlogflush); * Linkage to functions in neon module. * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c */ -typedef void (*neon_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, +#if PG_MAJORVERSION_NUM < 16 +typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, XLogRecPtr request_lsn, bool request_latest, char *buffer); +#else +typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, void *buffer); +#endif static neon_read_at_lsn_type neon_read_at_lsn_ptr; @@ -115,7 +122,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS) uint32 buf_state; Buffer bufferid; bool isvalid; - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blocknum; @@ -128,7 +135,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS) else isvalid = false; bufferid = BufferDescriptorGetBuffer(bufHdr); - rnode = bufHdr->tag.rnode; + rinfo = BufTagGetNRelFileInfo(bufHdr->tag); forknum = bufHdr->tag.forkNum; blocknum = bufHdr->tag.blockNum; @@ -141,7 +148,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS) */ if (isvalid) { - if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid)) + if (ReadRecentBuffer(rinfo, forknum, blocknum, bufferid)) ReleaseBuffer(bufferid); } } @@ -238,7 +245,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, read_lsn, request_latest, raw_page_data); relation_close(rel, AccessShareLock); @@ -267,10 +274,17 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) PG_RETURN_NULL(); { - RelFileNode rnode = { + NRelFileInfo rinfo = { +#if PG_MAJORVERSION_NUM < 16 .spcNode = PG_GETARG_OID(0), .dbNode = PG_GETARG_OID(1), - .relNode = PG_GETARG_OID(2)}; + .relNode = PG_GETARG_OID(2) +#else + .spcOid = PG_GETARG_OID(0), + .dbOid = PG_GETARG_OID(1), + .relNumber = PG_GETARG_OID(2) +#endif + }; ForkNumber forknum = PG_GETARG_UINT32(3); @@ -284,7 +298,7 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(rinfo, forknum, blkno, read_lsn, request_latest, raw_page_data); PG_RETURN_BYTEA_P(raw_page); } } diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index 2219543628..4e604a710c 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -18,10 +18,12 @@ */ #include "postgres.h" +#include "../neon/neon_pgversioncompat.h" + #include "access/xlog.h" #include "storage/block.h" #include "storage/buf_internals.h" -#include "storage/relfilenode.h" +#include RELFILEINFO_HDR #include "storage/smgr.h" #if PG_VERSION_NUM >= 150000 @@ -43,10 +45,12 @@ static int used_pages; static int locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno) { + NRelFileInfo rinfo = InfoFromSMgrRel(reln); + /* We only hold a small number of pages, so linear search */ for (int i = 0; i < used_pages; i++) { - if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + if (RelFileInfoEquals(rinfo, BufTagGetNRelFileInfo(page_tag[i])) && forknum == page_tag[i].forkNum && blkno == page_tag[i].blockNum) { @@ -63,15 +67,26 @@ static void inmem_open(SMgrRelation reln); static void inmem_close(SMgrRelation reln, ForkNumber forknum); static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); static bool inmem_exists(SMgrRelation reln, ForkNumber forknum); -static void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); -static void inmem_extend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); +static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo); static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +#if PG_MAJORVERSION_NUM < 16 +static void inmem_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); static void inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); +#else +static void inmem_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); +static void inmem_zeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); +static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void *buffer); +static void inmem_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); +#endif static void inmem_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); @@ -95,9 +110,11 @@ inmem_init(void) static bool inmem_exists(SMgrRelation reln, ForkNumber forknum) { + NRelFileInfo rinfo = InfoFromSMgrRel(reln); + for (int i = 0; i < used_pages; i++) { - if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + if (RelFileInfoEquals(rinfo, BufTagGetNRelFileInfo(page_tag[i])) && forknum == page_tag[i].forkNum) { return true; @@ -120,7 +137,7 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) * inmem_unlink() -- Unlink a relation. */ static void -inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) +inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo) { } @@ -135,12 +152,28 @@ inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) */ static void inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, +#if PG_MAJORVERSION_NUM < 16 char *buffer, bool skipFsync) +#else + const void *buffer, bool skipFsync) +#endif { /* same as smgwrite() for us */ inmem_write(reln, forknum, blkno, buffer, skipFsync); } +#if PG_MAJORVERSION_NUM >= 16 +static void +inmem_zeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync) +{ + char buffer[BLCKSZ] = {0}; + + for (int i = 0; i < nblocks; i++) + inmem_extend(reln, forknum, blocknum + i, buffer, skipFsync); +} +#endif + /* * inmem_open() -- Initialize newly-opened relation. */ @@ -180,7 +213,11 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum, */ static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, +#if PG_MAJORVERSION_NUM < 16 char *buffer) +#else + void *buffer) +#endif { int pg; @@ -200,7 +237,11 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, */ static void inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +#if PG_MAJORVERSION_NUM < 16 char *buffer, bool skipFsync) +#else + const void *buffer, bool skipFsync) +#endif { int pg; @@ -216,9 +257,7 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, */ elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, blocknum, used_pages); @@ -227,14 +266,13 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, pg = used_pages; used_pages++; - INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum); + + InitBufferTag(&page_tag[pg], &InfoFromSMgrRel(reln), forknum, blocknum); } else { elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, blocknum, used_pages); @@ -287,6 +325,9 @@ static const struct f_smgr inmem_smgr = .smgr_exists = inmem_exists, .smgr_unlink = inmem_unlink, .smgr_extend = inmem_extend, +#if PG_MAJORVERSION_NUM >= 16 + .smgr_zeroextend = inmem_zeroextend, +#endif .smgr_prefetch = inmem_prefetch, .smgr_read = inmem_read, .smgr_write = inmem_write, @@ -297,11 +338,11 @@ static const struct f_smgr inmem_smgr = }; const f_smgr * -smgr_inmem(BackendId backend, RelFileNode rnode) +smgr_inmem(BackendId backend, NRelFileInfo rinfo) { Assert(InRecovery); if (backend != InvalidBackendId) - return smgr_standard(backend, rnode); + return smgr_standard(backend, rinfo); else return &inmem_smgr; } diff --git a/pgxn/neon_walredo/inmem_smgr.h b/pgxn/neon_walredo/inmem_smgr.h index af7c3fe6cc..58b98b8e6a 100644 --- a/pgxn/neon_walredo/inmem_smgr.h +++ b/pgxn/neon_walredo/inmem_smgr.h @@ -11,7 +11,7 @@ #ifndef INMEM_SMGR_H #define INMEM_SMGR_H -extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); +extern const f_smgr *smgr_inmem(BackendId backend, NRelFileInfo rinfo); extern void smgr_init_inmem(void); #endif /* INMEM_SMGR_H */ diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index 9cce9b2a67..01e12983a6 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -43,6 +43,8 @@ #include "postgres.h" +#include "../neon/neon_pgversioncompat.h" + #include #include #include @@ -61,9 +63,11 @@ #include #endif +#if PG_MAJORVERSION_NUM < 16 #ifndef HAVE_GETRUSAGE #include "rusagestub.h" #endif +#endif #include "access/clog.h" #include "access/commit_ts.h" @@ -187,7 +191,7 @@ enter_seccomp_mode(void) * backend processes. Some initialization was done in CallExtMain * already. */ -void +PGDLLEXPORT void WalRedoMain(int argc, char *argv[]) { int firstchar; @@ -200,7 +204,7 @@ WalRedoMain(int argc, char *argv[]) /* * WAL redo does not need a large number of buffers. And speed of - * DropRelFileNodeAllLocalBuffers() is proportional to the number of + * DropRelationAllLocalBuffers() is proportional to the number of * buffers. So let's keep it small (default value is 1024) */ num_temp_buffers = 4; @@ -212,6 +216,12 @@ WalRedoMain(int argc, char *argv[]) smgr_hook = smgr_inmem; smgr_init_hook = smgr_init_inmem; +#if PG_VERSION_NUM >= 160000 + /* make rmgr registry believe we can register the resource manager */ + process_shared_preload_libraries_in_progress = true; + load_file("$libdir/neon_rmgr", false); + process_shared_preload_libraries_in_progress = false; +#endif /* Initialize MaxBackends (if under postmaster, was done already) */ MaxConnections = 1; @@ -300,6 +310,9 @@ WalRedoMain(int argc, char *argv[]) */ MemoryContextSwitchTo(MessageContext); initStringInfo(&input_message); +#if PG_MAJORVERSION_NUM >= 16 + MyBackendType = B_BACKEND; +#endif for (;;) { @@ -534,16 +547,16 @@ CreateFakeSharedMemoryAndSemaphores() /* Version compatility wrapper for ReadBufferWithoutRelcache */ static inline Buffer -NeonRedoReadBuffer(RelFileNode rnode, +NeonRedoReadBuffer(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode) { #if PG_VERSION_NUM >= 150000 - return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode, + return ReadBufferWithoutRelcache(rinfo, forkNum, blockNum, mode, NULL, /* no strategy */ true); /* WAL redo is only performed on permanent rels */ #else - return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode, + return ReadBufferWithoutRelcache(rinfo, forkNum, blockNum, mode, NULL); /* no strategy */ #endif } @@ -647,7 +660,7 @@ ReadRedoCommand(StringInfo inBuf) static void BeginRedoForBlock(StringInfo input_message) { - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blknum; SMgrRelation reln; @@ -662,22 +675,26 @@ BeginRedoForBlock(StringInfo input_message) * BlockNumber */ forknum = pq_getmsgbyte(input_message); - rnode.spcNode = pq_getmsgint(input_message, 4); - rnode.dbNode = pq_getmsgint(input_message, 4); - rnode.relNode = pq_getmsgint(input_message, 4); +#if PG_MAJORVERSION_NUM < 16 + rinfo.spcNode = pq_getmsgint(input_message, 4); + rinfo.dbNode = pq_getmsgint(input_message, 4); + rinfo.relNode = pq_getmsgint(input_message, 4); +#else + rinfo.spcOid = pq_getmsgint(input_message, 4); + rinfo.dbOid = pq_getmsgint(input_message, 4); + rinfo.relNumber = pq_getmsgint(input_message, 4); +#endif blknum = pq_getmsgint(input_message, 4); wal_redo_buffer = InvalidBuffer; - INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum); + InitBufferTag(&target_redo_tag, &rinfo, forknum, blknum); elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u", - target_redo_tag.rnode.spcNode, - target_redo_tag.rnode.dbNode, - target_redo_tag.rnode.relNode, + RelFileInfoFmt(rinfo), target_redo_tag.forkNum, target_redo_tag.blockNum); - reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); + reln = smgropen(rinfo, InvalidBackendId, RELPERSISTENCE_PERMANENT); if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || reln->smgr_cached_nblocks[forknum] < blknum + 1) { @@ -691,7 +708,7 @@ BeginRedoForBlock(StringInfo input_message) static void PushPage(StringInfo input_message) { - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blknum; const char *content; @@ -709,13 +726,19 @@ PushPage(StringInfo input_message) * 8k page content */ forknum = pq_getmsgbyte(input_message); - rnode.spcNode = pq_getmsgint(input_message, 4); - rnode.dbNode = pq_getmsgint(input_message, 4); - rnode.relNode = pq_getmsgint(input_message, 4); +#if PG_MAJORVERSION_NUM < 16 + rinfo.spcNode = pq_getmsgint(input_message, 4); + rinfo.dbNode = pq_getmsgint(input_message, 4); + rinfo.relNode = pq_getmsgint(input_message, 4); +#else + rinfo.spcOid = pq_getmsgint(input_message, 4); + rinfo.dbOid = pq_getmsgint(input_message, 4); + rinfo.relNumber = pq_getmsgint(input_message, 4); +#endif blknum = pq_getmsgint(input_message, 4); content = pq_getmsgbytes(input_message, BLCKSZ); - buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_ZERO_AND_LOCK); + buf = NeonRedoReadBuffer(rinfo, forknum, blknum, RBM_ZERO_AND_LOCK); wal_redo_buffer = buf; page = BufferGetPage(buf); memcpy(page, content, BLCKSZ); @@ -831,7 +854,7 @@ ApplyRecord(StringInfo input_message) */ if (BufferIsInvalid(wal_redo_buffer)) { - wal_redo_buffer = NeonRedoReadBuffer(target_redo_tag.rnode, + wal_redo_buffer = NeonRedoReadBuffer(BufTagGetNRelFileInfo(target_redo_tag), target_redo_tag.forkNum, target_redo_tag.blockNum, RBM_NORMAL); @@ -878,26 +901,29 @@ static bool redo_block_filter(XLogReaderState *record, uint8 block_id) { BufferTag target_tag; + NRelFileInfo rinfo; #if PG_VERSION_NUM >= 150000 XLogRecGetBlockTag(record, block_id, - &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum); + &rinfo, &target_tag.forkNum, &target_tag.blockNum); #else if (!XLogRecGetBlockTag(record, block_id, - &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum)) + &rinfo, &target_tag.forkNum, &target_tag.blockNum)) { /* Caller specified a bogus block_id */ elog(PANIC, "failed to locate backup block with ID %d", block_id); } #endif + CopyNRelFileInfoToBufTag(target_tag, rinfo); /* * Can a WAL redo function ever access a relation other than the one that * it modifies? I don't see why it would. + * Custom RMGRs may be affected by this. */ - if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode)) + if (!RelFileInfoEquals(rinfo, BufTagGetNRelFileInfo(target_redo_tag))) elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u", - target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum); + RelFileInfoFmt(rinfo), target_tag.forkNum, target_tag.blockNum); /* * If this block isn't one we are currently restoring, then return 'true' @@ -914,7 +940,7 @@ redo_block_filter(XLogReaderState *record, uint8 block_id) static void GetPage(StringInfo input_message) { - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blknum; Buffer buf; @@ -931,14 +957,20 @@ GetPage(StringInfo input_message) * BlockNumber */ forknum = pq_getmsgbyte(input_message); - rnode.spcNode = pq_getmsgint(input_message, 4); - rnode.dbNode = pq_getmsgint(input_message, 4); - rnode.relNode = pq_getmsgint(input_message, 4); +#if PG_MAJORVERSION_NUM < 16 + rinfo.spcNode = pq_getmsgint(input_message, 4); + rinfo.dbNode = pq_getmsgint(input_message, 4); + rinfo.relNode = pq_getmsgint(input_message, 4); +#else + rinfo.spcOid = pq_getmsgint(input_message, 4); + rinfo.dbOid = pq_getmsgint(input_message, 4); + rinfo.relNumber = pq_getmsgint(input_message, 4); +#endif blknum = pq_getmsgint(input_message, 4); /* FIXME: check that we got a BeginRedoForBlock message or this earlier */ - buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_NORMAL); + buf = NeonRedoReadBuffer(rinfo, forknum, blknum, RBM_NORMAL); Assert(buf == wal_redo_buffer); page = BufferGetPage(buf); /* single thread, so don't bother locking the page */ @@ -961,7 +993,7 @@ GetPage(StringInfo input_message) } while (tot_written < BLCKSZ); ReleaseBuffer(buf); - DropRelFileNodeAllLocalBuffers(rnode); + DropRelationAllLocalBuffers(rinfo); wal_redo_buffer = InvalidBuffer; elog(TRACE, "Page sent back for block %u", blknum); diff --git a/poetry.lock b/poetry.lock index 63e756a4c0..70961dc797 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -887,34 +887,34 @@ files = [ [[package]] name = "cryptography" -version = "41.0.3" +version = "41.0.4" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = ">=3.7" files = [ - {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"}, - {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"}, - {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"}, - {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"}, - {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"}, - {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"}, - {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"}, - {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"}, - {file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"}, - {file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"}, - {file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"}, - {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"}, - {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"}, - {file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"}, - {file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"}, - {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"}, - {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"}, - {file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"}, - {file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"}, - {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"}, - {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"}, - {file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"}, - {file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"}, + {file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839"}, + {file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f"}, + {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cecfefa17042941f94ab54f769c8ce0fe14beff2694e9ac684176a2535bf9714"}, + {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e40211b4923ba5a6dc9769eab704bdb3fbb58d56c5b336d30996c24fcf12aadb"}, + {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:23a25c09dfd0d9f28da2352503b23e086f8e78096b9fd585d1d14eca01613e13"}, + {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2ed09183922d66c4ec5fdaa59b4d14e105c084dd0febd27452de8f6f74704143"}, + {file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5a0f09cefded00e648a127048119f77bc2b2ec61e736660b5789e638f43cc397"}, + {file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:9eeb77214afae972a00dee47382d2591abe77bdae166bda672fb1e24702a3860"}, + {file = "cryptography-41.0.4-cp37-abi3-win32.whl", hash = "sha256:3b224890962a2d7b57cf5eeb16ccaafba6083f7b811829f00476309bce2fe0fd"}, + {file = "cryptography-41.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c880eba5175f4307129784eca96f4e70b88e57aa3f680aeba3bab0e980b0f37d"}, + {file = "cryptography-41.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:004b6ccc95943f6a9ad3142cfabcc769d7ee38a3f60fb0dddbfb431f818c3a67"}, + {file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:86defa8d248c3fa029da68ce61fe735432b047e32179883bdb1e79ed9bb8195e"}, + {file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:37480760ae08065437e6573d14be973112c9e6dcaf5f11d00147ee74f37a3829"}, + {file = "cryptography-41.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b5f4dfe950ff0479f1f00eda09c18798d4f49b98f4e2006d644b3301682ebdca"}, + {file = "cryptography-41.0.4-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7e53db173370dea832190870e975a1e09c86a879b613948f09eb49324218c14d"}, + {file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5b72205a360f3b6176485a333256b9bcd48700fc755fef51c8e7e67c4b63e3ac"}, + {file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:93530900d14c37a46ce3d6c9e6fd35dbe5f5601bf6b3a5c325c7bffc030344d9"}, + {file = "cryptography-41.0.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:efc8ad4e6fc4f1752ebfb58aefece8b4e3c4cae940b0994d43649bdfce8d0d4f"}, + {file = "cryptography-41.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c3391bd8e6de35f6f1140e50aaeb3e2b3d6a9012536ca23ab0d9c35ec18c8a91"}, + {file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:0d9409894f495d465fe6fda92cb70e8323e9648af912d5b9141d616df40a87b8"}, + {file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8ac4f9ead4bbd0bc8ab2d318f97d85147167a488be0e08814a37eb2f439d5cf6"}, + {file = "cryptography-41.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:047c4603aeb4bbd8db2756e38f5b8bd7e94318c047cfe4efeb5d715e08b49311"}, + {file = "cryptography-41.0.4.tar.gz", hash = "sha256:7febc3094125fc126a7f6fb1f420d0da639f3f32cb15c8ff0dc3997c4549f51a"}, ] [package.dependencies] diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 37190c76b8..7d587ff1ec 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -63,8 +63,8 @@ pub mod errors { format!("{REQUEST_FAILED}: endpoint is disabled") } http::StatusCode::LOCKED => { - // Status 423: project might be in maintenance mode (or bad state). - format!("{REQUEST_FAILED}: endpoint is temporary unavailable") + // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded. + format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support") } _ => REQUEST_FAILED.to_owned(), }, @@ -81,9 +81,15 @@ pub mod errors { // retry some temporary failures because the compute was in a bad state // (bad request can be returned when the endpoint was in transition) Self::Console { - status: http::StatusCode::BAD_REQUEST | http::StatusCode::LOCKED, + status: http::StatusCode::BAD_REQUEST, .. } => true, + // locked can be returned when the endpoint was in transition + // or when quotas are exceeded. don't retry when quotas are exceeded + Self::Console { + status: http::StatusCode::LOCKED, + ref text, + } => !text.contains("quota"), // retry server errors Self::Console { status, .. } if status.is_server_error() => true, _ => false, diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 3322d5a5be..163cdfffc0 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -8,6 +8,7 @@ use super::{ use crate::{auth::ClientCredentials, compute, http, scram}; use async_trait::async_trait; use futures::TryFutureExt; +use std::net::SocketAddr; use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{error, info, info_span, warn, Instrument}; @@ -117,7 +118,7 @@ impl Api { // We'll set username and such later using the startup message. // TODO: add more type safety (in progress). let mut config = compute::ConnCfg::new(); - config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. + config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. let node = NodeInfo { config, @@ -194,9 +195,9 @@ async fn parse_body serde::Deserialize<'a>>( Err(ApiError::Console { status, text }) } -fn parse_host_port(input: &str) -> Option<(&str, u16)> { - let (host, port) = input.split_once(':')?; - Some((host, port.parse().ok()?)) +fn parse_host_port(input: &str) -> Option<(String, u16)> { + let parsed: SocketAddr = input.parse().ok()?; + Some((parsed.ip().to_string(), parsed.port())) } #[cfg(test)] diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index 72ae3dc26f..fa66df0469 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -2,6 +2,7 @@ use crate::{ cancellation::CancelMap, config::ProxyConfig, error::io_error, + protocol2::{ProxyProtocolAccept, WithClientIp}, proxy::{handle_client, ClientMode}, }; use bytes::{Buf, Bytes}; @@ -292,6 +293,9 @@ pub async fn task_main( let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?; let _ = addr_incoming.set_nodelay(true); + let addr_incoming = ProxyProtocolAccept { + incoming: addr_incoming, + }; let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| { if let Err(err) = conn { @@ -302,9 +306,11 @@ pub async fn task_main( } }); - let make_svc = - hyper::service::make_service_fn(|stream: &tokio_rustls::server::TlsStream| { - let sni_name = stream.get_ref().1.server_name().map(|s| s.to_string()); + let make_svc = hyper::service::make_service_fn( + |stream: &tokio_rustls::server::TlsStream>| { + let (io, tls) = stream.get_ref(); + let peer_addr = io.client_addr().unwrap_or(io.inner.remote_addr()); + let sni_name = tls.server_name().map(|s| s.to_string()); let conn_pool = conn_pool.clone(); async move { @@ -319,13 +325,15 @@ pub async fn task_main( ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name) .instrument(info_span!( "ws-client", - session = %session_id + session = %session_id, + %peer_addr, )) .await } })) } - }); + }, + ); hyper::Server::builder(accept::from_stream(tls_listener)) .serve(make_svc) diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 1e1e216bb7..a3d1cdd3c8 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -16,6 +16,7 @@ pub mod http; pub mod logging; pub mod metrics; pub mod parse; +pub mod protocol2; pub mod proxy; pub mod sasl; pub mod scram; diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index d63f902ac1..9279002eb3 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -121,7 +121,7 @@ async fn collect_metrics_iteration( let current_metrics = gather_proxy_io_bytes_per_client(); - let metrics_to_send: Vec> = current_metrics + let metrics_to_send: Vec> = current_metrics .iter() .filter_map(|(curr_key, (curr_val, curr_time))| { let mut start_time = *curr_time; diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs new file mode 100644 index 0000000000..1d8931be85 --- /dev/null +++ b/proxy/src/protocol2.rs @@ -0,0 +1,479 @@ +//! Proxy Protocol V2 implementation + +use std::{ + future::poll_fn, + future::Future, + io, + net::SocketAddr, + pin::{pin, Pin}, + task::{ready, Context, Poll}, +}; + +use bytes::{Buf, BytesMut}; +use hyper::server::conn::{AddrIncoming, AddrStream}; +use pin_project_lite::pin_project; +use tls_listener::AsyncAccept; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; + +pub struct ProxyProtocolAccept { + pub incoming: AddrIncoming, +} + +pin_project! { + pub struct WithClientIp { + #[pin] + pub inner: T, + buf: BytesMut, + tlv_bytes: u16, + state: ProxyParse, + } +} + +#[derive(Clone, PartialEq, Debug)] +enum ProxyParse { + NotStarted, + + Finished(SocketAddr), + None, +} + +impl AsyncWrite for WithClientIp { + #[inline] + fn poll_write( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + self.project().inner.poll_write(cx, buf) + } + + #[inline] + fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().inner.poll_flush(cx) + } + + #[inline] + fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().inner.poll_shutdown(cx) + } + + #[inline] + fn poll_write_vectored( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + bufs: &[io::IoSlice<'_>], + ) -> Poll> { + self.project().inner.poll_write_vectored(cx, bufs) + } + + #[inline] + fn is_write_vectored(&self) -> bool { + self.inner.is_write_vectored() + } +} + +impl WithClientIp { + pub fn new(inner: T) -> Self { + WithClientIp { + inner, + buf: BytesMut::with_capacity(128), + tlv_bytes: 0, + state: ProxyParse::NotStarted, + } + } + + pub fn client_addr(&self) -> Option { + match self.state { + ProxyParse::Finished(socket) => Some(socket), + _ => None, + } + } +} + +impl WithClientIp { + pub async fn wait_for_addr(&mut self) -> io::Result> { + match self.state { + ProxyParse::NotStarted => { + let mut pin = Pin::new(&mut *self); + let addr = poll_fn(|cx| pin.as_mut().poll_client_ip(cx)).await?; + match addr { + Some(addr) => self.state = ProxyParse::Finished(addr), + None => self.state = ProxyParse::None, + } + Ok(addr) + } + ProxyParse::Finished(addr) => Ok(Some(addr)), + ProxyParse::None => Ok(None), + } + } +} + +/// Proxy Protocol Version 2 Header +const HEADER: [u8; 12] = [ + 0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A, +]; + +impl WithClientIp { + /// implementation of + /// Version 2 (Binary Format) + fn poll_client_ip( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll>> { + // The binary header format starts with a constant 12 bytes block containing the protocol signature : + // \x0D \x0A \x0D \x0A \x00 \x0D \x0A \x51 \x55 \x49 \x54 \x0A + while self.buf.len() < 16 { + let mut this = self.as_mut().project(); + let bytes_read = pin!(this.inner.read_buf(this.buf)).poll(cx)?; + + // exit for bad header + let len = usize::min(self.buf.len(), HEADER.len()); + if self.buf[..len] != HEADER[..len] { + return Poll::Ready(Ok(None)); + } + + // if no more bytes available then exit + if ready!(bytes_read) == 0 { + return Poll::Ready(Ok(None)); + }; + } + + // The next byte (the 13th one) is the protocol version and command. + // The highest four bits contains the version. As of this specification, it must + // always be sent as \x2 and the receiver must only accept this value. + let vc = self.buf[12]; + let version = vc >> 4; + let command = vc & 0b1111; + if version != 2 { + return Poll::Ready(Err(io::Error::new( + io::ErrorKind::Other, + "invalid proxy protocol version. expected version 2", + ))); + } + match command { + // the connection was established on purpose by the proxy + // without being relayed. The connection endpoints are the sender and the + // receiver. Such connections exist when the proxy sends health-checks to the + // server. The receiver must accept this connection as valid and must use the + // real connection endpoints and discard the protocol block including the + // family which is ignored. + 0 => {} + // the connection was established on behalf of another node, + // and reflects the original connection endpoints. The receiver must then use + // the information provided in the protocol block to get original the address. + 1 => {} + // other values are unassigned and must not be emitted by senders. Receivers + // must drop connections presenting unexpected values here. + _ => { + return Poll::Ready(Err(io::Error::new( + io::ErrorKind::Other, + "invalid proxy protocol command. expected local (0) or proxy (1)", + ))) + } + }; + + // The 14th byte contains the transport protocol and address family. The highest 4 + // bits contain the address family, the lowest 4 bits contain the protocol. + let ft = self.buf[13]; + let address_length = match ft { + // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET + // protocol family. Address length is 2*4 + 2*2 = 12 bytes. + // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET + // protocol family. Address length is 2*4 + 2*2 = 12 bytes. + 0x11 | 0x12 => 12, + // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6 + // protocol family. Address length is 2*16 + 2*2 = 36 bytes. + // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6 + // protocol family. Address length is 2*16 + 2*2 = 36 bytes. + 0x21 | 0x22 => 36, + // unspecified or unix stream. ignore the addresses + _ => 0, + }; + + // The 15th and 16th bytes is the address length in bytes in network endian order. + // It is used so that the receiver knows how many address bytes to skip even when + // it does not implement the presented protocol. Thus the length of the protocol + // header in bytes is always exactly 16 + this value. When a sender presents a + // LOCAL connection, it should not present any address so it sets this field to + // zero. Receivers MUST always consider this field to skip the appropriate number + // of bytes and must not assume zero is presented for LOCAL connections. When a + // receiver accepts an incoming connection showing an UNSPEC address family or + // protocol, it may or may not decide to log the address information if present. + let remaining_length = u16::from_be_bytes(self.buf[14..16].try_into().unwrap()); + if remaining_length < address_length { + return Poll::Ready(Err(io::Error::new( + io::ErrorKind::Other, + "invalid proxy protocol length. not enough to fit requested IP addresses", + ))); + } + + while self.buf.len() < 16 + address_length as usize { + let mut this = self.as_mut().project(); + if ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?) == 0 { + return Poll::Ready(Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "stream closed while waiting for proxy protocol addresses", + ))); + } + } + + let this = self.as_mut().project(); + + // we are sure this is a proxy protocol v2 entry and we have read all the bytes we need + // discard the header we have parsed + this.buf.advance(16); + + // Starting from the 17th byte, addresses are presented in network byte order. + // The address order is always the same : + // - source layer 3 address in network byte order + // - destination layer 3 address in network byte order + // - source layer 4 address if any, in network byte order (port) + // - destination layer 4 address if any, in network byte order (port) + let addresses = this.buf.split_to(address_length as usize); + let socket = match address_length { + 12 => { + let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap(); + let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap()); + Some(SocketAddr::from((src_addr, src_port))) + } + 36 => { + let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap(); + let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap()); + Some(SocketAddr::from((src_addr, src_port))) + } + _ => None, + }; + + *this.tlv_bytes = remaining_length - address_length; + self.as_mut().skip_tlv_inner(); + + Poll::Ready(Ok(socket)) + } + + #[cold] + fn read_ip(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let ip = ready!(self.as_mut().poll_client_ip(cx)?); + match ip { + Some(x) => *self.as_mut().project().state = ProxyParse::Finished(x), + None => *self.as_mut().project().state = ProxyParse::None, + } + Poll::Ready(Ok(())) + } + + #[cold] + fn skip_tlv(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let mut this = self.as_mut().project(); + // we know that this.buf is empty + debug_assert_eq!(this.buf.len(), 0); + + this.buf.reserve((*this.tlv_bytes).clamp(0, 1024) as usize); + ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?); + self.skip_tlv_inner(); + + Poll::Ready(Ok(())) + } + + fn skip_tlv_inner(self: Pin<&mut Self>) { + let tlv_bytes_read = match u16::try_from(self.buf.len()) { + // we read more than u16::MAX therefore we must have read the full tlv_bytes + Err(_) => self.tlv_bytes, + // we might not have read the full tlv bytes yet + Ok(n) => u16::min(n, self.tlv_bytes), + }; + let this = self.project(); + *this.tlv_bytes -= tlv_bytes_read; + this.buf.advance(tlv_bytes_read as usize); + } +} + +impl AsyncRead for WithClientIp { + #[inline] + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + // I'm assuming these 3 comparisons will be easy to branch predict. + // especially with the cold attributes + // which should make this read wrapper almost invisible + + if let ProxyParse::NotStarted = self.state { + ready!(self.as_mut().read_ip(cx)?); + } + + while self.tlv_bytes > 0 { + ready!(self.as_mut().skip_tlv(cx)?) + } + + let this = self.project(); + if this.buf.is_empty() { + this.inner.poll_read(cx, buf) + } else { + // we know that tlv_bytes is 0 + debug_assert_eq!(*this.tlv_bytes, 0); + + let write = usize::min(this.buf.len(), buf.remaining()); + let slice = this.buf.split_to(write).freeze(); + buf.put_slice(&slice); + + // reset the allocation so it can be freed + if this.buf.is_empty() { + *this.buf = BytesMut::new(); + } + + Poll::Ready(Ok(())) + } + } +} + +impl AsyncAccept for ProxyProtocolAccept { + type Connection = WithClientIp; + + type Error = io::Error; + + fn poll_accept( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll>> { + let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?); + let Some(conn) = conn else { + return Poll::Ready(None); + }; + + Poll::Ready(Some(Ok(WithClientIp::new(conn)))) + } +} + +#[cfg(test)] +mod tests { + use std::pin::pin; + + use tokio::io::AsyncReadExt; + + use crate::protocol2::{ProxyParse, WithClientIp}; + + #[tokio::test] + async fn test_ipv4() { + let header = super::HEADER + // Proxy command, IPV4 | TCP + .chain([(2 << 4) | 1, (1 << 4) | 1].as_slice()) + // 12 + 3 bytes + .chain([0, 15].as_slice()) + // src ip + .chain([127, 0, 0, 1].as_slice()) + // dst ip + .chain([192, 168, 0, 1].as_slice()) + // src port + .chain([255, 255].as_slice()) + // dst port + .chain([1, 1].as_slice()) + // TLV + .chain([1, 2, 3].as_slice()); + + let extra_data = [0x55; 256]; + + let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + + let mut bytes = vec![]; + read.read_to_end(&mut bytes).await.unwrap(); + + assert_eq!(bytes, extra_data); + assert_eq!( + read.state, + ProxyParse::Finished(([127, 0, 0, 1], 65535).into()) + ); + } + + #[tokio::test] + async fn test_ipv6() { + let header = super::HEADER + // Proxy command, IPV6 | UDP + .chain([(2 << 4) | 1, (2 << 4) | 2].as_slice()) + // 36 + 3 bytes + .chain([0, 39].as_slice()) + // src ip + .chain([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0].as_slice()) + // dst ip + .chain([0, 15, 1, 14, 2, 13, 3, 12, 4, 11, 5, 10, 6, 9, 7, 8].as_slice()) + // src port + .chain([1, 1].as_slice()) + // dst port + .chain([255, 255].as_slice()) + // TLV + .chain([1, 2, 3].as_slice()); + + let extra_data = [0x55; 256]; + + let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + + let mut bytes = vec![]; + read.read_to_end(&mut bytes).await.unwrap(); + + assert_eq!(bytes, extra_data); + assert_eq!( + read.state, + ProxyParse::Finished( + ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into() + ) + ); + } + + #[tokio::test] + async fn test_invalid() { + let data = [0x55; 256]; + + let mut read = pin!(WithClientIp::new(data.as_slice())); + + let mut bytes = vec![]; + read.read_to_end(&mut bytes).await.unwrap(); + assert_eq!(bytes, data); + assert_eq!(read.state, ProxyParse::None); + } + + #[tokio::test] + async fn test_short() { + let data = [0x55; 10]; + + let mut read = pin!(WithClientIp::new(data.as_slice())); + + let mut bytes = vec![]; + read.read_to_end(&mut bytes).await.unwrap(); + assert_eq!(bytes, data); + assert_eq!(read.state, ProxyParse::None); + } + + #[tokio::test] + async fn test_large_tlv() { + let tlv = vec![0x55; 32768]; + let len = (12 + tlv.len() as u16).to_be_bytes(); + + let header = super::HEADER + // Proxy command, Inet << 4 | Stream + .chain([(2 << 4) | 1, (1 << 4) | 1].as_slice()) + // 12 + 3 bytes + .chain(len.as_slice()) + // src ip + .chain([55, 56, 57, 58].as_slice()) + // dst ip + .chain([192, 168, 0, 1].as_slice()) + // src port + .chain([255, 255].as_slice()) + // dst port + .chain([1, 1].as_slice()) + // TLV + .chain(tlv.as_slice()); + + let extra_data = [0xaa; 256]; + + let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + + let mut bytes = vec![]; + read.read_to_end(&mut bytes).await.unwrap(); + + assert_eq!(bytes, extra_data); + assert_eq!( + read.state, + ProxyParse::Finished(([55, 56, 57, 58], 65535).into()) + ); + } +} diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 0267d767ee..f9da145859 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -7,6 +7,7 @@ use crate::{ compute::{self, PostgresConnection}, config::{ProxyConfig, TlsConfig}, console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api}, + protocol2::WithClientIp, stream::{PqStream, Stream}, }; use anyhow::{bail, Context}; @@ -27,10 +28,11 @@ use tracing::{error, info, info_span, warn, Instrument}; use utils::measured_stream::MeasuredStream; /// Number of times we should retry the `/proxy_wake_compute` http request. -/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n -pub const NUM_RETRIES_CONNECT: u32 = 10; +/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0 +pub const NUM_RETRIES_CONNECT: u32 = 16; const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); -const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100); +const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25); +const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; const ERR_PROTO_VIOLATION: &str = "protocol violation"; @@ -100,7 +102,7 @@ pub async fn task_main( loop { tokio::select! { accept_result = listener.accept() => { - let (socket, peer_addr) = accept_result?; + let (socket, _) = accept_result?; let session_id = uuid::Uuid::new_v4(); let cancel_map = Arc::clone(&cancel_map); @@ -108,13 +110,19 @@ pub async fn task_main( async move { info!("accepted postgres client connection"); + let mut socket = WithClientIp::new(socket); + if let Some(ip) = socket.wait_for_addr().await? { + tracing::Span::current().record("peer_addr", &tracing::field::display(ip)); + } + socket + .inner .set_nodelay(true) .context("failed to set socket option")?; handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp).await } - .instrument(info_span!("handle_client", ?session_id, %peer_addr)) + .instrument(info_span!("handle_client", ?session_id, peer_addr = tracing::field::Empty)) .unwrap_or_else(move |e| { // Acknowledge that the task has finished with an error. error!(?session_id, "per-client task finished with an error: {e:#}"); @@ -546,8 +554,7 @@ impl ShouldRetry for compute::ConnectionError { } pub fn retry_after(num_retries: u32) -> time::Duration { - // 1.5 seems to be an ok growth factor heuristic - BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32)) + BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1)) } /// Finish client connection initialization: confirm auth success, send params, etc. diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 5653ec94dc..9615017883 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -137,6 +137,7 @@ async fn dummy_proxy( auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let cancel_map = CancelMap::default(); + let client = WithClientIp::new(client); let (mut stream, _params) = handshake(client, tls.as_ref(), &cancel_map) .await? .context("handshake failed")?; @@ -302,7 +303,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> { #[test] fn connect_compute_total_wait() { let mut total_wait = tokio::time::Duration::ZERO; - for num_retries in 1..10 { + for num_retries in 1..NUM_RETRIES_CONNECT { total_wait += retry_after(num_retries); } assert!(total_wait < tokio::time::Duration::from_secs(12)); @@ -493,11 +494,11 @@ async fn connect_to_compute_non_retry_2() { /// Retry for at most `NUM_RETRIES_CONNECT` times. #[tokio::test] async fn connect_to_compute_non_retry_3() { - assert_eq!(NUM_RETRIES_CONNECT, 10); + assert_eq!(NUM_RETRIES_CONNECT, 16); use ConnectAction::*; let mechanism = TestConnectMechanism::new(vec![ - Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, - /* the 11th time */ Retry, + Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, + Retry, Retry, Retry, Retry, /* the 17th time */ Retry, ]); let (cache, extra, creds) = helper_create_connect_info(&mechanism); connect_to_compute(&mechanism, cache, &extra, &creds) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 0ce368ff9d..9cc47ec039 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.71.0" +channel = "1.72.1" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml new file mode 100644 index 0000000000..f3ea6e222c --- /dev/null +++ b/s3_scrubber/Cargo.toml @@ -0,0 +1,41 @@ +[package] +name = "s3_scrubber" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +aws-sdk-s3.workspace = true +aws-smithy-http.workspace = true +aws-types.workspace = true +either.workspace = true +tokio-rustls.workspace = true +anyhow.workspace = true +hex.workspace = true +thiserror.workspace = true +rand.workspace = true +bytes.workspace = true +bincode.workspace = true +crc32c.workspace = true +serde.workspace = true +serde_json.workspace = true +serde_with.workspace = true +workspace_hack.workspace = true +utils.workspace = true +async-stream.workspace = true +tokio-stream.workspace = true +futures-util.workspace = true +itertools.workspace = true + +tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } +chrono = { workspace = true, default-features = false, features = ["clock", "serde"] } +reqwest = { workspace = true, default-features = false, features = ["rustls-tls", "json"] } +aws-config = { workspace = true, default-features = false, features = ["rustls", "credentials-sso"] } + +pageserver = { path = "../pageserver" } + +tracing.workspace = true +tracing-subscriber.workspace = true +clap.workspace = true +tracing-appender = "0.2" +histogram = "0.7" diff --git a/s3_scrubber/README.md b/s3_scrubber/README.md new file mode 100644 index 0000000000..48be3512b4 --- /dev/null +++ b/s3_scrubber/README.md @@ -0,0 +1,93 @@ +# Neon S3 scrubber + +This tool directly accesses the S3 buckets used by the Neon `pageserver` +and `safekeeper`, and does housekeeping such as cleaning up objects for tenants & timelines that no longer exist. + +## Usage + +### Generic Parameters + +#### S3 + +Do `aws sso login --profile dev` to get the SSO access to the bucket to clean, get the SSO_ACCOUNT_ID for your profile (`cat ~/.aws/config` may help). + +- `SSO_ACCOUNT_ID`: Credentials id to use for accessing S3 buckets +- `REGION`: A region where the bucket is located at. +- `BUCKET`: Bucket name + +#### Console API + +_This section is only relevant if using a command that requires access to Neon's internal control plane_ + +- `CLOUD_ADMIN_API_URL`: The URL base to use for checking tenant/timeline for existence via the Cloud API. e.g. `https:///admin` + +- `CLOUD_ADMIN_API_TOKEN`: The token to provide when querying the admin API. Get one on the corresponding console page, e.g. `https:///app/settings/api-keys` + +### Commands + +#### `tidy` + +Iterate over S3 buckets for storage nodes, checking their contents and removing the data not present in the console. Node S3 data that's not removed is then further checked for discrepancies and, sometimes, validated. + +Unless the global `--delete` argument is provided, this command only dry-runs and logs +what it would have deleted. + +``` +tidy --node-kind= [--depth=] [--skip-validation] +``` + +- `--node-kind`: whether to inspect safekeeper or pageserver bucket prefix +- `--depth`: whether to only search for deletable tenants, or also search for + deletable timelines within active tenants. Default: `tenant` +- `--skip-validation`: skip additional post-deletion checks. Default: `false` + +For a selected S3 path, the tool lists the S3 bucket given for either tenants or both tenants and timelines — for every found entry, console API is queried: any deleted or missing in the API entity is scheduled for deletion from S3. + +If validation is enabled, only the non-deleted tenants' ones are checked. +For pageserver, timelines' index_part.json on S3 is also checked for various discrepancies: no files are removed, even if there are "extra" S3 files not present in index_part.json: due to the way pageserver updates the remote storage, it's better to do such removals manually, stopping the corresponding tenant first. + +Command examples: + +`env SSO_ACCOUNT_ID=369495373322 REGION=eu-west-1 BUCKET=neon-dev-storage-eu-west-1 CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- tidy --node-kind=safekeeper` + +`env SSO_ACCOUNT_ID=369495373322 REGION=us-east-2 BUCKET=neon-staging-storage-us-east-2 CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- tidy --node-kind=pageserver --depth=timeline` + +When dry run stats look satisfying, use `-- --delete` before the `tidy` command to +disable dry run and run the binary with deletion enabled. + +See these lines (and lines around) in the logs for the final stats: + +- `Finished listing the bucket for tenants` +- `Finished active tenant and timeline validation` +- `Total tenant deletion stats` +- `Total timeline deletion stats` + +## Current implementation details + +- The tool does not have any peristent state currently: instead, it creates very verbose logs, with every S3 delete request logged, every tenant/timeline id check, etc. + Worse, any panic or early errored tasks might force the tool to exit without printing the final summary — all affected ids will still be in the logs though. The tool has retries inside it, so it's error-resistant up to some extent, and recent runs showed no traces of errors/panics. + +- Instead of checking non-deleted tenants' timelines instantly, the tool attempts to create separate tasks (futures) for that, + complicating the logic and slowing down the process, this should be fixed and done in one "task". + +- The tool does uses only publicly available remote resources (S3, console) and does not access pageserver/safekeeper nodes themselves. + Yet, its S3 set up should be prepared for running on any pageserver/safekeeper node, using node's S3 credentials, so the node API access logic could be implemented relatively simply on top. + +## Cleanup procedure: + +### Pageserver preparations + +If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers. + +First, we need to group pageservers by buckets, `https:///admin/pageservers`` can be used for all env nodes, then `cat /storage/pageserver/data/pageserver.toml` on every node will show the bucket names and regions needed. + +Per bucket, for every pageserver id related, find deleted tenants: + +`curl -X POST "https:///admin/check_pageserver/{id}" -H "Accept: application/json" -H "Authorization: Bearer ${NEON_CLOUD_ADMIN_API_STAGING_KEY}" | jq` + +use `?check_timelines=true` to find deleted timelines, but the check runs a separate query on every alive tenant, so that could be long and time out for big pageservers. + +Note that some tenants/timelines could be marked as deleted in console, but console might continue querying the node later to fully remove the tenant/timeline: wait for some time before ensuring that the "extra" tenant/timeline is not going away by itself. + +When all IDs are collected, manually go to every pageserver and detach/delete the tenant/timeline. +In future, the cleanup tool may access pageservers directly, but now it's only console and S3 it has access to. diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs new file mode 100644 index 0000000000..914681d707 --- /dev/null +++ b/s3_scrubber/src/checks.rs @@ -0,0 +1,440 @@ +use std::collections::{hash_map, HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::Context; +use aws_sdk_s3::Client; +use tokio::task::JoinSet; +use tracing::{error, info, info_span, warn, Instrument}; + +use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectId}; +use crate::delete_batch_producer::DeleteProducerStats; +use crate::{download_object_with_retries, list_objects_with_retries, RootTarget, MAX_RETRIES}; +use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::IndexPart; +use utils::id::TenantTimelineId; + +pub async fn validate_pageserver_active_tenant_and_timelines( + s3_client: Arc, + s3_root: RootTarget, + admin_client: Arc, + batch_producer_stats: DeleteProducerStats, +) -> anyhow::Result { + let Some(timeline_stats) = batch_producer_stats.timeline_stats else { + info!("No tenant-only checks, exiting"); + return Ok(BranchCheckStats::default()); + }; + + let s3_active_projects = batch_producer_stats + .tenant_stats + .active_entries + .into_iter() + .map(|project| (project.id.clone(), project)) + .collect::>(); + info!("Validating {} active tenants", s3_active_projects.len()); + + let mut s3_active_branches_per_project = HashMap::>::new(); + let mut s3_blob_data = HashMap::::new(); + for active_branch in timeline_stats.active_entries { + let active_project_id = active_branch.project_id.clone(); + let active_branch_id = active_branch.id.clone(); + let active_timeline_id = active_branch.timeline_id; + + s3_active_branches_per_project + .entry(active_project_id.clone()) + .or_default() + .push(active_branch); + + let Some(active_project) = s3_active_projects.get(&active_project_id) else { + error!( + "Branch {:?} for project {:?} has no such project in the active projects", + active_branch_id, active_project_id + ); + continue; + }; + + let id = TenantTimelineId::new(active_project.tenant, active_timeline_id); + s3_blob_data.insert( + id, + list_timeline_blobs(&s3_client, id, &s3_root) + .await + .with_context(|| format!("List timeline {id} blobs"))?, + ); + } + + let mut branch_checks = JoinSet::new(); + for (_, s3_active_project) in s3_active_projects { + let project_id = &s3_active_project.id; + let tenant_id = s3_active_project.tenant; + + let mut console_active_branches = + branches_for_project_with_retries(&admin_client, project_id) + .await + .with_context(|| { + format!("Client API branches for project {project_id:?} retrieval") + })? + .into_iter() + .map(|branch| (branch.id.clone(), branch)) + .collect::>(); + + let active_branches = s3_active_branches_per_project + .remove(project_id) + .unwrap_or_default(); + info!( + "Spawning tasks for {} tenant {} active timelines", + active_branches.len(), + tenant_id + ); + for s3_active_branch in active_branches { + let console_branch = console_active_branches.remove(&s3_active_branch.id); + let timeline_id = s3_active_branch.timeline_id; + let id = TenantTimelineId::new(tenant_id, timeline_id); + let s3_data = s3_blob_data.remove(&id); + let s3_root = s3_root.clone(); + branch_checks.spawn( + async move { + let check_errors = branch_cleanup_and_check_errors( + &id, + &s3_root, + Some(&s3_active_branch), + console_branch, + s3_data, + ) + .await; + (id, check_errors) + } + .instrument(info_span!("check_timeline", id = %id)), + ); + } + } + + let mut total_stats = BranchCheckStats::default(); + while let Some((id, analysis)) = branch_checks + .join_next() + .await + .transpose() + .context("branch check task join")? + { + total_stats.add(id, analysis.errors); + } + Ok(total_stats) +} + +async fn branches_for_project_with_retries( + admin_client: &CloudAdminApiClient, + project_id: &ProjectId, +) -> anyhow::Result> { + for _ in 0..MAX_RETRIES { + match admin_client.branches_for_project(project_id, false).await { + Ok(branches) => return Ok(branches), + Err(e) => { + error!("admin list branches for project {project_id:?} query failed: {e}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + + anyhow::bail!("Failed to list branches for project {project_id:?} {MAX_RETRIES} times") +} + +#[derive(Debug, Default)] +pub struct BranchCheckStats { + pub timelines_with_errors: HashMap>, + pub normal_timelines: HashSet, +} + +impl BranchCheckStats { + pub fn add(&mut self, id: TenantTimelineId, check_errors: Vec) { + if check_errors.is_empty() { + if !self.normal_timelines.insert(id) { + panic!("Checking branch with timeline {id} more than once") + } + } else { + match self.timelines_with_errors.entry(id) { + hash_map::Entry::Occupied(_) => { + panic!("Checking branch with timeline {id} more than once") + } + hash_map::Entry::Vacant(v) => { + v.insert(check_errors); + } + } + } + } +} + +pub struct TimelineAnalysis { + /// Anomalies detected + pub errors: Vec, + + /// Healthy-but-noteworthy, like old-versioned structures that are readable but + /// worth reporting for awareness that we must not remove that old version decoding + /// yet. + pub warnings: Vec, + + /// Keys not referenced in metadata: candidates for removal + pub garbage_keys: Vec, +} + +impl TimelineAnalysis { + fn new() -> Self { + Self { + errors: Vec::new(), + warnings: Vec::new(), + garbage_keys: Vec::new(), + } + } +} + +pub async fn branch_cleanup_and_check_errors( + id: &TenantTimelineId, + s3_root: &RootTarget, + s3_active_branch: Option<&BranchData>, + console_branch: Option, + s3_data: Option, +) -> TimelineAnalysis { + let mut result = TimelineAnalysis::new(); + + info!("Checking timeline {id}"); + + if let Some(s3_active_branch) = s3_active_branch { + info!( + "Checking console status for timeline for branch {:?}/{:?}", + s3_active_branch.project_id, s3_active_branch.id + ); + match console_branch { + Some(_) => {result.errors.push(format!("Timeline has deleted branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check", + s3_active_branch.id, s3_active_branch.project_id)) + }, + None => { + result.errors.push(format!("Timeline has no branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check", + s3_active_branch.id, s3_active_branch.project_id)) + } + }; + } + + match s3_data { + Some(s3_data) => { + result.garbage_keys.extend(s3_data.keys_to_remove); + + match s3_data.blob_data { + BlobDataParseResult::Parsed { + index_part, + mut s3_layers, + } => { + if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) { + result.errors.push(format!( + "index_part.json version: {}", + index_part.get_version() + )) + } + + if &index_part.get_version() != IndexPart::KNOWN_VERSIONS.last().unwrap() { + result.warnings.push(format!( + "index_part.json version is not latest: {}", + index_part.get_version() + )) + } + + if index_part.metadata.disk_consistent_lsn() + != index_part.get_disk_consistent_lsn() + { + result.errors.push(format!( + "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})", + index_part.metadata.disk_consistent_lsn(), + index_part.get_disk_consistent_lsn(), + + )) + } + + if index_part.layer_metadata.is_empty() { + // not an error, can happen for branches with zero writes, but notice that + info!("index_part.json has no layers"); + } + + for (layer, metadata) in index_part.layer_metadata { + if metadata.file_size == 0 { + result.errors.push(format!( + "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(), + )) + } + + if !s3_layers.remove(&layer) { + result.errors.push(format!( + "index_part.json contains a layer {} that is not present in S3", + layer.file_name(), + )) + } + } + + if !s3_layers.is_empty() { + result.errors.push(format!( + "index_part.json does not contain layers from S3: {:?}", + s3_layers + .iter() + .map(|layer_name| layer_name.file_name()) + .collect::>(), + )); + result + .garbage_keys + .extend(s3_layers.iter().map(|layer_name| { + let mut key = s3_root.timeline_root(id).prefix_in_bucket; + let delimiter = s3_root.delimiter(); + if !key.ends_with(delimiter) { + key.push_str(delimiter); + } + key.push_str(&layer_name.file_name()); + key + })); + } + } + BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend( + parse_errors + .into_iter() + .map(|error| format!("parse error: {error}")), + ), + } + } + None => result + .errors + .push("Timeline has no data on S3 at all".to_string()), + } + + if result.errors.is_empty() { + info!("No check errors found"); + } else { + warn!("Timeline metadata errors: {0:?}", result.errors); + } + + if !result.warnings.is_empty() { + warn!("Timeline metadata warnings: {0:?}", result.warnings); + } + + if !result.garbage_keys.is_empty() { + error!( + "The following keys should be removed from S3: {0:?}", + result.garbage_keys + ) + } + + result +} + +#[derive(Debug)] +pub struct S3TimelineBlobData { + pub blob_data: BlobDataParseResult, + pub keys_to_remove: Vec, +} + +#[derive(Debug)] +pub enum BlobDataParseResult { + Parsed { + index_part: IndexPart, + s3_layers: HashSet, + }, + Incorrect(Vec), +} + +pub async fn list_timeline_blobs( + s3_client: &Client, + id: TenantTimelineId, + s3_root: &RootTarget, +) -> anyhow::Result { + let mut s3_layers = HashSet::new(); + let mut index_part_object = None; + + let timeline_dir_target = s3_root.timeline_root(&id); + let mut continuation_token = None; + + let mut errors = Vec::new(); + let mut keys_to_remove = Vec::new(); + + loop { + let fetch_response = + list_objects_with_retries(s3_client, &timeline_dir_target, continuation_token.clone()) + .await?; + + let subdirectories = fetch_response.common_prefixes().unwrap_or_default(); + if !subdirectories.is_empty() { + errors.push(format!( + "S3 list response should not contain any subdirectories, but got {subdirectories:?}" + )); + } + + for (object, key) in fetch_response + .contents() + .unwrap_or_default() + .iter() + .filter_map(|object| Some((object, object.key()?))) + { + let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket); + match blob_name { + Some("index_part.json") => index_part_object = Some(object.clone()), + Some(maybe_layer_name) => match maybe_layer_name.parse::() { + Ok(new_layer) => { + s3_layers.insert(new_layer); + } + Err(e) => { + errors.push( + format!("S3 list response got an object with key {key} that is not a layer name: {e}"), + ); + keys_to_remove.push(key.to_string()); + } + }, + None => { + errors.push(format!("S3 list response got an object with odd key {key}")); + keys_to_remove.push(key.to_string()); + } + } + } + + match fetch_response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + if index_part_object.is_none() { + errors.push("S3 list response got no index_part.json file".to_string()); + } + + if let Some(index_part_object_key) = index_part_object.as_ref().and_then(|object| object.key()) + { + let index_part_bytes = download_object_with_retries( + s3_client, + &timeline_dir_target.bucket_name, + index_part_object_key, + ) + .await + .context("index_part.json download")?; + + match serde_json::from_slice(&index_part_bytes) { + Ok(index_part) => { + return Ok(S3TimelineBlobData { + blob_data: BlobDataParseResult::Parsed { + index_part, + s3_layers, + }, + keys_to_remove, + }) + } + Err(index_parse_error) => errors.push(format!( + "index_part.json body parsing error: {index_parse_error}" + )), + } + } else { + errors.push(format!( + "Index part object {index_part_object:?} has no key" + )); + } + + if errors.is_empty() { + errors.push( + "Unexpected: no errors did not lead to a successfully parsed blob return".to_string(), + ); + } + + Ok(S3TimelineBlobData { + blob_data: BlobDataParseResult::Incorrect(errors), + keys_to_remove, + }) +} diff --git a/s3_scrubber/src/cloud_admin_api.rs b/s3_scrubber/src/cloud_admin_api.rs new file mode 100644 index 0000000000..3c21b70385 --- /dev/null +++ b/s3_scrubber/src/cloud_admin_api.rs @@ -0,0 +1,418 @@ +#![allow(unused)] + +use chrono::{DateTime, Utc}; +use reqwest::{header, Client, Url}; +use tokio::sync::Semaphore; + +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +#[derive(Debug)] +pub struct Error { + context: String, + kind: ErrorKind, +} + +impl Error { + fn new(context: String, kind: ErrorKind) -> Self { + Self { context, kind } + } +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self.kind { + ErrorKind::RequestSend(e) => write!( + f, + "Failed to send a request. Context: {}, error: {}", + self.context, e + ), + ErrorKind::BodyRead(e) => { + write!( + f, + "Failed to read a request body. Context: {}, error: {}", + self.context, e + ) + } + ErrorKind::UnexpectedState => write!(f, "Unexpected state: {}", self.context), + } + } +} + +#[derive(Debug, Clone, serde::Deserialize, Hash, PartialEq, Eq)] +#[serde(transparent)] +pub struct ProjectId(pub String); + +#[derive(Clone, Debug, serde::Deserialize, Hash, PartialEq, Eq)] +#[serde(transparent)] +pub struct BranchId(pub String); + +impl std::error::Error for Error {} + +#[derive(Debug)] +pub enum ErrorKind { + RequestSend(reqwest::Error), + BodyRead(reqwest::Error), + UnexpectedState, +} + +pub struct CloudAdminApiClient { + request_limiter: Semaphore, + token: String, + base_url: Url, + http_client: Client, +} + +#[derive(Debug, serde::Deserialize)] +struct AdminApiResponse { + data: T, + total: Option, +} + +#[derive(Debug, serde::Deserialize)] +pub struct PageserverData { + pub id: u64, + pub created_at: DateTime, + pub updated_at: DateTime, + pub region_id: String, + pub version: i64, + pub instance_id: String, + pub port: u16, + pub http_host: String, + pub http_port: u16, + pub active: bool, + pub projects_count: usize, + pub availability_zone_id: String, +} + +#[derive(Debug, Clone, serde::Deserialize)] +pub struct SafekeeperData { + pub id: u64, + pub created_at: DateTime, + pub updated_at: DateTime, + pub region_id: String, + pub version: i64, + pub instance_id: String, + pub active: bool, + pub host: String, + pub port: u16, + pub projects_count: usize, + pub availability_zone_id: String, +} + +#[serde_with::serde_as] +#[derive(Debug, Clone, serde::Deserialize)] +pub struct ProjectData { + pub id: ProjectId, + pub name: String, + pub region_id: String, + pub platform_id: String, + pub user_id: String, + pub pageserver_id: u64, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub tenant: TenantId, + pub safekeepers: Vec, + pub deleted: bool, + pub created_at: DateTime, + pub updated_at: DateTime, + pub pg_version: u32, + pub max_project_size: u64, + pub remote_storage_size: u64, + pub resident_size: u64, + pub synthetic_storage_size: u64, + pub compute_time: u64, + pub data_transfer: u64, + pub data_storage: u64, + pub maintenance_set: Option, +} + +#[serde_with::serde_as] +#[derive(Debug, serde::Deserialize)] +pub struct BranchData { + pub id: BranchId, + pub created_at: DateTime, + pub updated_at: DateTime, + pub name: String, + pub project_id: ProjectId, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub timeline_id: TimelineId, + #[serde(default)] + pub parent_id: Option, + #[serde(default)] + #[serde_as(as = "Option")] + pub parent_lsn: Option, + pub default: bool, + pub deleted: bool, + pub logical_size: Option, + pub physical_size: Option, + pub written_size: Option, +} + +impl CloudAdminApiClient { + pub fn new(token: String, base_url: Url) -> Self { + Self { + token, + base_url, + request_limiter: Semaphore::new(200), + http_client: Client::new(), // TODO timeout configs at least + } + } + + pub async fn find_tenant_project( + &self, + tenant_id: TenantId, + ) -> Result, Error> { + let _permit = self + .request_limiter + .acquire() + .await + .expect("Semaphore is not closed"); + + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("tenant_id", tenant_id.to_string()), + ("show_deleted", "true".to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "Find project for tenant".to_string(), + ErrorKind::RequestSend(e), + ) + })?; + + let response: AdminApiResponse> = response.json().await.map_err(|e| { + Error::new( + "Find project for tenant".to_string(), + ErrorKind::BodyRead(e), + ) + })?; + match response.data.len() { + 0 => Ok(None), + 1 => Ok(Some( + response + .data + .into_iter() + .next() + .expect("Should have exactly one element"), + )), + too_many => Err(Error::new( + format!("Find project for tenant returned {too_many} projects instead of 0 or 1"), + ErrorKind::UnexpectedState, + )), + } + } + + pub async fn find_timeline_branch( + &self, + timeline_id: TimelineId, + ) -> Result, Error> { + let _permit = self + .request_limiter + .acquire() + .await + .expect("Semaphore is not closed"); + + let response = self + .http_client + .get(self.append_url("/branches")) + .query(&[ + ("timeline_id", timeline_id.to_string()), + ("show_deleted", "true".to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "Find branch for timeline".to_string(), + ErrorKind::RequestSend(e), + ) + })?; + + let response: AdminApiResponse> = response.json().await.map_err(|e| { + Error::new( + "Find branch for timeline".to_string(), + ErrorKind::BodyRead(e), + ) + })?; + match response.data.len() { + 0 => Ok(None), + 1 => Ok(Some( + response + .data + .into_iter() + .next() + .expect("Should have exactly one element"), + )), + too_many => Err(Error::new( + format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"), + ErrorKind::UnexpectedState, + )), + } + } + + pub async fn list_pageservers(&self) -> Result, Error> { + let _permit = self + .request_limiter + .acquire() + .await + .expect("Semaphore is not closed"); + + let response = self + .http_client + .get(self.append_url("/pageservers")) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| Error::new("List pageservers".to_string(), ErrorKind::RequestSend(e)))?; + + let response: AdminApiResponse> = response + .json() + .await + .map_err(|e| Error::new("List pageservers".to_string(), ErrorKind::BodyRead(e)))?; + + Ok(response.data) + } + + pub async fn list_safekeepers(&self) -> Result, Error> { + let _permit = self + .request_limiter + .acquire() + .await + .expect("Semaphore is not closed"); + + let response = self + .http_client + .get(self.append_url("/safekeepers")) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| Error::new("List safekeepers".to_string(), ErrorKind::RequestSend(e)))?; + + let response: AdminApiResponse> = response + .json() + .await + .map_err(|e| Error::new("List safekeepers".to_string(), ErrorKind::BodyRead(e)))?; + + Ok(response.data) + } + + pub async fn projects_for_pageserver( + &self, + pageserver_id: u64, + show_deleted: bool, + ) -> Result, Error> { + let _permit = self + .request_limiter + .acquire() + .await + .expect("Semaphore is not closed"); + + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("pageserver_id", &pageserver_id.to_string()), + ("show_deleted", &show_deleted.to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| Error::new("Project for tenant".to_string(), ErrorKind::RequestSend(e)))?; + + let response: AdminApiResponse> = response + .json() + .await + .map_err(|e| Error::new("Project for tenant".to_string(), ErrorKind::BodyRead(e)))?; + + Ok(response.data) + } + + pub async fn project_for_tenant( + &self, + tenant_id: TenantId, + show_deleted: bool, + ) -> Result, Error> { + let _permit = self + .request_limiter + .acquire() + .await + .expect("Semaphore is not closed"); + + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("search", &tenant_id.to_string()), + ("show_deleted", &show_deleted.to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| Error::new("Project for tenant".to_string(), ErrorKind::RequestSend(e)))?; + + let response: AdminApiResponse> = response + .json() + .await + .map_err(|e| Error::new("Project for tenant".to_string(), ErrorKind::BodyRead(e)))?; + + match response.data.as_slice() { + [] => Ok(None), + [_single] => Ok(Some(response.data.into_iter().next().unwrap())), + multiple => Err(Error::new( + format!("Got more than one project for tenant {tenant_id} : {multiple:?}"), + ErrorKind::UnexpectedState, + )), + } + } + + pub async fn branches_for_project( + &self, + project_id: &ProjectId, + show_deleted: bool, + ) -> Result, Error> { + let _permit = self + .request_limiter + .acquire() + .await + .expect("Semaphore is not closed"); + + let response = self + .http_client + .get(self.append_url("/branches")) + .query(&[ + ("project_id", &project_id.0), + ("show_deleted", &show_deleted.to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| Error::new("Project for tenant".to_string(), ErrorKind::RequestSend(e)))?; + + let response: AdminApiResponse> = response + .json() + .await + .map_err(|e| Error::new("Project for tenant".to_string(), ErrorKind::BodyRead(e)))?; + + Ok(response.data) + } + + fn append_url(&self, subpath: &str) -> Url { + // TODO fugly, but `.join` does not work when called + (self.base_url.to_string() + subpath) + .parse() + .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}")) + } +} diff --git a/s3_scrubber/src/delete_batch_producer.rs b/s3_scrubber/src/delete_batch_producer.rs new file mode 100644 index 0000000000..99ab5c4198 --- /dev/null +++ b/s3_scrubber/src/delete_batch_producer.rs @@ -0,0 +1,354 @@ +mod tenant_batch; +mod timeline_batch; + +use std::future::Future; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::Context; +use aws_sdk_s3::Client; +use either::Either; +use tokio::sync::mpsc::UnboundedReceiver; +use tokio::sync::Mutex; +use tokio::task::{JoinHandle, JoinSet}; +use tracing::{error, info, info_span, Instrument}; + +use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectData}; +use crate::{list_objects_with_retries, RootTarget, S3Target, TraversingDepth, MAX_RETRIES}; +use utils::id::{TenantId, TenantTimelineId}; + +/// Typical tenant to remove contains 1 layer and 1 index_part.json blobs +/// Also, there are some non-standard tenants to remove, having more layers. +/// delete_objects request allows up to 1000 keys, so be on a safe side and allow most +/// batch processing tasks to do 1 delete objects request only. +/// +/// Every batch item will be additionally S3 LS'ed later, so keep the batch size +/// even lower to allow multiple concurrent tasks do the LS requests. +const BATCH_SIZE: usize = 100; + +pub struct DeleteBatchProducer { + delete_tenants_sender_task: JoinHandle>>, + delete_timelines_sender_task: + JoinHandle>>, + delete_batch_creator_task: JoinHandle<()>, + delete_batch_receiver: Arc>>, +} + +pub struct DeleteProducerStats { + pub tenant_stats: ProcessedS3List, + pub timeline_stats: Option>, +} + +impl DeleteProducerStats { + pub fn tenants_checked(&self) -> usize { + self.tenant_stats.entries_total + } + + pub fn active_tenants(&self) -> usize { + self.tenant_stats.active_entries.len() + } + + pub fn timelines_checked(&self) -> usize { + self.timeline_stats + .as_ref() + .map(|stats| stats.entries_total) + .unwrap_or(0) + } +} + +#[derive(Debug, Default, Clone)] +pub struct DeleteBatch { + pub tenants: Vec, + pub timelines: Vec, +} + +impl DeleteBatch { + pub fn merge(&mut self, other: Self) { + self.tenants.extend(other.tenants); + self.timelines.extend(other.timelines); + } + + pub fn len(&self) -> usize { + self.tenants.len() + self.timelines.len() + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl DeleteBatchProducer { + pub fn start( + admin_client: Arc, + s3_client: Arc, + s3_root_target: RootTarget, + traversing_depth: TraversingDepth, + ) -> Self { + let (delete_elements_sender, mut delete_elements_receiver) = + tokio::sync::mpsc::unbounded_channel(); + let delete_elements_sender = Arc::new(delete_elements_sender); + let admin_client = Arc::new(admin_client); + + let (projects_to_check_sender, mut projects_to_check_receiver) = + tokio::sync::mpsc::unbounded_channel(); + let delete_tenants_root_target = s3_root_target.clone(); + let delete_tenants_client = Arc::clone(&s3_client); + let delete_tenants_admin_client = Arc::clone(&admin_client); + let delete_sender = Arc::clone(&delete_elements_sender); + let delete_tenants_sender_task = tokio::spawn( + async move { + tenant_batch::schedule_cleanup_deleted_tenants( + &delete_tenants_root_target, + &delete_tenants_client, + &delete_tenants_admin_client, + projects_to_check_sender, + delete_sender, + traversing_depth, + ) + .await + } + .instrument(info_span!("delete_tenants_sender")), + ); + let delete_timelines_sender_task = tokio::spawn(async move { + timeline_batch::schedule_cleanup_deleted_timelines( + &s3_root_target, + &s3_client, + &admin_client, + &mut projects_to_check_receiver, + delete_elements_sender, + ) + .in_current_span() + .await + }); + + let (delete_batch_sender, delete_batch_receiver) = tokio::sync::mpsc::unbounded_channel(); + let delete_batch_creator_task = tokio::spawn( + async move { + 'outer: loop { + let mut delete_batch = DeleteBatch::default(); + while delete_batch.len() < BATCH_SIZE { + match delete_elements_receiver.recv().await { + Some(new_task) => match new_task { + Either::Left(tenant_id) => delete_batch.tenants.push(tenant_id), + Either::Right(timeline_id) => { + delete_batch.timelines.push(timeline_id) + } + }, + None => { + info!("Task finished: sender dropped"); + delete_batch_sender.send(delete_batch).ok(); + break 'outer; + } + } + } + + if !delete_batch.is_empty() { + delete_batch_sender.send(delete_batch).ok(); + } + } + } + .instrument(info_span!("delete batch creator")), + ); + + Self { + delete_tenants_sender_task, + delete_timelines_sender_task, + delete_batch_creator_task, + delete_batch_receiver: Arc::new(Mutex::new(delete_batch_receiver)), + } + } + + pub fn subscribe(&self) -> Arc>> { + self.delete_batch_receiver.clone() + } + + pub async fn join(self) -> anyhow::Result { + let (delete_tenants_task_result, delete_timelines_task_result, batch_task_result) = tokio::join!( + self.delete_tenants_sender_task, + self.delete_timelines_sender_task, + self.delete_batch_creator_task, + ); + + let tenant_stats = match delete_tenants_task_result { + Ok(Ok(stats)) => stats, + Ok(Err(tenant_deletion_error)) => return Err(tenant_deletion_error), + Err(join_error) => { + anyhow::bail!("Failed to join the delete tenant producing task: {join_error}") + } + }; + + let timeline_stats = match delete_timelines_task_result { + Ok(Ok(stats)) => Some(stats), + Ok(Err(timeline_deletion_error)) => return Err(timeline_deletion_error), + Err(join_error) => { + anyhow::bail!("Failed to join the delete timeline producing task: {join_error}") + } + }; + + match batch_task_result { + Ok(()) => (), + Err(join_error) => anyhow::bail!("Failed to join the batch forming task: {join_error}"), + }; + + Ok(DeleteProducerStats { + tenant_stats, + timeline_stats, + }) + } +} + +pub struct ProcessedS3List { + pub entries_total: usize, + pub entries_to_delete: Vec, + pub active_entries: Vec, +} + +impl Default for ProcessedS3List { + fn default() -> Self { + Self { + entries_total: 0, + entries_to_delete: Vec::new(), + active_entries: Vec::new(), + } + } +} + +impl ProcessedS3List { + fn merge(&mut self, other: Self) { + self.entries_total += other.entries_total; + self.entries_to_delete.extend(other.entries_to_delete); + self.active_entries.extend(other.active_entries); + } + + fn change_ids(self, transform: impl Fn(I) -> NewI) -> ProcessedS3List { + ProcessedS3List { + entries_total: self.entries_total, + entries_to_delete: self.entries_to_delete.into_iter().map(transform).collect(), + active_entries: self.active_entries, + } + } +} + +async fn process_s3_target_recursively( + s3_client: &Client, + target: &S3Target, + find_active_and_deleted_entries: F, +) -> anyhow::Result> +where + I: FromStr + Send + Sync, + E: Send + Sync + std::error::Error + 'static, + F: FnOnce(Vec) -> Fut + Clone, + Fut: Future>>, +{ + let mut continuation_token = None; + let mut total_entries = ProcessedS3List::default(); + + loop { + let fetch_response = + list_objects_with_retries(s3_client, target, continuation_token.clone()).await?; + + let new_entry_ids = fetch_response + .common_prefixes() + .unwrap_or_default() + .iter() + .filter_map(|prefix| prefix.prefix()) + .filter_map(|prefix| -> Option<&str> { + prefix + .strip_prefix(&target.prefix_in_bucket)? + .strip_suffix('/') + }) + .map(|entry_id_str| { + entry_id_str + .parse() + .with_context(|| format!("Incorrect entry id str: {entry_id_str}")) + }) + .collect::>>() + .context("list and parse bucket's entry ids")?; + + total_entries.merge( + (find_active_and_deleted_entries.clone())(new_entry_ids) + .await + .context("filter active and deleted entry ids")?, + ); + + match fetch_response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + Ok(total_entries) +} + +enum FetchResult { + Found(A), + Deleted, + Absent, +} + +async fn split_to_active_and_deleted_entries( + new_entry_ids: Vec, + find_active_entry: F, +) -> anyhow::Result> +where + I: std::fmt::Display + Send + Sync + 'static + Copy, + A: Send + 'static, + F: FnOnce(I) -> Fut + Send + Sync + 'static + Clone, + Fut: Future>> + Send, +{ + let entries_total = new_entry_ids.len(); + let mut check_tasks = JoinSet::new(); + let mut active_entries = Vec::with_capacity(entries_total); + let mut entries_to_delete = Vec::with_capacity(entries_total); + + for new_entry_id in new_entry_ids { + let check_closure = find_active_entry.clone(); + check_tasks.spawn( + async move { + ( + new_entry_id, + async { + for _ in 0..MAX_RETRIES { + let closure_clone = check_closure.clone(); + match closure_clone(new_entry_id).await { + Ok(active_entry) => return Ok(active_entry), + Err(e) => { + error!("find active entry admin API call failed: {e}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + + anyhow::bail!("Failed to check entry {new_entry_id} {MAX_RETRIES} times") + } + .await, + ) + } + .instrument(info_span!("filter_active_entries")), + ); + } + + while let Some(task_result) = check_tasks.join_next().await { + let (entry_id, entry_data_fetch_result) = task_result.context("task join")?; + match entry_data_fetch_result.context("entry data fetch")? { + FetchResult::Found(active_entry) => { + info!("Entry {entry_id} is alive, cannot delete"); + active_entries.push(active_entry); + } + FetchResult::Deleted => { + info!("Entry {entry_id} deleted in the admin data, can safely delete"); + entries_to_delete.push(entry_id); + } + FetchResult::Absent => { + info!("Entry {entry_id} absent in the admin data, can safely delete"); + entries_to_delete.push(entry_id); + } + } + } + Ok(ProcessedS3List { + entries_total, + entries_to_delete, + active_entries, + }) +} diff --git a/s3_scrubber/src/delete_batch_producer/tenant_batch.rs b/s3_scrubber/src/delete_batch_producer/tenant_batch.rs new file mode 100644 index 0000000000..59fd638645 --- /dev/null +++ b/s3_scrubber/src/delete_batch_producer/tenant_batch.rs @@ -0,0 +1,87 @@ +use std::sync::Arc; + +use anyhow::Context; +use aws_sdk_s3::Client; +use either::Either; +use tokio::sync::mpsc::UnboundedSender; +use tracing::info; + +use crate::cloud_admin_api::{CloudAdminApiClient, ProjectData}; +use crate::delete_batch_producer::FetchResult; +use crate::{RootTarget, TraversingDepth}; +use utils::id::{TenantId, TenantTimelineId}; + +use super::ProcessedS3List; + +pub async fn schedule_cleanup_deleted_tenants( + s3_root_target: &RootTarget, + s3_client: &Arc, + admin_client: &Arc, + projects_to_check_sender: UnboundedSender, + delete_sender: Arc>>, + traversing_depth: TraversingDepth, +) -> anyhow::Result> { + info!( + "Starting to list the bucket from root {}", + s3_root_target.bucket_name() + ); + s3_client + .head_bucket() + .bucket(s3_root_target.bucket_name()) + .send() + .await + .with_context(|| format!("bucket {} was not found", s3_root_target.bucket_name()))?; + + let check_client = Arc::clone(admin_client); + let tenant_stats = super::process_s3_target_recursively( + s3_client, + s3_root_target.tenants_root(), + |s3_tenants| async move { + let another_client = Arc::clone(&check_client); + super::split_to_active_and_deleted_entries(s3_tenants, move |tenant_id| async move { + let project_data = another_client + .find_tenant_project(tenant_id) + .await + .with_context(|| format!("Tenant {tenant_id} project admin check"))?; + + Ok(if let Some(console_project) = project_data { + if console_project.deleted { + delete_sender.send(Either::Left(tenant_id)).ok(); + FetchResult::Deleted + } else { + if traversing_depth == TraversingDepth::Timeline { + projects_to_check_sender.send(console_project.clone()).ok(); + } + FetchResult::Found(console_project) + } + } else { + delete_sender.send(Either::Left(tenant_id)).ok(); + FetchResult::Absent + }) + }) + .await + }, + ) + .await + .context("tenant batch processing")?; + + info!( + "Among {} tenants, found {} tenants to delete and {} active ones", + tenant_stats.entries_total, + tenant_stats.entries_to_delete.len(), + tenant_stats.active_entries.len(), + ); + + let tenant_stats = match traversing_depth { + TraversingDepth::Tenant => { + info!("Finished listing the bucket for tenants only"); + tenant_stats + } + TraversingDepth::Timeline => { + info!("Finished listing the bucket for tenants and sent {} active tenants to check for timelines", tenant_stats.active_entries.len()); + tenant_stats + } + }; + + Ok(tenant_stats) +} diff --git a/s3_scrubber/src/delete_batch_producer/timeline_batch.rs b/s3_scrubber/src/delete_batch_producer/timeline_batch.rs new file mode 100644 index 0000000000..2ad522d3fb --- /dev/null +++ b/s3_scrubber/src/delete_batch_producer/timeline_batch.rs @@ -0,0 +1,102 @@ +use std::sync::Arc; + +use anyhow::Context; +use aws_sdk_s3::Client; +use either::Either; +use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender}; +use tracing::{info, info_span, Instrument}; + +use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectData}; +use crate::delete_batch_producer::{FetchResult, ProcessedS3List}; +use crate::RootTarget; +use utils::id::{TenantId, TenantTimelineId}; + +pub async fn schedule_cleanup_deleted_timelines( + s3_root_target: &RootTarget, + s3_client: &Arc, + admin_client: &Arc, + projects_to_check_receiver: &mut UnboundedReceiver, + delete_elements_sender: Arc>>, +) -> anyhow::Result> { + info!( + "Starting to list the bucket from root {}", + s3_root_target.bucket_name() + ); + s3_client + .head_bucket() + .bucket(s3_root_target.bucket_name()) + .send() + .await + .with_context(|| format!("bucket {} was not found", s3_root_target.bucket_name()))?; + + let mut timeline_stats = ProcessedS3List::default(); + while let Some(project_to_check) = projects_to_check_receiver.recv().await { + let check_client = Arc::clone(admin_client); + + let check_s3_client = Arc::clone(s3_client); + + let check_delete_sender = Arc::clone(&delete_elements_sender); + + let check_root = s3_root_target.clone(); + + let new_stats = async move { + let tenant_id_to_check = project_to_check.tenant; + let check_target = check_root.timelines_root(&tenant_id_to_check); + let stats = super::process_s3_target_recursively( + &check_s3_client, + &check_target, + |s3_timelines| async move { + let another_client = check_client.clone(); + super::split_to_active_and_deleted_entries( + s3_timelines, + move |timeline_id| async move { + let console_branch = another_client + .find_timeline_branch(timeline_id) + .await + .map_err(|e| { + anyhow::anyhow!( + "Timeline {timeline_id} branch admin check: {e}" + ) + })?; + + let id = TenantTimelineId::new(tenant_id_to_check, timeline_id); + Ok(match console_branch { + Some(console_branch) => { + if console_branch.deleted { + check_delete_sender.send(Either::Right(id)).ok(); + FetchResult::Deleted + } else { + FetchResult::Found(console_branch) + } + } + None => { + check_delete_sender.send(Either::Right(id)).ok(); + FetchResult::Absent + } + }) + }, + ) + .await + }, + ) + .await + .with_context(|| format!("tenant {tenant_id_to_check} timeline batch processing"))? + .change_ids(|timeline_id| TenantTimelineId::new(tenant_id_to_check, timeline_id)); + + Ok::<_, anyhow::Error>(stats) + } + .instrument(info_span!("delete_timelines_sender", tenant = %project_to_check.tenant)) + .await?; + + timeline_stats.merge(new_stats); + } + + info!( + "Among {} timelines, found {} timelines to delete and {} active ones", + timeline_stats.entries_total, + timeline_stats.entries_to_delete.len(), + timeline_stats.active_entries.len(), + ); + + Ok(timeline_stats) +} diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs new file mode 100644 index 0000000000..072cdf1d93 --- /dev/null +++ b/s3_scrubber/src/lib.rs @@ -0,0 +1,300 @@ +pub mod checks; +pub mod cloud_admin_api; +pub mod delete_batch_producer; +pub mod metadata_stream; +mod s3_deletion; +pub mod scan_metadata; + +use std::env; +use std::fmt::Display; +use std::time::Duration; + +use anyhow::Context; +use aws_config::environment::EnvironmentVariableCredentialsProvider; +use aws_config::imds::credentials::ImdsCredentialsProvider; +use aws_config::meta::credentials::CredentialsProviderChain; +use aws_config::sso::SsoCredentialsProvider; +use aws_sdk_s3::config::Region; +use aws_sdk_s3::{Client, Config}; + +use reqwest::Url; +pub use s3_deletion::S3Deleter; +use std::io::IsTerminal; +use tokio::io::AsyncReadExt; +use tracing::error; +use tracing_appender::non_blocking::WorkerGuard; +use tracing_subscriber::{fmt, prelude::*, EnvFilter}; +use utils::id::{TenantId, TenantTimelineId}; + +const MAX_RETRIES: usize = 20; +const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; + +pub const CLI_NAME: &str = "s3-scrubber"; + +#[derive(Debug, Clone)] +pub struct S3Target { + pub bucket_name: String, + pub prefix_in_bucket: String, + pub delimiter: String, +} + +#[derive(clap::ValueEnum, Debug, Clone, Copy, PartialEq, Eq)] +pub enum TraversingDepth { + Tenant, + Timeline, +} + +impl Display for TraversingDepth { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + Self::Tenant => "tenant", + Self::Timeline => "timeline", + }) + } +} + +impl S3Target { + pub fn with_sub_segment(&self, new_segment: &str) -> Self { + let mut new_self = self.clone(); + let _ = new_self.prefix_in_bucket.pop(); + new_self.prefix_in_bucket = + [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter); + new_self + } +} + +#[derive(Clone)] +pub enum RootTarget { + Pageserver(S3Target), + Safekeeper(S3Target), +} + +impl RootTarget { + pub fn tenants_root(&self) -> &S3Target { + match self { + Self::Pageserver(root) => root, + Self::Safekeeper(root) => root, + } + } + + pub fn tenant_root(&self, tenant_id: &TenantId) -> S3Target { + self.tenants_root().with_sub_segment(&tenant_id.to_string()) + } + + pub fn timelines_root(&self, tenant_id: &TenantId) -> S3Target { + match self { + Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"), + Self::Safekeeper(_) => self.tenant_root(tenant_id), + } + } + + pub fn timeline_root(&self, id: &TenantTimelineId) -> S3Target { + self.timelines_root(&id.tenant_id) + .with_sub_segment(&id.timeline_id.to_string()) + } + + pub fn bucket_name(&self) -> &str { + match self { + Self::Pageserver(root) => &root.bucket_name, + Self::Safekeeper(root) => &root.bucket_name, + } + } + + pub fn delimiter(&self) -> &str { + match self { + Self::Pageserver(root) => &root.delimiter, + Self::Safekeeper(root) => &root.delimiter, + } + } +} + +pub struct BucketConfig { + pub region: String, + pub bucket: String, + + /// Use SSO if this is set, else rely on AWS_* environment vars + pub sso_account_id: Option, +} + +impl Display for BucketConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}/{}/{}", + self.sso_account_id.as_deref().unwrap_or(""), + self.region, + self.bucket + ) + } +} + +impl BucketConfig { + pub fn from_env() -> anyhow::Result { + let sso_account_id = env::var("SSO_ACCOUNT_ID").ok(); + let region = env::var("REGION").context("'REGION' param retrieval")?; + let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?; + + Ok(Self { + region, + bucket, + sso_account_id, + }) + } +} + +pub struct ConsoleConfig { + pub admin_api_url: Url, +} + +impl ConsoleConfig { + pub fn from_env() -> anyhow::Result { + let admin_api_url: Url = env::var("CLOUD_ADMIN_API_URL") + .context("'CLOUD_ADMIN_API_URL' param retrieval")? + .parse() + .context("'CLOUD_ADMIN_API_URL' param parsing")?; + + Ok(Self { admin_api_url }) + } +} + +pub fn get_cloud_admin_api_token_or_exit() -> String { + match env::var(CLOUD_ADMIN_API_TOKEN_ENV_VAR) { + Ok(token) => token, + Err(env::VarError::NotPresent) => { + error!("{CLOUD_ADMIN_API_TOKEN_ENV_VAR} env variable is not present"); + std::process::exit(1); + } + Err(env::VarError::NotUnicode(not_unicode_string)) => { + error!("{CLOUD_ADMIN_API_TOKEN_ENV_VAR} env variable's value is not a valid unicode string: {not_unicode_string:?}"); + std::process::exit(1); + } + } +} + +pub fn init_logging(file_name: &str) -> WorkerGuard { + let (file_writer, guard) = + tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name)); + + let file_logs = fmt::Layer::new() + .with_target(false) + .with_ansi(false) + .with_writer(file_writer); + let stdout_logs = fmt::Layer::new() + .with_ansi(std::io::stdout().is_terminal()) + .with_target(false) + .with_writer(std::io::stdout); + tracing_subscriber::registry() + .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) + .with(file_logs) + .with(stdout_logs) + .init(); + + guard +} + +pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Client { + let credentials_provider = { + // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" + let chain = CredentialsProviderChain::first_try( + "env", + EnvironmentVariableCredentialsProvider::new(), + ); + + // Use SSO if we were given an account ID + match account_id { + Some(sso_account) => chain.or_else( + "sso", + SsoCredentialsProvider::builder() + .account_id(sso_account) + .role_name("PowerUserAccess") + .start_url("https://neondb.awsapps.com/start") + .region(Region::from_static("eu-central-1")) + .build(), + ), + None => chain, + } + .or_else( + // Finally try IMDS + "imds", + ImdsCredentialsProvider::builder().build(), + ) + }; + + let mut builder = Config::builder() + .region(bucket_region) + .credentials_provider(credentials_provider); + + if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") { + builder = builder.endpoint_url(endpoint) + } + + Client::from_conf(builder.build()) +} + +async fn list_objects_with_retries( + s3_client: &Client, + s3_target: &S3Target, + continuation_token: Option, +) -> anyhow::Result { + for _ in 0..MAX_RETRIES { + match s3_client + .list_objects_v2() + .bucket(&s3_target.bucket_name) + .prefix(&s3_target.prefix_in_bucket) + .delimiter(&s3_target.delimiter) + .set_continuation_token(continuation_token.clone()) + .send() + .await + { + Ok(response) => return Ok(response), + Err(e) => { + error!("list_objects_v2 query failed: {e}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + + anyhow::bail!("Failed to list objects {MAX_RETRIES} times") +} + +async fn download_object_with_retries( + s3_client: &Client, + bucket_name: &str, + key: &str, +) -> anyhow::Result> { + for _ in 0..MAX_RETRIES { + let mut body_buf = Vec::new(); + let response_stream = match s3_client + .get_object() + .bucket(bucket_name) + .key(key) + .send() + .await + { + Ok(response) => response, + Err(e) => { + error!("Failed to download object for key {key}: {e}"); + tokio::time::sleep(Duration::from_secs(1)).await; + continue; + } + }; + + match response_stream + .body + .into_async_read() + .read_to_end(&mut body_buf) + .await + { + Ok(bytes_read) => { + tracing::info!("Downloaded {bytes_read} bytes for object object with key {key}"); + return Ok(body_buf); + } + Err(e) => { + error!("Failed to stream object body for key {key}: {e}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + + anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") +} diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs new file mode 100644 index 0000000000..3c60723f88 --- /dev/null +++ b/s3_scrubber/src/main.rs @@ -0,0 +1,251 @@ +use std::collections::HashMap; +use std::fmt::Display; +use std::num::NonZeroUsize; +use std::sync::Arc; + +use anyhow::Context; +use aws_sdk_s3::config::Region; +use s3_scrubber::cloud_admin_api::CloudAdminApiClient; +use s3_scrubber::delete_batch_producer::DeleteBatchProducer; +use s3_scrubber::scan_metadata::scan_metadata; +use s3_scrubber::{ + checks, get_cloud_admin_api_token_or_exit, init_logging, init_s3_client, BucketConfig, + ConsoleConfig, RootTarget, S3Deleter, S3Target, TraversingDepth, CLI_NAME, +}; +use tracing::{info, warn}; + +use clap::{Parser, Subcommand, ValueEnum}; + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +#[command(arg_required_else_help(true))] +struct Cli { + #[command(subcommand)] + command: Command, + + #[arg(short, long, default_value_t = false)] + delete: bool, +} + +#[derive(ValueEnum, Clone, Copy, Eq, PartialEq)] +enum NodeKind { + Safekeeper, + Pageserver, +} + +impl NodeKind { + fn as_str(&self) -> &'static str { + match self { + Self::Safekeeper => "safekeeper", + Self::Pageserver => "pageserver", + } + } +} + +impl Display for NodeKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[derive(Subcommand)] +enum Command { + Tidy { + #[arg(short, long)] + node_kind: NodeKind, + #[arg(short, long, default_value_t=TraversingDepth::Tenant)] + depth: TraversingDepth, + #[arg(short, long, default_value_t = false)] + skip_validation: bool, + }, + ScanMetadata {}, +} + +async fn tidy( + cli: &Cli, + bucket_config: BucketConfig, + console_config: ConsoleConfig, + node_kind: NodeKind, + depth: TraversingDepth, + skip_validation: bool, +) -> anyhow::Result<()> { + let dry_run = !cli.delete; + let file_name = if dry_run { + format!( + "{}_{}_{}__dry.log", + CLI_NAME, + node_kind, + chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") + ) + } else { + format!( + "{}_{}_{}.log", + CLI_NAME, + node_kind, + chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") + ) + }; + + let _guard = init_logging(&file_name); + + if dry_run { + info!("Dry run, not removing items for real"); + } else { + warn!("Dry run disabled, removing bucket items for real"); + } + + info!("skip_validation={skip_validation}"); + + info!("Starting extra S3 removal in {bucket_config} for node kind '{node_kind}', traversing depth: {depth:?}"); + + info!("Starting extra tenant S3 removal in {bucket_config} for node kind '{node_kind}'"); + let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new( + get_cloud_admin_api_token_or_exit(), + console_config.admin_api_url, + )); + + let bucket_region = Region::new(bucket_config.region); + let delimiter = "/".to_string(); + let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region)); + let s3_root = match node_kind { + NodeKind::Pageserver => RootTarget::Pageserver(S3Target { + bucket_name: bucket_config.bucket, + prefix_in_bucket: ["pageserver", "v1", "tenants", ""].join(&delimiter), + delimiter, + }), + NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target { + bucket_name: bucket_config.bucket, + prefix_in_bucket: ["safekeeper", "v1", "wal", ""].join(&delimiter), + delimiter, + }), + }; + + let delete_batch_producer = DeleteBatchProducer::start( + Arc::clone(&cloud_admin_api_client), + Arc::clone(&s3_client), + s3_root.clone(), + depth, + ); + + let s3_deleter = S3Deleter::new( + dry_run, + NonZeroUsize::new(15).unwrap(), + Arc::clone(&s3_client), + delete_batch_producer.subscribe(), + s3_root.clone(), + ); + + let (deleter_task_result, batch_producer_task_result) = + tokio::join!(s3_deleter.remove_all(), delete_batch_producer.join()); + + let deletion_stats = deleter_task_result.context("s3 deletion")?; + info!( + "Deleted {} tenants ({} keys) and {} timelines ({} keys) total. Dry run: {}", + deletion_stats.deleted_tenant_keys.len(), + deletion_stats.deleted_tenant_keys.values().sum::(), + deletion_stats.deleted_timeline_keys.len(), + deletion_stats.deleted_timeline_keys.values().sum::(), + dry_run, + ); + info!( + "Total tenant deletion stats: {:?}", + deletion_stats + .deleted_tenant_keys + .into_iter() + .map(|(id, key)| (id.to_string(), key)) + .collect::>() + ); + info!( + "Total timeline deletion stats: {:?}", + deletion_stats + .deleted_timeline_keys + .into_iter() + .map(|(id, key)| (id.to_string(), key)) + .collect::>() + ); + + let batch_producer_stats = batch_producer_task_result.context("delete batch producer join")?; + info!( + "Total bucket tenants listed: {}; for {} active tenants, timelines checked: {}", + batch_producer_stats.tenants_checked(), + batch_producer_stats.active_tenants(), + batch_producer_stats.timelines_checked() + ); + + if node_kind == NodeKind::Pageserver { + info!("node_kind != pageserver, finish without performing validation step"); + return Ok(()); + } + + if skip_validation { + info!("--skip-validation is set, exiting"); + return Ok(()); + } + + info!("validating active tenants and timelines for pageserver S3 data"); + + // TODO kb real stats for validation + better stats for every place: add and print `min`, `max`, `mean` values at least + let validation_stats = checks::validate_pageserver_active_tenant_and_timelines( + s3_client, + s3_root, + cloud_admin_api_client, + batch_producer_stats, + ) + .await + .context("active tenant and timeline validation")?; + info!("Finished active tenant and timeline validation, correct timelines: {}, timeline validation errors: {}", + validation_stats.normal_timelines.len(), validation_stats.timelines_with_errors.len()); + if !validation_stats.timelines_with_errors.is_empty() { + warn!( + "Validation errors: {:#?}", + validation_stats + .timelines_with_errors + .into_iter() + .map(|(id, errors)| (id.to_string(), format!("{errors:?}"))) + .collect::>() + ); + } + + info!("Done"); + Ok(()) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = Cli::parse(); + + let bucket_config = BucketConfig::from_env()?; + + match cli.command { + Command::Tidy { + node_kind, + depth, + skip_validation, + } => { + let console_config = ConsoleConfig::from_env()?; + tidy( + &cli, + bucket_config, + console_config, + node_kind, + depth, + skip_validation, + ) + .await + } + Command::ScanMetadata {} => match scan_metadata(bucket_config).await { + Err(e) => { + tracing::error!("Failed: {e}"); + Err(e) + } + Ok(summary) => { + println!("{}", summary.summary_string()); + if summary.is_fatal() { + Err(anyhow::anyhow!("Fatal scrub errors detected")) + } else { + Ok(()) + } + } + }, + } +} diff --git a/s3_scrubber/src/metadata_stream.rs b/s3_scrubber/src/metadata_stream.rs new file mode 100644 index 0000000000..4e500a96cf --- /dev/null +++ b/s3_scrubber/src/metadata_stream.rs @@ -0,0 +1,106 @@ +use anyhow::Context; +use async_stream::{stream, try_stream}; +use aws_sdk_s3::Client; +use tokio_stream::Stream; + +use crate::{list_objects_with_retries, RootTarget, TenantId}; +use utils::id::{TenantTimelineId, TimelineId}; + +/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2 +pub fn stream_tenants<'a>( + s3_client: &'a Client, + target: &'a RootTarget, +) -> impl Stream> + 'a { + try_stream! { + let mut continuation_token = None; + loop { + let tenants_target = target.tenants_root(); + let fetch_response = + list_objects_with_retries(s3_client, tenants_target, continuation_token.clone()).await?; + + let new_entry_ids = fetch_response + .common_prefixes() + .unwrap_or_default() + .iter() + .filter_map(|prefix| prefix.prefix()) + .filter_map(|prefix| -> Option<&str> { + prefix + .strip_prefix(&tenants_target.prefix_in_bucket)? + .strip_suffix('/') + }).map(|entry_id_str| { + entry_id_str + .parse() + .with_context(|| format!("Incorrect entry id str: {entry_id_str}")) + }); + + for i in new_entry_ids { + yield i?; + } + + match fetch_response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + } +} + +/// Given a TenantId, output a stream of the timelines within that tenant, discovered +/// using ListObjectsv2. The listing is done before the stream is built, so that this +/// function can be used to generate concurrency on a stream using buffer_unordered. +pub async fn stream_tenant_timelines<'a>( + s3_client: &'a Client, + target: &'a RootTarget, + tenant: TenantId, +) -> anyhow::Result> + 'a> { + let mut timeline_ids: Vec> = Vec::new(); + let mut continuation_token = None; + let timelines_target = target.timelines_root(&tenant); + + loop { + tracing::info!("Listing in {}", tenant); + let fetch_response = + list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone()) + .await; + let fetch_response = match fetch_response { + Err(e) => { + timeline_ids.push(Err(e)); + break; + } + Ok(r) => r, + }; + + let new_entry_ids = fetch_response + .common_prefixes() + .unwrap_or_default() + .iter() + .filter_map(|prefix| prefix.prefix()) + .filter_map(|prefix| -> Option<&str> { + prefix + .strip_prefix(&timelines_target.prefix_in_bucket)? + .strip_suffix('/') + }) + .map(|entry_id_str| { + entry_id_str + .parse::() + .with_context(|| format!("Incorrect entry id str: {entry_id_str}")) + }); + + for i in new_entry_ids { + timeline_ids.push(i); + } + + match fetch_response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + tracing::info!("Yielding for {}", tenant); + Ok(stream! { + for i in timeline_ids { + let id = i?; + yield Ok(TenantTimelineId::new(tenant, id)); + } + }) +} diff --git a/s3_scrubber/src/s3_deletion.rs b/s3_scrubber/src/s3_deletion.rs new file mode 100644 index 0000000000..a03cc65c89 --- /dev/null +++ b/s3_scrubber/src/s3_deletion.rs @@ -0,0 +1,434 @@ +use std::collections::BTreeMap; +use std::num::NonZeroUsize; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::Context; +use aws_sdk_s3::types::{Delete, ObjectIdentifier}; +use aws_sdk_s3::Client; +use tokio::sync::mpsc::error::TryRecvError; +use tokio::sync::mpsc::UnboundedReceiver; +use tokio::sync::Mutex; +use tokio::task::JoinSet; +use tracing::{debug, error, info, info_span, Instrument}; + +use crate::delete_batch_producer::DeleteBatch; +use crate::{list_objects_with_retries, RootTarget, S3Target, TenantId, MAX_RETRIES}; +use utils::id::TenantTimelineId; + +pub struct S3Deleter { + dry_run: bool, + concurrent_tasks_count: NonZeroUsize, + delete_batch_receiver: Arc>>, + s3_client: Arc, + s3_target: RootTarget, +} + +impl S3Deleter { + pub fn new( + dry_run: bool, + concurrent_tasks_count: NonZeroUsize, + s3_client: Arc, + delete_batch_receiver: Arc>>, + s3_target: RootTarget, + ) -> Self { + Self { + dry_run, + concurrent_tasks_count, + delete_batch_receiver, + s3_client, + s3_target, + } + } + + pub async fn remove_all(self) -> anyhow::Result { + let mut deletion_tasks = JoinSet::new(); + for id in 0..self.concurrent_tasks_count.get() { + let closure_client = Arc::clone(&self.s3_client); + let closure_s3_target = self.s3_target.clone(); + let closure_batch_receiver = Arc::clone(&self.delete_batch_receiver); + let dry_run = self.dry_run; + deletion_tasks.spawn( + async move { + info!("Task started"); + ( + id, + async move { + let mut task_stats = DeletionStats::default(); + loop { + let mut guard = closure_batch_receiver.lock().await; + let receiver_result = guard.try_recv(); + drop(guard); + match receiver_result { + Ok(batch) => { + let stats = delete_batch( + &closure_client, + &closure_s3_target, + batch, + dry_run, + ) + .await + .context("batch deletion")?; + debug!( + "Batch processed, number of objects deleted per tenant in the batch is: {}, per timeline — {}", + stats.deleted_tenant_keys.len(), + stats.deleted_timeline_keys.len(), + ); + task_stats.merge(stats); + } + Err(TryRecvError::Empty) => { + debug!("No tasks yet, waiting"); + tokio::time::sleep(Duration::from_secs(1)).await; + continue; + } + Err(TryRecvError::Disconnected) => { + info!("Task finished: sender dropped"); + return Ok(task_stats); + } + } + } + } + .in_current_span() + .await, + ) + } + .instrument(info_span!("deletion_task", %id)), + ); + } + + let mut total_stats = DeletionStats::default(); + while let Some(task_result) = deletion_tasks.join_next().await { + match task_result { + Ok((id, Ok(task_stats))) => { + info!("Task {id} completed"); + total_stats.merge(task_stats); + } + Ok((id, Err(e))) => { + error!("Task {id} failed: {e:#}"); + return Err(e); + } + Err(join_error) => anyhow::bail!("Failed to join on a task: {join_error:?}"), + } + } + + Ok(total_stats) + } +} + +/// S3 delete_objects allows up to 1000 keys to be passed in a single request. +/// Yet if you pass too many key requests, apparently S3 could return with OK and +/// actually delete nothing, so keep the number lower. +const MAX_ITEMS_TO_DELETE: usize = 200; + +#[derive(Debug, Default)] +pub struct DeletionStats { + pub deleted_tenant_keys: BTreeMap, + pub deleted_timeline_keys: BTreeMap, +} + +impl DeletionStats { + fn merge(&mut self, other: Self) { + self.deleted_tenant_keys.extend(other.deleted_tenant_keys); + self.deleted_timeline_keys + .extend(other.deleted_timeline_keys); + } +} + +async fn delete_batch( + s3_client: &Client, + s3_target: &RootTarget, + batch: DeleteBatch, + dry_run: bool, +) -> anyhow::Result { + let (deleted_tenant_keys, deleted_timeline_keys) = tokio::join!( + delete_tenants_batch(batch.tenants, s3_target, s3_client, dry_run), + delete_timelines_batch(batch.timelines, s3_target, s3_client, dry_run), + ); + + Ok(DeletionStats { + deleted_tenant_keys: deleted_tenant_keys.context("tenant batch deletion")?, + deleted_timeline_keys: deleted_timeline_keys.context("timeline batch deletion")?, + }) +} + +async fn delete_tenants_batch( + batched_tenants: Vec, + s3_target: &RootTarget, + s3_client: &Client, + dry_run: bool, +) -> Result, anyhow::Error> { + info!("Deleting tenants batch of size {}", batched_tenants.len()); + info!("Tenant ids to remove: {batched_tenants:?}"); + let deleted_keys = delete_elements( + &batched_tenants, + s3_target, + s3_client, + dry_run, + |root_target, tenant_to_delete| root_target.tenant_root(&tenant_to_delete), + ) + .await?; + + if !dry_run { + let mut last_err = None; + for _ in 0..MAX_RETRIES { + match ensure_tenant_batch_deleted(s3_client, s3_target, &batched_tenants).await { + Ok(()) => { + last_err = None; + break; + } + Err(e) => { + error!("Failed to ensure the tenant batch is deleted: {e}"); + last_err = Some(e); + } + } + } + + if let Some(e) = last_err { + anyhow::bail!( + "Failed to ensure that tenant batch is deleted {MAX_RETRIES} times: {e:?}" + ); + } + } + + Ok(deleted_keys) +} + +async fn delete_timelines_batch( + batched_timelines: Vec, + s3_target: &RootTarget, + s3_client: &Client, + dry_run: bool, +) -> Result, anyhow::Error> { + info!( + "Deleting timelines batch of size {}", + batched_timelines.len() + ); + info!( + "Timeline ids to remove: {:?}", + batched_timelines + .iter() + .map(|id| id.to_string()) + .collect::>() + ); + let deleted_keys = delete_elements( + &batched_timelines, + s3_target, + s3_client, + dry_run, + |root_target, timeline_to_delete| root_target.timeline_root(&timeline_to_delete), + ) + .await?; + + if !dry_run { + let mut last_err = None; + for _ in 0..MAX_RETRIES { + match ensure_timeline_batch_deleted(s3_client, s3_target, &batched_timelines).await { + Ok(()) => { + last_err = None; + break; + } + Err(e) => { + error!("Failed to ensure the timelines batch is deleted: {e}"); + last_err = Some(e); + } + } + } + + if let Some(e) = last_err { + anyhow::bail!( + "Failed to ensure that timeline batch is deleted {MAX_RETRIES} times: {e:?}" + ); + } + } + Ok(deleted_keys) +} + +async fn delete_elements( + batched_ids: &Vec, + s3_target: &RootTarget, + s3_client: &Client, + dry_run: bool, + target_producer: impl Fn(&RootTarget, I) -> S3Target, +) -> Result, anyhow::Error> +where + I: Ord + PartialOrd + Copy, +{ + let mut deleted_keys = BTreeMap::new(); + let mut object_ids_to_delete = Vec::with_capacity(MAX_ITEMS_TO_DELETE); + for &id_to_delete in batched_ids { + let mut continuation_token = None; + let mut subtargets = vec![target_producer(s3_target, id_to_delete)]; + while let Some(current_target) = subtargets.pop() { + loop { + let fetch_response = list_objects_with_retries( + s3_client, + ¤t_target, + continuation_token.clone(), + ) + .await?; + + for object_id in fetch_response + .contents() + .unwrap_or_default() + .iter() + .filter_map(|object| object.key()) + .map(|key| ObjectIdentifier::builder().key(key).build()) + { + if object_ids_to_delete.len() >= MAX_ITEMS_TO_DELETE { + let object_ids_for_request = std::mem::replace( + &mut object_ids_to_delete, + Vec::with_capacity(MAX_ITEMS_TO_DELETE), + ); + send_delete_request( + s3_client, + s3_target.bucket_name(), + object_ids_for_request, + dry_run, + ) + .await + .context("object ids deletion")?; + } + + object_ids_to_delete.push(object_id); + *deleted_keys.entry(id_to_delete).or_default() += 1; + } + + subtargets.extend( + fetch_response + .common_prefixes() + .unwrap_or_default() + .iter() + .filter_map(|common_prefix| common_prefix.prefix()) + .map(|prefix| { + let mut new_target = current_target.clone(); + new_target.prefix_in_bucket = prefix.to_string(); + new_target + }), + ); + + match fetch_response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + } + } + if !object_ids_to_delete.is_empty() { + info!("Removing last objects of the batch"); + send_delete_request( + s3_client, + s3_target.bucket_name(), + object_ids_to_delete, + dry_run, + ) + .await + .context("Last object ids deletion")?; + } + Ok(deleted_keys) +} + +pub async fn send_delete_request( + s3_client: &Client, + bucket_name: &str, + ids: Vec, + dry_run: bool, +) -> anyhow::Result<()> { + info!("Removing {} object ids from S3", ids.len()); + info!("Object ids to remove: {ids:?}"); + let delete_request = s3_client + .delete_objects() + .bucket(bucket_name) + .delete(Delete::builder().set_objects(Some(ids)).build()); + if dry_run { + info!("Dry run, skipping the actual removal"); + Ok(()) + } else { + let original_request = delete_request.clone(); + + for _ in 0..MAX_RETRIES { + match delete_request + .clone() + .send() + .await + .context("delete request processing") + { + Ok(delete_response) => { + info!("Delete response: {delete_response:?}"); + match delete_response.errors() { + Some(delete_errors) => { + error!("Delete request returned errors: {delete_errors:?}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + None => { + info!("Successfully removed an object batch from S3"); + return Ok(()); + } + } + } + Err(e) => { + error!("Failed to send a delete request: {e:#}"); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + + error!("Failed to do deletion, request: {original_request:?}"); + anyhow::bail!("Failed to run deletion request {MAX_RETRIES} times"); + } +} + +async fn ensure_tenant_batch_deleted( + s3_client: &Client, + s3_target: &RootTarget, + batch: &[TenantId], +) -> anyhow::Result<()> { + let mut not_deleted_tenants = Vec::with_capacity(batch.len()); + + for &tenant_id in batch { + let fetch_response = + list_objects_with_retries(s3_client, &s3_target.tenant_root(&tenant_id), None).await?; + + if fetch_response.is_truncated() + || fetch_response.contents().is_some() + || fetch_response.common_prefixes().is_some() + { + error!( + "Tenant {tenant_id} should be deleted, but its list response is {fetch_response:?}" + ); + not_deleted_tenants.push(tenant_id); + } + } + + anyhow::ensure!( + not_deleted_tenants.is_empty(), + "Failed to delete all tenants in a batch. Tenants {not_deleted_tenants:?} should be deleted." + ); + Ok(()) +} + +async fn ensure_timeline_batch_deleted( + s3_client: &Client, + s3_target: &RootTarget, + batch: &[TenantTimelineId], +) -> anyhow::Result<()> { + let mut not_deleted_timelines = Vec::with_capacity(batch.len()); + + for &id in batch { + let fetch_response = + list_objects_with_retries(s3_client, &s3_target.timeline_root(&id), None).await?; + + if fetch_response.is_truncated() + || fetch_response.contents().is_some() + || fetch_response.common_prefixes().is_some() + { + error!("Timeline {id} should be deleted, but its list response is {fetch_response:?}"); + not_deleted_timelines.push(id); + } + } + + anyhow::ensure!( + not_deleted_timelines.is_empty(), + "Failed to delete all timelines in a batch" + ); + Ok(()) +} diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs new file mode 100644 index 0000000000..f75d7645a8 --- /dev/null +++ b/s3_scrubber/src/scan_metadata.rs @@ -0,0 +1,234 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use crate::checks::{ + branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData, + TimelineAnalysis, +}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{init_logging, init_s3_client, BucketConfig, RootTarget, S3Target, CLI_NAME}; +use aws_sdk_s3::Client; +use aws_types::region::Region; +use futures_util::{pin_mut, StreamExt, TryStreamExt}; +use histogram::Histogram; +use pageserver::tenant::{IndexPart, TENANTS_SEGMENT_NAME}; +use utils::id::TenantTimelineId; + +pub struct MetadataSummary { + count: usize, + with_errors: HashSet, + with_warnings: HashSet, + with_garbage: HashSet, + indices_by_version: HashMap, + + layer_count: MinMaxHisto, + timeline_size_bytes: MinMaxHisto, + layer_size_bytes: MinMaxHisto, +} + +/// A histogram plus minimum and maximum tracking +struct MinMaxHisto { + histo: Histogram, + min: u64, + max: u64, +} + +impl MinMaxHisto { + fn new() -> Self { + Self { + histo: histogram::Histogram::builder() + .build() + .expect("Bad histogram params"), + min: u64::MAX, + max: 0, + } + } + + fn sample(&mut self, v: u64) -> Result<(), histogram::Error> { + self.min = std::cmp::min(self.min, v); + self.max = std::cmp::max(self.max, v); + let r = self.histo.increment(v, 1); + + if r.is_err() { + tracing::warn!("Bad histogram sample: {v}"); + } + + r + } + + fn oneline(&self) -> String { + let percentiles = match self.histo.percentiles(&[1.0, 10.0, 50.0, 90.0, 99.0]) { + Ok(p) => p, + Err(e) => return format!("No data: {}", e), + }; + + let percentiles: Vec = percentiles + .iter() + .map(|p| p.bucket().low() + p.bucket().high() / 2) + .collect(); + + format!( + "min {}, 1% {}, 10% {}, 50% {}, 90% {}, 99% {}, max {}", + self.min, + percentiles[0], + percentiles[1], + percentiles[2], + percentiles[3], + percentiles[4], + self.max, + ) + } +} + +impl MetadataSummary { + fn new() -> Self { + Self { + count: 0, + with_errors: HashSet::new(), + with_warnings: HashSet::new(), + with_garbage: HashSet::new(), + indices_by_version: HashMap::new(), + layer_count: MinMaxHisto::new(), + timeline_size_bytes: MinMaxHisto::new(), + layer_size_bytes: MinMaxHisto::new(), + } + } + + fn update_histograms(&mut self, index_part: &IndexPart) -> Result<(), histogram::Error> { + self.layer_count + .sample(index_part.layer_metadata.len() as u64)?; + let mut total_size: u64 = 0; + for meta in index_part.layer_metadata.values() { + total_size += meta.file_size; + self.layer_size_bytes.sample(meta.file_size)?; + } + self.timeline_size_bytes.sample(total_size)?; + + Ok(()) + } + + fn update_data(&mut self, data: &S3TimelineBlobData) { + self.count += 1; + if let BlobDataParseResult::Parsed { + index_part, + s3_layers: _, + } = &data.blob_data + { + *self + .indices_by_version + .entry(index_part.get_version()) + .or_insert(0) += 1; + + if let Err(e) = self.update_histograms(index_part) { + // Value out of range? Warn that the results are untrustworthy + tracing::warn!( + "Error updating histograms, summary stats may be wrong: {}", + e + ); + } + } + } + + fn update_analysis(&mut self, id: &TenantTimelineId, analysis: &TimelineAnalysis) { + if !analysis.errors.is_empty() { + self.with_errors.insert(*id); + } + + if !analysis.warnings.is_empty() { + self.with_warnings.insert(*id); + } + } + + /// Long-form output for printing at end of a scan + pub fn summary_string(&self) -> String { + let version_summary: String = itertools::join( + self.indices_by_version + .iter() + .map(|(k, v)| format!("{k}: {v}")), + ", ", + ); + + format!( + "Timelines: {0} +With errors: {1} +With warnings: {2} +With garbage: {3} +Index versions: {version_summary} +Timeline size bytes: {4} +Layer size bytes: {5} +Timeline layer count: {6} +", + self.count, + self.with_errors.len(), + self.with_warnings.len(), + self.with_garbage.len(), + self.timeline_size_bytes.oneline(), + self.layer_size_bytes.oneline(), + self.layer_count.oneline(), + ) + } + + pub fn is_fatal(&self) -> bool { + !self.with_errors.is_empty() + } +} + +/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics. +pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result { + let file_name = format!( + "{}_scan_metadata_{}_{}.log", + CLI_NAME, + bucket_config.bucket, + chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") + ); + + let _guard = init_logging(&file_name); + + let s3_client = Arc::new(init_s3_client( + bucket_config.sso_account_id, + Region::new(bucket_config.region), + )); + let delimiter = "/"; + let target = RootTarget::Pageserver(S3Target { + bucket_name: bucket_config.bucket.to_string(), + prefix_in_bucket: ["pageserver", "v1", TENANTS_SEGMENT_NAME, ""].join(delimiter), + delimiter: delimiter.to_string(), + }); + + let tenants = stream_tenants(&s3_client, &target); + + // How many tenants to process in parallel. We need to be mindful of pageservers + // accessing the same per tenant prefixes, so use a lower setting than pageservers. + const CONCURRENCY: usize = 32; + + // Generate a stream of TenantTimelineId + let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t)); + let timelines = timelines.try_buffer_unordered(CONCURRENCY); + let timelines = timelines.try_flatten(); + + // Generate a stream of S3TimelineBlobData + async fn report_on_timeline( + s3_client: &Client, + target: &RootTarget, + ttid: TenantTimelineId, + ) -> anyhow::Result<(TenantTimelineId, S3TimelineBlobData)> { + let data = list_timeline_blobs(s3_client, ttid, target).await?; + Ok((ttid, data)) + } + let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid)); + let timelines = timelines.try_buffer_unordered(CONCURRENCY); + + let mut summary = MetadataSummary::new(); + pin_mut!(timelines); + while let Some(i) = timelines.next().await { + let (ttid, data) = i?; + summary.update_data(&data); + + let analysis = + branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data)).await; + + summary.update_analysis(&ttid, &analysis); + } + + Ok(summary) +} diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index d728312de4..4ee66ddc8e 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -11,7 +11,7 @@ use anyhow::{bail, Context, Result}; use bytes::Bytes; use futures::future::BoxFuture; use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; -use postgres_ffi::{XLogSegNo, PG_TLI}; +use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; use std::cmp::{max, min}; use std::io::{self, SeekFrom}; @@ -138,19 +138,13 @@ impl PhysicalStorage { let write_lsn = if state.commit_lsn == Lsn(0) { Lsn(0) } else { - match state.server.pg_version / 10000 { - 14 => postgres_ffi::v14::xlog_utils::find_end_of_wal( - &timeline_dir, - wal_seg_size, - state.commit_lsn, - )?, - 15 => postgres_ffi::v15::xlog_utils::find_end_of_wal( - &timeline_dir, - wal_seg_size, - state.commit_lsn, - )?, - _ => bail!("unsupported postgres version: {}", state.server.pg_version), - } + let version = state.server.pg_version / 10000; + + dispatch_pgversion!( + version, + pgv::xlog_utils::find_end_of_wal(&timeline_dir, wal_seg_size, state.commit_lsn,)?, + bail!("unsupported postgres version: {}", version) + ) }; // TODO: do we really know that write_lsn is fully flushed to disk? diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js index 1410b8a0ca..89befda71f 100755 --- a/scripts/comment-test-report.js +++ b/scripts/comment-test-report.js @@ -18,6 +18,10 @@ // reportUrl: "...", // reportJsonUrl: "...", // }, +// coverage: { +// coverageUrl: "...", +// summaryJsonUrl: "...", +// } // }) // @@ -135,7 +139,7 @@ const reportSummary = async (params) => { // Print test resuls from the newest to the oldest Postgres version for release and debug builds. for (const pgVersion of Array.from(pgVersions).sort().reverse()) { if (Object.keys(failedTests[pgVersion]).length > 0) { - summary += `#### Failures on Posgres ${pgVersion}\n\n` + summary += `#### Failures on Postgres ${pgVersion}\n\n` for (const [testName, tests] of Object.entries(failedTests[pgVersion])) { const links = [] for (const test of tests) { @@ -183,11 +187,40 @@ const reportSummary = async (params) => { return summary } -module.exports = async ({ github, context, fetch, report }) => { - // Marker to find the comment in the subsequent runs - const startMarker = `` +const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => { + let summary = `\n### Code coverage ([full report](${coverageUrl}))\n` + + const coverage = await (await fetch(summaryJsonUrl)).json() + for (const covType of Object.keys(coverage).sort()) { + if (!coverage.hasOwnProperty(covType)) { + continue + } + + summary += `- \`${covType}s\`: \`${coverage[covType]["_summary"]}\`\n` + } + + summary += `\n___\n` + + return summary +} + +module.exports = async ({ github, context, fetch, report, coverage }) => { // If we run the script in the PR or in the branch (main/release/...) const isPullRequest = !!context.payload.pull_request + // Which PR to comment (for ci-run/pr-* it will comment the parent PR, not the ci-run/pr-* PR) + let prToComment + if (isPullRequest) { + const branchName = context.payload.pull_request.head.ref.replace(/^refs\/heads\//, "") + const match = branchName.match(/ci-run\/pr-(?\d+)/)?.groups + if (match) { + ({ prNumber } = match) + prToComment = parseInt(prNumber, 10) + } else { + prToComment = context.payload.number + } + } + // Marker to find the comment in the subsequent runs + const startMarker = `` // Latest commit in PR or in the branch const commitSha = isPullRequest ? context.payload.pull_request.head.sha : context.sha // Let users know that the comment is updated automatically @@ -204,7 +237,6 @@ module.exports = async ({ github, context, fetch, report }) => { } const {reportUrl, reportJsonUrl} = report - if (reportUrl && reportJsonUrl) { try { const parsed = await parseReportJson({ reportJsonUrl, fetch }) @@ -223,6 +255,22 @@ module.exports = async ({ github, context, fetch, report }) => { } else { commentBody += `#### No tests were run or test report is not available\n` } + + const { coverageUrl, summaryJsonUrl } = coverage + if (coverageUrl && summaryJsonUrl) { + try { + commentBody += await parseCoverageSummary({ summaryJsonUrl, coverageUrl, fetch }) + } catch (error) { + commentBody += `### [full report](${coverageUrl})\n___\n` + commentBody += `#### Failed to create a coverage summary for the test run: \n` + commentBody += "```\n" + commentBody += `${error.stack}\n` + commentBody += "```\n" + } + } else { + commentBody += `\n#### Test coverage report is not available\n` + } + commentBody += autoupdateNotice let createCommentFn, listCommentsFn, updateCommentFn, issueNumberOrSha @@ -231,7 +279,7 @@ module.exports = async ({ github, context, fetch, report }) => { listCommentsFn = github.rest.issues.listComments updateCommentFn = github.rest.issues.updateComment issueNumberOrSha = { - issue_number: context.payload.number, + issue_number: prToComment, } } else { updateCommentFn = github.rest.repos.updateCommitComment diff --git a/scripts/download_basebackup.py b/scripts/download_basebackup.py new file mode 100755 index 0000000000..1f84e41fef --- /dev/null +++ b/scripts/download_basebackup.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# +# Script to download the basebackup from a pageserver to a tar file. +# +# This can be useful in disaster recovery. +# +import argparse + +import psycopg2 +from psycopg2.extensions import connection as PgConnection + + +def main(args: argparse.Namespace): + pageserver_connstr = args.pageserver_connstr + tenant_id = args.tenant + timeline_id = args.timeline + lsn = args.lsn + output_path = args.output_path + + psconn: PgConnection = psycopg2.connect(pageserver_connstr) + psconn.autocommit = True + + output = open(output_path, "wb") + + with psconn.cursor() as pscur: + pscur.copy_expert(f"basebackup {tenant_id} {timeline_id} {lsn}", output) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--tenant-id", + dest="tenant", + required=True, + help="Id of the tenant", + ) + parser.add_argument( + "--timeline-id", + dest="timeline", + required=True, + help="Id of the timeline", + ) + parser.add_argument( + "--lsn", + dest="lsn", + required=True, + help="LSN to take the basebackup at", + ) + parser.add_argument( + "--pageserver-connstr", + dest="pageserver_connstr", + required=True, + help="libpq connection string of the pageserver", + ) + parser.add_argument( + "--output", + dest="output_path", + required=True, + help="output path to write the basebackup to", + ) + args = parser.parse_args() + main(args) diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 1c36c1ed02..200c9c3740 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,6 +1,7 @@ pytest_plugins = ( "fixtures.pg_version", "fixtures.parametrize", + "fixtures.httpserver", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index a10ef70aa2..1254c4e779 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -14,6 +14,7 @@ from fixtures.neon_fixtures import ( PgProtocol, RemotePostgres, VanillaPostgres, + wait_for_last_flush_lsn, ) from fixtures.pg_stats import PgStatTable @@ -129,6 +130,7 @@ class NeonCompare(PgCompare): return self._pg_bin def flush(self): + wait_for_last_flush_lsn(self.env, self._pg, self.tenant, self.timeline) self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline) self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0) diff --git a/test_runner/fixtures/httpserver.py b/test_runner/fixtures/httpserver.py new file mode 100644 index 0000000000..a321d59266 --- /dev/null +++ b/test_runner/fixtures/httpserver.py @@ -0,0 +1,45 @@ +from typing import Tuple + +import pytest +from pytest_httpserver import HTTPServer + +# TODO: mypy fails with: +# Module "fixtures.neon_fixtures" does not explicitly export attribute "PortDistributor" [attr-defined] +# from fixtures.neon_fixtures import PortDistributor + +# compared to the fixtures from pytest_httpserver with same names, these are +# always function scoped, so you can check and stop the server in tests. + + +@pytest.fixture(scope="function") +def httpserver_ssl_context(): + return None + + +@pytest.fixture(scope="function") +def make_httpserver(httpserver_listen_address, httpserver_ssl_context): + host, port = httpserver_listen_address + if not host: + host = HTTPServer.DEFAULT_LISTEN_HOST + if not port: + port = HTTPServer.DEFAULT_LISTEN_PORT + + server = HTTPServer(host=host, port=port, ssl_context=httpserver_ssl_context) + server.start() + yield server + server.clear() + if server.is_running(): + server.stop() + + +@pytest.fixture(scope="function") +def httpserver(make_httpserver): + server = make_httpserver + yield server + server.clear() + + +@pytest.fixture(scope="function") +def httpserver_listen_address(port_distributor) -> Tuple[str, int]: + port = port_distributor.get_port() + return ("localhost", port) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b2cd0fe968..0667403ba3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -24,7 +24,6 @@ from urllib.parse import urlparse import asyncpg import backoff -import boto3 import jwt import psycopg2 import pytest @@ -32,7 +31,6 @@ import requests from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest -from mypy_boto3_s3 import S3Client # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -47,11 +45,10 @@ from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( - LocalFsStorage, MockS3Server, RemoteStorage, RemoteStorageKind, - RemoteStorageUsers, + RemoteStorageUser, S3Storage, remote_storage_to_toml_inline_table, ) @@ -226,12 +223,6 @@ def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistrib return PortDistributor(base_port=worker_base_port, port_number=worker_port_num) -@pytest.fixture(scope="session") -def httpserver_listen_address(port_distributor: PortDistributor): - port = port_distributor.get_port() - return ("localhost", port) - - @pytest.fixture(scope="function") def default_broker( port_distributor: PortDistributor, @@ -414,10 +405,12 @@ class NeonEnvBuilder: neon_binpath: Path, pg_distrib_dir: Path, pg_version: PgVersion, - remote_storage: Optional[RemoteStorage] = None, - remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, + test_name: str, + test_output_dir: Path, + pageserver_remote_storage: Optional[RemoteStorage] = None, pageserver_config_override: Optional[str] = None, num_safekeepers: int = 1, + num_pageservers: int = 1, # Use non-standard SK ids to check for various parsing bugs safekeepers_id_start: int = 0, # fsync is disabled by default to make the tests go faster @@ -432,21 +425,25 @@ class NeonEnvBuilder: self.repo_dir = repo_dir self.rust_log_override = rust_log_override self.port_distributor = port_distributor - self.remote_storage = remote_storage + + # Pageserver remote storage + self.pageserver_remote_storage = pageserver_remote_storage + # Extensions remote storage self.ext_remote_storage: Optional[S3Storage] = None - self.remote_storage_client: Optional[S3Client] = None - self.remote_storage_users = remote_storage_users + # Safekeepers remote storage + self.sk_remote_storage: Optional[RemoteStorage] = None + self.broker = broker self.run_id = run_id self.mock_s3_server: MockS3Server = mock_s3_server self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers + self.num_pageservers = num_pageservers self.safekeepers_id_start = safekeepers_id_start self.safekeepers_enable_fsync = safekeepers_enable_fsync self.auth_enabled = auth_enabled self.default_branch_name = default_branch_name self.env: Optional[NeonEnv] = None - self.remote_storage_prefix: Optional[str] = None self.keep_remote_storage_contents: bool = True self.neon_binpath = neon_binpath self.pg_distrib_dir = pg_distrib_dir @@ -454,6 +451,14 @@ class NeonEnvBuilder: self.preserve_database_files = preserve_database_files self.initial_tenant = initial_tenant or TenantId.generate() self.initial_timeline = initial_timeline or TimelineId.generate() + self.enable_generations = False + self.scrub_on_exit = False + self.test_output_dir = test_output_dir + + assert test_name.startswith( + "test_" + ), "Unexpectedly instantiated from outside a test function" + self.test_name = test_name def init_configs(self) -> NeonEnv: # Cannot create more than one environment from one builder @@ -483,140 +488,83 @@ class NeonEnvBuilder: return env - def enable_remote_storage( + def enable_scrub_on_exit(self): + """ + Call this if you would like the fixture to automatically run + s3_scrubber at the end of the test, as a bidirectional test + that the scrubber is working properly, and that the code within + the test didn't produce any invalid remote state. + """ + + if not isinstance(self.pageserver_remote_storage, S3Storage): + # The scrubber can't talk to e.g. LocalFS -- it needs + # an HTTP endpoint (mock is fine) to connect to. + raise RuntimeError( + "Cannot scrub with remote_storage={self.pageserver_remote_storage}, require an S3 endpoint" + ) + + self.scrub_on_exit = True + + def enable_pageserver_remote_storage( self, remote_storage_kind: RemoteStorageKind, - test_name: str, - force_enable: bool = True, - enable_remote_extensions: bool = False, ): - if remote_storage_kind == RemoteStorageKind.NOOP: - return - elif remote_storage_kind == RemoteStorageKind.LOCAL_FS: - self.enable_local_fs_remote_storage(force_enable=force_enable) - elif remote_storage_kind == RemoteStorageKind.MOCK_S3: - self.enable_mock_s3_remote_storage( - bucket_name=test_name, - force_enable=force_enable, - enable_remote_extensions=enable_remote_extensions, - ) - elif remote_storage_kind == RemoteStorageKind.REAL_S3: - self.enable_real_s3_remote_storage( - test_name=test_name, - force_enable=force_enable, - enable_remote_extensions=enable_remote_extensions, - ) - else: - raise RuntimeError(f"Unknown storage type: {remote_storage_kind}") + assert self.pageserver_remote_storage is None, "remote storage is enabled already" + ret = self._configure_and_create_remote_storage( + remote_storage_kind, RemoteStorageUser.PAGESERVER + ) + self.pageserver_remote_storage = ret - self.remote_storage_kind = remote_storage_kind + def enable_extensions_remote_storage(self, kind: RemoteStorageKind): + assert self.ext_remote_storage is None, "already configured extensions remote storage" - def enable_local_fs_remote_storage(self, force_enable: bool = True): - """ - Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path. - Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. - """ - assert force_enable or self.remote_storage is None, "remote storage is enabled already" - self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage")) + # there is an assumption that REAL_S3 for extensions is never + # cleaned up these are also special in that they have a hardcoded + # bucket and region, which is most likely the same as our normal + ext = self._configure_and_create_remote_storage( + kind, + RemoteStorageUser.EXTENSIONS, + bucket_name="neon-dev-extensions-eu-central-1", + bucket_region="eu-central-1", + ) + assert isinstance( + ext, S3Storage + ), "unsure why, but only MOCK_S3 and REAL_S3 are currently supported for extensions" + ext.cleanup = False + self.ext_remote_storage = ext - def enable_mock_s3_remote_storage( + def enable_safekeeper_remote_storage(self, kind: RemoteStorageKind): + assert self.sk_remote_storage is None, "sk_remote_storage already configured" + + self.sk_remote_storage = self._configure_and_create_remote_storage( + kind, RemoteStorageUser.SAFEKEEPER + ) + + def _configure_and_create_remote_storage( self, - bucket_name: str, - force_enable: bool = True, - enable_remote_extensions: bool = False, - ): - """ - Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already. - Starts up the mock server, if that does not run yet. - Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. - - Also creates the bucket for extensions, self.ext_remote_storage bucket - """ - assert force_enable or self.remote_storage is None, "remote storage is enabled already" - mock_endpoint = self.mock_s3_server.endpoint() - mock_region = self.mock_s3_server.region() - - self.remote_storage_client = boto3.client( - "s3", - endpoint_url=mock_endpoint, - region_name=mock_region, - aws_access_key_id=self.mock_s3_server.access_key(), - aws_secret_access_key=self.mock_s3_server.secret_key(), - ) - self.remote_storage_client.create_bucket(Bucket=bucket_name) - - self.remote_storage = S3Storage( + kind: RemoteStorageKind, + user: RemoteStorageUser, + bucket_name: Optional[str] = None, + bucket_region: Optional[str] = None, + ) -> Optional[RemoteStorage]: + ret = kind.configure( + self.repo_dir, + self.mock_s3_server, + str(self.run_id), + self.test_name, + user, bucket_name=bucket_name, - endpoint=mock_endpoint, - bucket_region=mock_region, - access_key=self.mock_s3_server.access_key(), - secret_key=self.mock_s3_server.secret_key(), - prefix_in_bucket="pageserver", + bucket_region=bucket_region, ) - if enable_remote_extensions: - self.ext_remote_storage = S3Storage( - bucket_name=bucket_name, - endpoint=mock_endpoint, - bucket_region=mock_region, - access_key=self.mock_s3_server.access_key(), - secret_key=self.mock_s3_server.secret_key(), - prefix_in_bucket="ext", - ) + if kind == RemoteStorageKind.MOCK_S3: + assert isinstance(ret, S3Storage) + ret.client.create_bucket(Bucket=ret.bucket_name) + elif kind == RemoteStorageKind.REAL_S3: + assert isinstance(ret, S3Storage) + assert ret.cleanup, "we should not leave files in REAL_S3" - def enable_real_s3_remote_storage( - self, - test_name: str, - force_enable: bool = True, - enable_remote_extensions: bool = False, - ): - """ - Sets up configuration to use real s3 endpoint without mock server - """ - assert force_enable or self.remote_storage is None, "remote storage is enabled already" - - access_key = os.getenv("AWS_ACCESS_KEY_ID") - assert access_key, "no aws access key provided" - secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") - assert secret_key, "no aws access key provided" - - # session token is needed for local runs with sso auth - session_token = os.getenv("AWS_SESSION_TOKEN") - - bucket_name = os.getenv("REMOTE_STORAGE_S3_BUCKET") - assert bucket_name, "no remote storage bucket name provided" - region = os.getenv("REMOTE_STORAGE_S3_REGION") - assert region, "no remote storage region provided" - - # do not leave data in real s3 - self.keep_remote_storage_contents = False - - # construct a prefix inside bucket for the particular test case and test run - self.remote_storage_prefix = f"{self.run_id}/{test_name}" - - self.remote_storage_client = boto3.client( - "s3", - region_name=region, - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - aws_session_token=session_token, - ) - self.remote_storage = S3Storage( - bucket_name=bucket_name, - bucket_region=region, - access_key=access_key, - secret_key=secret_key, - prefix_in_bucket=self.remote_storage_prefix, - ) - - if enable_remote_extensions: - self.ext_remote_storage = S3Storage( - bucket_name="neon-dev-extensions-eu-central-1", - bucket_region="eu-central-1", - access_key=access_key, - secret_key=secret_key, - prefix_in_bucket=None, - ) + return ret def cleanup_local_storage(self): if self.preserve_database_files: @@ -641,60 +589,10 @@ class NeonEnvBuilder: directory_to_clean.rmdir() def cleanup_remote_storage(self): - # here wee check for true remote storage, no the local one - # local cleanup is not needed after test because in ci all env will be destroyed anyway - if self.remote_storage_prefix is None: - log.info("no remote storage was set up, skipping cleanup") - return - - # Making mypy happy with allowing only `S3Storage` further. - # `self.remote_storage_prefix` is coupled with `S3Storage` storage type, - # so this line effectively a no-op - assert isinstance(self.remote_storage, S3Storage) - assert self.remote_storage_client is not None - - if self.keep_remote_storage_contents: - log.info("keep_remote_storage_contents skipping remote storage cleanup") - return - - log.info( - "removing data from test s3 bucket %s by prefix %s", - self.remote_storage.bucket_name, - self.remote_storage_prefix, - ) - paginator = self.remote_storage_client.get_paginator("list_objects_v2") - pages = paginator.paginate( - Bucket=self.remote_storage.bucket_name, - Prefix=self.remote_storage_prefix, - ) - - # Using Any because DeleteTypeDef (from boto3-stubs) doesn't fit our case - objects_to_delete: Any = {"Objects": []} - cnt = 0 - for item in pages.search("Contents"): - # weirdly when nothing is found it returns [None] - if item is None: - break - - objects_to_delete["Objects"].append({"Key": item["Key"]}) - - # flush once aws limit reached - if len(objects_to_delete["Objects"]) >= 1000: - self.remote_storage_client.delete_objects( - Bucket=self.remote_storage.bucket_name, - Delete=objects_to_delete, - ) - objects_to_delete = {"Objects": []} - cnt += 1 - - # flush rest - if len(objects_to_delete["Objects"]): - self.remote_storage_client.delete_objects( - Bucket=self.remote_storage.bucket_name, - Delete=objects_to_delete, - ) - - log.info(f"deleted {cnt} objects from remote storage") + # extensions are currently not cleaned up, disabled when creating + for x in [self.pageserver_remote_storage, self.ext_remote_storage, self.sk_remote_storage]: + if isinstance(x, S3Storage): + x.do_cleanup() def __enter__(self) -> "NeonEnvBuilder": return self @@ -711,14 +609,28 @@ class NeonEnvBuilder: self.env.endpoints.stop_all() for sk in self.env.safekeepers: sk.stop(immediate=True) - self.env.pageserver.stop(immediate=True) + + for pageserver in self.env.pageservers: + pageserver.stop(immediate=True) + + if self.env.attachment_service is not None: + self.env.attachment_service.stop(immediate=True) cleanup_error = None + + if self.scrub_on_exit: + try: + S3Scrubber(self.test_output_dir, self).scan_metadata() + except Exception as e: + log.error(f"Error during remote storage scrub: {e}") + cleanup_error = e + try: self.cleanup_remote_storage() except Exception as e: log.error(f"Error during remote storage cleanup: {e}") - cleanup_error = e + if cleanup_error is not None: + cleanup_error = e try: self.cleanup_local_storage() @@ -730,7 +642,8 @@ class NeonEnvBuilder: if cleanup_error is not None: raise cleanup_error - self.env.pageserver.assert_no_errors() + for pageserver in self.env.pageservers: + pageserver.assert_no_errors() class NeonEnv: @@ -750,8 +663,7 @@ class NeonEnv: postgres - A factory object for creating postgres compute nodes. - pageserver - An object that contains functions for manipulating and - connecting to the pageserver + pageservers - An array containing objects representing the pageservers safekeepers - An array containing objects representing the safekeepers @@ -766,6 +678,8 @@ class NeonEnv: the tenant id """ + BASE_PAGESERVER_ID = 1 + def __init__(self, config: NeonEnvBuilder): self.repo_dir = config.repo_dir self.rust_log_override = config.rust_log_override @@ -774,21 +688,34 @@ class NeonEnv: self.neon_cli = NeonCli(env=self) self.endpoints = EndpointFactory(self) self.safekeepers: List[Safekeeper] = [] + self.pageservers: List[NeonPageserver] = [] self.broker = config.broker - self.remote_storage = config.remote_storage - self.remote_storage_users = config.remote_storage_users + self.pageserver_remote_storage = config.pageserver_remote_storage + self.ext_remote_storage = config.ext_remote_storage + self.safekeepers_remote_storage = config.sk_remote_storage self.pg_version = config.pg_version + # Binary path for pageserver, safekeeper, etc self.neon_binpath = config.neon_binpath + # Binary path for neon_local test-specific binaries: may be overridden + # after construction for compat testing + self.neon_local_binpath = config.neon_binpath self.pg_distrib_dir = config.pg_distrib_dir self.endpoint_counter = 0 - self.remote_storage_client = config.remote_storage_client - self.ext_remote_storage = config.ext_remote_storage + self.pageserver_config_override = config.pageserver_config_override # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline + if config.enable_generations: + attachment_service_port = self.port_distributor.get_port() + self.control_plane_api: Optional[str] = f"http://127.0.0.1:{attachment_service_port}" + self.attachment_service: Optional[NeonAttachmentService] = NeonAttachmentService(self) + else: + self.control_plane_api = None + self.attachment_service = None + # Create a config file corresponding to the options toml = textwrap.dedent( f""" @@ -796,6 +723,13 @@ class NeonEnv: """ ) + if self.control_plane_api is not None: + toml += textwrap.dedent( + f""" + control_plane_api = '{self.control_plane_api}' + """ + ) + toml += textwrap.dedent( f""" [broker] @@ -804,29 +738,36 @@ class NeonEnv: ) # Create config for pageserver - pageserver_port = PageserverPort( - pg=self.port_distributor.get_port(), - http=self.port_distributor.get_port(), - ) http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust" + for ps_id in range( + self.BASE_PAGESERVER_ID, self.BASE_PAGESERVER_ID + config.num_pageservers + ): + pageserver_port = PageserverPort( + pg=self.port_distributor.get_port(), + http=self.port_distributor.get_port(), + ) - toml += textwrap.dedent( - f""" - [pageserver] - id=1 - listen_pg_addr = 'localhost:{pageserver_port.pg}' - listen_http_addr = 'localhost:{pageserver_port.http}' - pg_auth_type = '{pg_auth_type}' - http_auth_type = '{http_auth_type}' - """ - ) - - # Create a corresponding NeonPageserver object - self.pageserver = NeonPageserver( - self, port=pageserver_port, config_override=config.pageserver_config_override - ) + toml += textwrap.dedent( + f""" + [[pageservers]] + id={ps_id} + listen_pg_addr = 'localhost:{pageserver_port.pg}' + listen_http_addr = 'localhost:{pageserver_port.http}' + pg_auth_type = '{pg_auth_type}' + http_auth_type = '{http_auth_type}' + """ + ) + # Create a corresponding NeonPageserver object + self.pageservers.append( + NeonPageserver( + self, + ps_id, + port=pageserver_port, + config_override=config.pageserver_config_override, + ) + ) # Create config and a Safekeeper object for each safekeeper for i in range(1, config.num_safekeepers + 1): port = SafekeeperPort( @@ -850,13 +791,10 @@ class NeonEnv: auth_enabled = true """ ) - if ( - bool(self.remote_storage_users & RemoteStorageUsers.SAFEKEEPER) - and self.remote_storage is not None - ): + if config.sk_remote_storage is not None: toml += textwrap.dedent( f""" - remote_storage = "{remote_storage_to_toml_inline_table(self.remote_storage)}" + remote_storage = "{remote_storage_to_toml_inline_table(config.sk_remote_storage)}" """ ) safekeeper = Safekeeper(env=self, id=id, port=port) @@ -868,26 +806,47 @@ class NeonEnv: def start(self): # Start up broker, pageserver and all safekeepers self.broker.try_start() - self.pageserver.start() + + if self.attachment_service is not None: + self.attachment_service.start() + + for pageserver in self.pageservers: + pageserver.start() for safekeeper in self.safekeepers: safekeeper.start() + @property + def pageserver(self) -> NeonPageserver: + """ + For tests that are naive to multiple pageservers: give them the 1st in the list, and + assert that there is only one. Tests with multiple pageservers should always use + get_pageserver with an explicit ID. + """ + assert len(self.pageservers) == 1 + return self.pageservers[0] + + def get_pageserver(self, id: Optional[int]) -> NeonPageserver: + """ + Look up a pageserver by its node ID. + + As a convenience for tests that do not use multiple pageservers, passing None + will yield the same default pageserver as `self.pageserver`. + """ + + if id is None: + return self.pageserver + + for ps in self.pageservers: + if ps.id == id: + return ps + + raise RuntimeError(f"Pageserver with ID {id} not found") + def get_safekeeper_connstrs(self) -> str: """Get list of safekeeper endpoints suitable for safekeepers GUC""" return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers) - def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: - """Get a timeline directory's path based on the repo directory of the test environment""" - return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id) - - def tenant_dir( - self, - tenant_id: TenantId, - ) -> Path: - """Get a tenant directory's path based on the repo directory of the test environment""" - return self.repo_dir / "tenants" / str(tenant_id) - def get_pageserver_version(self) -> str: bin_pageserver = str(self.neon_binpath / "pageserver") res = subprocess.run( @@ -922,6 +881,7 @@ def _shared_simple_env( default_broker: NeonBroker, run_id: uuid.UUID, top_output_dir: Path, + test_output_dir: Path, neon_binpath: Path, pg_distrib_dir: Path, pg_version: PgVersion, @@ -949,6 +909,8 @@ def _shared_simple_env( pg_version=pg_version, run_id=run_id, preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + test_name=request.node.name, + test_output_dir=test_output_dir, ) as builder: env = builder.init_start() @@ -976,7 +938,7 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: @pytest.fixture(scope="function") def neon_env_builder( pytestconfig: Config, - test_output_dir: str, + test_output_dir: Path, port_distributor: PortDistributor, mock_s3_server: MockS3Server, neon_binpath: Path, @@ -984,6 +946,7 @@ def neon_env_builder( pg_version: PgVersion, default_broker: NeonBroker, run_id: uuid.UUID, + request: FixtureRequest, ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1012,6 +975,8 @@ def neon_env_builder( broker=default_broker, run_id=run_id, preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + test_name=request.node.name, + test_output_dir=test_output_dir, ) as builder: yield builder @@ -1048,6 +1013,7 @@ class AbstractNeonCli(abc.ABC): extra_env_vars: Optional[Dict[str, str]] = None, check_return_code=True, timeout=None, + local_binpath=False, ) -> "subprocess.CompletedProcess[str]": """ Run the command with the specified arguments. @@ -1061,12 +1027,19 @@ class AbstractNeonCli(abc.ABC): >>> log.info(result.stdout) If `check_return_code`, on non-zero exit code logs failure and raises. + + If `local_binpath` is true, then we are invoking a test utility """ assert type(arguments) == list assert type(self.COMMAND) == str - bin_neon = str(self.env.neon_binpath / self.COMMAND) + if local_binpath: + # Test utility + bin_neon = str(self.env.neon_local_binpath / self.COMMAND) + else: + # Normal binary + bin_neon = str(self.env.neon_binpath / self.COMMAND) args = [bin_neon] + arguments log.info('Running command "{}"'.format(" ".join(args))) @@ -1120,6 +1093,10 @@ class NeonCli(AbstractNeonCli): COMMAND = "neon_local" + def raw_cli(self, *args, **kwargs) -> subprocess.CompletedProcess[str]: + kwargs["local_binpath"] = True + return super().raw_cli(*args, **kwargs) + def create_tenant( self, tenant_id: Optional[TenantId] = None, @@ -1273,43 +1250,53 @@ class NeonCli(AbstractNeonCli): cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version] + storage = self.env.pageserver_remote_storage + append_pageserver_param_overrides( params_to_update=cmd, - remote_storage=self.env.remote_storage, - remote_storage_users=self.env.remote_storage_users, - pageserver_config_override=self.env.pageserver.config_override, + remote_storage=storage, + pageserver_config_override=self.env.pageserver_config_override, ) s3_env_vars = None - if self.env.remote_storage is not None and isinstance( - self.env.remote_storage, S3Storage - ): - s3_env_vars = self.env.remote_storage.access_env_vars() + if isinstance(storage, S3Storage): + s3_env_vars = storage.access_env_vars() res = self.raw_cli(cmd, extra_env_vars=s3_env_vars) res.check_returncode() return res + def attachment_service_start(self): + cmd = ["attachment_service", "start"] + return self.raw_cli(cmd) + + def attachment_service_stop(self, immediate: bool): + cmd = ["attachment_service", "stop"] + if immediate: + cmd.extend(["-m", "immediate"]) + return self.raw_cli(cmd) + def pageserver_start( self, + id: int, overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, ) -> "subprocess.CompletedProcess[str]": - start_args = ["pageserver", "start", *overrides] + start_args = ["pageserver", "start", f"--id={id}", *overrides] + storage = self.env.pageserver_remote_storage append_pageserver_param_overrides( params_to_update=start_args, - remote_storage=self.env.remote_storage, - remote_storage_users=self.env.remote_storage_users, - pageserver_config_override=self.env.pageserver.config_override, + remote_storage=storage, + pageserver_config_override=self.env.pageserver_config_override, ) - if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): - s3_env_vars = self.env.remote_storage.access_env_vars() + if isinstance(storage, S3Storage): + s3_env_vars = storage.access_env_vars() extra_env_vars = (extra_env_vars or {}) | s3_env_vars return self.raw_cli(start_args, extra_env_vars=extra_env_vars) - def pageserver_stop(self, immediate=False) -> "subprocess.CompletedProcess[str]": - cmd = ["pageserver", "stop"] + def pageserver_stop(self, id: int, immediate=False) -> "subprocess.CompletedProcess[str]": + cmd = ["pageserver", "stop", f"--id={id}"] if immediate: cmd.extend(["-m", "immediate"]) @@ -1320,8 +1307,8 @@ class NeonCli(AbstractNeonCli): self, id: int, extra_opts: Optional[List[str]] = None ) -> "subprocess.CompletedProcess[str]": s3_env_vars = None - if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): - s3_env_vars = self.env.remote_storage.access_env_vars() + if isinstance(self.env.safekeepers_remote_storage, S3Storage): + s3_env_vars = self.env.safekeepers_remote_storage.access_env_vars() if extra_opts is not None: extra_opts = [f"-e={opt}" for opt in extra_opts] @@ -1350,6 +1337,7 @@ class NeonCli(AbstractNeonCli): tenant_id: Optional[TenantId] = None, hot_standby: bool = False, lsn: Optional[Lsn] = None, + pageserver_id: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1371,6 +1359,8 @@ class NeonCli(AbstractNeonCli): args.append(endpoint_id) if hot_standby: args.extend(["--hot-standby", "true"]) + if pageserver_id is not None: + args.extend(["--pageserver-id", str(pageserver_id)]) res = self.raw_cli(args) res.check_returncode() @@ -1386,6 +1376,7 @@ class NeonCli(AbstractNeonCli): lsn: Optional[Lsn] = None, branch_name: Optional[str] = None, remote_ext_config: Optional[str] = None, + pageserver_id: Optional[int] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1408,10 +1399,13 @@ class NeonCli(AbstractNeonCli): args.extend(["--branch-name", branch_name]) if endpoint_id is not None: args.append(endpoint_id) + if pageserver_id is not None: + args.extend(["--pageserver-id", str(pageserver_id)]) + storage = self.env.ext_remote_storage s3_env_vars = None - if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): - s3_env_vars = self.env.remote_storage.access_env_vars() + if isinstance(storage, S3Storage): + s3_env_vars = storage.access_env_vars() res = self.raw_cli(args, extra_env_vars=s3_env_vars) res.check_returncode() @@ -1470,6 +1464,35 @@ class ComputeCtl(AbstractNeonCli): COMMAND = "compute_ctl" +class NeonAttachmentService: + def __init__(self, env: NeonEnv): + self.env = env + self.running = False + + def start(self): + assert not self.running + self.env.neon_cli.attachment_service_start() + self.running = True + return self + + def stop(self, immediate: bool = False) -> "NeonAttachmentService": + if self.running: + self.env.neon_cli.attachment_service_stop(immediate) + self.running = False + return self + + def __enter__(self) -> "NeonAttachmentService": + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): + self.stop(immediate=True) + + class NeonPageserver(PgProtocol): """ An object representing a running pageserver. @@ -1477,9 +1500,12 @@ class NeonPageserver(PgProtocol): TEMP_FILE_SUFFIX = "___temp" - def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional[str] = None): + def __init__( + self, env: NeonEnv, id: int, port: PageserverPort, config_override: Optional[str] = None + ): super().__init__(host="localhost", port=port.pg, user="cloud_admin") self.env = env + self.id = id self.running = False self.service_port = port self.config_override = config_override @@ -1539,8 +1565,24 @@ class NeonPageserver(PgProtocol): ".*took more than expected to complete.*", # these can happen during shutdown, but it should not be a reason to fail a test ".*completed, took longer than expected.*", + '.*registered custom resource manager "neon".*', ] + def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path: + """Get a timeline directory's path based on the repo directory of the test environment""" + if timeline_id is None: + return self.tenant_dir(tenant_id) / "timelines" + return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id) + + def tenant_dir( + self, + tenant_id: Optional[TenantId] = None, + ) -> Path: + """Get a tenant directory's path based on the repo directory of the test environment""" + if tenant_id is None: + return self.workdir / "tenants" + return self.workdir / "tenants" / str(tenant_id) + def start( self, overrides: Tuple[str, ...] = (), @@ -1553,7 +1595,9 @@ class NeonPageserver(PgProtocol): """ assert self.running is False - self.env.neon_cli.pageserver_start(overrides=overrides, extra_env_vars=extra_env_vars) + self.env.neon_cli.pageserver_start( + self.id, overrides=overrides, extra_env_vars=extra_env_vars + ) self.running = True return self @@ -1563,7 +1607,7 @@ class NeonPageserver(PgProtocol): Returns self. """ if self.running: - self.env.neon_cli.pageserver_stop(immediate) + self.env.neon_cli.pageserver_stop(self.id, immediate) self.running = False return self @@ -1589,8 +1633,12 @@ class NeonPageserver(PgProtocol): is_testing_enabled_or_skip=self.is_testing_enabled_or_skip, ) + @property + def workdir(self) -> Path: + return Path(os.path.join(self.env.repo_dir, f"pageserver_{self.id}")) + def assert_no_errors(self): - logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r") + logfile = open(os.path.join(self.workdir, "pageserver.log"), "r") error_or_warn = re.compile(r"\s(ERROR|WARN)") errors = [] while True: @@ -1613,7 +1661,7 @@ class NeonPageserver(PgProtocol): def log_contains(self, pattern: str) -> Optional[str]: """Check that the pageserver log contains a line that matches the given regex""" - logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r") + logfile = open(os.path.join(self.workdir, "pageserver.log"), "r") contains_re = re.compile(pattern) @@ -1633,14 +1681,33 @@ class NeonPageserver(PgProtocol): return None + def tenant_attach( + self, tenant_id: TenantId, config: None | Dict[str, Any] = None, config_null: bool = False + ): + """ + Tenant attachment passes through here to acquire a generation number before proceeding + to call into the pageserver HTTP client. + """ + if self.env.attachment_service is not None: + response = requests.post( + f"{self.env.control_plane_api}/attach_hook", + json={"tenant_id": str(tenant_id), "pageserver_id": self.id}, + ) + response.raise_for_status() + generation = response.json()["gen"] + else: + generation = None + + client = self.http_client() + return client.tenant_attach(tenant_id, config, config_null, generation=generation) + def append_pageserver_param_overrides( params_to_update: List[str], remote_storage: Optional[RemoteStorage], - remote_storage_users: RemoteStorageUsers, pageserver_config_override: Optional[str] = None, ): - if bool(remote_storage_users & RemoteStorageUsers.PAGESERVER) and remote_storage is not None: + if remote_storage is not None: remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) params_to_update.append( @@ -1718,7 +1785,10 @@ class PgBin: self._fixpath(command) log.info(f"Running command '{' '.join(command)}'") env = self._build_env(env) - return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs) + base_path, _, _ = subprocess_capture( + self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs + ) + return base_path @pytest.fixture(scope="function") @@ -2063,6 +2133,28 @@ class NeonProxy(PgProtocol): def _wait_until_ready(self): requests.get(f"http://{self.host}:{self.http_port}/v1/status") + def http_query(self, query, args, **kwargs): + # TODO maybe use default values if not provided + user = kwargs["user"] + password = kwargs["password"] + expected_code = kwargs.get("expected_code") + + connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres" + response = requests.post( + f"https://{self.domain}:{self.external_http_port}/sql", + data=json.dumps({"query": query, "params": args}), + headers={ + "Content-Type": "application/sql", + "Neon-Connection-String": connstr, + "Neon-Pool-Opt-In": "true", + }, + verify=str(self.test_output_dir / "proxy.crt"), + ) + + if expected_code is not None: + assert response.status_code == kwargs["expected_code"], f"response: {response.json()}" + return response.json() + def get_metrics(self) -> str: request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics") request_result.raise_for_status() @@ -2239,6 +2331,7 @@ class Endpoint(PgProtocol): hot_standby: bool = False, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, + pageserver_id: Optional[int] = None, ) -> "Endpoint": """ Create a new Postgres endpoint. @@ -2260,6 +2353,7 @@ class Endpoint(PgProtocol): hot_standby=hot_standby, pg_port=self.pg_port, http_port=self.http_port, + pageserver_id=pageserver_id, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = os.path.join(self.env.repo_dir, path) @@ -2273,7 +2367,9 @@ class Endpoint(PgProtocol): return self - def start(self, remote_ext_config: Optional[str] = None) -> "Endpoint": + def start( + self, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None + ) -> "Endpoint": """ Start the Postgres instance. Returns self. @@ -2290,6 +2386,7 @@ class Endpoint(PgProtocol): tenant_id=self.tenant_id, safekeepers=self.active_safekeepers, remote_ext_config=remote_ext_config, + pageserver_id=pageserver_id, ) self.running = True @@ -2380,6 +2477,7 @@ class Endpoint(PgProtocol): lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, remote_ext_config: Optional[str] = None, + pageserver_id: Optional[int] = None, ) -> "Endpoint": """ Create an endpoint, apply config, and start Postgres. @@ -2394,6 +2492,7 @@ class Endpoint(PgProtocol): config_lines=config_lines, hot_standby=hot_standby, lsn=lsn, + pageserver_id=pageserver_id, ).start(remote_ext_config=remote_ext_config) log.info(f"Postgres startup took {time.time() - started_at} seconds") @@ -2429,6 +2528,7 @@ class EndpointFactory: hot_standby: bool = False, config_lines: Optional[List[str]] = None, remote_ext_config: Optional[str] = None, + pageserver_id: Optional[int] = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -2446,6 +2546,7 @@ class EndpointFactory: config_lines=config_lines, lsn=lsn, remote_ext_config=remote_ext_config, + pageserver_id=pageserver_id, ) def create( @@ -2724,6 +2825,41 @@ class SafekeeperHttpClient(requests.Session): return metrics +class S3Scrubber: + def __init__(self, log_dir: Path, env: NeonEnvBuilder): + self.env = env + self.log_dir = log_dir + + def scrubber_cli(self, args, timeout): + assert isinstance(self.env.pageserver_remote_storage, S3Storage) + s3_storage = self.env.pageserver_remote_storage + + env = { + "REGION": s3_storage.bucket_region, + "BUCKET": s3_storage.bucket_name, + } + env.update(s3_storage.access_env_vars()) + + if s3_storage.endpoint is not None: + env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint}) + + base_args = [self.env.neon_binpath / "s3_scrubber"] + args = base_args + args + + (output_path, _, status_code) = subprocess_capture( + self.log_dir, args, echo_stderr=True, echo_stdout=True, env=env, check=False + ) + if status_code: + log.warning(f"Scrub command {args} failed") + log.warning(f"Scrub environment: {env}") + log.warning(f"Output at: {output_path}") + + raise RuntimeError("Remote storage scrub failed") + + def scan_metadata(self): + self.scrubber_cli(["scan-metadata"], timeout=30) + + def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: """Compute the working directory for an individual test.""" test_name = request.node.name @@ -2845,9 +2981,7 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]: # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content( - test_output_dir: Path, - env: NeonEnv, - endpoint: Endpoint, + test_output_dir: Path, env: NeonEnv, endpoint: Endpoint, pageserver_id: Optional[int] = None ): # Get the timeline ID. We need it for the 'basebackup' command timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0]) @@ -2872,7 +3006,7 @@ def check_restored_datadir_content( cmd = rf""" {psql_path} \ --no-psqlrc \ - postgres://localhost:{env.pageserver.service_port.pg} \ + postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg} \ -c 'basebackup {endpoint.tenant_id} {timeline_id}' \ | tar -x -C {restored_dir_path} """ @@ -2922,19 +3056,32 @@ def check_restored_datadir_content( def wait_for_last_flush_lsn( - env: NeonEnv, endpoint: Endpoint, tenant: TenantId, timeline: TimelineId + env: NeonEnv, + endpoint: Endpoint, + tenant: TenantId, + timeline: TimelineId, + pageserver_id: Optional[int] = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) + return wait_for_last_record_lsn( + env.get_pageserver(pageserver_id).http_client(), tenant, timeline, last_flush_lsn + ) def wait_for_wal_insert_lsn( - env: NeonEnv, endpoint: Endpoint, tenant: TenantId, timeline: TimelineId + env: NeonEnv, + endpoint: Endpoint, + tenant: TenantId, + timeline: TimelineId, + pageserver_id: Optional[int] = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0]) - return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) + return wait_for_last_record_lsn( + env.get_pageserver(pageserver_id).http_client(), tenant, timeline, last_flush_lsn + ) def fork_at_current_lsn( @@ -2954,15 +3101,21 @@ def fork_at_current_lsn( def last_flush_lsn_upload( - env: NeonEnv, endpoint: Endpoint, tenant_id: TenantId, timeline_id: TimelineId + env: NeonEnv, + endpoint: Endpoint, + tenant_id: TenantId, + timeline_id: TimelineId, + pageserver_id: Optional[int] = None, ) -> Lsn: """ Wait for pageserver to catch to the latest flush LSN of given endpoint, checkpoint pageserver, and wait for it to be uploaded (remote_consistent_lsn reaching flush LSN). """ - last_flush_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - ps_http = env.pageserver.http_client() + last_flush_lsn = wait_for_last_flush_lsn( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver_id + ) + ps_http = env.get_pageserver(pageserver_id).http_client() wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn) # force a checkpoint to trigger upload ps_http.timeline_checkpoint(tenant_id, timeline_id) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index a179ebdd09..9373073abf 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -186,18 +186,25 @@ class PageserverHttpClient(requests.Session): return TenantId(new_tenant_id) def tenant_attach( - self, tenant_id: TenantId, config: None | Dict[str, Any] = None, config_null: bool = False + self, + tenant_id: TenantId, + config: None | Dict[str, Any] = None, + config_null: bool = False, + generation: Optional[int] = None, ): if config_null: assert config is None - body = "null" + body: Any = None else: # null-config is prohibited by the API config = config or {} - body = json.dumps({"config": config}) + body = {"config": config} + if generation is not None: + body.update({"generation": generation}) + res = self.post( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach", - data=body, + data=json.dumps(body), headers={"Content-Type": "application/json"}, ) self.verbose_error(res) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 708e90d9b2..2e5d75a0fc 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -154,16 +154,17 @@ def wait_for_last_record_lsn( lsn: Lsn, ) -> Lsn: """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" - for i in range(10): + for i in range(100): current_lsn = last_record_lsn(pageserver_http, tenant, timeline) if current_lsn >= lsn: return current_lsn - log.info( - "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - lsn, current_lsn, i + 1 + if i % 10 == 0: + log.info( + "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( + lsn, current_lsn, i + 1 + ) ) - ) - time.sleep(1) + time.sleep(0.1) raise Exception( "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) ) @@ -235,15 +236,27 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str response = list_prefix(neon_env_builder, prefix) keys = response["KeyCount"] objects = response.get("Contents", []) + common_prefixes = response.get("CommonPrefixes", []) - if keys != 0 and len(objects) == 0: - # this has been seen in one case with mock_s3: - # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f - # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes - common_prefixes = response.get("CommonPrefixes", []) - log.warn( - f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}" - ) + remote_storage = neon_env_builder.pageserver_remote_storage + is_mock_s3 = isinstance(remote_storage, S3Storage) and not remote_storage.cleanup + + if is_mock_s3: + if keys == 1 and len(objects) == 0 and len(common_prefixes) == 1: + # this has been seen in the wild by tests with the below contradicting logging + # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865 + # this seems like a mock_s3 issue + log.warn( + f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0" + ) + keys = 0 + elif keys != 0 and len(objects) == 0: + # this has been seen in one case with mock_s3: + # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f + # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes + log.warn( + f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}" + ) assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}" @@ -260,15 +273,11 @@ def list_prefix( Note that this function takes into account prefix_in_bucket. """ # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api. - assert neon_env_builder.remote_storage_kind in ( - RemoteStorageKind.MOCK_S3, - RemoteStorageKind.REAL_S3, - ) - # For mypy - assert isinstance(neon_env_builder.remote_storage, S3Storage) - assert neon_env_builder.remote_storage_client is not None + remote = neon_env_builder.pageserver_remote_storage + assert isinstance(remote, S3Storage), "localfs is currently not supported" + assert remote.client is not None - prefix_in_bucket = neon_env_builder.remote_storage.prefix_in_bucket or "" + prefix_in_bucket = remote.prefix_in_bucket or "" if not prefix: prefix = prefix_in_bucket else: @@ -277,9 +286,9 @@ def list_prefix( prefix = "/".join((prefix_in_bucket, prefix)) # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive. - response = neon_env_builder.remote_storage_client.list_objects_v2( + response = remote.client.list_objects_v2( Delimiter="/", - Bucket=neon_env_builder.remote_storage.bucket_name, + Bucket=remote.bucket_name, Prefix=prefix, ) return response diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index b61f52be3c..657718da00 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -17,6 +17,7 @@ This fixture is used to determine which version of Postgres to use for tests. class PgVersion(str, enum.Enum): V14 = "14" V15 = "15" + V16 = "16" # Instead of making version an optional parameter in methods, we can use this fake entry # to explicitly rely on the default server version (could be different from pg_version fixture value) NOT_SET = "<-POSTRGRES VERSION IS NOT SET->" diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 320e658639..f7cddbc821 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -1,10 +1,15 @@ import enum +import hashlib import json import os +import re import subprocess from dataclasses import dataclass from pathlib import Path -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union + +import boto3 +from mypy_boto3_s3 import S3Client from fixtures.log_helper import log from fixtures.types import TenantId, TimelineId @@ -12,6 +17,20 @@ from fixtures.types import TenantId, TimelineId TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" +@enum.unique +class RemoteStorageUser(str, enum.Enum): + """ + Instead of using strings for the users, use a more strict enum. + """ + + PAGESERVER = "pageserver" + EXTENSIONS = "ext" + SAFEKEEPER = "safekeeper" + + def __str__(self) -> str: + return self.value + + class MockS3Server: """ Starts a mock S3 server for testing on a port given, errors if the server fails to start or exits prematurely. @@ -58,6 +77,126 @@ class MockS3Server: self.subprocess.kill() +@dataclass +class LocalFsStorage: + root: Path + + def tenant_path(self, tenant_id: TenantId) -> Path: + return self.root / "tenants" / str(tenant_id) + + def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: + return self.tenant_path(tenant_id) / "timelines" / str(timeline_id) + + def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: + return self.timeline_path(tenant_id, timeline_id) / TIMELINE_INDEX_PART_FILE_NAME + + def index_content(self, tenant_id: TenantId, timeline_id: TimelineId): + with self.index_path(tenant_id, timeline_id).open("r") as f: + return json.load(f) + + def to_toml_inline_table(self) -> str: + return f"local_path='{self.root}'" + + def cleanup(self): + # no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files + pass + + @staticmethod + def component_path(repo_dir: Path, user: RemoteStorageUser) -> Path: + return repo_dir / "local_fs_remote_storage" / str(user) + + +@dataclass +class S3Storage: + bucket_name: str + bucket_region: str + access_key: str + secret_key: str + prefix_in_bucket: str + client: S3Client + cleanup: bool + """Is this MOCK_S3 (false) or REAL_S3 (true)""" + real: bool + endpoint: Optional[str] = None + + def access_env_vars(self) -> Dict[str, str]: + return { + "AWS_ACCESS_KEY_ID": self.access_key, + "AWS_SECRET_ACCESS_KEY": self.secret_key, + } + + def to_string(self) -> str: + return json.dumps( + { + "bucket": self.bucket_name, + "region": self.bucket_region, + "endpoint": self.endpoint, + "prefix": self.prefix_in_bucket, + } + ) + + def to_toml_inline_table(self) -> str: + s = [ + f"bucket_name='{self.bucket_name}'", + f"bucket_region='{self.bucket_region}'", + ] + + if self.prefix_in_bucket is not None: + s.append(f"prefix_in_bucket='{self.prefix_in_bucket}'") + + if self.endpoint is not None: + s.append(f"endpoint='{self.endpoint}'") + + return ",".join(s) + + def do_cleanup(self): + if not self.cleanup: + # handles previous keep_remote_storage_contents + return + + log.info( + "removing data from test s3 bucket %s by prefix %s", + self.bucket_name, + self.prefix_in_bucket, + ) + paginator = self.client.get_paginator("list_objects_v2") + pages = paginator.paginate( + Bucket=self.bucket_name, + Prefix=self.prefix_in_bucket, + ) + + # Using Any because DeleteTypeDef (from boto3-stubs) doesn't fit our case + objects_to_delete: Any = {"Objects": []} + cnt = 0 + for item in pages.search("Contents"): + # weirdly when nothing is found it returns [None] + if item is None: + break + + objects_to_delete["Objects"].append({"Key": item["Key"]}) + + # flush once aws limit reached + if len(objects_to_delete["Objects"]) >= 1000: + self.client.delete_objects( + Bucket=self.bucket_name, + Delete=objects_to_delete, + ) + objects_to_delete = {"Objects": []} + cnt += 1 + + # flush rest + if len(objects_to_delete["Objects"]): + self.client.delete_objects( + Bucket=self.bucket_name, + Delete=objects_to_delete, + ) + + log.info(f"deleted {cnt} objects from remote storage") + + +RemoteStorage = Union[LocalFsStorage, S3Storage] + + @enum.unique class RemoteStorageKind(str, enum.Enum): LOCAL_FS = "local_fs" @@ -67,6 +206,106 @@ class RemoteStorageKind(str, enum.Enum): # to ensure the test pass with or without the remote storage NOOP = "noop" + def configure( + self, + repo_dir: Path, + mock_s3_server, + run_id: str, + test_name: str, + user: RemoteStorageUser, + bucket_name: Optional[str] = None, + bucket_region: Optional[str] = None, + ) -> Optional[RemoteStorage]: + if self == RemoteStorageKind.NOOP: + return None + + if self == RemoteStorageKind.LOCAL_FS: + return LocalFsStorage(LocalFsStorage.component_path(repo_dir, user)) + + # real_s3 uses this as part of prefix, mock_s3 uses this as part of + # bucket name, giving all users unique buckets because we have to + # create them + test_name = re.sub(r"[_\[\]]", "-", test_name) + + def to_bucket_name(user: str, test_name: str) -> str: + s = f"{user}-{test_name}" + + if len(s) > 63: + prefix = s[:30] + suffix = hashlib.sha256(test_name.encode()).hexdigest()[:32] + s = f"{prefix}-{suffix}" + assert len(s) == 63 + + return s + + if self == RemoteStorageKind.MOCK_S3: + # there's a single mock_s3 server for each process running the tests + mock_endpoint = mock_s3_server.endpoint() + mock_region = mock_s3_server.region() + + access_key, secret_key = mock_s3_server.access_key(), mock_s3_server.secret_key() + + client = boto3.client( + "s3", + endpoint_url=mock_endpoint, + region_name=mock_region, + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + ) + + bucket_name = to_bucket_name(user, test_name) + log.info( + f"using mock_s3 bucket name {bucket_name} for user={user}, test_name={test_name}" + ) + + return S3Storage( + bucket_name=bucket_name, + endpoint=mock_endpoint, + bucket_region=mock_region, + access_key=access_key, + secret_key=secret_key, + prefix_in_bucket="", + client=client, + cleanup=False, + real=False, + ) + + assert self == RemoteStorageKind.REAL_S3 + + env_access_key = os.getenv("AWS_ACCESS_KEY_ID") + assert env_access_key, "no aws access key provided" + env_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") + assert env_secret_key, "no aws access key provided" + + # session token is needed for local runs with sso auth + session_token = os.getenv("AWS_SESSION_TOKEN") + + bucket_name = bucket_name or os.getenv("REMOTE_STORAGE_S3_BUCKET") + assert bucket_name is not None, "no remote storage bucket name provided" + bucket_region = bucket_region or os.getenv("REMOTE_STORAGE_S3_REGION") + assert bucket_region is not None, "no remote storage region provided" + + prefix_in_bucket = f"{run_id}/{test_name}/{user}" + + client = boto3.client( + "s3", + region_name=bucket_region, + aws_access_key_id=env_access_key, + aws_secret_access_key=env_secret_key, + aws_session_token=session_token, + ) + + return S3Storage( + bucket_name=bucket_name, + bucket_region=bucket_region, + access_key=env_access_key, + secret_key=env_secret_key, + prefix_in_bucket=prefix_in_bucket, + client=client, + cleanup=True, + real=True, + ) + def available_remote_storages() -> List[RemoteStorageKind]: remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3] @@ -88,72 +327,22 @@ def available_s3_storages() -> List[RemoteStorageKind]: return remote_storages -@dataclass -class LocalFsStorage: - root: Path +def s3_storage() -> RemoteStorageKind: + """ + For tests that require a remote storage impl that exposes an S3 + endpoint, but don't want to parametrize over multiple storage types. - def tenant_path(self, tenant_id: TenantId) -> Path: - return self.root / "tenants" / str(tenant_id) - - def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: - return self.tenant_path(tenant_id) / "timelines" / str(timeline_id) - - def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: - return self.timeline_path(tenant_id, timeline_id) / TIMELINE_INDEX_PART_FILE_NAME - - def index_content(self, tenant_id: TenantId, timeline_id: TimelineId): - with self.index_path(tenant_id, timeline_id).open("r") as f: - return json.load(f) - - -@dataclass -class S3Storage: - bucket_name: str - bucket_region: str - access_key: str - secret_key: str - endpoint: Optional[str] = None - prefix_in_bucket: Optional[str] = "" - - def access_env_vars(self) -> Dict[str, str]: - return { - "AWS_ACCESS_KEY_ID": self.access_key, - "AWS_SECRET_ACCESS_KEY": self.secret_key, - } - - def to_string(self) -> str: - return json.dumps( - { - "bucket": self.bucket_name, - "region": self.bucket_region, - "endpoint": self.endpoint, - "prefix": self.prefix_in_bucket, - } - ) - - -RemoteStorage = Union[LocalFsStorage, S3Storage] + Use real S3 if available, else use MockS3 + """ + if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None: + return RemoteStorageKind.REAL_S3 + else: + return RemoteStorageKind.MOCK_S3 # serialize as toml inline table def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str: - if isinstance(remote_storage, LocalFsStorage): - remote_storage_config = f"local_path='{remote_storage.root}'" - elif isinstance(remote_storage, S3Storage): - remote_storage_config = f"bucket_name='{remote_storage.bucket_name}',\ - bucket_region='{remote_storage.bucket_region}'" - - if remote_storage.prefix_in_bucket is not None: - remote_storage_config += f",prefix_in_bucket='{remote_storage.prefix_in_bucket}'" - - if remote_storage.endpoint is not None: - remote_storage_config += f",endpoint='{remote_storage.endpoint}'" - else: + if not isinstance(remote_storage, (LocalFsStorage, S3Storage)): raise Exception("invalid remote storage type") - return f"{{{remote_storage_config}}}" - - -class RemoteStorageUsers(enum.Flag): - PAGESERVER = enum.auto() - SAFEKEEPER = enum.auto() + return f"{{{remote_storage.to_toml_inline_table()}}}" diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index d03d2e7595..46ab446f99 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -4,9 +4,10 @@ import os import re import subprocess import tarfile +import threading import time from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, TypeVar from urllib.parse import urlencode import allure @@ -26,34 +27,100 @@ def get_self_dir() -> Path: return Path(__file__).resolve().parent -def subprocess_capture(capture_dir: Path, cmd: List[str], **kwargs: Any) -> str: - """Run a process and capture its output +def subprocess_capture( + capture_dir: Path, + cmd: List[str], + *, + check=False, + echo_stderr=False, + echo_stdout=False, + capture_stdout=False, + **kwargs: Any, +) -> Tuple[str, Optional[str], int]: + """Run a process and bifurcate its output to files and the `log` logger - Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" + stderr and stdout are always captured in files. They are also optionally + echoed to the log (echo_stderr, echo_stdout), and/or captured and returned + (capture_stdout). + + File output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" where "cmd" is the name of the program and NNN is an incrementing counter. If those files already exist, we will overwrite them. - Returns basepath for files with captured output. + + Returns 3-tuple of: + - The base path for output files + - Captured stdout, or None + - The exit status of the process """ assert isinstance(cmd, list) - base = f"{os.path.basename(cmd[0])}_{global_counter()}" + base_cmd = os.path.basename(cmd[0]) + base = f"{base_cmd}_{global_counter()}" basepath = os.path.join(capture_dir, base) stdout_filename = f"{basepath}.stdout" stderr_filename = f"{basepath}.stderr" + # Since we will stream stdout and stderr concurrently, need to do it in a thread. + class OutputHandler(threading.Thread): + def __init__(self, in_file, out_file, echo: bool, capture: bool): + super().__init__() + self.in_file = in_file + self.out_file = out_file + self.echo = echo + self.capture = capture + self.captured = "" + + def run(self): + for line in self.in_file: + # Only bother decoding if we are going to do something more than stream to a file + if self.echo or self.capture: + string = line.decode(encoding="utf-8", errors="replace") + + if self.echo: + log.info(string) + + if self.capture: + self.captured += string + + self.out_file.write(line) + + captured = None try: - with open(stdout_filename, "w") as stdout_f: - with open(stderr_filename, "w") as stderr_f: + with open(stdout_filename, "wb") as stdout_f: + with open(stderr_filename, "wb") as stderr_f: log.info(f'Capturing stdout to "{base}.stdout" and stderr to "{base}.stderr"') - subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) + + p = subprocess.Popen( + cmd, + **kwargs, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout_handler = OutputHandler( + p.stdout, stdout_f, echo=echo_stdout, capture=capture_stdout + ) + stdout_handler.start() + stderr_handler = OutputHandler(p.stderr, stderr_f, echo=echo_stderr, capture=False) + stderr_handler.start() + + r = p.wait() + + stdout_handler.join() + stderr_handler.join() + + if check and r != 0: + raise subprocess.CalledProcessError(r, " ".join(cmd)) + + if capture_stdout: + captured = stdout_handler.captured finally: # Remove empty files if there is no output for filename in (stdout_filename, stderr_filename): if os.stat(filename).st_size == 0: os.remove(filename) - return basepath + return (basepath, captured, r) _global_counter = 0 diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index d6e67aa361..aafc315576 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,6 +1,8 @@ +import shutil from contextlib import closing -from fixtures.compare_fixtures import PgCompare +from fixtures.compare_fixtures import NeonCompare, PgCompare +from fixtures.pg_version import PgVersion # @@ -28,3 +30,30 @@ def test_bulk_insert(neon_with_baseline: PgCompare): env.report_peak_memory_use() env.report_size() + + # When testing neon, also check how long it takes the pageserver to reingest the + # wal from safekeepers. If this number is close to total runtime, then the pageserver + # is the bottleneck. + if isinstance(env, NeonCompare): + measure_recovery_time(env) + + +def measure_recovery_time(env: NeonCompare): + client = env.env.pageserver.http_client() + pg_version = PgVersion(client.timeline_detail(env.tenant, env.timeline)["pg_version"]) + + # Stop pageserver and remove tenant data + env.env.pageserver.stop() + timeline_dir = env.env.pageserver.timeline_dir(env.tenant, env.timeline) + shutil.rmtree(timeline_dir) + + # Start pageserver + env.env.pageserver.start() + + # Measure recovery time + with env.record_duration("wal_recovery"): + # Create the tenant, which will start walingest + client.timeline_create(pg_version, env.tenant, env.timeline) + + # Flush, which will also wait for lsn to catch up + env.flush() diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py index 39aafa80df..0f7615f7ed 100644 --- a/test_runner/performance/test_perf_olap.py +++ b/test_runner/performance/test_perf_olap.py @@ -1,5 +1,6 @@ +import os from dataclasses import dataclass -from typing import Dict, Tuple +from typing import Dict, List, Tuple import pytest from _pytest.mark import ParameterSet @@ -78,6 +79,15 @@ QUERIES: Tuple[LabelledQuery, ...] = ( ) +def get_scale() -> List[str]: + # We parametrize each tpc-h and clickbench test with scale + # to distinguish them from each other, but don't really use it inside. + # Databases are pre-created and passed through BENCHMARK_CONNSTR env variable. + + scale = os.getenv("TEST_OLAP_SCALE", "noscale") + return [scale] + + def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> None: # prepare connstr: # - cut out password from connstr to pass it via env @@ -100,9 +110,10 @@ def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> N env.pg_bin.run_capture(["psql", connstr, "-c", query], env=environ) +@pytest.mark.parametrize("scale", get_scale()) @pytest.mark.parametrize("query", QUERIES) @pytest.mark.remote_cluster -def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare): +def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale: str): """ An OLAP-style ClickHouse benchmark @@ -128,9 +139,10 @@ def tpch_queuies() -> Tuple[ParameterSet, ...]: ) +@pytest.mark.parametrize("scale", get_scale()) @pytest.mark.parametrize("query", tpch_queuies()) @pytest.mark.remote_cluster -def test_tpch(query: LabelledQuery, remote_compare: RemoteCompare): +def test_tpch(query: LabelledQuery, remote_compare: RemoteCompare, scale: str): """ TCP-H Benchmark diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index fade78504a..366b9b0e68 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -84,7 +84,7 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc # This test sometimes runs for longer than the global 5 minute timeout. -@pytest.mark.timeout(600) +@pytest.mark.timeout(900) def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py index 7c20bac399..8381eac946 100644 --- a/test_runner/pg_clients/test_pg_clients.py +++ b/test_runner/pg_clients/test_pg_clients.py @@ -48,6 +48,6 @@ def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: st subprocess_capture(test_output_dir, build_cmd, check=True) run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag] - basepath = subprocess_capture(test_output_dir, run_cmd, check=True) + basepath, _, _ = subprocess_capture(test_output_dir, run_cmd, check=True) assert Path(f"{basepath}.stdout").read_text().strip() == "1" diff --git a/test_runner/regress/data/extension_test/5670669815/v16/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v16/ext_index.json new file mode 100644 index 0000000000..1157e0d032 --- /dev/null +++ b/test_runner/regress/data/extension_test/5670669815/v16/ext_index.json @@ -0,0 +1,7 @@ +{ + "public_extensions": [], + "library_index": { + "TODO": "We still need PG16 extensions" + }, + "extension_data": {} +} \ No newline at end of file diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index bc6afa84a1..892e40b19b 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -14,17 +14,14 @@ from fixtures.utils import wait_until @pytest.fixture def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv: - neon_env_builder.enable_remote_storage( - remote_storage_kind=RemoteStorageKind.LOCAL_FS, - test_name="test_attach_tenant_config", - ) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() # eviction might be the first one after an attach to access the layers env.pageserver.allowed_errors.append( ".*unexpectedly on-demand downloading remote layer remote.* for task kind Eviction" ) - assert isinstance(env.remote_storage, LocalFsStorage) + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) return env @@ -37,12 +34,9 @@ class NegativeTests: @pytest.fixture def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, None, None]: - neon_env_builder.enable_remote_storage( - remote_storage_kind=RemoteStorageKind.LOCAL_FS, - test_name="test_attach_tenant_config", - ) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - assert isinstance(env.remote_storage, LocalFsStorage) + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) ps_http = env.pageserver.http_client() (tenant_id, _) = env.neon_cli.create_tenant() diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 57e9413aa3..d0462844f0 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -44,14 +44,14 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): log.info(f"Timeline {tenant0}/{timeline0} is left intact") (tenant1, timeline1, pg1) = tenant_timelines[1] - metadata_path = f"{env.repo_dir}/tenants/{tenant1}/timelines/{timeline1}/metadata" + metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata" f = open(metadata_path, "w") f.write("overwritten with garbage!") f.close() log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled") (tenant2, timeline2, pg2) = tenant_timelines[2] - timeline_path = f"{env.repo_dir}/tenants/{tenant2}/timelines/{timeline2}/" + timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Remove it @@ -61,7 +61,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): ) (tenant3, timeline3, pg3) = tenant_timelines[3] - timeline_path = f"{env.repo_dir}/tenants/{tenant3}/timelines/{timeline3}/" + timeline_path = f"{env.pageserver.workdir}/tenants/{tenant3}/timelines/{timeline3}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Corrupt it @@ -122,8 +122,8 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): future.result() -def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() env.pageserver.allowed_errors.extend( @@ -133,9 +133,9 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv): ] ) - tenant_id, _ = env.neon_cli.create_tenant() + tenant_id = env.initial_tenant - timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" + timelines_dir = env.pageserver.timeline_dir(tenant_id) old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) initial_timeline_dirs = [d for d in timelines_dir.iterdir()] @@ -145,8 +145,8 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv): _ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id) # Restart the page server - env.neon_cli.pageserver_stop(immediate=True) - env.neon_cli.pageserver_start() + env.pageserver.stop(immediate=True) + env.pageserver.start() # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) @@ -160,13 +160,13 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv): ), "pageserver should clean its temp timeline files on timeline creation failure" -def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - tenant_id, _ = env.neon_cli.create_tenant() + tenant_id = env.initial_tenant - timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" + timelines_dir = env.pageserver.timeline_dir(tenant_id) old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) initial_timeline_dirs = [d for d in timelines_dir.iterdir()] diff --git a/test_runner/regress/test_close_fds.py b/test_runner/regress/test_close_fds.py index 7059f3360e..ce9ecb3dc4 100644 --- a/test_runner/regress/test_close_fds.py +++ b/test_runner/regress/test_close_fds.py @@ -33,7 +33,7 @@ def test_lsof_pageserver_pid(neon_simple_env: NeonEnv): workload_thread = threading.Thread(target=start_workload, args=(), daemon=True) workload_thread.start() - path = os.path.join(env.repo_dir, "pageserver.pid") + path = os.path.join(env.pageserver.workdir, "pageserver.pid") lsof = lsof_path() while workload_thread.is_alive(): res = subprocess.run( diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 6979b99708..161662bc99 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -22,6 +22,7 @@ from fixtures.pageserver.utils import ( ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, RemoteStorageUser from fixtures.types import Lsn from pytest import FixtureRequest @@ -39,6 +40,54 @@ from pytest import FixtureRequest # - prepare_snapshot copies the snapshot, cleans it up and makes it ready for the current version of Neon (replaces paths and ports in config files). # - check_neon_works performs the test itself, feel free to add more checks there. # +# +# How to run `test_backward_compatibility` locally: +# +# export DEFAULT_PG_VERSION=15 +# export BUILD_TYPE=release +# export CHECK_ONDISK_DATA_COMPATIBILITY=true +# +# # Build previous version of binaries and create a data snapshot: +# rm -rf pg_install target +# git checkout +# CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc` +# ./scripts/pytest -k test_create_snapshot +# +# # Build current version of binaries +# rm -rf pg_install target +# git checkout +# CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc` +# +# # Run backward compatibility test +# COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} \ +# ./scripts/pytest -k test_backward_compatibility +# +# +# How to run `test_forward_compatibility` locally: +# +# export DEFAULT_PG_VERSION=15 +# export BUILD_TYPE=release +# export CHECK_ONDISK_DATA_COMPATIBILITY=true +# +# # Build previous version of binaries and store them somewhere: +# rm -rf pg_install target +# git checkout +# CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc` +# mkdir -p neon_previous/target +# cp -a target/${BUILD_TYPE} ./neon_previous/target/${BUILD_TYPE} +# cp -a pg_install ./neon_previous/pg_install +# +# # Build current version of binaries and create a data snapshot: +# rm -rf pg_install target +# git checkout +# CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc` +# ./scripts/pytest -k test_create_snapshot +# +# # Run forward compatibility test +# COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} \ +# COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install \ +# ./scripts/pytest -k test_forward_compatibility +# check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif( os.environ.get("CHECK_ONDISK_DATA_COMPATIBILITY") is None, @@ -61,7 +110,7 @@ def test_create_snapshot( # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it. neon_env_builder.pg_version = pg_version neon_env_builder.num_safekeepers = 3 - neon_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") @@ -132,7 +181,6 @@ def test_backward_compatibility( prepare_snapshot( from_dir=compatibility_snapshot_dir, to_dir=test_output_dir / "compatibility_snapshot", - neon_binpath=neon_binpath, port_distributor=port_distributor, ) @@ -201,7 +249,6 @@ def test_forward_compatibility( from_dir=compatibility_snapshot_dir, to_dir=test_output_dir / "compatibility_snapshot", port_distributor=port_distributor, - neon_binpath=compatibility_neon_bin, pg_distrib_dir=compatibility_postgres_distrib_dir, ) @@ -233,7 +280,6 @@ def prepare_snapshot( from_dir: Path, to_dir: Path, port_distributor: PortDistributor, - neon_binpath: Path, pg_distrib_dir: Optional[Path] = None, ): assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist" @@ -245,6 +291,9 @@ def prepare_snapshot( repo_dir = to_dir / "repo" + snapshot_config_toml = repo_dir / "config" + snapshot_config = toml.load(snapshot_config_toml) + # Remove old logs to avoid confusion in test artifacts for logfile in repo_dir.glob("**/*.log"): logfile.unlink() @@ -258,29 +307,53 @@ def prepare_snapshot( os.mkdir(repo_dir / "endpoints") # Update paths and ports in config files - pageserver_toml = repo_dir / "pageserver.toml" - pageserver_config = toml.load(pageserver_toml) - pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage") - for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"): - pageserver_config[param] = port_distributor.replace_with_new_port(pageserver_config[param]) + legacy_pageserver_toml = repo_dir / "pageserver.toml" + legacy_bundle = os.path.exists(legacy_pageserver_toml) - # We don't use authentication in compatibility tests - # so just remove authentication related settings. - pageserver_config.pop("pg_auth_type", None) - pageserver_config.pop("http_auth_type", None) - - if pg_distrib_dir: - pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir) - - with pageserver_toml.open("w") as f: - toml.dump(pageserver_config, f) - - snapshot_config_toml = repo_dir / "config" - snapshot_config = toml.load(snapshot_config_toml) - for param in ("listen_http_addr", "listen_pg_addr"): - snapshot_config["pageserver"][param] = port_distributor.replace_with_new_port( - snapshot_config["pageserver"][param] + path_to_config: dict[Path, dict[Any, Any]] = {} + if legacy_bundle: + os.mkdir(repo_dir / "pageserver_1") + path_to_config[repo_dir / "pageserver_1" / "pageserver.toml"] = toml.load( + legacy_pageserver_toml ) + os.remove(legacy_pageserver_toml) + os.rename(repo_dir / "tenants", repo_dir / "pageserver_1" / "tenants") + else: + for ps_conf in snapshot_config["pageservers"]: + config_path = repo_dir / f"pageserver_{ps_conf['id']}" / "pageserver.toml" + path_to_config[config_path] = toml.load(config_path) + + # For each pageserver config, edit it and rewrite + for config_path, pageserver_config in path_to_config.items(): + pageserver_config["remote_storage"]["local_path"] = str( + LocalFsStorage.component_path(repo_dir, RemoteStorageUser.PAGESERVER) + ) + + for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"): + pageserver_config[param] = port_distributor.replace_with_new_port( + pageserver_config[param] + ) + + # We don't use authentication in compatibility tests + # so just remove authentication related settings. + pageserver_config.pop("pg_auth_type", None) + pageserver_config.pop("http_auth_type", None) + + if pg_distrib_dir: + pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir) + + with config_path.open("w") as f: + toml.dump(pageserver_config, f) + + # neon_local config doesn't have to be backward compatible. If we're using a dump from before + # it supported multiple pageservers, fix it up. + if "pageservers" not in snapshot_config: + snapshot_config["pageservers"] = [snapshot_config["pageserver"]] + del snapshot_config["pageserver"] + + for param in ("listen_http_addr", "listen_pg_addr"): + for pageserver in snapshot_config["pageservers"]: + pageserver[param] = port_distributor.replace_with_new_port(pageserver[param]) snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port( snapshot_config["broker"]["listen_addr"] ) @@ -338,10 +411,15 @@ def check_neon_works( config.initial_tenant = snapshot_config["default_tenant_id"] config.pg_distrib_dir = pg_distrib_dir config.remote_storage = None + config.ext_remote_storage = None + config.sk_remote_storage = None # Use the "target" binaries to launch the storage nodes config_target = config config_target.neon_binpath = neon_target_binpath + # We are using maybe-old binaries for neon services, but want to use current + # binaries for test utilities like neon_local + config_target.neon_local_binpath = neon_current_binpath cli_target = NeonCli(config_target) # And the current binaries to launch computes @@ -374,7 +452,7 @@ def check_neon_works( # loosely based on https://github.com/neondatabase/cloud/wiki/Recovery-from-WAL tenant_id = snapshot_config["default_tenant_id"] timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] - pageserver_port = snapshot_config["pageserver"]["listen_http_addr"].split(":")[-1] + pageserver_port = snapshot_config["pageservers"][0]["listen_http_addr"].split(":")[-1] pageserver_http = PageserverHttpClient( port=pageserver_port, is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py index 68035b1b14..500d19cf31 100644 --- a/test_runner/regress/test_createdropdb.py +++ b/test_runner/regress/test_createdropdb.py @@ -1,16 +1,22 @@ import os import pathlib +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content +from fixtures.pg_version import PgVersion from fixtures.utils import query_scalar # # Test CREATE DATABASE when there have been relmapper changes # -def test_createdb(neon_simple_env: NeonEnv): +@pytest.mark.parametrize("strategy", ["file_copy", "wal_log"]) +def test_createdb(neon_simple_env: NeonEnv, strategy: str): env = neon_simple_env + if env.pg_version == PgVersion.V14 and strategy == "wal_log": + pytest.skip("wal_log strategy not supported on PostgreSQL 14") + env.neon_cli.create_branch("test_createdb", "empty") endpoint = env.endpoints.create_start("test_createdb") @@ -20,7 +26,10 @@ def test_createdb(neon_simple_env: NeonEnv): # Cause a 'relmapper' change in the original branch cur.execute("VACUUM FULL pg_class") - cur.execute("CREATE DATABASE foodb") + if env.pg_version == PgVersion.V14: + cur.execute("CREATE DATABASE foodb") + else: + cur.execute(f"CREATE DATABASE foodb STRATEGY={strategy}") lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index ebd836ecbc..740e489759 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -42,12 +42,11 @@ def handle_role(dbs, roles, operation): raise ValueError("Invalid op") -fail = False - - -def ddl_forward_handler(request: Request, dbs: Dict[str, str], roles: Dict[str, str]) -> Response: +def ddl_forward_handler( + request: Request, dbs: Dict[str, str], roles: Dict[str, str], ddl: "DdlForwardingContext" +) -> Response: log.info(f"Received request with data {request.get_data(as_text=True)}") - if fail: + if ddl.fail: log.info("FAILING") return Response(status=500, response="Failed just cuz") if request.json is None: @@ -72,6 +71,7 @@ class DdlForwardingContext: self.port = port self.dbs: Dict[str, str] = {} self.roles: Dict[str, str] = {} + self.fail = False endpoint = "/management/api/v2/roles_and_databases" ddl_url = f"http://{host}:{port}{endpoint}" self.pg.configure( @@ -82,7 +82,7 @@ class DdlForwardingContext: ) log.info(f"Listening on {ddl_url}") self.server.expect_request(endpoint, method="PATCH").respond_with_handler( - lambda request: ddl_forward_handler(request, self.dbs, self.roles) + lambda request: ddl_forward_handler(request, self.dbs, self.roles, self) ) def __enter__(self): @@ -103,6 +103,9 @@ class DdlForwardingContext: def wait(self, timeout=3): self.server.wait(timeout=timeout) + def failures(self, bool): + self.fail = bool + def send_and_wait(self, query: str, timeout=3) -> List[Tuple[Any, ...]]: res = self.send(query) self.wait(timeout=timeout) @@ -203,9 +206,9 @@ def test_ddl_forwarding(ddl: DdlForwardingContext): assert ddl.dbs == {"stork": "cork"} with pytest.raises(psycopg2.InternalError): - global fail - fail = True + ddl.failures(True) cur.execute("CREATE DATABASE failure WITH OWNER=cork") ddl.wait() + ddl.failures(False) conn.close() diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 182069315e..ae62fdf4a4 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -135,7 +135,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev log.info(f"setting up eviction_env for test {request.node.name}") - neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}") + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) # initial tenant will not be present on this pageserver env = neon_env_builder.init_configs() @@ -417,7 +417,7 @@ def poor_mans_du( largest_layer = 0 smallest_layer = None for tenant_id, timeline_id in timelines: - timeline_dir = env.timeline_dir(tenant_id, timeline_id) + timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}" total = 0 for file in timeline_dir.iterdir(): diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index b208616345..775ad10241 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -9,8 +9,12 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, ) -from fixtures.pg_version import PgVersion -from fixtures.remote_storage import RemoteStorageKind, available_s3_storages +from fixtures.pg_version import PgVersion, skip_on_postgres +from fixtures.remote_storage import ( + RemoteStorageKind, + S3Storage, + available_s3_storages, +) # Cleaning up downloaded files is important for local tests @@ -72,7 +76,8 @@ def upload_files(env): with open(full_path, "rb") as f: log.info(f"UPLOAD {full_path} to ext/{full_path}") - env.remote_storage_client.upload_fileobj( + assert isinstance(env.pageserver_remote_storage, S3Storage) + env.pageserver_remote_storage.client.upload_fileobj( f, env.ext_remote_storage.bucket_name, f"ext/{full_path}", @@ -81,6 +86,7 @@ def upload_files(env): # Test downloading remote extension. +@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building") @pytest.mark.parametrize("remote_storage_kind", available_s3_storages()) @pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") def test_remote_extensions( @@ -88,17 +94,12 @@ def test_remote_extensions( remote_storage_kind: RemoteStorageKind, pg_version: PgVersion, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_remote_extensions", - enable_remote_extensions=True, - ) + neon_env_builder.enable_extensions_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() tenant_id, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline("test_remote_extensions", tenant_id=tenant_id) assert env.ext_remote_storage is not None # satisfy mypy - assert env.remote_storage_client is not None # satisfy mypy # For MOCK_S3 we upload test files. # For REAL_S3 we use the files already in the bucket @@ -148,6 +149,7 @@ def test_remote_extensions( # Test downloading remote library. +@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building") @pytest.mark.parametrize("remote_storage_kind", available_s3_storages()) @pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") def test_remote_library( @@ -155,17 +157,12 @@ def test_remote_library( remote_storage_kind: RemoteStorageKind, pg_version: PgVersion, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_remote_library", - enable_remote_extensions=True, - ) + neon_env_builder.enable_extensions_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() tenant_id, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline("test_remote_library", tenant_id=tenant_id) assert env.ext_remote_storage is not None # satisfy mypy - assert env.remote_storage_client is not None # satisfy mypy # For MOCK_S3 we upload test files. # For REAL_S3 we use the files already in the bucket @@ -211,22 +208,18 @@ def test_remote_library( # RemoteStorageKind.REAL_S3 not in available_s3_storages(), # reason="skipping test because real s3 not enabled", # ) +@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building") @pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") def test_multiple_extensions_one_archive( neon_env_builder: NeonEnvBuilder, pg_version: PgVersion, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=RemoteStorageKind.REAL_S3, - test_name="test_multiple_extensions_one_archive", - enable_remote_extensions=True, - ) + neon_env_builder.enable_extensions_remote_storage(RemoteStorageKind.REAL_S3) env = neon_env_builder.init_start() tenant_id, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline("test_multiple_extensions_one_archive", tenant_id=tenant_id) assert env.ext_remote_storage is not None # satisfy mypy - assert env.remote_storage_client is not None # satisfy mypy endpoint = env.endpoints.create_start( "test_multiple_extensions_one_archive", @@ -261,20 +254,16 @@ def test_extension_download_after_restart( neon_env_builder: NeonEnvBuilder, pg_version: PgVersion, ): - if "15" in pg_version: # SKIP v15 for now because test set only has extension built for v14 + # TODO: PG15 + PG16 extension building + if "v14" not in pg_version: # test set only has extension built for v14 return None - neon_env_builder.enable_remote_storage( - remote_storage_kind=RemoteStorageKind.MOCK_S3, - test_name="test_extension_download_after_restart", - enable_remote_extensions=True, - ) + neon_env_builder.enable_extensions_remote_storage(RemoteStorageKind.MOCK_S3) env = neon_env_builder.init_start() tenant_id, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline("test_extension_download_after_restart", tenant_id=tenant_id) assert env.ext_remote_storage is not None # satisfy mypy - assert env.remote_storage_client is not None # satisfy mypy # For MOCK_S3 we upload test files. upload_files(env) diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index be817521cd..017a38f85c 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -15,45 +15,45 @@ from fixtures.types import TimelineId # Test configuration # -# Create a table with {num_rows} rows, and perform {updates_to_perform} random -# UPDATEs on it, using {num_connections} separate connections. -num_connections = 10 -num_rows = 100000 -updates_to_perform = 10000 - -updates_performed = 0 - - -# Run random UPDATEs on test table -async def update_table(endpoint: Endpoint): - global updates_performed - pg_conn = await endpoint.connect_async() - - while updates_performed < updates_to_perform: - updates_performed += 1 - id = random.randrange(1, num_rows) - await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}") - - -# Perform aggressive GC with 0 horizon -async def gc(env: NeonEnv, timeline: TimelineId): - pageserver_http = env.pageserver.http_client() - - loop = asyncio.get_running_loop() - - def do_gc(): - pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) - pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) - - with concurrent.futures.ThreadPoolExecutor() as pool: - while updates_performed < updates_to_perform: - await loop.run_in_executor(pool, do_gc) +# Create a table with {NUM_ROWS} rows, and perform {UPDATES_TO_PERFORM} random +# UPDATEs on it, using {NUM_CONNECTIONS} separate connections. +NUM_CONNECTIONS = 10 +NUM_ROWS = 100000 +UPDATES_TO_PERFORM = 10000 # At the same time, run UPDATEs and GC async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId): workers = [] - for _ in range(num_connections): + updates_performed = 0 + + # Perform aggressive GC with 0 horizon + async def gc(env: NeonEnv, timeline: TimelineId): + pageserver_http = env.pageserver.http_client() + nonlocal updates_performed + global UPDATES_TO_PERFORM + + loop = asyncio.get_running_loop() + + def do_gc(): + pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) + pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + + with concurrent.futures.ThreadPoolExecutor() as pool: + while updates_performed < UPDATES_TO_PERFORM: + await loop.run_in_executor(pool, do_gc) + + # Run random UPDATEs on test table + async def update_table(endpoint: Endpoint): + pg_conn = await endpoint.connect_async() + nonlocal updates_performed + + while updates_performed < UPDATES_TO_PERFORM: + updates_performed += 1 + id = random.randrange(1, NUM_ROWS) + await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}") + + for _ in range(NUM_CONNECTIONS): workers.append(asyncio.create_task(update_table(endpoint))) workers.append(asyncio.create_task(gc(env, timeline))) @@ -81,7 +81,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): f""" INSERT INTO foo SELECT g, 0, 'long string to consume some space' || g - FROM generate_series(1, {num_rows}) g + FROM generate_series(1, {NUM_ROWS}) g """ ) cur.execute("CREATE INDEX ON foo(id)") @@ -91,7 +91,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): cur.execute("SELECT COUNT(*), SUM(counter) FROM foo") r = cur.fetchone() assert r is not None - assert r == (num_rows, updates_to_perform) + assert r == (NUM_ROWS, UPDATES_TO_PERFORM) # @@ -99,11 +99,9 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + num_index_uploads = 0 - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_gc_index_upload", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() tenant_id = env.initial_tenant @@ -163,5 +161,5 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: log.info(f"{num_index_uploads} index uploads after GC iteration {i}") after = num_index_uploads - log.info(f"{after-before} new index uploads during test") + log.info(f"{after - before} new index uploads during test") assert after - before < 5 diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 68086d3f84..d357bd0ee4 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -19,6 +19,7 @@ from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, ) +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import subprocess_capture @@ -80,7 +81,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build timeline = TimelineId.generate() # Set up pageserver for import - neon_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -163,7 +164,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() # FIXME: Is this expected? @@ -185,7 +186,7 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu # @pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build") @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2255") def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() timeline = env.neon_cli.create_branch("test_import_from_pageserver_multisegment") @@ -270,7 +271,7 @@ def _import( env.endpoints.stop_all() env.pageserver.stop() - dir_to_clear = Path(env.repo_dir) / "tenants" + dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) diff --git a/test_runner/regress/test_large_schema.py b/test_runner/regress/test_large_schema.py index 72bf32fcd3..b6ac1aa41f 100644 --- a/test_runner/regress/test_large_schema.py +++ b/test_runner/regress/test_large_schema.py @@ -75,7 +75,7 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): # Check layer file sizes timeline_path = "{}/tenants/{}/timelines/{}/".format( - env.repo_dir, env.initial_tenant, env.initial_timeline + env.pageserver.workdir, env.initial_tenant, env.initial_timeline ) for filename in os.listdir(timeline_path): if filename.startswith("00000"): diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 8f627defb5..3e23a8e165 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -19,10 +19,7 @@ def test_basic_eviction( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_download_remote_layers_api", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start( initial_tenant_conf={ @@ -58,7 +55,7 @@ def test_basic_eviction( for sk in env.safekeepers: sk.stop() - timeline_path = env.timeline_dir(tenant_id, timeline_id) + timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id) initial_local_layers = sorted( list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) ) @@ -155,10 +152,7 @@ def test_basic_eviction( def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_remote_storage( - remote_storage_kind=RemoteStorageKind.LOCAL_FS, - test_name="test_gc_of_remote_layers", - ) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() @@ -249,7 +243,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): assert by_kind["Image"] > 0 assert by_kind["Delta"] > 0 assert by_kind["InMemory"] == 0 - resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*")) + resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*")) log.info("resident layers count before eviction: %s", len(resident_layers)) log.info("evict all layers") @@ -257,7 +251,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): def ensure_resident_and_remote_size_metrics(): log.info("ensure that all the layers are gone") - resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*")) + resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*")) # we have disabled all background loops, so, this should hold assert len(resident_layers) == 0 diff --git a/test_runner/regress/test_layer_writers_fail.py b/test_runner/regress/test_layer_writers_fail.py index 5ffc12b5b3..7298635abe 100644 --- a/test_runner/regress/test_layer_writers_fail.py +++ b/test_runner/regress/test_layer_writers_fail.py @@ -38,7 +38,7 @@ def test_image_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): new_temp_layer_files = list( filter( lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX), - [path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()], + [path for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir()], ) ) @@ -84,7 +84,7 @@ def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): new_temp_layer_files = list( filter( lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX), - [path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()], + [path for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir()], ) ) diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py new file mode 100644 index 0000000000..5c68a63d06 --- /dev/null +++ b/test_runner/regress/test_lfc_resize.py @@ -0,0 +1,44 @@ +import threading +import time + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, PgBin + + +# +# Test branching, when a transaction is in prepared state +# +@pytest.mark.timeout(600) +def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): + env = neon_simple_env + env.neon_cli.create_branch("test_lfc_resize", "empty") + endpoint = env.endpoints.create_start( + "test_lfc_resize", + config_lines=[ + "neon.file_cache_path='file.cache'", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + ], + ) + n_resize = 10 + scale = 10 + log.info("postgres is running on 'test_lfc_resize' branch") + + def run_pgbench(connstr: str): + log.info(f"Start a pgbench workload on pg {connstr}") + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr]) + + thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) + thread.start() + + conn = endpoint.connect() + cur = conn.cursor() + + for i in range(n_resize): + cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'") + cur.execute("select pg_reload_conf()") + time.sleep(1) + + thread.join() diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py deleted file mode 100644 index 3f4b42707a..0000000000 --- a/test_runner/regress/test_metric_collection.py +++ /dev/null @@ -1,268 +0,0 @@ -# -# Test for collecting metrics from pageserver and proxy. -# Use mock HTTP server to receive metrics and verify that they look sane. -# - -import time -from pathlib import Path -from typing import Iterator - -import pytest -from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - PSQL, - NeonEnvBuilder, - NeonProxy, - VanillaPostgres, - wait_for_last_flush_lsn, -) -from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import TenantId -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.request import Request -from werkzeug.wrappers.response import Response - -# ============================================================================== -# Storage metrics tests -# ============================================================================== - -initial_tenant = TenantId.generate() -remote_uploaded = 0 -checks = { - "written_size": lambda value: value > 0, - "resident_size": lambda value: value >= 0, - # >= 0 check here is to avoid race condition when we receive metrics before - # remote_uploaded is updated - "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0, - # logical size may lag behind the actual size, so allow 0 here - "timeline_logical_size": lambda value: value >= 0, -} - -metric_kinds_checked = set([]) - - -# -# verify that metrics look minilally sane -# -def metrics_handler(request: Request) -> Response: - if request.json is None: - return Response(status=400) - - events = request.json["events"] - log.info("received events:") - log.info(events) - - for event in events: - assert event["tenant_id"] == str( - initial_tenant - ), "Expecting metrics only from the initial tenant" - metric_name = event["metric"] - - check = checks.get(metric_name) - # calm down mypy - if check is not None: - assert check(event["value"]), f"{metric_name} isn't valid" - global metric_kinds_checked - metric_kinds_checked.add(metric_name) - - return Response(status=200) - - -@pytest.mark.parametrize( - "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS] -) -def test_metric_collection( - httpserver: HTTPServer, - neon_env_builder: NeonEnvBuilder, - httpserver_listen_address, - remote_storage_kind: RemoteStorageKind, -): - (host, port) = httpserver_listen_address - metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" - - # Require collecting metrics frequently, since we change - # the timeline and want something to be logged about it. - # - # Disable time-based pitr, we will use the manual GC calls - # to trigger remote storage operations in a controlled way - neon_env_builder.pageserver_config_override = ( - f""" - metric_collection_interval="1s" - metric_collection_endpoint="{metric_collection_endpoint}" - """ - + "tenant_config={pitr_interval = '0 sec'}" - ) - - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_metric_collection", - ) - - log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}") - - # Set initial tenant of the test, that we expect the logs from - global initial_tenant - initial_tenant = neon_env_builder.initial_tenant - # mock http server that returns OK for the metrics - httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler( - metrics_handler - ) - - # spin up neon, after http server is ready - env = neon_env_builder.init_start() - # Order of fixtures shutdown is not specified, and if http server gets down - # before pageserver, pageserver log might contain such errors in the end. - env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") - tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_metric_collection") - endpoint = env.endpoints.create_start("test_metric_collection") - - pg_conn = endpoint.connect() - cur = pg_conn.cursor() - - cur.execute("CREATE TABLE foo (id int, counter int, t text)") - cur.execute( - """ - INSERT INTO foo - SELECT g, 0, 'long string to consume some space' || g - FROM generate_series(1, 100000) g - """ - ) - - # Helper function that gets the number of given kind of remote ops from the metrics - def get_num_remote_ops(file_kind: str, op_kind: str) -> int: - ps_metrics = env.pageserver.http_client().get_metrics() - total = 0.0 - for sample in ps_metrics.query_all( - name="pageserver_remote_operation_seconds_count", - filter={ - "file_kind": str(file_kind), - "op_kind": str(op_kind), - }, - ): - total += sample[2] - return int(total) - - # upload some data to remote storage - if remote_storage_kind == RemoteStorageKind.LOCAL_FS: - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - pageserver_http = env.pageserver.http_client() - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - pageserver_http.timeline_gc(tenant_id, timeline_id, 10000) - global remote_uploaded - remote_uploaded = get_num_remote_ops("index", "upload") - assert remote_uploaded > 0 - - # wait longer than collecting interval and check that all requests are served - time.sleep(3) - httpserver.check() - global metric_kinds_checked, checks - expected_checks = set(checks.keys()) - assert len(metric_kinds_checked) == len( - checks - ), f"Expected to receive and check all kind of metrics, but {expected_checks - metric_kinds_checked} got uncovered" - - -# ============================================================================== -# Proxy metrics tests -# ============================================================================== - - -def proxy_metrics_handler(request: Request) -> Response: - if request.json is None: - return Response(status=400) - - events = request.json["events"] - log.info("received events:") - log.info(events) - - # perform basic sanity checks - for event in events: - assert event["metric"] == "proxy_io_bytes_per_client" - assert event["endpoint_id"] == "test_endpoint_id" - assert event["value"] >= 0 - assert event["stop_time"] >= event["start_time"] - - return Response(status=200) - - -@pytest.fixture(scope="function") -def proxy_with_metric_collector( - port_distributor: PortDistributor, - neon_binpath: Path, - httpserver_listen_address, - test_output_dir: Path, -) -> Iterator[NeonProxy]: - """Neon proxy that routes through link auth and has metric collection enabled.""" - - http_port = port_distributor.get_port() - proxy_port = port_distributor.get_port() - mgmt_port = port_distributor.get_port() - external_http_port = port_distributor.get_port() - - (host, port) = httpserver_listen_address - metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" - metric_collection_interval = "5s" - - with NeonProxy( - neon_binpath=neon_binpath, - test_output_dir=test_output_dir, - proxy_port=proxy_port, - http_port=http_port, - mgmt_port=mgmt_port, - external_http_port=external_http_port, - metric_collection_endpoint=metric_collection_endpoint, - metric_collection_interval=metric_collection_interval, - auth_backend=NeonProxy.Link(), - ) as proxy: - proxy.start() - yield proxy - - -@pytest.mark.asyncio -async def test_proxy_metric_collection( - httpserver: HTTPServer, - proxy_with_metric_collector: NeonProxy, - vanilla_pg: VanillaPostgres, -): - # mock http server that returns OK for the metrics - httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler( - proxy_metrics_handler - ) - - # do something to generate load to generate metrics - # sleep for 5 seconds to give metric collector time to collect metrics - psql = await PSQL( - host=proxy_with_metric_collector.host, port=proxy_with_metric_collector.proxy_port - ).run( - "create table tbl as select * from generate_series(0,1000); select pg_sleep(5); select 42" - ) - - base_uri = proxy_with_metric_collector.link_auth_uri - link = await NeonProxy.find_auth_link(base_uri, psql) - - psql_session_id = NeonProxy.get_session_id(base_uri, link) - await NeonProxy.activate_link_auth(vanilla_pg, proxy_with_metric_collector, psql_session_id) - - assert psql.stdout is not None - out = (await psql.stdout.read()).decode("utf-8").strip() - assert out == "42" - - # do something to generate load to generate metrics - # sleep for 5 seconds to give metric collector time to collect metrics - psql = await PSQL( - host=proxy_with_metric_collector.host, port=proxy_with_metric_collector.proxy_port - ).run("insert into tbl select * from generate_series(0,1000); select pg_sleep(5); select 42") - - link = await NeonProxy.find_auth_link(base_uri, psql) - psql_session_id = NeonProxy.get_session_id(base_uri, link) - await NeonProxy.activate_link_auth( - vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=False - ) - - assert psql.stdout is not None - out = (await psql.stdout.read()).decode("utf-8").strip() - assert out == "42" - - httpserver.check() diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 9d24594cb6..1b3984583a 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -124,10 +124,14 @@ def test_cli_ipv4_listeners(neon_env_builder: NeonEnvBuilder): def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): + """ + Basic start/stop with default single-instance config for + safekeeper and pageserver + """ env = neon_env_builder.init_start() # Stop default ps/sk - env.neon_cli.pageserver_stop() + env.neon_cli.pageserver_stop(env.pageserver.id) env.neon_cli.safekeeper_stop() # Default start @@ -139,6 +143,38 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): res.check_returncode() +def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): + """ + Basic start/stop with explicitly configured counts of pageserver + and safekeeper + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.num_safekeepers = 2 + env = neon_env_builder.init_start() + + env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID) + env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 1) + + # Addressing a nonexistent ID throws + with pytest.raises(RuntimeError): + env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 100) + + # Using the single-pageserver shortcut property throws when there are multiple pageservers + with pytest.raises(AssertionError): + _drop = env.pageserver + + env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 1) + env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2) + + # Default start + res = env.neon_cli.raw_cli(["start"]) + res.check_returncode() + + # Default stop + res = env.neon_cli.raw_cli(["stop"]) + res.check_returncode() + + @skip_on_postgres(PgVersion.V14, reason="does not use postgres") @pytest.mark.skipif( os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works" diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index a4e86e0519..a38a517100 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -3,7 +3,6 @@ import time from collections import defaultdict -from pathlib import Path from typing import Any, DefaultDict, Dict, Tuple import pytest @@ -52,10 +51,7 @@ def test_ondemand_download_large_rel( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_ondemand_download_large_rel", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) # thinking about using a shared environment? the test assumes that global # metrics are for single tenant. @@ -118,7 +114,7 @@ def test_ondemand_download_large_rel( env.pageserver.stop() # remove all the layer files - for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"): log.info(f"unlinking layer {layer}") layer.unlink() @@ -155,10 +151,7 @@ def test_ondemand_download_timetravel( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_ondemand_download_timetravel", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) # thinking about using a shared environment? the test assumes that global # metrics are for single tenant. @@ -243,7 +236,7 @@ def test_ondemand_download_timetravel( env.pageserver.stop() # remove all the layer files - for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"): log.info(f"unlinking layer {layer}") layer.unlink() @@ -307,6 +300,7 @@ def test_ondemand_download_timetravel( # they are present only in the remote storage, only locally, or both. # It should not change. assert filled_current_physical == get_api_current_physical_size() + endpoint_old.stop() # @@ -317,10 +311,7 @@ def test_download_remote_layers_api( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_download_remote_layers_api", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) ##### First start, insert data and upload it to the remote storage env = neon_env_builder.init_start( @@ -332,8 +323,8 @@ def test_download_remote_layers_api( "compaction_period": "0s", # small checkpoint distance to create more delta layer files "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB - "compaction_threshold": "1", - "image_creation_threshold": "1", + "compaction_threshold": "999999", + "image_creation_threshold": "999999", "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB } ) @@ -366,8 +357,20 @@ def test_download_remote_layers_api( tenant_id, timeline_id, "pageserver_resident_physical_size" ) + # Shut down safekeepers before starting the pageserver. + # If we don't, they might stream us more WAL. + for sk in env.safekeepers: + sk.stop() + + # it is sad we cannot do a flush inmem layer without compaction, but + # working around with very high layer0 count and image layer creation + # threshold + client.timeline_checkpoint(tenant_id, timeline_id) + + wait_for_upload_queue_empty(client, tenant_id, timeline_id) + filled_current_physical = get_api_current_physical_size() - log.info(filled_current_physical) + log.info(f"filled_current_physical: {filled_current_physical}") filled_size = get_resident_physical_size() log.info(f"filled_size: {filled_size}") assert filled_current_physical == filled_size, "we don't yet do layer eviction" @@ -375,18 +378,10 @@ def test_download_remote_layers_api( env.pageserver.stop() # remove all the layer files - # XXX only delete some of the layer files, to show that it really just downloads all the layers - for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"): log.info(f"unlinking layer {layer.name}") layer.unlink() - # Shut down safekeepers before starting the pageserver. - # If we don't, the tenant's walreceiver handler will trigger the - # the logical size computation task, and that downloads layes, - # which makes our assertions on size fail. - for sk in env.safekeepers: - sk.stop(immediate=True) - ##### Second start, restore the data and ensure it's the same env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"}) env.pageserver.allowed_errors.extend( @@ -399,23 +394,22 @@ def test_download_remote_layers_api( wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active")) ###### Phase 1: exercise download error code path + + this_time = get_api_current_physical_size() assert ( - filled_current_physical == get_api_current_physical_size() + filled_current_physical == this_time ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote" + post_unlink_size = get_resident_physical_size() log.info(f"post_unlink_size: {post_unlink_size}") assert ( post_unlink_size < filled_size ), "we just deleted layers and didn't cause anything to re-download them yet" - assert filled_size - post_unlink_size > 5 * ( - 1024**2 - ), "we may be downloading some layers as part of tenant activation" # issue downloads that we know will fail info = client.timeline_download_remote_layers( tenant_id, timeline_id, - # allow some concurrency to unveil potential concurrency bugs max_concurrent_downloads=10, errors_ok=True, at_least_one_download=False, @@ -424,9 +418,9 @@ def test_download_remote_layers_api( assert info["state"] == "Completed" assert info["total_layer_count"] > 0 assert info["successful_download_count"] == 0 - assert ( - info["failed_download_count"] > 0 - ) # can't assert == total_layer_count because attach + tenant status downloads some layers + # can't assert == total_layer_count because timeline_detail also tries to + # download layers for logical size, but this might not always hold. + assert info["failed_download_count"] > 0 assert ( info["total_layer_count"] == info["successful_download_count"] + info["failed_download_count"] @@ -435,7 +429,6 @@ def test_download_remote_layers_api( assert ( get_resident_physical_size() == post_unlink_size ), "didn't download anything new due to failpoint" - # would be nice to assert that the layers in the layer map are still RemoteLayer ##### Retry, this time without failpoints client.configure_failpoints(("remote-storage-download-pre-rename", "off")) @@ -479,10 +472,7 @@ def test_compaction_downloads_on_demand_without_image_creation( """ Create a few layers, then evict, then make sure compaction runs successfully. """ - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_compaction_downloads_on_demand_without_image_creation", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) conf = { # Disable background GC & compaction @@ -567,10 +557,7 @@ def test_compaction_downloads_on_demand_with_image_creation( Due to current implementation, this will make image creation on-demand download layers, but we cannot really directly test for it. """ - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_compaction_downloads_on_demand", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) conf = { # Disable background GC & compaction @@ -668,10 +655,7 @@ def test_ondemand_download_failure_to_replace( See: https://github.com/neondatabase/neon/issues/3533 """ - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_ondemand_download_failure_to_replace", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) # disable gc and compaction via default tenant config because config is lost while detaching # so that compaction will not be the one to download the layer but the http handler is diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 28732872df..f5bcfd52f9 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -17,13 +17,13 @@ from fixtures.utils import wait_until def test_pageserver_init_node_id( neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path ): - repo_dir = neon_simple_env.repo_dir - pageserver_config = repo_dir / "pageserver.toml" + workdir = neon_simple_env.pageserver.workdir + pageserver_config = workdir / "pageserver.toml" pageserver_bin = neon_binpath / "pageserver" def run_pageserver(args): return subprocess.run( - [str(pageserver_bin), "-D", str(repo_dir), *args], + [str(pageserver_bin), "-D", str(workdir), *args], check=False, universal_newlines=True, stdout=subprocess.PIPE, diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py new file mode 100644 index 0000000000..dae39d2752 --- /dev/null +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -0,0 +1,481 @@ +import json +import time +from dataclasses import dataclass +from pathlib import Path +from queue import SimpleQueue +from typing import Any, Dict, Set + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + wait_for_last_flush_lsn, +) +from fixtures.remote_storage import RemoteStorageKind +from fixtures.types import TenantId, TimelineId +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +@pytest.mark.parametrize( + "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS] +) +def test_metric_collection( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address, + remote_storage_kind: RemoteStorageKind, +): + (host, port) = httpserver_listen_address + metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" + + # this should be Union[str, Tuple[List[Any], bool]], but it will make unpacking much more verbose + uploads: SimpleQueue[Any] = SimpleQueue() + + def metrics_handler(request: Request) -> Response: + if request.json is None: + return Response(status=400) + + events = request.json["events"] + is_last = request.headers["pageserver-metrics-last-upload-in-batch"] + assert is_last in ["true", "false"] + uploads.put((events, is_last == "true")) + return Response(status=200) + + # Require collecting metrics frequently, since we change + # the timeline and want something to be logged about it. + # + # Disable time-based pitr, we will use the manual GC calls + # to trigger remote storage operations in a controlled way + neon_env_builder.pageserver_config_override = f""" + metric_collection_interval="1s" + metric_collection_endpoint="{metric_collection_endpoint}" + cached_metric_collection_interval="0s" + synthetic_size_calculation_interval="3s" + """ + + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}") + + # mock http server that returns OK for the metrics + httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler( + metrics_handler + ) + + # spin up neon, after http server is ready + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) + # httpserver is shut down before pageserver during passing run + env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("CREATE TABLE foo (id int, counter int, t text)") + cur.execute( + """ + INSERT INTO foo + SELECT g, 0, 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Helper function that gets the number of given kind of remote ops from the metrics + def get_num_remote_ops(file_kind: str, op_kind: str) -> int: + ps_metrics = env.pageserver.http_client().get_metrics() + total = 0.0 + for sample in ps_metrics.query_all( + name="pageserver_remote_operation_seconds_count", + filter={ + "file_kind": str(file_kind), + "op_kind": str(op_kind), + }, + ): + total += sample[2] + return int(total) + + remote_uploaded = 0 + + # upload some data to remote storage + if remote_storage_kind == RemoteStorageKind.LOCAL_FS: + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + pageserver_http = env.pageserver.http_client() + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + pageserver_http.timeline_gc(tenant_id, timeline_id, 10000) + + remote_uploaded = get_num_remote_ops("index", "upload") + assert remote_uploaded > 0 + + # we expect uploads at 1Hz, on busy runners this could be too optimistic, + # so give 5s we only want to get the following upload after "ready" value. + timeout = 5 + + # these strings in the upload queue allow synchronizing with the uploads + # and the main test execution + uploads.put("ready") + + # note that this verifier graph should live across restarts as long as the + # cache file lives + v = MetricsVerifier() + + while True: + events = uploads.get(timeout=timeout) + + if events == "ready": + (events, is_last) = uploads.get(timeout=timeout) + v.ingest(events, is_last) + break + else: + (events, is_last) = events + v.ingest(events, is_last) + + if "synthetic_storage_size" not in v.accepted_event_names(): + log.info("waiting for synthetic storage size to be calculated and uploaded...") + + rounds = 0 + while "synthetic_storage_size" not in v.accepted_event_names(): + (events, is_last) = uploads.get(timeout=timeout) + v.ingest(events, is_last) + rounds += 1 + assert rounds < 10, "did not get synthetic_storage_size in 10 uploads" + # once we have it in verifiers, it will assert that future batches will contain it + + env.pageserver.stop() + time.sleep(1) + uploads.put("ready") + env.pageserver.start() + + while True: + events = uploads.get(timeout=timeout) + + if events == "ready": + (events, is_last) = uploads.get(timeout=timeout * 3) + v.ingest(events, is_last) + (events, is_last) = uploads.get(timeout=timeout) + v.ingest(events, is_last) + break + else: + (events, is_last) = events + v.ingest(events, is_last) + + httpserver.check() + + +def test_metric_collection_cleans_up_tempfile( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address, +): + (host, port) = httpserver_listen_address + metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" + + # this should be Union[str, Tuple[List[Any], bool]], but it will make unpacking much more verbose + uploads: SimpleQueue[Any] = SimpleQueue() + + def metrics_handler(request: Request) -> Response: + if request.json is None: + return Response(status=400) + + events = request.json["events"] + is_last = request.headers["pageserver-metrics-last-upload-in-batch"] + assert is_last in ["true", "false"] + uploads.put((events, is_last == "true")) + return Response(status=200) + + # Require collecting metrics frequently, since we change + # the timeline and want something to be logged about it. + # + # Disable time-based pitr, we will use the manual GC calls + # to trigger remote storage operations in a controlled way + neon_env_builder.pageserver_config_override = f""" + metric_collection_interval="1s" + metric_collection_endpoint="{metric_collection_endpoint}" + cached_metric_collection_interval="0s" + synthetic_size_calculation_interval="3s" + """ + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + # mock http server that returns OK for the metrics + httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler( + metrics_handler + ) + + # spin up neon, after http server is ready + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) + pageserver_http = env.pageserver.http_client() + + # httpserver is shut down before pageserver during passing run + env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("CREATE TABLE foo (id int, counter int, t text)") + cur.execute( + """ + INSERT INTO foo + SELECT g, 0, 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + # we expect uploads at 1Hz, on busy runners this could be too optimistic, + # so give 5s we only want to get the following upload after "ready" value. + timeout = 5 + + # these strings in the upload queue allow synchronizing with the uploads + # and the main test execution + uploads.put("ready") + + while True: + events = uploads.get(timeout=timeout) + + if events == "ready": + (events, _) = uploads.get(timeout=timeout) + break + + # should really configure an env? + pageserver_http.configure_failpoints(("before-persist-last-metrics-collected", "exit")) + + time.sleep(3) + + env.pageserver.stop() + + initially = iterate_pageserver_workdir(env.pageserver.workdir, "last_consumption_metrics.json") + + assert ( + len(initially.matching) == 2 + ), f"expecting actual file and tempfile, but not found: {initially.matching}" + + uploads.put("ready") + env.pageserver.start() + + while True: + events = uploads.get(timeout=timeout * 3) + + if events == "ready": + (events, _) = uploads.get(timeout=timeout) + break + + env.pageserver.stop() + + later = iterate_pageserver_workdir(env.pageserver.workdir, "last_consumption_metrics.json") + + # it is possible we shutdown the pageserver right at the correct time, so the old tempfile + # is gone, but we also have a new one. + only = set(["last_consumption_metrics.json"]) + assert ( + initially.matching.intersection(later.matching) == only + ), "only initial tempfile should had been removed" + assert initially.other.issuperset(later.other), "no other files should had been removed" + + +@dataclass +class PrefixPartitionedFiles: + matching: Set[str] + other: Set[str] + + +def iterate_pageserver_workdir(path: Path, prefix: str) -> PrefixPartitionedFiles: + """ + Iterates the files in the workdir, returns two sets: + - files with the prefix + - files without the prefix + """ + + matching = set() + other = set() + for entry in path.iterdir(): + if not entry.is_file(): + continue + + if not entry.name.startswith(prefix): + other.add(entry.name) + else: + matching.add(entry.name) + + return PrefixPartitionedFiles(matching, other) + + +class MetricsVerifier: + """ + A graph of per tenant per timeline verifiers, allowing one for each + metric + """ + + def __init__(self): + self.tenants: Dict[TenantId, TenantMetricsVerifier] = {} + pass + + def ingest(self, events, is_last): + stringified = json.dumps(events, indent=2) + log.info(f"ingesting: {stringified}") + for event in events: + id = TenantId(event["tenant_id"]) + if id not in self.tenants: + self.tenants[id] = TenantMetricsVerifier(id) + + self.tenants[id].ingest(event) + + if is_last: + for t in self.tenants.values(): + t.post_batch() + + def accepted_event_names(self) -> Set[str]: + names: Set[str] = set() + for t in self.tenants.values(): + names = names.union(t.accepted_event_names()) + return names + + +class TenantMetricsVerifier: + def __init__(self, id: TenantId): + self.id = id + self.timelines: Dict[TimelineId, TimelineMetricsVerifier] = {} + self.state: Dict[str, Any] = {} + + def ingest(self, event): + assert TenantId(event["tenant_id"]) == self.id + + if "timeline_id" in event: + id = TimelineId(event["timeline_id"]) + if id not in self.timelines: + self.timelines[id] = TimelineMetricsVerifier(self.id, id) + + self.timelines[id].ingest(event) + else: + name = event["metric"] + if name not in self.state: + self.state[name] = PER_METRIC_VERIFIERS[name]() + self.state[name].ingest(event, self) + + def post_batch(self): + for v in self.state.values(): + v.post_batch(self) + + for tl in self.timelines.values(): + tl.post_batch(self) + + def accepted_event_names(self) -> Set[str]: + names = set(self.state.keys()) + for t in self.timelines.values(): + names = names.union(t.accepted_event_names()) + return names + + +class TimelineMetricsVerifier: + def __init__(self, tenant_id: TenantId, timeline_id: TimelineId): + self.id = timeline_id + self.state: Dict[str, Any] = {} + + def ingest(self, event): + name = event["metric"] + if name not in self.state: + self.state[name] = PER_METRIC_VERIFIERS[name]() + self.state[name].ingest(event, self) + + def post_batch(self, parent): + for v in self.state.values(): + v.post_batch(self) + + def accepted_event_names(self) -> Set[str]: + return set(self.state.keys()) + + +class CannotVerifyAnything: + """We can only assert types, but rust already has types, so no need.""" + + def __init__(self): + pass + + def ingest(self, event, parent): + pass + + def post_batch(self, parent): + pass + + +class WrittenDataVerifier: + def __init__(self): + self.values = [] + pass + + def ingest(self, event, parent): + self.values.append(event["value"]) + + def post_batch(self, parent): + pass + + +class WrittenDataDeltaVerifier: + def __init__(self): + self.value = None + self.sum = 0 + self.timerange = None + pass + + def ingest(self, event, parent): + assert event["type"] == "incremental" + self.value = event["value"] + self.sum += event["value"] + start = event["start_time"] + stop = event["stop_time"] + timerange = (start, stop) + if self.timerange is not None: + # this holds across restarts + assert self.timerange[1] == timerange[0], "time ranges should be continious" + self.timerange = timerange + + def post_batch(self, parent): + absolute = parent.state["written_size"] + if len(absolute.values) == 1: + # in tests this comes up as initdb execution, so we can have 0 or + # about 30MB on the first event. it is not consistent. + assert self.value is not None + else: + assert self.value == absolute.values[-1] - absolute.values[-2] + # sounds like this should hold, but it will not for branches -- probably related to timing + # assert self.sum == absolute.latest + + +class SyntheticSizeVerifier: + def __init__(self): + self.prev = None + self.value = None + pass + + def ingest(self, event, parent): + assert isinstance(parent, TenantMetricsVerifier) + assert event["type"] == "absolute" + value = event["value"] + self.value = value + + def post_batch(self, parent): + if self.prev is not None: + # this is assuming no one goes and deletes the cache file + assert ( + self.value is not None + ), "after calculating first synthetic size, cached or more recent should be sent" + self.prev = self.value + self.value = None + + +PER_METRIC_VERIFIERS = { + "remote_storage_size": CannotVerifyAnything, + "resident_size": CannotVerifyAnything, + "written_size": WrittenDataVerifier, + "written_data_bytes_delta": WrittenDataDeltaVerifier, + "timeline_logical_size": CannotVerifyAnything, + "synthetic_storage_size": SyntheticSizeVerifier, +} diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 1e41ebd15b..2965a354bd 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -3,11 +3,17 @@ from contextlib import closing import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.remote_storage import s3_storage # Test restarting page server, while safekeeper and compute node keep # running. -def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("generations", [True, False]) +def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool): + neon_env_builder.enable_generations = generations + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start() env.neon_cli.create_branch("test_pageserver_restart") @@ -109,6 +115,9 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # safekeeper and compute node keep running. @pytest.mark.timeout(540) def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start() # Use a tiny checkpoint distance, to create a lot of layers quickly. diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 598a1bd084..c542ab05ae 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -346,23 +346,13 @@ def test_sql_over_http_pool(static_proxy: NeonProxy): static_proxy.safe_psql("create user http_auth with password 'http' superuser") def get_pid(status: int, pw: str) -> Any: - connstr = ( - f"postgresql://http_auth:{pw}@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" + return static_proxy.http_query( + "SELECT pid FROM pg_stat_activity WHERE state = 'active'", + [], + user="http_auth", + password=pw, + expected_code=status, ) - response = requests.post( - f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql", - data=json.dumps( - {"query": "SELECT pid FROM pg_stat_activity WHERE state = 'active'", "params": []} - ), - headers={ - "Content-Type": "application/sql", - "Neon-Connection-String": connstr, - "Neon-Pool-Opt-In": "true", - }, - verify=str(static_proxy.test_output_dir / "proxy.crt"), - ) - assert response.status_code == status - return response.json() pid1 = get_pid(200, "http")["rows"][0]["pid"] @@ -387,3 +377,23 @@ def test_sql_over_http_pool(static_proxy: NeonProxy): # old password should not work res = get_pid(400, "http") assert "password authentication failed for user" in res["message"] + + +# Beginning a transaction should not impact the next query, +# which might come from a completely different client. +@pytest.mark.xfail(reason="not implemented") +def test_http_pool_begin(static_proxy: NeonProxy): + static_proxy.safe_psql("create user http_auth with password 'http' superuser") + + def query(status: int, query: str, *args) -> Any: + static_proxy.http_query( + query, + args, + user="http_auth", + password="http", + expected_code=status, + ) + + query(200, "BEGIN;") + query(400, "garbage-lol(&(&(&(&") # Intentional error to break the transaction + query(200, "SELECT 1;") # Query that should succeed regardless of the transaction diff --git a/test_runner/regress/test_proxy_metric_collection.py b/test_runner/regress/test_proxy_metric_collection.py new file mode 100644 index 0000000000..f57b47f4da --- /dev/null +++ b/test_runner/regress/test_proxy_metric_collection.py @@ -0,0 +1,113 @@ +from pathlib import Path +from typing import Iterator + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + PSQL, + NeonProxy, + VanillaPostgres, +) +from fixtures.port_distributor import PortDistributor +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +def proxy_metrics_handler(request: Request) -> Response: + if request.json is None: + return Response(status=400) + + events = request.json["events"] + log.info("received events:") + log.info(events) + + # perform basic sanity checks + for event in events: + assert event["metric"] == "proxy_io_bytes_per_client" + assert event["endpoint_id"] == "test_endpoint_id" + assert event["value"] >= 0 + assert event["stop_time"] >= event["start_time"] + + return Response(status=200) + + +@pytest.fixture(scope="function") +def proxy_with_metric_collector( + port_distributor: PortDistributor, + neon_binpath: Path, + httpserver_listen_address, + test_output_dir: Path, +) -> Iterator[NeonProxy]: + """Neon proxy that routes through link auth and has metric collection enabled.""" + + http_port = port_distributor.get_port() + proxy_port = port_distributor.get_port() + mgmt_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + + (host, port) = httpserver_listen_address + metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" + metric_collection_interval = "5s" + + with NeonProxy( + neon_binpath=neon_binpath, + test_output_dir=test_output_dir, + proxy_port=proxy_port, + http_port=http_port, + mgmt_port=mgmt_port, + external_http_port=external_http_port, + metric_collection_endpoint=metric_collection_endpoint, + metric_collection_interval=metric_collection_interval, + auth_backend=NeonProxy.Link(), + ) as proxy: + proxy.start() + yield proxy + + +@pytest.mark.asyncio +async def test_proxy_metric_collection( + httpserver: HTTPServer, + proxy_with_metric_collector: NeonProxy, + vanilla_pg: VanillaPostgres, +): + # mock http server that returns OK for the metrics + httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler( + proxy_metrics_handler + ) + + # do something to generate load to generate metrics + # sleep for 5 seconds to give metric collector time to collect metrics + psql = await PSQL( + host=proxy_with_metric_collector.host, port=proxy_with_metric_collector.proxy_port + ).run( + "create table tbl as select * from generate_series(0,1000); select pg_sleep(5); select 42" + ) + + base_uri = proxy_with_metric_collector.link_auth_uri + link = await NeonProxy.find_auth_link(base_uri, psql) + + psql_session_id = NeonProxy.get_session_id(base_uri, link) + await NeonProxy.activate_link_auth(vanilla_pg, proxy_with_metric_collector, psql_session_id) + + assert psql.stdout is not None + out = (await psql.stdout.read()).decode("utf-8").strip() + assert out == "42" + + # do something to generate load to generate metrics + # sleep for 5 seconds to give metric collector time to collect metrics + psql = await PSQL( + host=proxy_with_metric_collector.host, port=proxy_with_metric_collector.proxy_port + ).run("insert into tbl select * from generate_series(0,1000); select pg_sleep(5); select 42") + + link = await NeonProxy.find_auth_link(base_uri, psql) + psql_session_id = NeonProxy.get_session_id(base_uri, link) + await NeonProxy.activate_link_auth( + vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=False + ) + + assert psql.stdout is not None + out = (await psql.stdout.read()).decode("utf-8").strip() + assert out == "42" + + httpserver.check() diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py index cae8ca3919..e6b3ccd7ec 100644 --- a/test_runner/regress/test_read_trace.py +++ b/test_runner/regress/test_read_trace.py @@ -35,5 +35,5 @@ def test_read_request_tracing(neon_env_builder: NeonEnvBuilder): # Stop postgres so we drop the connection and flush the traces endpoint.stop() - trace_path = env.repo_dir / "traces" / str(tenant_id) / str(timeline_id) + trace_path = env.pageserver.workdir / "traces" / str(tenant_id) / str(timeline_id) assert trace_path.exists() diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 0bd365efaa..f316b42d1c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -6,7 +6,6 @@ import queue import shutil import threading import time -from pathlib import Path from typing import Dict, List, Optional, Tuple import pytest @@ -52,18 +51,17 @@ from requests import ReadTimeout # # The tests are done for all types of remote storage pageserver supports. @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +@pytest.mark.parametrize("generations", [True, False]) def test_remote_storage_backup_and_restore( - neon_env_builder: NeonEnvBuilder, - remote_storage_kind: RemoteStorageKind, + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, generations: bool ): # Use this test to check more realistic SK ids: some etcd key parsing bugs were related, # and this test needs SK to write data to pageserver, so it will be visible neon_env_builder.safekeepers_id_start = 12 - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_remote_storage_backup_and_restore", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + neon_env_builder.enable_generations = generations # Exercise retry code path by making all uploads and downloads fail for the # first time. The retries print INFO-messages to the log; we will check @@ -138,7 +136,7 @@ def test_remote_storage_backup_and_restore( env.endpoints.stop_all() env.pageserver.stop() - dir_to_clear = Path(env.repo_dir) / "tenants" + dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) @@ -155,7 +153,7 @@ def test_remote_storage_backup_and_restore( # background task to load the tenant. In that background task, # listing the remote timelines will fail because of the failpoint, # and the tenant will be marked as Broken. - client.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) assert tenant_info["attachment_status"] == { @@ -165,7 +163,7 @@ def test_remote_storage_backup_and_restore( # Ensure that even though the tenant is broken, we can't attach it again. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"): - client.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) # Restart again, this implicitly clears the failpoint. # test_remote_failures=1 remains active, though, as it's in the pageserver config. @@ -183,7 +181,7 @@ def test_remote_storage_backup_and_restore( # Ensure that the pageserver remembers that the tenant was attaching, by # trying to attach it again. It should fail. with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"): - client.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) log.info("waiting for tenant to become active. this should be quick with on-demand download") wait_until_tenant_active( @@ -223,10 +221,7 @@ def test_remote_storage_upload_queue_retries( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_remote_storage_upload_queue_retries", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() @@ -357,14 +352,14 @@ def test_remote_storage_upload_queue_retries( env.pageserver.stop(immediate=True) env.endpoints.stop_all() - dir_to_clear = Path(env.repo_dir) / "tenants" + dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) env.pageserver.start() client = env.pageserver.http_client() - client.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) wait_until_tenant_active(client, tenant_id) @@ -379,10 +374,7 @@ def test_remote_timeline_client_calls_started_metric( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_remote_timeline_client_metrics", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) # thinking about using a shared environment? the test assumes that global # metrics are for single tenant. @@ -495,14 +487,14 @@ def test_remote_timeline_client_calls_started_metric( env.pageserver.stop(immediate=True) env.endpoints.stop_all() - dir_to_clear = Path(env.repo_dir) / "tenants" + dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) env.pageserver.start() client = env.pageserver.http_client() - client.tenant_attach(tenant_id) + env.pageserver.tenant_attach(tenant_id) wait_until_tenant_active(client, tenant_id) @@ -522,10 +514,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_timeline_deletion_with_files_stuck_in_upload_queue", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start( initial_tenant_conf={ @@ -543,7 +532,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( tenant_id = env.initial_tenant timeline_id = env.initial_timeline - timeline_path = env.timeline_dir(tenant_id, timeline_id) + timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id) client = env.pageserver.http_client() @@ -614,8 +603,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( assert not timeline_path.exists() # to please mypy - assert isinstance(env.remote_storage, LocalFsStorage) - remote_timeline_path = env.remote_storage.timeline_path(tenant_id, timeline_id) + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + remote_timeline_path = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) assert not list(remote_timeline_path.iterdir()) @@ -640,10 +629,7 @@ def test_empty_branch_remote_storage_upload( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_empty_branch_remote_storage_upload", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -692,10 +678,7 @@ def test_empty_branch_remote_storage_upload_on_restart( — the upload should be scheduled by load, and create_timeline should await for it even though it gets 409 Conflict. """ - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_empty_branch_remote_storage_upload_on_restart", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -720,12 +703,14 @@ def test_empty_branch_remote_storage_upload_on_restart( # index upload is now hitting the failpoint, it should block the shutdown env.pageserver.stop(immediate=True) - local_metadata = env.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata" + local_metadata = ( + env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata" + ) assert local_metadata.is_file() - assert isinstance(env.remote_storage, LocalFsStorage) + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) - new_branch_on_remote_storage = env.remote_storage.timeline_path( + new_branch_on_remote_storage = env.pageserver_remote_storage.timeline_path( env.initial_tenant, new_branch_timeline_id ) assert ( @@ -790,10 +775,7 @@ def test_compaction_delete_before_upload( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_compaction_delete_before_upload", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start( initial_tenant_conf={ diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 60ec532db4..2fdcfca671 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -292,17 +292,14 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): - neon_env_builder.enable_remote_storage( - remote_storage_kind=RemoteStorageKind.LOCAL_FS, - test_name="test_creating_tenant_conf_after_attach", - ) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - assert isinstance(env.remote_storage, LocalFsStorage) + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) # tenant is created with defaults, as in without config file (tenant_id, timeline_id) = env.neon_cli.create_tenant() - config_path = env.repo_dir / "tenants" / str(tenant_id) / "config" + config_path = env.pageserver.tenant_dir(tenant_id) / "config" assert config_path.exists(), "config file is always initially created" http_client = env.pageserver.http_client() @@ -337,13 +334,10 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( neon_env_builder: NeonEnvBuilder, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=RemoteStorageKind.LOCAL_FS, - test_name="test_live_reconfig_get_evictions_low_residence_duration_metric_threshold", - ) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - assert isinstance(env.remote_storage, LocalFsStorage) + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) (tenant_id, timeline_id) = env.neon_cli.create_tenant() ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 448dcfaff7..c6ddb54ee6 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -1,7 +1,6 @@ import enum import os import shutil -from pathlib import Path import pytest from fixtures.log_helper import log @@ -41,10 +40,7 @@ def test_tenant_delete_smoke( ): neon_env_builder.pageserver_config_override = "test_remote_failures=1" - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_tenant_delete_smoke", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() @@ -93,7 +89,7 @@ def test_tenant_delete_smoke( tenant_delete_wait_completed(ps_http, tenant_id, iterations) - tenant_path = env.tenant_dir(tenant_id=tenant_id) + tenant_path = env.pageserver.tenant_dir(tenant_id) assert not tenant_path.exists() if remote_storage_kind in available_s3_storages(): @@ -177,9 +173,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( if simulate_failures: neon_env_builder.pageserver_config_override = "test_remote_failures=1" - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints" - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) @@ -192,7 +186,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # We may leave some upload tasks in the queue. They're likely deletes. # For uploads we explicitly wait with `last_flush_lsn_upload` below. # So by ignoring these instead of waiting for empty upload queue @@ -275,7 +269,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations) - tenant_dir = env.tenant_dir(tenant_id) + tenant_dir = env.pageserver.tenant_dir(tenant_id) # Check local is empty assert not tenant_dir.exists() @@ -298,10 +292,7 @@ def test_tenant_delete_is_resumed_on_attach( remote_storage_kind: RemoteStorageKind, pg_bin: PgBin, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_deleted_tenant_ignored_on_attach", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) @@ -338,7 +329,7 @@ def test_tenant_delete_is_resumed_on_attach( # From deletion polling f".*NotFound: tenant {env.initial_tenant}.*", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*', @@ -375,7 +366,7 @@ def test_tenant_delete_is_resumed_on_attach( env.endpoints.stop_all() env.pageserver.stop() - dir_to_clear = Path(env.repo_dir) / "tenants" + dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) @@ -388,7 +379,7 @@ def test_tenant_delete_is_resumed_on_attach( wait_tenant_status_404(ps_http, tenant_id, iterations) # we shouldn've created tenant dir on disk - tenant_path = env.tenant_dir(tenant_id=tenant_id) + tenant_path = env.pageserver.tenant_dir(tenant_id) assert not tenant_path.exists() if remote_storage_kind in available_s3_storages(): diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 07b751bcca..8ccbcf551d 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -44,10 +44,7 @@ def test_tenant_reattach( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_tenant_reattach", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) # Exercise retry code path by making all uploads and downloads fail for the # first time. The retries print INFO-messages to the log; we will check @@ -122,65 +119,6 @@ def test_tenant_reattach( num_connections = 10 num_rows = 100000 -updates_to_perform = 0 - -updates_started = 0 -updates_finished = 0 - - -# Run random UPDATEs on test table. On failure, try again. -async def update_table(pg_conn: asyncpg.Connection): - global updates_started, updates_finished, updates_to_perform - - while updates_started < updates_to_perform or updates_to_perform == 0: - updates_started += 1 - id = random.randrange(1, num_rows) - - # Loop to retry until the UPDATE succeeds - while True: - try: - await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}") - updates_finished += 1 - if updates_finished % 1000 == 0: - log.info(f"update {updates_finished} / {updates_to_perform}") - break - except asyncpg.PostgresError as e: - # Received error from Postgres. Log it, sleep a little, and continue - log.info(f"UPDATE error: {e}") - await asyncio.sleep(0.1) - - -async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId): - global updates_started, updates_finished, updates_to_perform - - # Wait until we have performed some updates - wait_until(20, 0.5, lambda: updates_finished > 500) - - log.info("Detaching tenant") - pageserver_http.tenant_detach(tenant_id) - await asyncio.sleep(1) - log.info("Re-attaching tenant") - pageserver_http.tenant_attach(tenant_id) - log.info("Re-attach finished") - - # Continue with 5000 more updates - updates_to_perform = updates_started + 5000 - - -# async guts of test_tenant_reattach_while_bysy test -async def reattach_while_busy( - env: NeonEnv, endpoint: Endpoint, pageserver_http: PageserverHttpClient, tenant_id: TenantId -): - workers = [] - for _ in range(num_connections): - pg_conn = await endpoint.connect_async() - workers.append(asyncio.create_task(update_table(pg_conn))) - - workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id))) - await asyncio.gather(*workers) - - assert updates_finished == updates_to_perform - # Detach and re-attach tenant, while compute is busy running queries. # @@ -229,10 +167,63 @@ def test_tenant_reattach_while_busy( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_tenant_reattach_while_busy", - ) + updates_started = 0 + updates_finished = 0 + updates_to_perform = 0 + + # Run random UPDATEs on test table. On failure, try again. + async def update_table(pg_conn: asyncpg.Connection): + nonlocal updates_started, updates_finished, updates_to_perform + + while updates_started < updates_to_perform or updates_to_perform == 0: + updates_started += 1 + id = random.randrange(1, num_rows) + + # Loop to retry until the UPDATE succeeds + while True: + try: + await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}") + updates_finished += 1 + if updates_finished % 1000 == 0: + log.info(f"update {updates_finished} / {updates_to_perform}") + break + except asyncpg.PostgresError as e: + # Received error from Postgres. Log it, sleep a little, and continue + log.info(f"UPDATE error: {e}") + await asyncio.sleep(0.1) + + async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId): + nonlocal updates_started, updates_finished, updates_to_perform + + # Wait until we have performed some updates + wait_until(20, 0.5, lambda: updates_finished > 500) + + log.info("Detaching tenant") + pageserver_http.tenant_detach(tenant_id) + await asyncio.sleep(1) + log.info("Re-attaching tenant") + pageserver_http.tenant_attach(tenant_id) + log.info("Re-attach finished") + + # Continue with 5000 more updates + updates_to_perform = updates_started + 5000 + + # async guts of test_tenant_reattach_while_bysy test + async def reattach_while_busy( + env: NeonEnv, endpoint: Endpoint, pageserver_http: PageserverHttpClient, tenant_id: TenantId + ): + nonlocal updates_to_perform, updates_finished + workers = [] + for _ in range(num_connections): + pg_conn = await endpoint.connect_async() + workers.append(asyncio.create_task(update_table(pg_conn))) + + workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id))) + await asyncio.gather(*workers) + + assert updates_finished == updates_to_perform + + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -295,7 +286,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): ) # assert tenant exists on disk - assert (env.repo_dir / "tenants" / str(tenant_id)).exists() + assert env.pageserver.tenant_dir(tenant_id).exists() endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement @@ -338,7 +329,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): log.info("gc thread returned") # check that nothing is left on disk for deleted tenant - assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() + assert not env.pageserver.tenant_dir(tenant_id).exists() with pytest.raises( expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}" @@ -363,7 +354,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv): ) # assert tenant exists on disk - assert (env.repo_dir / "tenants" / str(tenant_id)).exists() + assert env.pageserver.tenant_dir(tenant_id).exists() endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement @@ -392,7 +383,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv): log.info("ignored tenant detached without error") # check that nothing is left on disk for deleted tenant - assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() + assert not env.pageserver.tenant_dir(tenant_id).exists() # assert the tenant does not exists in the Pageserver tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] @@ -419,7 +410,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): ) # assert tenant exists on disk - assert (env.repo_dir / "tenants" / str(tenant_id)).exists() + assert env.pageserver.tenant_dir(tenant_id).exists() endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement @@ -436,7 +427,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): log.info("regular tenant detached without error") # check that nothing is left on disk for deleted tenant - assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() + assert not env.pageserver.tenant_dir(tenant_id).exists() # assert the tenant does not exists in the Pageserver tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] @@ -451,10 +442,7 @@ def test_detach_while_attaching( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_detach_while_attaching", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() @@ -535,15 +523,12 @@ def test_detach_while_attaching( def test_ignored_tenant_reattach( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_ignored_tenant_reattach", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() ignored_tenant_id, _ = env.neon_cli.create_tenant() - tenant_dir = env.repo_dir / "tenants" / str(ignored_tenant_id) + tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id) tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] tenants_before_ignore.sort() timelines_before_ignore = [ @@ -607,10 +592,7 @@ def test_ignored_tenant_reattach( def test_ignored_tenant_download_missing_layers( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_ignored_tenant_download_and_attach", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") @@ -637,7 +619,7 @@ def test_ignored_tenant_download_missing_layers( # ignore the tenant and remove its layers pageserver_http.tenant_ignore(tenant_id) - timeline_dir = env.timeline_dir(tenant_id, timeline_id) + timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) layers_removed = False for dir_entry in timeline_dir.iterdir(): if dir_entry.name.startswith("00000"): @@ -673,10 +655,7 @@ def test_ignored_tenant_download_missing_layers( def test_ignored_tenant_stays_broken_without_metadata( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_ignored_tenant_stays_broken_without_metadata", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() env.endpoints.create_start("main") @@ -688,12 +667,12 @@ def test_ignored_tenant_stays_broken_without_metadata( # temporarily detached produces these errors in the pageserver log. env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*") env.pageserver.allowed_errors.append( - f".*Tenant {tenant_id} will not become active\\. Current state: Broken.*" + f".*Tenant {tenant_id} will not become active\\. Current state: (Broken|Stopping).*" ) # ignore the tenant and remove its metadata pageserver_http.tenant_ignore(tenant_id) - timeline_dir = env.timeline_dir(tenant_id, timeline_id) + timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) metadata_removed = False for dir_entry in timeline_dir.iterdir(): if dir_entry.name == "metadata": @@ -717,10 +696,7 @@ def test_ignored_tenant_stays_broken_without_metadata( def test_load_attach_negatives( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_load_attach_negatives", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() env.endpoints.create_start("main") @@ -762,10 +738,7 @@ def test_ignore_while_attaching( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_ignore_while_attaching", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -866,10 +839,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_metrics_while_ignoring_broken_tenant_and_reloading", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 32ad5381b4..8be0f0449b 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -22,7 +22,11 @@ from fixtures.pageserver.utils import ( wait_tenant_status_404, ) from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import RemoteStorageKind, available_remote_storages +from fixtures.remote_storage import ( + LocalFsStorage, + RemoteStorageKind, + available_remote_storages, +) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( query_scalar, @@ -212,7 +216,7 @@ def switch_pg_to_new_pageserver( endpoint.start() - timeline_to_detach_local_path = env.timeline_dir(tenant_id, timeline_id) + timeline_to_detach_local_path = env.pageserver.timeline_dir(tenant_id, timeline_id) files_before_detach = os.listdir(timeline_to_detach_local_path) assert ( "metadata" in files_before_detach @@ -264,7 +268,7 @@ def test_tenant_relocation( method: str, with_load: str, ): - neon_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() @@ -278,8 +282,8 @@ def test_tenant_relocation( # Needed for detach polling. env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*") - # create folder for remote storage mock - remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage" + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + remote_storage_mock_path = env.pageserver_remote_storage.root # we use two branches to check that they are both relocated # first branch is used for load, compute for second one is used to @@ -524,10 +528,7 @@ def test_emergency_relocate_with_branches_slow_replay( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_emergency_relocate_with_branches_slow_replay", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() env.pageserver.is_testing_enabled_or_skip() @@ -560,7 +561,7 @@ def test_emergency_relocate_with_branches_slow_replay( # simpler than initializing a new one from scratch, but the effect on the single tenant # is the same. env.pageserver.stop(immediate=True) - shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id)) + shutil.rmtree(env.pageserver.tenant_dir(tenant_id)) env.pageserver.start() # This fail point will pause the WAL ingestion on the main branch, after the @@ -681,10 +682,7 @@ def test_emergency_relocate_with_branches_createdb( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_emergency_relocate_with_branches_createdb", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -711,7 +709,7 @@ def test_emergency_relocate_with_branches_createdb( # Kill the pageserver, remove the tenant directory, and restart env.pageserver.stop(immediate=True) - shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id)) + shutil.rmtree(env.pageserver.tenant_dir(tenant_id)) env.pageserver.start() # Wait before ingesting the WAL for CREATE DATABASE on the main branch. The original diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 25c6634108..49a6ca5a53 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -11,7 +11,10 @@ from fixtures.neon_fixtures import ( wait_for_wal_insert_lsn, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pageserver.utils import timeline_delete_wait_completed +from fixtures.pageserver.utils import ( + timeline_delete_wait_completed, + wait_until_tenant_active, +) from fixtures.pg_version import PgVersion, xfail_on_postgres from fixtures.types import Lsn, TenantId, TimelineId @@ -517,6 +520,8 @@ def test_single_branch_get_tenant_size_grows( env.pageserver.stop() env.pageserver.start() + wait_until_tenant_active(http_client, tenant_id) + size_after = http_client.tenant_size(tenant_id) size_debug = http_client.tenant_size_debug(tenant_id) size_debug_file.write(size_debug) @@ -624,6 +629,8 @@ def test_get_tenant_size_with_multiple_branches( env.pageserver.stop() env.pageserver.start() + wait_until_tenant_active(http_client, tenant_id) + # chance of compaction and gc on startup might have an effect on the # tenant_size but so far this has been reliable, even though at least gc # and tenant_size race for the same locks diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 19bc3ed37c..677c0d18e8 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -27,7 +27,7 @@ from prometheus_client.samples import Sample def test_tenant_creation_fails(neon_simple_env: NeonEnv): - tenants_dir = Path(neon_simple_env.repo_dir) / "tenants" + tenants_dir = neon_simple_env.pageserver.tenant_dir() initial_tenants = sorted( map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) ) @@ -242,10 +242,7 @@ def test_pageserver_metrics_removed_after_detach( ): """Tests that when a tenant is detached, the tenant specific metrics are not left behind""" - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_pageserver_metrics_removed_after_detach", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) neon_env_builder.num_safekeepers = 3 @@ -303,10 +300,7 @@ def test_pageserver_metrics_removed_after_detach( def test_pageserver_with_empty_tenants( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_pageserver_with_empty_tenants", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() @@ -326,10 +320,7 @@ def test_pageserver_with_empty_tenants( ) files_in_timelines_dir = sum( - 1 - for _p in Path.iterdir( - Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines) / "timelines" - ) + 1 for _p in Path.iterdir(env.pageserver.timeline_dir(tenant_with_empty_timelines)) ) assert ( files_in_timelines_dir == 0 @@ -340,7 +331,7 @@ def test_pageserver_with_empty_tenants( env.pageserver.stop() tenant_without_timelines_dir = env.initial_tenant - shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines") + shutil.rmtree(env.pageserver.timeline_dir(tenant_without_timelines_dir)) env.pageserver.start() diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 2925f8c2da..d42e566c36 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -62,10 +62,7 @@ async def all_tenants_workload(env: NeonEnv, tenants_endpoints): @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_tenants_many", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() @@ -115,10 +112,7 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem def test_tenants_attached_after_download( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="remote_storage_kind", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) data_id = 1 data_secret = "very secret secret" @@ -185,7 +179,7 @@ def test_tenants_attached_after_download( env.pageserver.stop() - timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) local_layer_deleted = False for path in Path.iterdir(timeline_dir): if path.name.startswith("00000"): @@ -230,14 +224,11 @@ def test_tenant_redownloads_truncated_file_on_startup( remote_storage_kind = RemoteStorageKind.LOCAL_FS # since we now store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it. - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_tenant_redownloads_truncated_file_on_startup", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() - assert isinstance(env.remote_storage, LocalFsStorage) + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) env.pageserver.allowed_errors.append( ".*removing local file .* because it has unexpected length.*" @@ -266,7 +257,7 @@ def test_tenant_redownloads_truncated_file_on_startup( env.endpoints.stop_all() env.pageserver.stop() - timeline_dir = env.timeline_dir(tenant_id, timeline_id) + timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) local_layer_truncated = None for path in Path.iterdir(timeline_dir): if path.name.startswith("00000"): @@ -281,7 +272,7 @@ def test_tenant_redownloads_truncated_file_on_startup( (path, expected_size) = local_layer_truncated # ensure the same size is found from the index_part.json - index_part = env.remote_storage.index_content(tenant_id, timeline_id) + index_part = env.pageserver_remote_storage.index_content(tenant_id, timeline_id) assert index_part["layer_metadata"][path.name]["file_size"] == expected_size ## Start the pageserver. It will notice that the file size doesn't match, and @@ -311,7 +302,9 @@ def test_tenant_redownloads_truncated_file_on_startup( assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded" # the remote side of local_layer_truncated - remote_layer_path = env.remote_storage.timeline_path(tenant_id, timeline_id) / path.name + remote_layer_path = ( + env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / path.name + ) # if the upload ever was ongoing, this check would be racy, but at least one # extra http request has been made in between so assume it's enough delay diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index a0e423e7ff..12866accc7 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -16,13 +16,12 @@ from pytest_httpserver import HTTPServer def test_threshold_based_eviction( - request, httpserver: HTTPServer, httpserver_listen_address, pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, ): - neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}") + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) # Start with metrics collection enabled, so that the eviction task # imitates its accesses. We'll use a non-existent endpoint to make it fail. @@ -33,7 +32,9 @@ def test_threshold_based_eviction( synthetic_size_calculation_interval="2s" metric_collection_endpoint="http://{host}:{port}/nonexistent" """ - metrics_refused_log_line = ".*metrics endpoint refused the sent metrics.*/nonexistent.*" + metrics_refused_log_line = ( + ".*metrics_collection:.* upload consumption_metrics (still failed|failed, will retry).*" + ) env = neon_env_builder.init_start() env.pageserver.allowed_errors.append(metrics_refused_log_line) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 916c0111f7..0e4df21d83 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -3,7 +3,6 @@ import os import queue import shutil import threading -from pathlib import Path import pytest import requests @@ -72,9 +71,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent" ) - timeline_path = ( - env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(parent_timeline_id) - ) + timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id) with pytest.raises( PageserverApiException, match="Cannot delete timeline which has child timelines" @@ -85,9 +82,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): assert exc.value.status_code == 412 - timeline_path = ( - env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id) - ) + timeline_path = env.pageserver.timeline_dir(env.initial_tenant, leaf_timeline_id) assert timeline_path.exists() # retry deletes when compaction or gc is running in pageserver @@ -120,6 +115,8 @@ def test_timeline_delete(neon_simple_env: NeonEnv): env.pageserver.stop(immediate=True) env.pageserver.start() + wait_until_tenant_active(ps_http, env.initial_tenant) + with pytest.raises( PageserverApiException, match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found", @@ -191,9 +188,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( 8. Retry or restart without the failpoint and check the result. """ - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints" - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start( initial_tenant_conf={ @@ -231,7 +226,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}") # It appears when we stopped flush loop during deletion and then pageserver is stopped env.pageserver.allowed_errors.append( - ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited" + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", ) # This happens when we fail before scheduling background operation. # Timeline is left in stopping state and retry tries to stop it again. @@ -328,7 +323,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ), ) - timeline_dir = env.timeline_dir(env.initial_tenant, timeline_id) + timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id) # Check local is empty assert not timeline_dir.exists() # Check no delete mark present @@ -348,10 +343,7 @@ def test_timeline_resurrection_on_attach( Original issue: https://github.com/neondatabase/neon/issues/3560 """ - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_timeline_resurrection_on_attach", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) ##### First start, insert data and upload it to the remote storage env = neon_env_builder.init_start() @@ -411,7 +403,7 @@ def test_timeline_resurrection_on_attach( env.endpoints.stop_all() env.pageserver.stop() - dir_to_clear = Path(env.repo_dir) / "tenants" + dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) @@ -436,10 +428,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild the deletion of the local state. """ - neon_env_builder.enable_remote_storage( - remote_storage_kind=RemoteStorageKind.MOCK_S3, - test_name="test_timeline_delete_fail_before_local_delete", - ) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) env = neon_env_builder.init_start() @@ -449,7 +438,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ) # this happens, because the stuck timeline is visible to shutdown env.pageserver.allowed_errors.append( - ".*freeze_and_flush_on_shutdown.+: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited" + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", ) ps_http = env.pageserver.http_client() @@ -465,9 +454,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild "test_timeline_delete_fail_before_local_delete", ) - leaf_timeline_path = ( - env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id) - ) + leaf_timeline_path = env.pageserver.timeline_dir(env.initial_tenant, leaf_timeline_id) ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id) timeline_info = wait_until_timeline_state( @@ -556,10 +543,7 @@ def test_concurrent_timeline_delete_stuck_on( signalling to console that it should retry later. """ - neon_env_builder.enable_remote_storage( - remote_storage_kind=RemoteStorageKind.MOCK_S3, - test_name=f"concurrent_timeline_delete_stuck_on_{stuck_failpoint}", - ) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) env = neon_env_builder.init_start() @@ -634,10 +618,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): This tests cancel safety up to the given failpoint. """ - neon_env_builder.enable_remote_storage( - remote_storage_kind=RemoteStorageKind.MOCK_S3, - test_name="test_delete_timeline_client_hangup", - ) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) env = neon_env_builder.init_start() @@ -704,10 +685,7 @@ def test_timeline_delete_works_for_remote_smoke( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_timeline_delete_works_for_remote_smoke", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() @@ -780,7 +758,7 @@ def test_delete_orphaned_objects( pg_bin: PgBin, ): remote_storage_kind = RemoteStorageKind.LOCAL_FS - neon_env_builder.enable_remote_storage(remote_storage_kind, "test_delete_orphaned_objects") + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start( initial_tenant_conf={ @@ -791,7 +769,7 @@ def test_delete_orphaned_objects( } ) - assert isinstance(env.remote_storage, LocalFsStorage) + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) ps_http = env.pageserver.http_client() @@ -802,7 +780,9 @@ def test_delete_orphaned_objects( last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id) # write orphaned file that is missing from the index - remote_timeline_path = env.remote_storage.timeline_path(env.initial_tenant, timeline_id) + remote_timeline_path = env.pageserver_remote_storage.timeline_path( + env.initial_tenant, timeline_id + ) orphans = [remote_timeline_path / f"orphan_{i}" for i in range(3)] for orphan in orphans: orphan.write_text("I shouldnt be there") @@ -833,7 +813,7 @@ def test_delete_orphaned_objects( f"deleting a file not referenced from index_part.json name={orphan.stem}" ) - assert env.remote_storage.index_path(env.initial_tenant, timeline_id).exists() + assert env.pageserver_remote_storage.index_path(env.initial_tenant, timeline_id).exists() @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) @@ -842,10 +822,7 @@ def test_timeline_delete_resumed_on_attach( remote_storage_kind: RemoteStorageKind, pg_bin: PgBin, ): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_deleted_tenant_ignored_on_attach", - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) @@ -881,7 +858,7 @@ def test_timeline_delete_resumed_on_attach( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", # Polling after attach may fail with this @@ -925,7 +902,7 @@ def test_timeline_delete_resumed_on_attach( env.endpoints.stop_all() env.pageserver.stop() - dir_to_clear = Path(env.repo_dir) / "tenants" + dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) @@ -937,7 +914,7 @@ def test_timeline_delete_resumed_on_attach( # delete should be resumed wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations) - tenant_path = env.timeline_dir(tenant_id=tenant_id, timeline_id=timeline_id) + tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id) assert not tenant_path.exists() if remote_storage_kind in available_s3_storages(): diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index f6e4a667a4..f856b26c6e 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -306,9 +306,7 @@ def test_timeline_physical_size_init( neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] ): if remote_storage_kind is not None: - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_timeline_physical_size_init" - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() @@ -349,9 +347,7 @@ def test_timeline_physical_size_post_checkpoint( neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] ): if remote_storage_kind is not None: - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_timeline_physical_size_init" - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() @@ -371,10 +367,13 @@ def test_timeline_physical_size_post_checkpoint( wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id) pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - assert_physical_size_invariants( - get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), - remote_storage_kind, - ) + def check(): + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), + remote_storage_kind, + ) + + wait_until(10, 1, check) @pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) @@ -382,9 +381,7 @@ def test_timeline_physical_size_post_compaction( neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] ): if remote_storage_kind is not None: - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_timeline_physical_size_init" - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) # Disable background compaction as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed @@ -437,9 +434,7 @@ def test_timeline_physical_size_post_gc( neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind] ): if remote_storage_kind is not None: - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_timeline_physical_size_init" - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed @@ -523,7 +518,7 @@ def test_timeline_size_metrics( ).value # assert that the physical size metric matches the actual physical size on disk - timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id) + timeline_path = env.pageserver.timeline_dir(env.initial_tenant, new_timeline_id) assert tl_physical_size_metric == get_timeline_dir_size(timeline_path) # Check that the logical size metric is sane, and matches @@ -572,9 +567,7 @@ def test_tenant_physical_size( random.seed(100) if remote_storage_kind is not None: - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_timeline_physical_size_init" - ) + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() @@ -665,7 +658,7 @@ def get_physical_size_values( ) res.api_current_physical = detail["current_physical_size"] - timeline_path = env.timeline_dir(tenant_id, timeline_id) + timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id) res.python_timelinedir_layerfiles_physical = get_timeline_dir_size(timeline_path) return res diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index d8034b31b0..bc810ceb09 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -19,18 +19,40 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # Install extension containing function needed for test cur.execute("CREATE EXTENSION neon_test_utils") - # Create a test table and freeze it to set the VM bit. + # Create a test table for a few different scenarios and freeze it to set the VM bits. cur.execute("CREATE TABLE vmtest_delete (id integer PRIMARY KEY)") cur.execute("INSERT INTO vmtest_delete VALUES (1)") cur.execute("VACUUM FREEZE vmtest_delete") - cur.execute("CREATE TABLE vmtest_update (id integer PRIMARY KEY)") - cur.execute("INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g") - cur.execute("VACUUM FREEZE vmtest_update") + cur.execute("CREATE TABLE vmtest_hot_update (id integer PRIMARY KEY, filler text)") + cur.execute("INSERT INTO vmtest_hot_update VALUES (1, 'x')") + cur.execute("VACUUM FREEZE vmtest_hot_update") + + cur.execute("CREATE TABLE vmtest_cold_update (id integer PRIMARY KEY)") + cur.execute("INSERT INTO vmtest_cold_update SELECT g FROM generate_series(1, 1000) g") + cur.execute("VACUUM FREEZE vmtest_cold_update") + + cur.execute( + "CREATE TABLE vmtest_cold_update2 (id integer PRIMARY KEY, filler text) WITH (fillfactor=100)" + ) + cur.execute("INSERT INTO vmtest_cold_update2 SELECT g, '' FROM generate_series(1, 1000) g") + cur.execute("VACUUM FREEZE vmtest_cold_update2") # DELETE and UPDATE the rows. cur.execute("DELETE FROM vmtest_delete WHERE id = 1") - cur.execute("UPDATE vmtest_update SET id = 5000 WHERE id = 1") + cur.execute("UPDATE vmtest_hot_update SET filler='x' WHERE id = 1") + cur.execute("UPDATE vmtest_cold_update SET id = 5000 WHERE id = 1") + + # Clear the VM bit on the last page with an INSERT. Then clear the VM bit on + # the page where row 1 is (block 0), by doing an UPDATE. The UPDATE is a + # cold update, and the new tuple goes to the last page, which already had + # its VM bit cleared. The point is that the UPDATE *only* clears the VM bit + # on the page containing the old tuple. We had a bug where we got the old + # and new pages mixed up, and that only shows up when one of the bits is + # cleared, but not the other one. + cur.execute("INSERT INTO vmtest_cold_update2 VALUES (9999, 'x')") + # Clears the VM bit on the old page + cur.execute("UPDATE vmtest_cold_update2 SET id = 5000, filler=repeat('x', 200) WHERE id = 1") # Branch at this point, to test that later fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear") @@ -50,9 +72,13 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): """ ) - cur.execute("SELECT * FROM vmtest_delete WHERE id = 1") + cur.execute("SELECT id FROM vmtest_delete WHERE id = 1") assert cur.fetchall() == [] - cur.execute("SELECT * FROM vmtest_update WHERE id = 1") + cur.execute("SELECT id FROM vmtest_hot_update WHERE id = 1") + assert cur.fetchall() == [(1,)] + cur.execute("SELECT id FROM vmtest_cold_update WHERE id = 1") + assert cur.fetchall() == [] + cur.execute("SELECT id FROM vmtest_cold_update2 WHERE id = 1") assert cur.fetchall() == [] cur.close() @@ -77,7 +103,111 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): """ ) - cur_new.execute("SELECT * FROM vmtest_delete WHERE id = 1") + cur_new.execute("SELECT id FROM vmtest_delete WHERE id = 1") assert cur_new.fetchall() == [] - cur_new.execute("SELECT * FROM vmtest_update WHERE id = 1") + cur_new.execute("SELECT id FROM vmtest_hot_update WHERE id = 1") + assert cur_new.fetchall() == [(1,)] + cur_new.execute("SELECT id FROM vmtest_cold_update WHERE id = 1") assert cur_new.fetchall() == [] + cur_new.execute("SELECT id FROM vmtest_cold_update2 WHERE id = 1") + assert cur_new.fetchall() == [] + + +# +# Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK +# record. +# +def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv): + env = neon_simple_env + + env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock", "empty") + endpoint = env.endpoints.create_start( + "test_vm_bit_clear_on_heap_lock", + config_lines=[ + "log_autovacuum_min_duration = 0", + # Perform anti-wraparound vacuuming aggressively + "autovacuum_naptime='1 s'", + "autovacuum_freeze_max_age = 1000000", + ], + ) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Install extension containing function needed for test + cur.execute("CREATE EXTENSION neon_test_utils") + + cur.execute("SELECT pg_switch_wal()") + + # Create a test table and freeze it to set the all-frozen VM bit on all pages. + cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)") + cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g") + cur.execute("VACUUM FREEZE vmtest_lock") + + # Lock a row. This clears the all-frozen VM bit for that page. + cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE") + + # Remember the XID. We will use it later to verify that we have consumed a lot of + # XIDs after this. + cur.execute("select pg_current_xact_id()") + locking_xid = cur.fetchall()[0][0] + + # Stop and restart postgres, to clear the buffer cache. + # + # NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages + # in a "clean" way. Our neon extension will write a full-page image of the VM + # page, and we want to avoid that. + endpoint.stop() + endpoint.start() + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ") + tup = cur.fetchall() + xmax_before = tup[0][1] + + # Consume a lot of XIDs, so that anti-wraparound autovacuum kicks + # in and the clog gets truncated. We set autovacuum_freeze_max_age to a very + # low value, so it doesn't take all that many XIDs for autovacuum to kick in. + for i in range(1000): + cur.execute( + """ + CREATE TEMP TABLE othertable (i int) ON COMMIT DROP; + do $$ + begin + for i in 1..100000 loop + -- Use a begin-exception block to generate a new subtransaction on each iteration + begin + insert into othertable values (i); + exception when others then + raise 'not expected %', sqlerrm; + end; + end loop; + end; + $$; + """ + ) + cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ") + tup = cur.fetchall() + log.info(f"tuple = {tup}") + xmax = tup[0][1] + assert xmax == xmax_before + + if i % 50 == 0: + cur.execute("select datfrozenxid from pg_database where datname='postgres'") + datfrozenxid = cur.fetchall()[0][0] + if datfrozenxid > locking_xid: + break + + cur.execute("select pg_current_xact_id()") + curr_xid = cur.fetchall()[0][0] + assert int(curr_xid) - int(locking_xid) >= 100000 + + # Now, if the VM all-frozen bit was not correctly cleared on + # replay, we will try to fetch the status of the XID that was + # already truncated away. + # + # ERROR: could not access status of transaction 1027 + cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 for update") + tup = cur.fetchall() + log.info(f"tuple = {tup}") diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 8ca93845b2..8199f5777b 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -14,6 +14,8 @@ from pathlib import Path from typing import Any, List, Optional import psycopg2 +import psycopg2.errors +import psycopg2.extras import pytest from fixtures.broker import NeonBroker from fixtures.log_helper import log @@ -37,7 +39,6 @@ from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( RemoteStorageKind, - RemoteStorageUsers, available_remote_storages, ) from fixtures.types import Lsn, TenantId, TimelineId @@ -261,13 +262,13 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): else: failed_node.start() failed_node = None - assert query_scalar(cur, "SELECT sum(key) FROM t") == 500500 + assert query_scalar(cur, "SELECT sum(key) FROM t") == (n_inserts * (n_inserts + 1)) // 2 # Test that safekeepers push their info to the broker and learn peer status from it def test_broker(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 - neon_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() tenant_id = env.initial_tenant @@ -313,7 +314,7 @@ def test_broker(neon_env_builder: NeonEnvBuilder): def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.num_safekeepers = 2 # to advance remote_consistent_lsn - neon_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() @@ -436,13 +437,7 @@ def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 - - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_safekeepers_wal_backup", - ) - - neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER + neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() @@ -489,12 +484,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): neon_env_builder.num_safekeepers = 3 - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_s3_wal_replay", - ) - - neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER + neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() tenant_id = env.initial_tenant diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index c97c69db23..a4f03be7a0 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -29,7 +29,7 @@ def test_wal_restore( endpoint.safe_psql("create table t as select generate_series(1,300000)") tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) - env.neon_cli.pageserver_stop() + env.pageserver.stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" with VanillaPostgres( diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 4a47898935..13159efbe8 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -27,7 +27,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.append(".*NotFound: tenant.*") pageserver_http = env.pageserver.http_client() - pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text()) + pagserver_pid = int((env.pageserver.workdir / "pageserver.pid").read_text()) assert_child_processes(pagserver_pid, wal_redo_present=False, defunct_present=False) @@ -43,7 +43,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): tenant_id, _ = env.neon_cli.create_tenant() # assert tenant exists on disk - assert (env.repo_dir / "tenants" / str(tenant_id)).exists() + assert (env.pageserver.tenant_dir(tenant_id)).exists() endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) @@ -101,7 +101,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): pytest.fail(f"could not detach tenant: {last_error}") # check that nothing is left on disk for deleted tenant - assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() + assert not env.pageserver.tenant_dir(tenant_id).exists() # Pageserver schedules kill+wait of the WAL redo process to the background runtime, # asynchronously to tenant detach. Cut it some slack to complete kill+wait before diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 026d6b093d..74cfe3e681 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 026d6b093d49e25cec44dd04598152329ceac027 +Subproject commit 74cfe3e681836747a31fdbd47bdd14b3d81b0772 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 new file mode 160000 index 0000000000..389ce36b4b --- /dev/null +++ b/vendor/postgres-v16 @@ -0,0 +1 @@ +Subproject commit 389ce36b4b3da7aa654a25e1b3f10b641319a87f diff --git a/vendor/revisions.json b/vendor/revisions.json index 63b72cf506..d08cb25f43 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,4 +1,5 @@ { - "postgres-v15": "026d6b093d49e25cec44dd04598152329ceac027", + "postgres-v16": "389ce36b4b3da7aa654a25e1b3f10b641319a87f", + "postgres-v15": "74cfe3e681836747a31fdbd47bdd14b3d81b0772", "postgres-v14": "5d5cfee12783f0989a9c9fe13bb40b5585812568" } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 4ec4b01f66..b2303869f2 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -14,6 +14,10 @@ publish = false ### BEGIN HAKARI SECTION [dependencies] anyhow = { version = "1", features = ["backtrace"] } +aws-config = { version = "0.56", default-features = false, features = ["credentials-sso", "rustls"] } +aws-runtime = { version = "0.56", default-features = false, features = ["event-stream"] } +aws-sigv4 = { version = "0.56", features = ["sign-eventstream"] } +aws-smithy-http = { version = "0.56", default-features = false, features = ["event-stream", "rt-tokio"] } axum = { version = "0.6", features = ["ws"] } base64 = { version = "0.21", features = ["alloc"] } bytes = { version = "1", features = ["serde"] } @@ -21,7 +25,6 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd clap = { version = "4", features = ["derive", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } -digest = { version = "0.10", features = ["mac", "std"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } futures = { version = "0.3" } @@ -30,6 +33,7 @@ futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } +hex = { version = "0.4", features = ["serde"] } hyper = { version = "0.14", features = ["full"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } @@ -51,6 +55,7 @@ serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } smallvec = { version = "1", default-features = false, features = ["write"] } socket2 = { version = "0.4", default-features = false, features = ["all"] } +time = { version = "0.3", features = ["formatting", "macros", "parsing"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } tokio-util = { version = "0.7", features = ["codec", "io"] } @@ -59,7 +64,9 @@ toml_edit = { version = "0.19", features = ["serde"] } tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } +tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } +uuid = { version = "1", features = ["serde", "v4"] } [build-dependencies] anyhow = { version = "1", features = ["backtrace"] }