diff --git a/.cargo/config.toml b/.cargo/config.toml index 76a2ff549e..c40783bc1b 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -11,3 +11,6 @@ opt-level = 3 [profile.dev] # Turn on a small amount of optimization in Development mode. opt-level = 1 + +[alias] +build_testing = ["build", "--features", "testing"] diff --git a/.dockerignore b/.dockerignore index 4bc8e5fa13..92eb4f24de 100644 --- a/.dockerignore +++ b/.dockerignore @@ -18,3 +18,4 @@ !vendor/postgres-v15/ !workspace_hack/ !neon_local/ +!scripts/ninstall.sh diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index 5aa45164e7..731ef6639d 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -12,6 +12,9 @@ inputs: description: "Allow to skip if file doesn't exist, fail otherwise" default: false required: false + prefix: + description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" + required: false runs: using: "composite" @@ -23,18 +26,18 @@ runs: TARGET: ${{ inputs.path }} ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }} + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} run: | BUCKET=neon-github-public-dev - PREFIX=artifacts/${GITHUB_RUN_ID} FILENAME=$(basename $ARCHIVE) - S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) + S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) if [ -z "${S3_KEY}" ]; then if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then echo '::set-output name=SKIPPED::true' exit 0 else - echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} nor its version from previous attempts exist" + echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" exit 1 fi fi diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index ba81afaaff..2f58ae77ad 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -60,6 +60,7 @@ runs: --header "Authorization: Bearer ${API_KEY}" \ --data "{ \"project\": { + \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", \"platform_id\": \"aws\", \"region_id\": \"${REGION_ID}\", \"settings\": { } diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 4c18641938..cc6ab65b76 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -85,7 +85,8 @@ runs: # PLATFORM will be embedded in the perf test report # and it is needed to distinguish different environments export PLATFORM=${PLATFORM:-github-actions-selfhosted} - export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install/v14} + export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} + export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 @@ -112,10 +113,8 @@ runs: fi if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then - if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then - mkdir -p "$PERF_REPORT_DIR" - EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" - fi + mkdir -p "$PERF_REPORT_DIR" + EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" fi if [[ "${{ inputs.build_type }}" == "debug" ]]; then @@ -128,7 +127,7 @@ runs: # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" fi # Run the tests. @@ -150,11 +149,9 @@ runs: -rA $TEST_SELECTION $EXTRA_PARAMS if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then - if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then - export REPORT_FROM="$PERF_REPORT_DIR" - export REPORT_TO="$PLATFORM" - scripts/generate_and_push_perf_report.sh - fi + export REPORT_FROM="$PERF_REPORT_DIR" + export REPORT_TO="$PLATFORM" + scripts/generate_and_push_perf_report.sh fi - name: Create Allure report diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index de8df3230f..291a2cf3b0 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -7,6 +7,9 @@ inputs: path: description: "A directory or file to upload" required: true + prefix: + description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" + required: false runs: using: "composite" @@ -42,14 +45,14 @@ runs: env: SOURCE: ${{ inputs.path }} ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} run: | BUCKET=neon-github-public-dev - PREFIX=artifacts/${GITHUB_RUN_ID} FILENAME=$(basename $ARCHIVE) FILESIZE=$(du -sh ${ARCHIVE} | cut -f1) - time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} + time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${FILENAME} # Ref https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#adding-a-job-summary - echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY} + echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY} diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml index b47db6a9b5..e206f9d5ba 100644 --- a/.github/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -58,7 +58,7 @@ creates: "/storage/pageserver/data/tenants" environment: NEON_REPO_DIR: "/storage/pageserver/data" - LD_LIBRARY_PATH: "/usr/local/lib" + LD_LIBRARY_PATH: "/usr/local/v14/lib" become: true tags: - pageserver @@ -132,7 +132,7 @@ creates: "/storage/safekeeper/data/safekeeper.id" environment: NEON_REPO_DIR: "/storage/safekeeper/data" - LD_LIBRARY_PATH: "/usr/local/lib" + LD_LIBRARY_PATH: "/usr/local/v14/lib" become: true tags: - safekeeper diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh index f44a1ca50a..a484bfb0a0 100755 --- a/.github/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -21,10 +21,14 @@ docker pull --quiet neondatabase/neon:${DOCKER_TAG} ID=$(docker create neondatabase/neon:${DOCKER_TAG}) docker cp ${ID}:/data/postgres_install.tar.gz . tar -xzf postgres_install.tar.gz -C neon_install +mkdir neon_install/bin/ docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ -docker cp ${ID}:/usr/local/bin/postgres neon_install/bin/ +docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/ +docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/ +docker cp ${ID}:/usr/local/v14/lib/ neon_install/v14/lib/ +docker cp ${ID}:/usr/local/v15/lib/ neon_install/v15/lib/ docker rm -vf ${ID} # store version to file (for ansible playbooks) and create binaries tarball diff --git a/.github/ansible/staging.hosts b/.github/ansible/staging.hosts index 2bb28f1972..f5accc188a 100644 --- a/.github/ansible/staging.hosts +++ b/.github/ansible/staging.hosts @@ -2,11 +2,16 @@ #zenith-us-stage-ps-1 console_region_id=27 zenith-us-stage-ps-2 console_region_id=27 zenith-us-stage-ps-3 console_region_id=27 +zenith-us-stage-ps-4 console_region_id=27 +zenith-us-stage-test-ps-1 console_region_id=28 [safekeepers] zenith-us-stage-sk-4 console_region_id=27 zenith-us-stage-sk-5 console_region_id=27 zenith-us-stage-sk-6 console_region_id=27 +zenith-us-stage-test-sk-1 console_region_id=28 +zenith-us-stage-test-sk-2 console_region_id=28 +zenith-us-stage-test-sk-3 console_region_id=28 [storage:children] pageservers diff --git a/.github/ansible/systemd/pageserver.service b/.github/ansible/systemd/pageserver.service index bb78054fa3..688c7e7b87 100644 --- a/.github/ansible/systemd/pageserver.service +++ b/.github/ansible/systemd/pageserver.service @@ -5,7 +5,7 @@ After=network.target auditd.service [Service] Type=simple User=pageserver -Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib +Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed diff --git a/.github/ansible/systemd/safekeeper.service b/.github/ansible/systemd/safekeeper.service index d5c6d00017..36af414761 100644 --- a/.github/ansible/systemd/safekeeper.service +++ b/.github/ansible/systemd/safekeeper.service @@ -5,7 +5,7 @@ After=network.target auditd.service [Service] Type=simple User=safekeeper -Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib +Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}' ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index fab0a9aa04..4d91e9fa74 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -11,7 +11,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '36 4 * * *' # run once a day, timezone is utc + - cron: '0 3 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually inputs: @@ -19,8 +19,12 @@ on: description: 'Environment to run remote tests on (dev or staging)' required: false region_id: - description: 'Use a particular region. If empty the default one will be used' - false: true + description: 'Use a particular region. If not set the default region will be used' + required: false + save_perf_report: + type: boolean + description: 'Publish perf report or not. If not set, the report is published only for the main branch' + required: false defaults: run: @@ -42,7 +46,8 @@ jobs: runs-on: [self-hosted, zenith-benchmarker] env: - POSTGRES_DISTRIB_DIR: "/usr/pgsql-14" + POSTGRES_DISTRIB_DIR: /tmp/pg_install + DEFAULT_PG_VERSION: 14 steps: - name: Checkout zenith repo @@ -67,7 +72,7 @@ jobs: echo Poetry poetry --version echo Pgbench - $POSTGRES_DISTRIB_DIR/bin/pgbench --version + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - name: Create Neon Project id: create-neon-project @@ -136,17 +141,19 @@ jobs: env: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" TEST_PG_BENCH_SCALES_MATRIX: "10gb" - POSTGRES_DISTRIB_DIR: /usr + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: true + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} strategy: fail-fast: false matrix: - # neon-captest: Run pgbench, reusing existing project - # neon-captest-new: Same, but on a freshly created project - platform: [ neon-captest, neon-captest-new, rds-aurora ] + # neon-captest-new: Run pgbench in a freshly created project + # neon-captest-reuse: Same, but reusing existing project + # neon-captest-prefetch: Same, with prefetching enabled (new project) + platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-aurora ] runs-on: dev container: @@ -158,13 +165,20 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Install Deps + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Add Postgres binaries to PATH run: | - sudo apt -y update - sudo apt install -y postgresql-14 + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version + echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - name: Create Neon Project - if: matrix.platform == 'neon-captest-new' + if: matrix.platform != 'neon-captest-reuse' id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -175,17 +189,17 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; - neon-captest-new) + neon-captest-new | neon-captest-prefetch) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest', 'neon-captest-new' or 'rds-aurora'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch' or 'rds-aurora'" exit 1 ;; esac @@ -196,6 +210,14 @@ jobs: env: PLATFORM: ${{ matrix.platform }} + - name: Set database options + if: matrix.platform == 'neon-captest-prefetch' + run: | + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET enable_seqscan_prefetch=on" + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET seqscan_prefetch_buffers=10" + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + - name: Benchmark init uses: ./.github/actions/run-python-test-set with: @@ -239,13 +261,14 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Create Allure report + if: always() uses: ./.github/actions/allure-report with: action: generate build_type: ${{ env.BUILD_TYPE }} - name: Delete Neon Project - if: ${{ matrix.platform == 'neon-captest-new' && always() }} + if: ${{ matrix.platform != 'neon-captest-reuse' && always() }} uses: ./.github/actions/neon-project-delete with: environment: dev diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d586741d68..4f2f8f0833 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -94,15 +94,17 @@ jobs: # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, # because "cargo metadata" doesn't accept --release or --debug options # + # We run tests with addtional features, that are turned off by default (e.g. in release builds), see + # corresponding Cargo.toml files for their descriptions. - name: Set env variables run: | if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FEATURES="" - CARGO_FLAGS="--locked --timings" + CARGO_FEATURES="--features testing" + CARGO_FLAGS="--locked --timings $CARGO_FEATURES" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FEATURES="--features profiling" + CARGO_FEATURES="--features testing,profiling" CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV @@ -125,8 +127,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - v8-${{ runner.os }}-${{ matrix.build_type }}-cargo- + v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + v9-${{ runner.os }}-${{ matrix.build_type }}-cargo- - name: Cache postgres v14 build id: cache_pg_14 @@ -158,7 +160,7 @@ jobs: - name: Run cargo build run: | - ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + ${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests shell: bash -euxo pipefail {0} - name: Run cargo test @@ -266,6 +268,32 @@ jobs: if: matrix.build_type == 'debug' uses: ./.github/actions/save-coverage-data + upload-latest-artifacts: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + needs: [ regress-tests ] + if: github.ref_name == 'main' + steps: + - name: Copy Neon artifact to the latest directory + shell: bash -euxo pipefail {0} + env: + BUCKET: neon-github-public-dev + PREFIX: artifacts/${{ github.run_id }} + run: | + for build_type in debug release; do + FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst + + S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) + if [ -z "${S3_KEY}" ]; then + echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" + exit 1 + fi + + time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME} + done + benchmarks: runs-on: dev container: @@ -290,7 +318,7 @@ jobs: build_type: ${{ matrix.build_type }} test_selection: performance run_in_parallel: false - save_perf_report: true + save_perf_report: ${{ github.ref == 'refs/heads/main' }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -322,6 +350,7 @@ jobs: build_type: ${{ matrix.build_type }} - name: Store Allure test stat in the DB + if: ${{ steps.create-allure-report.outputs.report-url }} env: BUILD_TYPE: ${{ matrix.build_type }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} @@ -332,9 +361,6 @@ jobs: curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json ./scripts/pysync - # Workaround for https://github.com/neondatabase/cloud/issues/2188 - psql "$TEST_RESULT_CONNSTR" -c "SELECT 1;" || sleep 10 - DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json coverage-report: @@ -363,7 +389,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download @@ -585,7 +611,16 @@ jobs: - name: Pull rust image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust - - name: Configure docker login + - name: Push images to production ECR + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + run: | + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest + + - name: Configure Docker Hub login run: | # ECR Credential Helper & Docker Hub don't work together in config, hence reset echo "" > /github/home/.docker/config.json @@ -606,7 +641,7 @@ jobs: - name: Push rust image to Docker Hub run: crane push rust neondatabase/rust:pinned - - name: Add latest tag to images + - name: Add latest tag to images in Docker Hub if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' @@ -733,5 +768,5 @@ jobs: - name: Re-deploy proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s - helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index 237cf81205..6d39958bab 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -30,7 +30,7 @@ jobs: # this is all we need to install our toolchain later via rust-toolchain.toml # so don't install any toolchain explicitly. os: [ubuntu-latest, macos-latest] - timeout-minutes: 60 + timeout-minutes: 90 name: check codestyle rust and postgres runs-on: ${{ matrix.os }} @@ -106,7 +106,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust + key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust - name: Run cargo clippy run: ./run_clippy.sh @@ -114,6 +114,26 @@ jobs: - name: Ensure all project builds run: cargo build --locked --all --all-targets + check-rust-dependencies: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: false + fetch-depth: 1 + + # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check every project module is covered by Hakari + run: | + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + shell: bash -euxo pipefail {0} + check-codestyle-python: runs-on: [ self-hosted, Linux, k8s-runner ] steps: diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index d04d002811..0600f9234f 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -58,12 +58,12 @@ jobs: env: REMOTE_ENV: 1 BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install/v14 + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install shell: bash -euxo pipefail {0} run: | # Test framework expects we have psql binary; # but since we don't really need it in this test, let's mock it - mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql"; + mkdir -p "$POSTGRES_DISTRIB_DIR/v14/bin" && touch "$POSTGRES_DISTRIB_DIR/v14/bin/psql"; ./scripts/pytest \ --junitxml=$TEST_OUTPUT/junit.xml \ --tb=short \ diff --git a/Cargo.lock b/Cargo.lock index ad77b2bdfc..d9694bba9f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -37,6 +37,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "amplify_num" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27d3d00d3d115395a7a8a4dc045feb7aa82b641e485f7e15f4e67ac16f4f56d" + [[package]] name = "ansi_term" version = "0.12.1" @@ -135,6 +141,15 @@ dependencies = [ "syn", ] +[[package]] +name = "atomic-polyfill" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c041a8d9751a520ee19656232a18971f18946a7900f1520ee4400002244dd89" +dependencies = [ + "critical-section", +] + [[package]] name = "atty" version = "0.2.14" @@ -212,6 +227,21 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "bare-metal" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5deb64efa5bd81e31fcd1938615a6d98c82eafcbcd787162b6f63b91d6bac5b3" +dependencies = [ + "rustc_version 0.2.3", +] + +[[package]] +name = "bare-metal" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fe8f5a8a398345e52358e18ff07cc17a568fbca5c6f73873d3a62056309603" + [[package]] name = "base64" version = "0.13.0" @@ -229,14 +259,14 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.59.2" +version = "0.60.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" +checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6" dependencies = [ "bitflags", "cexpr", "clang-sys", - "clap 2.34.0", + "clap 3.2.16", "env_logger", "lazy_static", "lazycell", @@ -250,6 +280,18 @@ dependencies = [ "which", ] +[[package]] +name = "bit_field" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb6dd1c2376d2e096796e234a70e17e94cc2d5d54ff8ce42b28cef1d0d359a4" + +[[package]] +name = "bitfield" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719" + [[package]] name = "bitflags" version = "1.3.2" @@ -377,13 +419,9 @@ version = "2.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ - "ansi_term", - "atty", "bitflags", - "strsim 0.8.0", "textwrap 0.11.0", "unicode-width", - "vec_map", ] [[package]] @@ -396,7 +434,7 @@ dependencies = [ "bitflags", "clap_lex", "indexmap", - "strsim 0.10.0", + "strsim", "termcolor", "textwrap 0.15.0", ] @@ -459,8 +497,10 @@ dependencies = [ "chrono", "clap 3.2.16", "env_logger", + "futures", "hyper", "log", + "notify", "postgres", "regex", "serde", @@ -502,11 +542,11 @@ dependencies = [ "git-version", "nix", "once_cell", - "pageserver", + "pageserver_api", "postgres", "regex", "reqwest", - "safekeeper", + "safekeeper_api", "serde", "serde_with", "tar", @@ -532,6 +572,18 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +[[package]] +name = "cortex-m" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70858629a458fdfd39f9675c4dc309411f2a3f83bede76988d81bf1a0ecee9e0" +dependencies = [ + "bare-metal 0.2.5", + "bitfield", + "embedded-hal", + "volatile-register", +] + [[package]] name = "cpp_demangle" version = "0.3.5" @@ -556,7 +608,7 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" dependencies = [ - "rustc_version", + "rustc_version 0.4.0", ] [[package]] @@ -604,6 +656,18 @@ dependencies = [ "itertools", ] +[[package]] +name = "critical-section" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95da181745b56d4bd339530ec393508910c909c784e8962d15d722bacf0bcbcd" +dependencies = [ + "bare-metal 1.0.0", + "cfg-if", + "cortex-m", + "riscv", +] + [[package]] name = "crossbeam-channel" version = "0.5.6" @@ -746,7 +810,7 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim 0.10.0", + "strsim", "syn", ] @@ -848,6 +912,16 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be" +[[package]] +name = "embedded-hal" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35949884794ad573cf46071e41c9b60efb0cb311e3ca01f7af807af1debc66ff" +dependencies = [ + "nb 0.1.3", + "void", +] + [[package]] name = "encoding_rs" version = "0.8.31" @@ -1000,6 +1074,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "fsevent-sys" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2" +dependencies = [ + "libc", +] + [[package]] name = "futures" version = "0.3.21" @@ -1169,6 +1252,15 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "hash32" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" +dependencies = [ + "byteorder", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -1178,6 +1270,19 @@ dependencies = [ "ahash", ] +[[package]] +name = "heapless" +version = "0.7.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743" +dependencies = [ + "atomic-polyfill", + "hash32", + "rustc_version 0.4.0", + "spin 0.9.4", + "stable_deref_trait", +] + [[package]] name = "heck" version = "0.3.3" @@ -1399,6 +1504,26 @@ dependencies = [ "str_stack", ] +[[package]] +name = "inotify" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" +dependencies = [ + "bitflags", + "inotify-sys", + "libc", +] + +[[package]] +name = "inotify-sys" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" +dependencies = [ + "libc", +] + [[package]] name = "instant" version = "0.1.12" @@ -1458,6 +1583,26 @@ dependencies = [ "simple_asn1", ] +[[package]] +name = "kqueue" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6112e8f37b59803ac47a42d14f1f3a59bbf72fc6857ffc5be455e28a691f8e" +dependencies = [ + "kqueue-sys", + "libc", +] + +[[package]] +name = "kqueue-sys" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587" +dependencies = [ + "bitflags", + "libc", +] + [[package]] name = "kstring" version = "1.0.6" @@ -1495,6 +1640,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "libm" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565" + [[package]] name = "lock_api" version = "0.4.7" @@ -1653,6 +1804,21 @@ dependencies = [ "tempfile", ] +[[package]] +name = "nb" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "801d31da0513b6ec5214e9bf433a77966320625a37860f910be265be6e18d06f" +dependencies = [ + "nb 1.0.0", +] + +[[package]] +name = "nb" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "546c37ac5d9e56f55e73b677106873d9d9f5190605e41a856503623648488cae" + [[package]] name = "nix" version = "0.23.1" @@ -1682,6 +1848,24 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "notify" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2c66da08abae1c024c01d635253e402341b4060a12e99b31c7594063bf490a" +dependencies = [ + "bitflags", + "crossbeam-channel", + "filetime", + "fsevent-sys", + "inotify", + "kqueue", + "libc", + "mio", + "walkdir", + "winapi", +] + [[package]] name = "num-bigint" version = "0.4.3" @@ -1720,6 +1904,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -1832,6 +2017,7 @@ checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4" name = "pageserver" version = "0.1.0" dependencies = [ + "amplify_num", "anyhow", "async-stream", "async-trait", @@ -1856,7 +2042,9 @@ dependencies = [ "itertools", "metrics", "nix", + "num-traits", "once_cell", + "pageserver_api", "postgres", "postgres-protocol", "postgres-types", @@ -1865,6 +2053,7 @@ dependencies = [ "rand", "regex", "remote_storage", + "rstar", "scopeguard", "serde", "serde_json", @@ -1885,6 +2074,17 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_api" +version = "0.1.0" +dependencies = [ + "const_format", + "serde", + "serde_with", + "utils", + "workspace_hack", +] + [[package]] name = "parking_lot" version = "0.11.2" @@ -2288,6 +2488,7 @@ dependencies = [ "tokio-rustls", "url", "utils", + "uuid", "workspace_hack", "x509-parser", ] @@ -2449,6 +2650,7 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", + "utils", "workspace_hack", ] @@ -2518,12 +2720,33 @@ dependencies = [ "cc", "libc", "once_cell", - "spin", + "spin 0.5.2", "untrusted", "web-sys", "winapi", ] +[[package]] +name = "riscv" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6907ccdd7a31012b70faf2af85cd9e5ba97657cc3987c4f13f8e4d2c2a088aba" +dependencies = [ + "bare-metal 1.0.0", + "bit_field", + "riscv-target", +] + +[[package]] +name = "riscv-target" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88aa938cda42a0cf62a20cfe8d139ff1af20c2e681212b5b34adb5a58333f222" +dependencies = [ + "lazy_static", + "regex", +] + [[package]] name = "routerify" version = "3.0.0" @@ -2537,6 +2760,17 @@ dependencies = [ "regex", ] +[[package]] +name = "rstar" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa" +dependencies = [ + "heapless", + "num-traits", + "smallvec", +] + [[package]] name = "rstest" version = "0.12.0" @@ -2546,7 +2780,7 @@ dependencies = [ "cfg-if", "proc-macro2", "quote", - "rustc_version", + "rustc_version 0.4.0", "syn", ] @@ -2568,7 +2802,7 @@ dependencies = [ "log", "rusoto_credential", "rusoto_signature", - "rustc_version", + "rustc_version 0.4.0", "serde", "serde_json", "tokio", @@ -2626,7 +2860,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rusoto_credential", - "rustc_version", + "rustc_version 0.4.0", "serde", "sha2 0.9.9", "tokio", @@ -2644,13 +2878,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver 0.9.0", +] + [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver", + "semver 1.0.13", ] [[package]] @@ -2724,16 +2967,19 @@ dependencies = [ "hyper", "metrics", "once_cell", + "parking_lot 0.12.1", "postgres", "postgres-protocol", "postgres_ffi", "regex", "remote_storage", + "safekeeper_api", "serde", "serde_json", "serde_with", "signal-hook", "tempfile", + "thiserror", "tokio", "tokio-postgres", "toml_edit", @@ -2743,6 +2989,17 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "safekeeper_api" +version = "0.1.0" +dependencies = [ + "const_format", + "serde", + "serde_with", + "utils", + "workspace_hack", +] + [[package]] name = "same-file" version = "1.0.6" @@ -2801,12 +3058,27 @@ dependencies = [ "libc", ] +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + [[package]] name = "semver" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93f6841e709003d68bb2deee8c343572bf446003ec20a583e76f7b15cebf3711" +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + [[package]] name = "serde" version = "1.0.142" @@ -3000,6 +3272,15 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +[[package]] +name = "spin" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" +dependencies = [ + "lock_api", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -3022,12 +3303,6 @@ dependencies = [ "unicode-normalization", ] -[[package]] -name = "strsim" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - [[package]] name = "strsim" version = "0.10.0" @@ -3677,6 +3952,10 @@ name = "uuid" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" +dependencies = [ + "getrandom", + "serde", +] [[package]] name = "valuable" @@ -3684,24 +3963,39 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vcell" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77439c1b53d2303b20d9459b1ade71a83c716e3f9c34f3228c00e6f185d6c002" + [[package]] name = "vcpkg" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" -[[package]] -name = "vec_map" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" - [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + +[[package]] +name = "volatile-register" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ee8f19f9d74293faf70901bc20ad067dc1ad390d2cbf1e3f75f721ffee908b6" +dependencies = [ + "vcell", +] + [[package]] name = "wal_craft" version = "0.1.0" @@ -3714,6 +4008,7 @@ dependencies = [ "postgres", "postgres_ffi", "tempfile", + "workspace_hack", ] [[package]] @@ -3947,16 +4242,10 @@ dependencies = [ "bstr", "bytes", "chrono", - "clap 2.34.0", + "crossbeam-utils", "either", "fail", - "futures-channel", - "futures-task", - "futures-util", - "generic-array", "hashbrown", - "hex", - "hyper", "indexmap", "itoa 0.4.8", "libc", @@ -3973,12 +4262,14 @@ dependencies = [ "regex-syntax", "scopeguard", "serde", + "stable_deref_trait", "syn", "time 0.3.12", "tokio", "tokio-util", "tracing", "tracing-core", + "uuid", ] [[package]] diff --git a/Dockerfile b/Dockerfile index eacb88d168..69402919ec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,13 +14,13 @@ COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile +COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh ENV BUILD_TYPE release RUN set -e \ && mold -run make -j $(nproc) -s neon-pg-ext \ - && rm -rf pg_install/v14/build \ - && rm -rf pg_install/v15/build \ - && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz . + && rm -rf pg_install/build \ + && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz . # Build neon binaries FROM $REPOSITORY/$IMAGE:$TAG AS build @@ -44,7 +44,7 @@ COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ -&& mold -run cargo build --locked --release \ +&& mold -run cargo build --bin pageserver --bin safekeeper --bin proxy --locked --release \ && cachepot -s # Build final image @@ -67,8 +67,8 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin -# v14 is default for now -COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/ +COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ +COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. @@ -77,7 +77,7 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ && /usr/local/bin/pageserver -D /data/.neon/ --init \ -c "id=1234" \ -c "broker_endpoints=['http://etcd:2379']" \ - -c "pg_distrib_dir='/usr/local'" \ + -c "pg_distrib_dir='/usr/local/'" \ -c "listen_pg_addr='0.0.0.0:6400'" \ -c "listen_http_addr='0.0.0.0:9898'" diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index 8ddf752191..ed57b29009 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -8,9 +8,12 @@ ARG TAG=pinned # Layer "build-deps" # FROM debian:bullseye-slim AS build-deps +RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ + echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ + apt update RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libcurl4-openssl-dev libossp-uuid-dev + libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev # # Layer "pg-build" @@ -37,7 +40,7 @@ RUN cd postgres && \ FROM build-deps AS postgis-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget + apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ tar xvzf postgis-3.3.0.tar.gz && \ @@ -59,15 +62,13 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \ # Build plv8 # FROM build-deps AS plv8-build -COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev + apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 # https://github.com/plv8/plv8/issues/475 # Debian bullseye provides binutils 2.35 when >= 2.38 is necessary -RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \ - echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \ - apt update && \ +RUN apt update && \ apt install -y --no-install-recommends -t testing binutils RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ @@ -79,12 +80,46 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ rm -rf /plv8-* && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control +# +# Layer "h3-pg-build" +# Build h3_pg +# +FROM build-deps AS h3-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +# packaged cmake is too old +RUN apt update && \ + apt install -y --no-install-recommends -t testing cmake + +RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ + tar xvzf h3.tgz && \ + cd h3-4.0.1 && \ + mkdir build && \ + cd build && \ + cmake .. -DCMAKE_BUILD_TYPE=Release && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/h3 make install && \ + cp -R /h3/usr / && \ + rm -rf build + +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \ + tar xvzf h3-pg.tgz && \ + cd h3-pg-4.0.1 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control + # # Layer "neon-pg-ext-build" # compile neon extensions # FROM build-deps AS neon-pg-ext-build COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +# plv8 still sometimes crashes during the creation +# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /h3/usr / COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -132,8 +167,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ chmod 0750 /var/db/postgres/compute && \ echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig -# TODO: Check if we can make the extension setup more modular versus a linear build -# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc# COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl diff --git a/Makefile b/Makefile index 4ac51ed174..738a45fd5e 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ endif # headers, the mtime of the headers are not changed when there have # been no changes to the files. Changing the mtime triggers an # unnecessary rebuild of 'postgres_ffi'. -PG_CONFIGURE_OPTS += INSTALL='install -C' +PG_CONFIGURE_OPTS += INSTALL='$(ROOT_PROJECT_DIR)/scripts/ninstall.sh -C' # Choose whether we should be silent or verbose CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose) diff --git a/README.md b/README.md index 03ed57a0fa..dc469c36b1 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,12 @@ Ensure your dependencies are installed as described [here](https://github.com/ne ```sh git clone --recursive https://github.com/neondatabase/neon.git -make # builds also postgres and installs it to ./pg_install + +# either: +CARGO_BUILD_FLAGS="--features=testing" make +# or: +make debug + ./scripts/pytest ``` diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index b13f7f191d..43cf7ae2dd 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -8,8 +8,10 @@ anyhow = "1.0" chrono = "0.4" clap = "3.0" env_logger = "0.9" +futures = "0.3.13" hyper = { version = "0.14", features = ["full"] } log = { version = "0.4", features = ["std", "serde"] } +notify = "5.0.0" postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } regex = "1" serde = { version = "1.0", features = ["derive"] } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 58469b1c97..1e848627e3 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -258,14 +258,7 @@ impl ComputeNode { .spawn() .expect("cannot start postgres process"); - // Try default Postgres port if it is not provided - let port = self - .spec - .cluster - .settings - .find("port") - .unwrap_or_else(|| "5432".to_string()); - wait_for_postgres(&mut pg, &port, pgdata_path)?; + wait_for_postgres(&mut pg, pgdata_path)?; // If connection fails, // it may be the old node with `zenith_admin` superuser. diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index ac065fa60c..769dbfac73 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,18 +1,19 @@ use std::fmt::Write; +use std::fs; use std::fs::File; use std::io::{BufRead, BufReader}; -use std::net::{SocketAddr, TcpStream}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; -use std::str::FromStr; -use std::{fs, thread, time}; +use std::time::{Duration, Instant}; use anyhow::{bail, Result}; use postgres::{Client, Transaction}; use serde::Deserialize; -const POSTGRES_WAIT_TIMEOUT: u64 = 60 * 1000; // milliseconds +use notify::{RecursiveMode, Watcher}; + +const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds /// Rust representation of Postgres role info with only those fields /// that matter for us. @@ -230,52 +231,85 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { Ok(postgres_dbs) } -/// Wait for Postgres to become ready to accept connections: -/// - state should be `ready` in the `pgdata/postmaster.pid` -/// - and we should be able to connect to 127.0.0.1:5432 -pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> { +/// Wait for Postgres to become ready to accept connections. It's ready to +/// accept connections when the state-field in `pgdata/postmaster.pid` says +/// 'ready'. +pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { let pid_path = pgdata.join("postmaster.pid"); - let mut slept: u64 = 0; // ms - let pause = time::Duration::from_millis(100); - let timeout = time::Duration::from_millis(10); - let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap(); + // PostgreSQL writes line "ready" to the postmaster.pid file, when it has + // completed initialization and is ready to accept connections. We want to + // react quickly and perform the rest of our initialization as soon as + // PostgreSQL starts accepting connections. Use 'notify' to be notified + // whenever the PID file is changed, and whenever it changes, read it to + // check if it's now "ready". + // + // You cannot actually watch a file before it exists, so we first watch the + // data directory, and once the postmaster.pid file appears, we switch to + // watch the file instead. We also wake up every 100 ms to poll, just in + // case we miss some events for some reason. Not strictly necessary, but + // better safe than sorry. + let (tx, rx) = std::sync::mpsc::channel(); + let mut watcher = notify::recommended_watcher(move |res| { + let _ = tx.send(res); + })?; + watcher.watch(pgdata, RecursiveMode::NonRecursive)?; + let started_at = Instant::now(); + let mut postmaster_pid_seen = false; loop { - // Sleep POSTGRES_WAIT_TIMEOUT at max (a bit longer actually if consider a TCP timeout, - // but postgres starts listening almost immediately, even if it is not really - // ready to accept connections). - if slept >= POSTGRES_WAIT_TIMEOUT { - bail!("timed out while waiting for Postgres to start"); - } - if let Ok(Some(status)) = pg.try_wait() { // Postgres exited, that is not what we expected, bail out earlier. let code = status.code().unwrap_or(-1); bail!("Postgres exited unexpectedly with code {}", code); } + let res = rx.recv_timeout(Duration::from_millis(100)); + log::debug!("woken up by notify: {res:?}"); + // If there are multiple events in the channel already, we only need to be + // check once. Swallow the extra events before we go ahead to check the + // pid file. + while let Ok(res) = rx.try_recv() { + log::debug!("swallowing extra event: {res:?}"); + } + // Check that we can open pid file first. if let Ok(file) = File::open(&pid_path) { + if !postmaster_pid_seen { + log::debug!("postmaster.pid appeared"); + watcher + .unwatch(pgdata) + .expect("Failed to remove pgdata dir watch"); + watcher + .watch(&pid_path, RecursiveMode::NonRecursive) + .expect("Failed to add postmaster.pid file watch"); + postmaster_pid_seen = true; + } + let file = BufReader::new(file); let last_line = file.lines().last(); // Pid file could be there and we could read it, but it could be empty, for example. if let Some(Ok(line)) = last_line { let status = line.trim(); - let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok(); + log::debug!("last line of postmaster.pid: {status:?}"); // Now Postgres is ready to accept connections - if status == "ready" && can_connect { + if status == "ready" { break; } } } - thread::sleep(pause); - slept += 100; + // Give up after POSTGRES_WAIT_TIMEOUT. + let duration = started_at.elapsed(); + if duration >= POSTGRES_WAIT_TIMEOUT { + bail!("timed out while waiting for Postgres to start"); + } } + log::info!("PostgreSQL is now running, continuing to configure it"); + Ok(()) } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index ab9df8534c..ee8481e141 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -19,7 +19,9 @@ thiserror = "1" nix = "0.23" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } -pageserver = { path = "../pageserver" } -safekeeper = { path = "../safekeeper" } +# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api +# instead, so that recompile times are better. +pageserver_api = { path = "../libs/pageserver_api" } +safekeeper_api = { path = "../libs/safekeeper_api" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index e16fd8764a..0c26842b34 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -12,12 +12,12 @@ use control_plane::local_env::{EtcdBroker, LocalEnv}; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage::PageServerNode; use control_plane::{etcd, local_env}; -use pageserver::config::defaults::{ +use pageserver_api::models::TimelineInfo; +use pageserver_api::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, }; -use pageserver::http::models::TimelineInfo; -use safekeeper::defaults::{ +use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; @@ -39,6 +39,8 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); +const DEFAULT_PG_VERSION: &str = "14"; + fn default_conf(etcd_binary_path: &Path) -> String { format!( r#" @@ -105,6 +107,13 @@ fn main() -> Result<()> { .takes_value(true) .required(false); + let pg_version_arg = Arg::new("pg-version") + .long("pg-version") + .help("Postgres version to use for the initial tenant") + .required(false) + .takes_value(true) + .default_value(DEFAULT_PG_VERSION); + let port_arg = Arg::new("port") .long("port") .required(false) @@ -146,6 +155,7 @@ fn main() -> Result<()> { .required(false) .value_name("config"), ) + .arg(pg_version_arg.clone()) ) .subcommand( App::new("timeline") @@ -164,7 +174,9 @@ fn main() -> Result<()> { .subcommand(App::new("create") .about("Create a new blank timeline") .arg(tenant_id_arg.clone()) - .arg(branch_name_arg.clone())) + .arg(branch_name_arg.clone()) + .arg(pg_version_arg.clone()) + ) .subcommand(App::new("import") .about("Import timeline from basebackup directory") .arg(tenant_id_arg.clone()) @@ -178,7 +190,9 @@ fn main() -> Result<()> { .arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true) .help("Wal to add after base")) .arg(Arg::new("end-lsn").long("end-lsn").takes_value(true) - .help("Lsn the basebackup ends at"))) + .help("Lsn the basebackup ends at")) + .arg(pg_version_arg.clone()) + ) ).subcommand( App::new("tenant") .setting(AppSettings::ArgRequiredElseHelp) @@ -188,6 +202,7 @@ fn main() -> Result<()> { .arg(tenant_id_arg.clone()) .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false)) + .arg(pg_version_arg.clone()) ) .subcommand(App::new("config") .arg(tenant_id_arg.clone()) @@ -239,8 +254,9 @@ fn main() -> Result<()> { Arg::new("config-only") .help("Don't do basebackup, create compute node with only config files") .long("config-only") - .required(false) - )) + .required(false)) + .arg(pg_version_arg.clone()) + ) .subcommand(App::new("start") .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") .arg(pg_node_arg.clone()) @@ -248,7 +264,9 @@ fn main() -> Result<()> { .arg(branch_name_arg.clone()) .arg(timeline_id_arg.clone()) .arg(lsn_arg.clone()) - .arg(port_arg.clone())) + .arg(port_arg.clone()) + .arg(pg_version_arg.clone()) + ) .subcommand( App::new("stop") .arg(pg_node_arg.clone()) @@ -501,9 +519,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { default_conf(&EtcdBroker::locate_etcd()?) }; + let pg_version = init_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + let mut env = LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; - env.init().context("Failed to initialize neon repository")?; + env.init(pg_version) + .context("Failed to initialize neon repository")?; let initial_tenant_id = env .default_tenant_id .expect("default_tenant_id should be generated by the `env.init()` call above"); @@ -515,6 +540,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { Some(initial_tenant_id), initial_timeline_id_arg, &pageserver_config_overrides(init_match), + pg_version, ) .unwrap_or_else(|e| { eprintln!("pageserver init failed: {e}"); @@ -557,8 +583,19 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an // Create an initial timeline for the new tenant let new_timeline_id = parse_timeline_id(create_match)?; - let timeline_info = - pageserver.timeline_create(new_tenant_id, new_timeline_id, None, None)?; + let pg_version = create_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + let timeline_info = pageserver.timeline_create( + new_tenant_id, + new_timeline_id, + None, + None, + Some(pg_version), + )?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info .local @@ -607,7 +644,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let new_branch_name = create_match .value_of("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; - let timeline_info = pageserver.timeline_create(tenant_id, None, None, None)?; + + let pg_version = create_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + let timeline_info = + pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?; let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info @@ -650,12 +695,19 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - // TODO validate both or none are provided let pg_wal = end_lsn.zip(wal_tarfile); + let pg_version = import_match + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + let mut cplane = ComputeControlPlane::load(env.clone())?; println!("Importing timeline into pageserver ..."); - pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal)?; + pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?; println!("Creating node for imported timeline ..."); env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; - cplane.new_node(tenant_id, name, timeline_id, None, None)?; + + cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?; println!("Done"); } Some(("branch", branch_match)) => { @@ -682,6 +734,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - None, start_lsn, Some(ancestor_timeline_id), + None, )?; let new_timeline_id = timeline_info.timeline_id; @@ -797,7 +850,14 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { Some(p) => Some(p.parse()?), None => None, }; - cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port)?; + + let pg_version = sub_args + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; + + cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?; } "start" => { let port: Option = match sub_args.value_of("port") { @@ -835,16 +895,23 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .map(Lsn::from_str) .transpose() .context("Failed to parse Lsn from the request")?; + let pg_version = sub_args + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?; // when used with custom port this results in non obvious behaviour // port is remembered from first start command, i e // start --port X // stop // start <-- will also use port X even without explicit port argument println!( - "Starting new postgres {} on timeline {} ...", - node_name, timeline_id + "Starting new postgres (v{}) {} on timeline {} ...", + pg_version, node_name, timeline_id ); - let node = cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?; + + let node = + cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?; node.start(&auth_token)?; } } diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index b678d620df..89994c5647 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -18,7 +18,7 @@ use utils::{ postgres_backend::AuthType, }; -use crate::local_env::LocalEnv; +use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION}; use crate::postgresql_conf::PostgresConf; use crate::storage::PageServerNode; @@ -81,6 +81,7 @@ impl ComputeControlPlane { timeline_id: TimelineId, lsn: Option, port: Option, + pg_version: u32, ) -> Result> { let port = port.unwrap_or_else(|| self.get_port()); let node = Arc::new(PostgresNode { @@ -93,6 +94,7 @@ impl ComputeControlPlane { lsn, tenant_id, uses_wal_proposer: false, + pg_version, }); node.create_pgdata()?; @@ -118,6 +120,7 @@ pub struct PostgresNode { pub lsn: Option, // if it's a read-only node. None for primary pub tenant_id: TenantId, uses_wal_proposer: bool, + pg_version: u32, } impl PostgresNode { @@ -152,6 +155,14 @@ impl PostgresNode { let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); + // Read postgres version from PG_VERSION file to determine which postgres version binary to use. + // If it doesn't exist, assume broken data directory and use default pg version. + let pg_version_path = entry.path().join("PG_VERSION"); + + let pg_version_str = + fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string()); + let pg_version = u32::from_str(&pg_version_str)?; + // parse recovery_target_lsn, if any let recovery_target_lsn: Option = conf.parse_field_optional("recovery_target_lsn", &context)?; @@ -167,17 +178,24 @@ impl PostgresNode { lsn: recovery_target_lsn, tenant_id, uses_wal_proposer, + pg_version, }) } - fn sync_safekeepers(&self, auth_token: &Option) -> Result { - let pg_path = self.env.pg_bin_dir().join("postgres"); + fn sync_safekeepers(&self, auth_token: &Option, pg_version: u32) -> Result { + let pg_path = self.env.pg_bin_dir(pg_version).join("postgres"); let mut cmd = Command::new(&pg_path); cmd.arg("--sync-safekeepers") .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) + .env( + "LD_LIBRARY_PATH", + self.env.pg_lib_dir(pg_version).to_str().unwrap(), + ) + .env( + "DYLD_LIBRARY_PATH", + self.env.pg_lib_dir(pg_version).to_str().unwrap(), + ) .env("PGDATA", self.pgdata().to_str().unwrap()) .stdout(Stdio::piped()) // Comment this to avoid capturing stderr (useful if command hangs) @@ -259,8 +277,8 @@ impl PostgresNode { }) } - // Connect to a page server, get base backup, and untar it to initialize a - // new data directory + // Write postgresql.conf with default configuration + // and PG_VERSION file to the data directory of a new node. fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> { let mut conf = PostgresConf::new(); conf.append("max_wal_senders", "10"); @@ -357,6 +375,9 @@ impl PostgresNode { let mut file = File::create(self.pgdata().join("postgresql.conf"))?; file.write_all(conf.to_string().as_bytes())?; + let mut file = File::create(self.pgdata().join("PG_VERSION"))?; + file.write_all(self.pg_version.to_string().as_bytes())?; + Ok(()) } @@ -368,7 +389,7 @@ impl PostgresNode { // latest data from the pageserver. That is a bit clumsy but whole bootstrap // procedure evolves quite actively right now, so let's think about it again // when things would be more stable (TODO). - let lsn = self.sync_safekeepers(auth_token)?; + let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; if lsn == Lsn(0) { None } else { @@ -401,7 +422,7 @@ impl PostgresNode { } fn pg_ctl(&self, args: &[&str], auth_token: &Option) -> Result<()> { - let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl"); + let pg_ctl_path = self.env.pg_bin_dir(self.pg_version).join("pg_ctl"); let mut cmd = Command::new(pg_ctl_path); cmd.args( [ @@ -417,8 +438,14 @@ impl PostgresNode { .concat(), ) .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()); + .env( + "LD_LIBRARY_PATH", + self.env.pg_lib_dir(self.pg_version).to_str().unwrap(), + ) + .env( + "DYLD_LIBRARY_PATH", + self.env.pg_lib_dir(self.pg_version).to_str().unwrap(), + ); if let Some(token) = auth_token { cmd.env("ZENITH_AUTH_TOKEN", token); } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 7afaad26dc..f4fbc99420 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -20,6 +20,8 @@ use utils::{ use crate::safekeeper::SafekeeperNode; +pub const DEFAULT_PG_VERSION: u32 = 14; + // // This data structures represents neon_local CLI config // @@ -195,12 +197,33 @@ impl Default for SafekeeperConf { } impl LocalEnv { - // postgres installation paths - pub fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + pub fn pg_distrib_dir_raw(&self) -> PathBuf { + self.pg_distrib_dir.clone() } - pub fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + + pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { + let path = self.pg_distrib_dir.clone(); + + match pg_version { + 14 => path.join(format!("v{pg_version}")), + 15 => path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), + } + } + + pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("bin"), + 15 => self.pg_distrib_dir(pg_version).join("bin"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } + } + pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("lib"), + 15 => self.pg_distrib_dir(pg_version).join("lib"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } } pub fn pageserver_bin(&self) -> anyhow::Result { @@ -289,13 +312,15 @@ impl LocalEnv { let mut env: LocalEnv = toml::from_str(toml)?; // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install/v14". + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". + // Note that later in the code we assume, that distrib dirs follow the same pattern + // for all postgres versions. if env.pg_distrib_dir == Path::new("") { if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { env.pg_distrib_dir = postgres_bin.into(); } else { let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("pg_install/v14") + env.pg_distrib_dir = cwd.join("pg_install") } } @@ -384,7 +409,7 @@ impl LocalEnv { // // Initialize a new Neon repository // - pub fn init(&mut self) -> anyhow::Result<()> { + pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> { // check if config already exists let base_path = &self.base_data_dir; ensure!( @@ -397,10 +422,10 @@ impl LocalEnv { "directory '{}' already exists. Perhaps already initialized?", base_path.display() ); - if !self.pg_distrib_dir.join("bin/postgres").exists() { + if !self.pg_bin_dir(pg_version).join("postgres").exists() { bail!( "Can't find postgres binary at {}", - self.pg_distrib_dir.display() + self.pg_bin_dir(pg_version).display() ); } for binary in ["pageserver", "safekeeper"] { diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 600a9ffe05..34b2f3000a 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -12,7 +12,7 @@ use nix::unistd::Pid; use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; -use safekeeper::http::models::TimelineCreateRequest; +use safekeeper_api::models::TimelineCreateRequest; use thiserror::Error; use utils::{ connstring::connection_address, diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 6c79860404..933b7b7b65 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -11,7 +11,7 @@ use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{ +use pageserver_api::models::{ TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, }; use postgres::{Config, NoTls}; @@ -112,11 +112,15 @@ impl PageServerNode { create_tenant: Option, initial_timeline_id: Option, config_overrides: &[&str], + pg_version: u32, ) -> anyhow::Result { let id = format!("id={}", self.env.pageserver.id); // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. - let pg_distrib_dir_param = - format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display()); + let pg_distrib_dir_param = format!( + "pg_distrib_dir='{}'", + self.env.pg_distrib_dir_raw().display() + ); + let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); let listen_http_addr_param = format!( "listen_http_addr='{}'", @@ -159,7 +163,7 @@ impl PageServerNode { self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?; let init_result = self - .try_init_timeline(create_tenant, initial_timeline_id) + .try_init_timeline(create_tenant, initial_timeline_id, pg_version) .context("Failed to create initial tenant and timeline for pageserver"); match &init_result { Ok(initial_timeline_id) => { @@ -175,12 +179,16 @@ impl PageServerNode { &self, new_tenant_id: Option, new_timeline_id: Option, + pg_version: u32, ) -> anyhow::Result { - let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new()) - .context("failed to create tenant")?; - let initial_timeline_info = - self.timeline_create(initial_tenant_id, new_timeline_id, None, None) - .context("failed to create timeline")?; + let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; + let initial_timeline_info = self.timeline_create( + initial_tenant_id, + new_timeline_id, + None, + None, + Some(pg_version), + )?; Ok(initial_timeline_info.timeline_id) } @@ -504,6 +512,7 @@ impl PageServerNode { new_timeline_id: Option, ancestor_start_lsn: Option, ancestor_timeline_id: Option, + pg_version: Option, ) -> anyhow::Result { self.http_request( Method::POST, @@ -513,6 +522,7 @@ impl PageServerNode { new_timeline_id, ancestor_start_lsn, ancestor_timeline_id, + pg_version, }) .send()? .error_from_body()? @@ -542,6 +552,7 @@ impl PageServerNode { timeline_id: TimelineId, base: (Lsn, PathBuf), pg_wal: Option<(Lsn, PathBuf)>, + pg_version: u32, ) -> anyhow::Result<()> { let mut client = self.pg_connection_config.connect(NoTls).unwrap(); @@ -560,8 +571,9 @@ impl PageServerNode { }; // Import base - let import_cmd = - format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}"); + let import_cmd = format!( + "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}" + ); let mut writer = client.copy_in(&import_cmd)?; io::copy(&mut base_reader, &mut writer)?; writer.finish()?; diff --git a/docs/core_changes.md b/docs/core_changes.md index 8f29dd9121..ea219adae9 100644 --- a/docs/core_changes.md +++ b/docs/core_changes.md @@ -148,31 +148,6 @@ relcache? (I think we do cache nblocks in relcache already, check why that's not Neon) -## Misc change in vacuumlazy.c - -``` -index 8aab6e324e..c684c4fbee 100644 ---- a/src/backend/access/heap/vacuumlazy.c -+++ b/src/backend/access/heap/vacuumlazy.c -@@ -1487,7 +1487,10 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive) - else if (all_visible_according_to_vm && !PageIsAllVisible(page) - && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) - { -- elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", -+ /* ZENITH-XXX: all visible hint is not wal-logged -+ * FIXME: Replay visibilitymap changes in pageserver -+ */ -+ elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", - vacrel->relname, blkno); - visibilitymap_clear(vacrel->rel, blkno, vmbuffer, - VISIBILITYMAP_VALID_BITS); -``` - - -Is this still needed? If that WARNING happens, it looks like potential corruption that we should -fix! - - ## Use buffer manager when extending VM or FSM ``` diff --git a/docs/settings.md b/docs/settings.md index 30db495dbe..878681fce1 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -155,6 +155,8 @@ for other files and for sockets for incoming connections. #### pg_distrib_dir A directory with Postgres installation to use during pageserver activities. +Since pageserver supports several postgres versions, `pg_distrib_dir` contains +a subdirectory for each version with naming convention `v{PG_MAJOR_VERSION}/`. Inside that dir, a `bin/postgres` binary should be present. The default distrib dir is `./pg_install/`. diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 8043450a55..c468134b81 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -96,7 +96,7 @@ A single virtual environment with all dependencies is described in the single `P sudo apt install python3.9 ``` - Install `poetry` - - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`. + - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation). - Install dependencies via `./scripts/pysync`. - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile)) so if you have different version some linting tools can yield different result locally vs in the CI. diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml new file mode 100644 index 0000000000..be8762100c --- /dev/null +++ b/libs/pageserver_api/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "pageserver_api" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_with = "1.12.0" +const_format = "0.2.21" + +utils = { path = "../utils" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs new file mode 100644 index 0000000000..a36c1692a9 --- /dev/null +++ b/libs/pageserver_api/src/lib.rs @@ -0,0 +1,9 @@ +use const_format::formatcp; + +/// Public API types +pub mod models; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); diff --git a/pageserver/src/http/models.rs b/libs/pageserver_api/src/models.rs similarity index 80% rename from pageserver/src/http/models.rs rename to libs/pageserver_api/src/models.rs index d0011c6006..9657ff0ee4 100644 --- a/pageserver/src/http/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -7,7 +7,17 @@ use utils::{ lsn::Lsn, }; -use crate::tenant::TenantState; +/// A state of a tenant in pageserver's memory. +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum TenantState { + /// Tenant is fully operational, its background jobs might be running or not. + Active { background_jobs_running: bool }, + /// A tenant is recognized by pageserver, but not yet ready to operate: + /// e.g. not present locally and being downloaded or being read into memory from the file system. + Paused, + /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated. + Broken, +} #[serde_as] #[derive(Serialize, Deserialize)] @@ -21,6 +31,7 @@ pub struct TimelineCreateRequest { #[serde(default)] #[serde_as(as = "Option")] pub ancestor_start_lsn: Option, + pub pg_version: Option, } #[serde_as] @@ -138,6 +149,7 @@ pub struct LocalTimelineInfo { pub last_received_msg_lsn: Option, /// the timestamp (in microseconds) of the last received message pub last_received_msg_ts: Option, + pub pg_version: u32, } #[serde_as] @@ -161,3 +173,21 @@ pub struct TimelineInfo { pub local: Option, pub remote: Option, } + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct TimelineGcRequest { + pub gc_horizon: Option, +} diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 2b453fa0dc..60caca76b8 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -25,4 +25,5 @@ postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d05 wal_craft = { path = "wal_craft" } [build-dependencies] -bindgen = "0.59.1" +anyhow = "1.0" +bindgen = "0.60.1" diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 8389ac37fe..25ff398bbd 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -4,6 +4,7 @@ use std::env; use std::path::PathBuf; use std::process::Command; +use anyhow::{anyhow, Context}; use bindgen::callbacks::ParseCallbacks; #[derive(Debug)] @@ -42,7 +43,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { } } -fn main() { +fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=bindgen_deps.h"); @@ -58,7 +59,7 @@ fn main() { for pg_version in &["v14", "v15"] { let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); if pg_install_dir_versioned.is_relative() { - let cwd = env::current_dir().unwrap(); + let cwd = env::current_dir().context("Failed to get current_dir")?; pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned); } @@ -70,21 +71,25 @@ fn main() { let output = Command::new(pg_config_bin) .arg("--includedir-server") .output() - .expect("failed to execute `pg_config --includedir-server`"); + .context("failed to execute `pg_config --includedir-server`")?; if !output.status.success() { panic!("`pg_config --includedir-server` failed") } - String::from_utf8(output.stdout).unwrap().trim_end().into() + String::from_utf8(output.stdout) + .context("pg_config output is not UTF-8")? + .trim_end() + .into() } else { - pg_install_dir_versioned + let server_path = pg_install_dir_versioned .join("include") .join("postgresql") .join("server") - .into_os_string() + .into_os_string(); + server_path .into_string() - .unwrap() + .map_err(|s| anyhow!("Bad postgres server path {s:?}"))? }; // The bindgen::Builder is the main entry point @@ -132,14 +137,18 @@ fn main() { // Finish the builder and generate the bindings. // .generate() - .expect("Unable to generate bindings"); + .context("Unable to generate bindings")?; // Write the bindings to the $OUT_DIR/bindings_$pg_version.rs file. - let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); + let out_path: PathBuf = env::var("OUT_DIR") + .context("Couldn't read OUT_DIR environment variable var")? + .into(); let filename = format!("bindings_{pg_version}.rs"); bindings .write_to_file(out_path.join(filename)) - .expect("Couldn't write bindings!"); + .context("Couldn't write bindings")?; } + + Ok(()) } diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index f43232ed0c..f3dad159be 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -3,10 +3,14 @@ #![allow(non_snake_case)] // bindgen creates some unsafe code with no doc comments. #![allow(clippy::missing_safety_doc)] -// suppress warnings on rust 1.53 due to bindgen unit tests. -// https://github.com/rust-lang/rust-bindgen/issues/1651 -#![allow(deref_nullptr)] +// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code. +#![allow(clippy::useless_transmute)] +// modules included with the postgres_ffi macro depend on the types of the specific version's +// types, and trigger a too eager lint. +#![allow(clippy::duplicate_mod)] +use bytes::Bytes; +use utils::bin_ser::SerializeError; use utils::lsn::Lsn; macro_rules! postgres_ffi { @@ -24,12 +28,12 @@ macro_rules! postgres_ffi { stringify!($version), ".rs" )); + + include!(concat!("pg_constants_", stringify!($version), ".rs")); } pub mod controlfile_utils; pub mod nonrelfile_utils; - pub mod pg_constants; - pub mod relfile_utils; - pub mod waldecoder; + pub mod waldecoder_handler; pub mod xlog_utils; pub const PG_MAJORVERSION: &str = stringify!($version); @@ -44,6 +48,9 @@ macro_rules! postgres_ffi { postgres_ffi!(v14); postgres_ffi!(v15); +pub mod pg_constants; +pub mod relfile_utils; + // Export some widely used datatypes that are unlikely to change across Postgres versions pub use v14::bindings::{uint32, uint64, Oid}; pub use v14::bindings::{BlockNumber, OffsetNumber}; @@ -52,8 +59,11 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; +pub use v14::bindings::{PageHeaderData, XLogRecord}; pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; +pub use v14::bindings::{CheckPoint, ControlFileData}; + // from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and // --with-segsize=SEGSIZE, but assume the defaults for now. pub const BLCKSZ: u16 = 8192; @@ -63,6 +73,49 @@ pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; +// Export some version independent functions that are used outside of this mod +pub use v14::xlog_utils::encode_logical_message; +pub use v14::xlog_utils::get_current_timestamp; +pub use v14::xlog_utils::to_pg_timestamp; +pub use v14::xlog_utils::XLogFileName; + +pub use v14::bindings::DBState_DB_SHUTDOWNED; + +pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result { + match version { + 14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0), + 15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0), + _ => anyhow::bail!("Unknown version {}", version), + } +} + +pub fn generate_wal_segment( + segno: u64, + system_id: u64, + pg_version: u32, +) -> Result { + match pg_version { + 14 => v14::xlog_utils::generate_wal_segment(segno, system_id), + 15 => v15::xlog_utils::generate_wal_segment(segno, system_id), + _ => Err(SerializeError::BadInput), + } +} + +pub fn generate_pg_control( + pg_control_bytes: &[u8], + checkpoint_bytes: &[u8], + lsn: Lsn, + pg_version: u32, +) -> anyhow::Result<(Bytes, u64)> { + match pg_version { + 14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), + 15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), + _ => anyhow::bail!("Unknown version {}", pg_version), + } +} + // PG timeline is always 1, changing it doesn't have any useful meaning in Neon. // // NOTE: this is not to be confused with Neon timelines; different concept! @@ -74,7 +127,7 @@ pub const PG_TLI: u32 = 1; // See TransactionIdIsNormal in transam.h pub const fn transaction_id_is_normal(id: TransactionId) -> bool { - id > v14::pg_constants::FIRST_NORMAL_TRANSACTION_ID + id > pg_constants::FIRST_NORMAL_TRANSACTION_ID } // See TransactionIdPrecedes in transam.c @@ -109,3 +162,76 @@ pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) { pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); } + +pub mod waldecoder { + + use crate::{v14, v15}; + use bytes::{Buf, Bytes, BytesMut}; + use std::num::NonZeroU32; + use thiserror::Error; + use utils::lsn::Lsn; + + pub enum State { + WaitingForRecord, + ReassemblingRecord { + recordbuf: BytesMut, + contlen: NonZeroU32, + }, + SkippingEverything { + skip_until_lsn: Lsn, + }, + } + + pub struct WalStreamDecoder { + pub lsn: Lsn, + pub pg_version: u32, + pub inputbuf: BytesMut, + pub state: State, + } + + #[derive(Error, Debug, Clone)] + #[error("{msg} at {lsn}")] + pub struct WalDecodeError { + pub msg: String, + pub lsn: Lsn, + } + + impl WalStreamDecoder { + pub fn new(lsn: Lsn, pg_version: u32) -> WalStreamDecoder { + WalStreamDecoder { + lsn, + pg_version, + inputbuf: BytesMut::new(), + state: State::WaitingForRecord, + } + } + + // The latest LSN position fed to the decoder. + pub fn available(&self) -> Lsn { + self.lsn + self.inputbuf.remaining() as u64 + } + + pub fn feed_bytes(&mut self, buf: &[u8]) { + self.inputbuf.extend_from_slice(buf); + } + + pub fn poll_decode(&mut self) -> Result, WalDecodeError> { + match self.pg_version { + // This is a trick to support both versions simultaneously. + // See WalStreamDecoderHandler comments. + 14 => { + use self::v14::waldecoder_handler::WalStreamDecoderHandler; + self.poll_decode_internal() + } + 15 => { + use self::v15::waldecoder_handler::WalStreamDecoderHandler; + self.poll_decode_internal() + } + _ => Err(WalDecodeError { + msg: format!("Unknown version {}", self.pg_version), + lsn: self.lsn, + }), + } + } + } +} diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs index 1de1d367e0..01e5554b8a 100644 --- a/libs/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -1,7 +1,7 @@ //! //! Common utilities for dealing with PostgreSQL non-relation files. //! -use super::pg_constants; +use crate::pg_constants; use crate::transaction_id_precedes; use bytes::BytesMut; use log::*; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 8cc9fa7af6..6aaa739a69 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -1,14 +1,16 @@ //! //! Misc constants, copied from PostgreSQL headers. //! +//! Only place version-independent constants here. +//! //! TODO: These probably should be auto-generated using bindgen, //! rather than copied by hand. Although on the other hand, it's nice //! to have them all here in one place, and have the ability to add //! comments on them. //! -use super::bindings::{PageHeaderData, XLogRecord}; use crate::BLCKSZ; +use crate::{PageHeaderData, XLogRecord}; // // From pg_tablespace_d.h @@ -16,14 +18,6 @@ use crate::BLCKSZ; pub const DEFAULTTABLESPACE_OID: u32 = 1663; pub const GLOBALTABLESPACE_OID: u32 = 1664; -// -// Fork numbers, from relpath.h -// -pub const MAIN_FORKNUM: u8 = 0; -pub const FSM_FORKNUM: u8 = 1; -pub const VISIBILITYMAP_FORKNUM: u8 = 2; -pub const INIT_FORKNUM: u8 = 3; - // From storage_xlog.h pub const XLOG_SMGR_CREATE: u8 = 0x10; pub const XLOG_SMGR_TRUNCATE: u8 = 0x20; @@ -114,7 +108,6 @@ pub const XLOG_NEXTOID: u8 = 0x30; pub const XLOG_SWITCH: u8 = 0x40; pub const XLOG_FPI_FOR_HINT: u8 = 0xA0; pub const XLOG_FPI: u8 = 0xB0; -pub const DB_SHUTDOWNED: u32 = 1; // From multixact.h pub const FIRST_MULTIXACT_ID: u32 = 1; @@ -169,10 +162,6 @@ pub const RM_HEAP_ID: u8 = 10; pub const XLR_INFO_MASK: u8 = 0x0F; pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; -// from dbcommands_xlog.h -pub const XLOG_DBASE_CREATE: u8 = 0x00; -pub const XLOG_DBASE_DROP: u8 = 0x10; - pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; @@ -197,8 +186,6 @@ pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous /* Information stored in bimg_info */ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */ -pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ -pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ /* From transam.h */ pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3; diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs new file mode 100644 index 0000000000..810898ee80 --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -0,0 +1,5 @@ +pub const XLOG_DBASE_CREATE: u8 = 0x00; +pub const XLOG_DBASE_DROP: u8 = 0x10; + +pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ +pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs new file mode 100644 index 0000000000..6fa5eb008c --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -0,0 +1,10 @@ +pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; + +pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00; +pub const XLOG_DBASE_DROP: u8 = 0x20; + +pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ +pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ diff --git a/libs/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs index f3476acc9c..1dc9f367ff 100644 --- a/libs/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -1,10 +1,17 @@ //! //! Common utilities for dealing with PostgreSQL relation files. //! -use super::pg_constants; use once_cell::sync::OnceCell; use regex::Regex; +// +// Fork numbers, from relpath.h +// +pub const MAIN_FORKNUM: u8 = 0; +pub const FSM_FORKNUM: u8 = 1; +pub const VISIBILITYMAP_FORKNUM: u8 = 2; +pub const INIT_FORKNUM: u8 = 3; + #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum FilePathError { #[error("invalid relation fork name")] @@ -23,10 +30,10 @@ impl From for FilePathError { pub fn forkname_to_number(forkname: Option<&str>) -> Result { match forkname { // "main" is not in filenames, it's implicit if the fork name is not present - None => Ok(pg_constants::MAIN_FORKNUM), - Some("fsm") => Ok(pg_constants::FSM_FORKNUM), - Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM), - Some("init") => Ok(pg_constants::INIT_FORKNUM), + None => Ok(MAIN_FORKNUM), + Some("fsm") => Ok(FSM_FORKNUM), + Some("vm") => Ok(VISIBILITYMAP_FORKNUM), + Some("init") => Ok(INIT_FORKNUM), Some(_) => Err(FilePathError::InvalidForkName), } } @@ -34,10 +41,10 @@ pub fn forkname_to_number(forkname: Option<&str>) -> Result { /// Convert Postgres fork number to the right suffix of the relation data file. pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> { match forknum { - pg_constants::MAIN_FORKNUM => None, - pg_constants::FSM_FORKNUM => Some("fsm"), - pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"), - pg_constants::INIT_FORKNUM => Some("init"), + MAIN_FORKNUM => None, + FSM_FORKNUM => Some("fsm"), + VISIBILITYMAP_FORKNUM => Some("vm"), + INIT_FORKNUM => Some("init"), _ => Some("UNKNOWN FORKNUM"), } } diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder_handler.rs similarity index 90% rename from libs/postgres_ffi/src/waldecoder.rs rename to libs/postgres_ffi/src/waldecoder_handler.rs index 4d79e4b1d1..b4d50375bd 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder_handler.rs @@ -8,6 +8,7 @@ //! to look deeper into the WAL records to also understand which blocks they modify, the code //! for that is in pageserver/src/walrecord.rs //! +use super::super::waldecoder::{State, WalDecodeError, WalStreamDecoder}; use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; use super::xlog_utils::*; use crate::WAL_SEGMENT_SIZE; @@ -16,55 +17,26 @@ use crc32c::*; use log::*; use std::cmp::min; use std::num::NonZeroU32; -use thiserror::Error; use utils::lsn::Lsn; -enum State { - WaitingForRecord, - ReassemblingRecord { - recordbuf: BytesMut, - contlen: NonZeroU32, - }, - SkippingEverything { - skip_until_lsn: Lsn, - }, -} - -pub struct WalStreamDecoder { - lsn: Lsn, - inputbuf: BytesMut, - state: State, -} - -#[derive(Error, Debug, Clone)] -#[error("{msg} at {lsn}")] -pub struct WalDecodeError { - msg: String, - lsn: Lsn, +pub trait WalStreamDecoderHandler { + fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError>; + fn poll_decode_internal(&mut self) -> Result, WalDecodeError>; + fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError>; } // -// WalRecordStream is a Stream that returns a stream of WAL records -// FIXME: This isn't a proper rust stream +// This is a trick to support several postgres versions simultaneously. // -impl WalStreamDecoder { - pub fn new(lsn: Lsn) -> WalStreamDecoder { - WalStreamDecoder { - lsn, - inputbuf: BytesMut::new(), - state: State::WaitingForRecord, - } - } - - // The latest LSN position fed to the decoder. - pub fn available(&self) -> Lsn { - self.lsn + self.inputbuf.remaining() as u64 - } - - pub fn feed_bytes(&mut self, buf: &[u8]) { - self.inputbuf.extend_from_slice(buf); - } - +// Page decoding code depends on postgres bindings, so it is compiled for each version. +// Thus WalStreamDecoder implements several WalStreamDecoderHandler traits. +// WalStreamDecoder poll_decode() method dispatches to the right handler based on the postgres version. +// Other methods are internal and are not dispatched. +// +// It is similar to having several impl blocks for the same struct, +// but the impls here are in different modules, so need to use a trait. +// +impl WalStreamDecoderHandler for WalStreamDecoder { fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> { let validate_impl = || { if hdr.xlp_magic != XLOG_PAGE_MAGIC as u16 { @@ -125,7 +97,7 @@ impl WalStreamDecoder { /// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function /// Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid. /// - pub fn poll_decode(&mut self) -> Result, WalDecodeError> { + fn poll_decode_internal(&mut self) -> Result, WalDecodeError> { // Run state machine that validates page headers, and reassembles records // that cross page boundaries. loop { diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index f8606b6e47..953723a8f0 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -9,12 +9,13 @@ use crc32c::crc32c_append; +use super::super::waldecoder::WalStreamDecoder; use super::bindings::{ - CheckPoint, FullTransactionId, TimeLineID, TimestampTz, XLogLongPageHeaderData, - XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, + CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz, + XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, }; -use super::pg_constants; -use super::waldecoder::WalStreamDecoder; +use super::PG_MAJORVERSION; +use crate::pg_constants; use crate::PG_TLI; use crate::{uint32, uint64, Oid}; use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; @@ -56,12 +57,10 @@ pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; /// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG. const XID_CHECKPOINT_INTERVAL: u32 = 1024; -#[allow(non_snake_case)] pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo } -#[allow(non_snake_case)] pub fn XLogSegNoOffsetToRecPtr( segno: XLogSegNo, offset: u32, @@ -70,7 +69,6 @@ pub fn XLogSegNoOffsetToRecPtr( segno * (wal_segsz_bytes as u64) + (offset as u64) } -#[allow(non_snake_case)] pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { format!( "{:>08X}{:>08X}{:>08X}", @@ -80,7 +78,6 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize ) } -#[allow(non_snake_case)] pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; @@ -88,12 +85,10 @@ pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLin (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli) } -#[allow(non_snake_case)] pub fn IsXLogFileName(fname: &str) -> bool { return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()); } -#[allow(non_snake_case)] pub fn IsPartialXLogFileName(fname: &str) -> bool { fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]) } @@ -113,6 +108,30 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { } } +pub fn generate_pg_control( + pg_control_bytes: &[u8], + checkpoint_bytes: &[u8], + lsn: Lsn, +) -> anyhow::Result<(Bytes, u64)> { + let mut pg_control = ControlFileData::decode(pg_control_bytes)?; + let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?; + + // Generate new pg_control needed for bootstrap + checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; + + //reset some fields we don't want to preserve + //TODO Check this. + //We may need to determine the value from twophase data. + checkpoint.oldestActiveXid = 0; + + //save new values in pg_control + pg_control.checkPoint = 0; + pg_control.checkPointCopy = checkpoint; + pg_control.state = DBState_DB_SHUTDOWNED; + + Ok((pg_control.encode(), pg_control.system_identifier)) +} + pub fn get_current_timestamp() -> TimestampTz { to_pg_timestamp(SystemTime::now()) } @@ -144,7 +163,10 @@ pub fn find_end_of_wal( let mut result = start_lsn; let mut curr_lsn = start_lsn; let mut buf = [0u8; XLOG_BLCKSZ]; - let mut decoder = WalStreamDecoder::new(start_lsn); + let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + debug!("find_end_of_wal PG_VERSION: {}", pg_version); + + let mut decoder = WalStreamDecoder::new(start_lsn, pg_version); // loop over segments loop { @@ -154,7 +176,7 @@ pub fn find_end_of_wal( match open_wal_segment(&seg_file_path)? { None => { // no more segments - info!( + debug!( "find_end_of_wal reached end at {:?}, segment {:?} doesn't exist", result, seg_file_path ); @@ -177,7 +199,7 @@ pub fn find_end_of_wal( match decoder.poll_decode() { Ok(Some(record)) => result = record.0, Err(e) => { - info!( + debug!( "find_end_of_wal reached end at {:?}, decode error: {:?}", result, e ); @@ -438,12 +460,15 @@ mod tests { fn test_end_of_wal(test_name: &str) { use wal_craft::*; + let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + // Craft some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") .join(".."); let cfg = Conf { - pg_distrib_dir: top_path.join(format!("pg_install/{PG_MAJORVERSION}")), + pg_version, + pg_distrib_dir: top_path.join("pg_install"), datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), }; if cfg.datadir.exists() { diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index f848ac1273..88466737ed 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -14,3 +14,4 @@ once_cell = "1.13.0" postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } postgres_ffi = { path = "../" } tempfile = "3.2" +workspace_hack = { version = "0.1", path = "../../../workspace_hack" } diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 2a607db6dc..9563298cd8 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -37,9 +37,16 @@ fn main() -> Result<()> { Arg::new("pg-distrib-dir") .long("pg-distrib-dir") .takes_value(true) - .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install/v14)") + .help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)") .default_value("/usr/local") ) + .arg( + Arg::new("pg-version") + .long("pg-version") + .help("Postgres version to use for the initial tenant") + .required(true) + .takes_value(true) + ) ) .subcommand( App::new("in-existing") @@ -82,8 +89,14 @@ fn main() -> Result<()> { } Ok(()) } + Some(("with-initdb", arg_matches)) => { let cfg = Conf { + pg_version: arg_matches + .value_of("pg-version") + .unwrap() + .parse::() + .context("Failed to parse postgres version from the argument string")?, pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(), datadir: arg_matches.value_of("datadir").unwrap().into(), }; diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 2ad92d776d..7ffe19e209 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -15,6 +15,7 @@ use tempfile::{tempdir, TempDir}; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Conf { + pub pg_version: u32, pub pg_distrib_dir: PathBuf, pub datadir: PathBuf, } @@ -36,12 +37,22 @@ pub static REQUIRED_POSTGRES_CONFIG: Lazy> = Lazy::new(|| { }); impl Conf { + pub fn pg_distrib_dir(&self) -> PathBuf { + let path = self.pg_distrib_dir.clone(); + + match self.pg_version { + 14 => path.join(format!("v{}", self.pg_version)), + 15 => path.join(format!("v{}", self.pg_version)), + _ => panic!("Unsupported postgres version: {}", self.pg_version), + } + } + fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + self.pg_distrib_dir().join("bin") } fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + self.pg_distrib_dir().join("lib") } pub fn wal_dir(&self) -> PathBuf { diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index b3485f274a..cec344a4ad 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" anyhow = { version = "1.0", features = ["backtrace"] } async-trait = "0.1" metrics = { version = "0.1", path = "../metrics" } +utils = { version = "0.1", path = "../utils" } once_cell = "1.13.0" rusoto_core = "0.48" rusoto_s3 = "0.48" diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 6b3fd29a0e..4bdd2b9608 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -9,9 +9,7 @@ mod local_fs; mod s3_bucket; use std::{ - borrow::Cow, collections::HashMap, - ffi::OsStr, fmt::{Debug, Display}, num::{NonZeroU32, NonZeroUsize}, ops::Deref, @@ -344,22 +342,6 @@ impl Debug for S3Config { } } -/// Adds a suffix to the file(directory) name, either appending the suffux to the end of its extension, -/// or if there's no extension, creates one and puts a suffix there. -pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { - let new_extension = match original_path - .as_ref() - .extension() - .map(OsStr::to_string_lossy) - { - Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), - None => Cow::Borrowed(suffix), - }; - original_path - .as_ref() - .with_extension(new_extension.as_ref()) -} - impl RemoteStorageConfig { pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { let local_path = toml.get("local_path"); @@ -448,35 +430,6 @@ fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result { mod tests { use super::*; - #[test] - fn test_path_with_suffix_extension() { - let p = PathBuf::from("/foo/bar"); - assert_eq!( - &path_with_suffix_extension(&p, "temp").to_string_lossy(), - "/foo/bar.temp" - ); - let p = PathBuf::from("/foo/bar"); - assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), - "/foo/bar.temp.temp" - ); - let p = PathBuf::from("/foo/bar.baz"); - assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), - "/foo/bar.baz.temp.temp" - ); - let p = PathBuf::from("/foo/bar.baz"); - assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), - "/foo/bar.baz..temp" - ); - let p = PathBuf::from("/foo/bar/dir/"); - assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), - "/foo/bar/dir..temp" - ); - } - #[test] fn object_name() { let k = RemoteObjectId("a/b/c".to_owned()); diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 3ffbf3cb39..5723a512f6 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -16,8 +16,9 @@ use tokio::{ io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; use tracing::*; +use utils::crashsafe_dir::path_with_suffix_extension; -use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectId}; +use crate::{Download, DownloadError, RemoteObjectId}; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml new file mode 100644 index 0000000000..852d643f30 --- /dev/null +++ b/libs/safekeeper_api/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "safekeeper_api" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_with = "1.12.0" +const_format = "0.2.21" + +utils = { path = "../utils" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs new file mode 100644 index 0000000000..0a391478da --- /dev/null +++ b/libs/safekeeper_api/src/lib.rs @@ -0,0 +1,10 @@ +use const_format::formatcp; + +/// Public API types +pub mod models; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); + +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); diff --git a/safekeeper/src/http/models.rs b/libs/safekeeper_api/src/models.rs similarity index 100% rename from safekeeper/src/http/models.rs rename to libs/safekeeper_api/src/models.rs diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index badcb5774e..98d839ca55 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -3,20 +3,20 @@ use criterion::{criterion_group, criterion_main, Criterion}; use utils::id; -pub fn bench_zid_stringify(c: &mut Criterion) { +pub fn bench_id_stringify(c: &mut Criterion) { // Can only use public methods. - let ztl = id::TenantTimelineId::generate(); + let ttid = id::TenantTimelineId::generate(); - c.bench_function("zid.to_string", |b| { + c.bench_function("id.to_string", |b| { b.iter(|| { // FIXME measurement overhead? //for _ in 0..1000 { - // ztl.tenant_id.to_string(); + // ttid.tenant_id.to_string(); //} - ztl.tenant_id.to_string(); + ttid.tenant_id.to_string(); }) }); } -criterion_group!(benches, bench_zid_stringify); +criterion_group!(benches, bench_id_stringify); criterion_main!(benches); diff --git a/libs/utils/src/crashsafe_dir.rs b/libs/utils/src/crashsafe_dir.rs index a7eab73a43..032ab0a916 100644 --- a/libs/utils/src/crashsafe_dir.rs +++ b/libs/utils/src/crashsafe_dir.rs @@ -1,7 +1,9 @@ use std::{ + borrow::Cow, + ffi::OsStr, fs::{self, File}, io, - path::Path, + path::{Path, PathBuf}, }; /// Similar to [`std::fs::create_dir`], except we fsync the @@ -74,6 +76,22 @@ pub fn create_dir_all(path: impl AsRef) -> io::Result<()> { Ok(()) } +/// Adds a suffix to the file(directory) name, either appending the suffix to the end of its extension, +/// or if there's no extension, creates one and puts a suffix there. +pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { + let new_extension = match original_path + .as_ref() + .extension() + .map(OsStr::to_string_lossy) + { + Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), + None => Cow::Borrowed(suffix), + }; + original_path + .as_ref() + .with_extension(new_extension.as_ref()) +} + #[cfg(test)] mod tests { use tempfile::tempdir; @@ -122,4 +140,33 @@ mod tests { let invalid_dir_path = file_path.join("folder"); create_dir_all(&invalid_dir_path).unwrap_err(); } + + #[test] + fn test_path_with_suffix_extension() { + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp").to_string_lossy(), + "/foo/bar.temp" + ); + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.baz.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar.baz..temp" + ); + let p = PathBuf::from("/foo/bar/dir/"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar/dir..temp" + ); + } } diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index b3bbec0f1c..b0ecb746d9 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -1,12 +1,11 @@ -use anyhow::anyhow; use hyper::{header, Body, Response, StatusCode}; use serde::{Deserialize, Serialize}; use thiserror::Error; #[derive(Debug, Error)] pub enum ApiError { - #[error("Bad request: {0}")] - BadRequest(String), + #[error("Bad request: {0:#?}")] + BadRequest(anyhow::Error), #[error("Forbidden: {0}")] Forbidden(String), @@ -15,24 +14,20 @@ pub enum ApiError { Unauthorized(String), #[error("NotFound: {0}")] - NotFound(String), + NotFound(anyhow::Error), #[error("Conflict: {0}")] Conflict(String), #[error(transparent)] - InternalServerError(#[from] anyhow::Error), + InternalServerError(anyhow::Error), } impl ApiError { - pub fn from_err>(err: E) -> Self { - Self::InternalServerError(anyhow!(err)) - } - pub fn into_response(self) -> Response { match self { - ApiError::BadRequest(_) => HttpErrorBody::response_from_msg_and_status( - self.to_string(), + ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( + format!("{err:#?}"), // use debug printing so that we give the cause StatusCode::BAD_REQUEST, ), ApiError::Forbidden(_) => { diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 08f2ac4205..8981fdd1dd 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -1,3 +1,4 @@ +use anyhow::Context; use bytes::Buf; use hyper::{header, Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; @@ -9,20 +10,24 @@ pub async fn json_request Deserialize<'de>>( ) -> Result { let whole_body = hyper::body::aggregate(request.body_mut()) .await - .map_err(ApiError::from_err)?; + .context("Failed to read request body") + .map_err(ApiError::BadRequest)?; serde_json::from_reader(whole_body.reader()) - .map_err(|err| ApiError::BadRequest(format!("Failed to parse json request {}", err))) + .context("Failed to parse json request") + .map_err(ApiError::BadRequest) } pub fn json_response( status: StatusCode, data: T, ) -> Result, ApiError> { - let json = serde_json::to_string(&data).map_err(ApiError::from_err)?; + let json = serde_json::to_string(&data) + .context("Failed to serialize JSON response") + .map_err(ApiError::InternalServerError)?; let response = Response::builder() .status(status) .header(header::CONTENT_TYPE, "application/json") .body(Body::from(json)) - .map_err(ApiError::from_err)?; + .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 4984d695fd..7b96ccd584 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -1,6 +1,7 @@ use std::str::FromStr; use super::error::ApiError; +use anyhow::anyhow; use hyper::{body::HttpBody, Body, Request}; use routerify::ext::RequestExt; @@ -10,9 +11,8 @@ pub fn get_request_param<'a>( ) -> Result<&'a str, ApiError> { match request.param(param_name) { Some(arg) => Ok(arg), - None => Err(ApiError::BadRequest(format!( - "no {} specified in path param", - param_name + None => Err(ApiError::BadRequest(anyhow!( + "no {param_name} specified in path param", ))), } } @@ -23,16 +23,15 @@ pub fn parse_request_param( ) -> Result { match get_request_param(request, param_name)?.parse() { Ok(v) => Ok(v), - Err(_) => Err(ApiError::BadRequest(format!( - "failed to parse {}", - param_name + Err(_) => Err(ApiError::BadRequest(anyhow!( + "failed to parse {param_name}", ))), } } pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> { match request.body_mut().data().await { - Some(_) => Err(ApiError::BadRequest("Unexpected request body".into())), + Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))), None => Ok(()), } } diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 0498e0887b..adee46c2dd 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -429,8 +429,22 @@ impl PostgresBackend { // full cause of the error, not just the top-level context + its trace. // We don't want to send that in the ErrorResponse though, // because it's not relevant to the compute node logs. - error!("query handler for '{}' failed: {:?}", query_string, e); - self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?; + // + // We also don't want to log full stacktrace when the error is primitive, + // such as usual connection closed. + let short_error = format!("{:#}", e); + let root_cause = e.root_cause().to_string(); + if root_cause.contains("connection closed unexpectedly") + || root_cause.contains("Broken pipe (os error 32)") + { + error!( + "query handler for '{}' failed: {}", + query_string, short_error + ); + } else { + error!("query handler for '{}' failed: {:?}", query_string, e); + } + self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?; // TODO: untangle convoluted control flow if e.to_string().contains("failed to run") { return Ok(ProcessMsgResult::Break); diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index dde76039d7..21952ab87e 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -931,7 +931,7 @@ impl ReplicationFeedback { // Deserialize ReplicationFeedback message pub fn parse(mut buf: Bytes) -> ReplicationFeedback { - let mut zf = ReplicationFeedback::empty(); + let mut rf = ReplicationFeedback::empty(); let nfields = buf.get_u8(); for _ in 0..nfields { let key = read_cstr(&mut buf).unwrap(); @@ -939,31 +939,31 @@ impl ReplicationFeedback { b"current_timeline_size" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.current_timeline_size = buf.get_u64(); + rf.current_timeline_size = buf.get_u64(); } b"ps_writelsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_writelsn = buf.get_u64(); + rf.ps_writelsn = buf.get_u64(); } b"ps_flushlsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_flushlsn = buf.get_u64(); + rf.ps_flushlsn = buf.get_u64(); } b"ps_applylsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_applylsn = buf.get_u64(); + rf.ps_applylsn = buf.get_u64(); } b"ps_replytime" => { let len = buf.get_i32(); assert_eq!(len, 8); let raw_time = buf.get_i64(); if raw_time > 0 { - zf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); + rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); } else { - zf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); + rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); } } _ => { @@ -976,8 +976,8 @@ impl ReplicationFeedback { } } } - trace!("ReplicationFeedback parsed is {:?}", zf); - zf + trace!("ReplicationFeedback parsed is {:?}", rf); + rf } } @@ -987,29 +987,29 @@ mod tests { #[test] fn test_replication_feedback_serialization() { - let mut zf = ReplicationFeedback::empty(); - // Fill zf with some values - zf.current_timeline_size = 12345678; + let mut rf = ReplicationFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - zf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); - zf.serialize(&mut data).unwrap(); + rf.serialize(&mut data).unwrap(); - let zf_parsed = ReplicationFeedback::parse(data.freeze()); - assert_eq!(zf, zf_parsed); + let rf_parsed = ReplicationFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); } #[test] fn test_replication_feedback_unknown_key() { - let mut zf = ReplicationFeedback::empty(); - // Fill zf with some values - zf.current_timeline_size = 12345678; + let mut rf = ReplicationFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - zf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); - zf.serialize(&mut data).unwrap(); + rf.serialize(&mut data).unwrap(); // Add an extra field to the buffer and adjust number of keys if let Some(first) = data.first_mut() { @@ -1021,8 +1021,8 @@ mod tests { data.put_u64(42); // Parse serialized data and check that new field is not parsed - let zf_parsed = ReplicationFeedback::parse(data.freeze()); - assert_eq!(zf, zf_parsed); + let rf_parsed = ReplicationFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); } #[test] diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index 467b900a13..bf330a482c 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -240,7 +240,6 @@ where mod tests { use super::*; use std::sync::Arc; - use std::thread::sleep; use std::time::Duration; impl MonotonicCounter for i32 { @@ -258,17 +257,19 @@ mod tests { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); let seq3 = Arc::clone(&seq); - tokio::task::spawn(async move { + let jh1 = tokio::task::spawn(async move { seq2.wait_for(42).await.expect("wait_for 42"); let old = seq2.advance(100); assert_eq!(old, 99); - seq2.wait_for(999).await.expect_err("no 999"); + seq2.wait_for_timeout(999, Duration::from_millis(100)) + .await + .expect_err("no 999"); }); - tokio::task::spawn(async move { + let jh2 = tokio::task::spawn(async move { seq3.wait_for(42).await.expect("wait_for 42"); seq3.wait_for(0).await.expect("wait_for 0"); }); - sleep(Duration::from_secs(1)); + tokio::time::sleep(Duration::from_millis(200)).await; let old = seq.advance(99); assert_eq!(old, 0); seq.wait_for(100).await.expect("wait_for 100"); @@ -277,6 +278,9 @@ mod tests { assert_eq!(seq.advance(98), 100); assert_eq!(seq.load(), 100); + jh1.await.unwrap(); + jh2.await.unwrap(); + seq.shutdown(); } @@ -284,15 +288,18 @@ mod tests { async fn seqwait_timeout() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); - tokio::task::spawn(async move { + let jh = tokio::task::spawn(async move { let timeout = Duration::from_millis(1); let res = seq2.wait_for_timeout(42, timeout).await; assert_eq!(res, Err(SeqWaitError::Timeout)); }); - tokio::time::sleep(Duration::from_secs(1)).await; + tokio::time::sleep(Duration::from_millis(200)).await; // This will attempt to wake, but nothing will happen // because the waiter already dropped its Receiver. let old = seq.advance(99); - assert_eq!(old, 0) + assert_eq!(old, 0); + jh.await.unwrap(); + + seq.shutdown(); } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index aa7ed507dd..3729b942be 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -4,12 +4,12 @@ version = "0.1.0" edition = "2021" [features] -# It is simpler infra-wise to have failpoints enabled by default -# It shouldn't affect performance in any way because failpoints -# are not placed in hot code paths -default = ["failpoints"] +default = [] +# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, +# which adds some runtime cost to run tests on outage conditions +testing = ["fail/failpoints"] + profiling = ["pprof"] -failpoints = ["fail/failpoints"] [dependencies] async-stream = "0.3" @@ -54,7 +54,11 @@ once_cell = "1.13.0" crossbeam-utils = "0.8.5" fail = "0.5.0" git-version = "0.3.5" +rstar = "0.9.3" +num-traits = "0.2.15" +amplify_num = "0.4.1" +pageserver_api = { path = "../libs/pageserver_api" } postgres_ffi = { path = "../libs/postgres_ffi" } etcd_broker = { path = "../libs/etcd_broker" } metrics = { path = "../libs/metrics" } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index eca6a3c87f..d0a57a473b 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -25,10 +25,10 @@ use tracing::*; use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; -use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName}; -use postgres_ffi::v14::{CheckPoint, ControlFileData}; +use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; +use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; use postgres_ffi::TransactionId; +use postgres_ffi::XLogFileName; use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; @@ -129,15 +129,15 @@ where // TODO include checksum // Create pgdata subdirs structure - for dir in pg_constants::PGDATA_SUBDIRS.iter() { + for dir in PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(*dir)?; self.ar.append(&header, &mut io::empty())?; } // Send empty config files. - for filepath in pg_constants::PGDATA_SPECIAL_FILES.iter() { + for filepath in PGDATA_SPECIAL_FILES.iter() { if *filepath == "pg_hba.conf" { - let data = pg_constants::PG_HBA.as_bytes(); + let data = PG_HBA.as_bytes(); let header = new_tar_header(filepath, data.len() as u64)?; self.ar.append(&header, data)?; } else { @@ -267,16 +267,12 @@ where None }; - // TODO pass this as a parameter - let pg_version = "14"; + if spcnode == GLOBALTABLESPACE_OID { + let pg_version_str = self.timeline.pg_version.to_string(); + let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; + self.ar.append(&header, pg_version_str.as_bytes())?; - if spcnode == pg_constants::GLOBALTABLESPACE_OID { - let version_bytes = pg_version.as_bytes(); - let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; - - let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; + info!("timeline.pg_version {}", self.timeline.pg_version); if let Some(img) = relmap_img { // filenode map for global tablespace @@ -305,7 +301,7 @@ where return Ok(()); } // User defined tablespaces are not supported - ensure!(spcnode == pg_constants::DEFAULTTABLESPACE_OID); + ensure!(spcnode == DEFAULTTABLESPACE_OID); // Append dir path for each database let path = format!("base/{}", dbnode); @@ -314,9 +310,10 @@ where if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); - let version_bytes = pg_version.as_bytes(); - let header = new_tar_header(&dst_path, version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; + + let pg_version_str = self.timeline.pg_version.to_string(); + let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; + self.ar.append(&header, pg_version_str.as_bytes())?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; @@ -348,30 +345,6 @@ where // Also send zenith.signal file with extra bootstrap data. // fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn) - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn) - .context("failed get control bytes")?; - let mut pg_control = ControlFileData::decode(&pg_control_bytes)?; - let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - - // Generate new pg_control needed for bootstrap - checkpoint.redo = normalize_lsn(self.lsn, WAL_SEGMENT_SIZE).0; - - //reset some fields we don't want to preserve - //TODO Check this. - //We may need to determine the value from twophase data. - checkpoint.oldestActiveXid = 0; - - //save new values in pg_control - pg_control.checkPoint = 0; - pg_control.checkPointCopy = checkpoint; - pg_control.state = pg_constants::DB_SHUTDOWNED; - // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { @@ -388,8 +361,23 @@ where zenith_signal.as_bytes(), )?; + let checkpoint_bytes = self + .timeline + .get_checkpoint(self.lsn) + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = self + .timeline + .get_control_file(self.lsn) + .context("failed get control bytes")?; + + let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( + &pg_control_bytes, + &checkpoint_bytes, + self.lsn, + self.timeline.pg_version, + )?; + //send pg_control - let pg_control_bytes = pg_control.encode(); let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; self.ar.append(&header, &pg_control_bytes[..])?; @@ -398,8 +386,10 @@ where let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE); let wal_file_path = format!("pg_wal/{}", wal_file_name); let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; - let wal_seg = generate_wal_segment(segno, pg_control.system_identifier) - .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; + + let wal_seg = + postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version) + .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..])?; Ok(()) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 92d5eab379..fb79ad3945 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -87,8 +87,8 @@ fn main() -> anyhow::Result<()> { if arg_matches.is_present("enabled-features") { let features: &[&str] = &[ - #[cfg(feature = "failpoints")] - "failpoints", + #[cfg(feature = "testing")] + "testing", #[cfg(feature = "profiling")] "profiling", ]; diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs index 16359c2532..e66049c457 100644 --- a/pageserver/src/bin/update_metadata.rs +++ b/pageserver/src/bin/update_metadata.rs @@ -50,6 +50,7 @@ fn main() -> Result<()> { meta.ancestor_lsn(), meta.latest_gc_cutoff_lsn(), meta.initdb_lsn(), + meta.pg_version(), ); update_meta = true; } @@ -62,6 +63,7 @@ fn main() -> Result<()> { meta.ancestor_lsn(), meta.latest_gc_cutoff_lsn(), meta.initdb_lsn(), + meta.pg_version(), ); update_meta = true; } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 0d1c064290..fe8fee597f 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -22,14 +22,18 @@ use utils::{ use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::tenant_config::{TenantConf, TenantConfOpt}; +/// The name of the metadata file pageserver creates per timeline. +pub const METADATA_FILE_NAME: &str = "metadata"; +const TENANT_CONFIG_NAME: &str = "config"; + pub mod defaults { use crate::tenant_config::defaults::*; use const_format::formatcp; - pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; - pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); - pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; - pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); + pub use pageserver_api::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PG_LISTEN_PORT, + }; pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; @@ -205,7 +209,7 @@ impl Default for PageServerConfigBuilder { workdir: Set(PathBuf::new()), pg_distrib_dir: Set(env::current_dir() .expect("cannot access current directory") - .join("pg_install/v14")), + .join("pg_install")), auth_type: Set(AuthType::Trust), auth_validation_public_key_path: Set(None), remote_storage_config: Set(None), @@ -346,6 +350,12 @@ impl PageServerConf { self.tenants_path().join(tenant_id.to_string()) } + /// Points to a place in pageserver's local directory, + /// where certain tenant's tenantconf file should be located. + pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf { + self.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) + } + pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf { self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) } @@ -368,18 +378,42 @@ impl PageServerConf { .join(tenant_id.to_string()) .join(timeline_id.to_string()) .join(connection_id.to_string()) + + } + + /// Points to a place in pageserver's local directory, + /// where certain timeline's metadata file should be located. + pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf { + self.timeline_path(&timeline_id, &tenant_id) + .join(METADATA_FILE_NAME) } // // Postgres distribution paths // + pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf { + let path = self.pg_distrib_dir.clone(); - pub fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + match pg_version { + 14 => path.join(format!("v{pg_version}")), + 15 => path.join(format!("v{pg_version}")), + _ => panic!("Unsupported postgres version: {}", pg_version), + } } - pub fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("bin"), + 15 => self.pg_distrib_dir(pg_version).join("bin"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } + } + pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf { + match pg_version { + 14 => self.pg_distrib_dir(pg_version).join("lib"), + 15 => self.pg_distrib_dir(pg_version).join("lib"), + _ => panic!("Unsupported postgres version: {}", pg_version), + } } /// Parse a configuration file (pageserver.toml) into a PageServerConf struct, @@ -448,13 +482,6 @@ impl PageServerConf { ); } - if !conf.pg_distrib_dir.join("bin/postgres").exists() { - bail!( - "Can't find postgres binary at {}", - conf.pg_distrib_dir.display() - ); - } - conf.default_tenant_conf = t_conf.merge(TenantConf::default()); Ok(conf) @@ -624,6 +651,7 @@ mod tests { use tempfile::{tempdir, TempDir}; use super::*; + use crate::DEFAULT_PG_VERSION; const ALL_BASE_VALUES_TOML: &str = r#" # Initial configuration file created by 'pageserver --init' @@ -863,8 +891,9 @@ broker_endpoints = ['{broker_endpoint}'] fs::create_dir_all(&workdir)?; let pg_distrib_dir = tempdir_path.join("pg_distrib"); - fs::create_dir_all(&pg_distrib_dir)?; - let postgres_bin_dir = pg_distrib_dir.join("bin"); + let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}")); + fs::create_dir_all(&pg_distrib_dir_versioned)?; + let postgres_bin_dir = pg_distrib_dir_versioned.join("bin"); fs::create_dir_all(&postgres_bin_dir)?; fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?; diff --git a/pageserver/src/http/mod.rs b/pageserver/src/http/mod.rs index 4c0be17ecd..1c083bd382 100644 --- a/pageserver/src/http/mod.rs +++ b/pageserver/src/http/mod.rs @@ -1,3 +1,4 @@ -pub mod models; pub mod routes; pub use routes::make_router; + +pub use pageserver_api::models; diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 1f2eba05ec..4e748207c8 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -307,6 +307,7 @@ paths: description: | Create a timeline. Returns new timeline id on success.\ If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline. + If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver. requestBody: content: application/json: @@ -322,6 +323,8 @@ paths: ancestor_start_lsn: type: string format: hex + pg_version: + type: integer responses: "201": description: TimelineInfo diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 236762ec7d..d0b75c013a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,9 +1,10 @@ use std::sync::Arc; -use anyhow::{Context, Result}; +use anyhow::{anyhow, Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use remote_storage::GenericRemoteStorage; +use tokio::task::JoinError; use tracing::*; use super::models::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; @@ -15,7 +16,7 @@ use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant::{TenantState, Timeline}; use crate::tenant_config::TenantConfOpt; -use crate::{config::PageServerConf, tenant_mgr, timelines}; +use crate::{config::PageServerConf, tenant_mgr}; use utils::{ auth::JwtAuth, http::{ @@ -29,6 +30,12 @@ use utils::{ lsn::Lsn, }; +// Imports only used for testing APIs +#[cfg(feature = "testing")] +use super::models::{ConfigureFailpointsRequest, TimelineGcRequest}; +#[cfg(feature = "testing")] +use crate::CheckpointConfig; + struct State { conf: &'static PageServerConf, auth: Option>, @@ -123,6 +130,7 @@ fn local_timeline_info_from_timeline( wal_source_connstr, last_received_msg_lsn, last_received_msg_ts, + pg_version: timeline.pg_version, }; Ok(info) } @@ -160,17 +168,18 @@ async fn timeline_create_handler(mut request: Request) -> Result { // Created. Construct a TimelineInfo for it. - let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?; + let local_info = local_timeline_info_from_timeline(&new_timeline, false, false) + .map_err(ApiError::InternalServerError)?; Ok(Some(TimelineInfo { tenant_id, timeline_id: new_timeline.timeline_id, @@ -179,12 +188,11 @@ async fn timeline_create_handler(mut request: Request) -> Result Ok(None), // timeline already exists - Err(err) => Err(err), + Err(err) => Err(ApiError::InternalServerError(err)), } } - .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn)) - .await - .map_err(ApiError::from_err)?; + .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) + .await?; Ok(match new_timeline_info { Some(info) => json_response(StatusCode::CREATED, info)?, @@ -202,10 +210,11 @@ async fn timeline_list_handler(request: Request) -> Result, let timelines = tokio::task::spawn_blocking(move || { let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); - Ok::<_, anyhow::Error>(tenant_mgr::get_tenant(tenant_id, true)?.list_timelines()) + let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; + Ok(tenant.list_timelines()) }) .await - .map_err(ApiError::from_err)??; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; let mut response_data = Vec::with_capacity(timelines.len()); for (timeline_id, timeline) in timelines { @@ -270,7 +279,7 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result((local_timeline_info, remote_timeline_info)) + Ok::<_, ApiError>((local_timeline_info, remote_timeline_info)) } .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id)) .await?; if local_timeline_info.is_none() && remote_timeline_info.is_none() { - Err(ApiError::NotFound(format!( + Err(ApiError::NotFound(anyhow!( "Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely" ))) } else { @@ -327,14 +336,21 @@ async fn tenant_attach_handler(request: Request) -> Result, info!("Handling tenant attach {tenant_id}"); - tokio::task::spawn_blocking(move || { - if tenant_mgr::get_tenant(tenant_id, false).is_ok() { - anyhow::bail!("Tenant is already present locally") - }; - Ok(()) + tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) { + Ok(tenant) => { + if tenant.list_timelines().is_empty() { + info!("Attaching to tenant {tenant_id} with zero timelines"); + Ok(()) + } else { + Err(ApiError::Conflict( + "Tenant is already present locally".to_owned(), + )) + } + } + Err(_) => Ok(()), }) .await - .map_err(ApiError::from_err)??; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; let state = get_state(&request); let remote_index = &state.remote_index; @@ -359,12 +375,12 @@ async fn tenant_attach_handler(request: Request) -> Result, // download index parts for every tenant timeline let remote_timelines = match gather_tenant_timelines_index_parts(state, tenant_id).await { Ok(Some(remote_timelines)) => remote_timelines, - Ok(None) => return Err(ApiError::NotFound("Unknown remote tenant".to_string())), + Ok(None) => return Err(ApiError::NotFound(anyhow!("Unknown remote tenant"))), Err(e) => { error!("Failed to retrieve remote tenant data: {:?}", e); - return Err(ApiError::NotFound( - "Failed to retrieve remote tenant".to_string(), - )); + return Err(ApiError::NotFound(anyhow!( + "Failed to retrieve remote tenant" + ))); } }; @@ -387,7 +403,8 @@ async fn tenant_attach_handler(request: Request) -> Result, for (timeline_id, mut remote_timeline) in remote_timelines { tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id)) .await - .context("Failed to create new timeline directory")?; + .context("Failed to create new timeline directory") + .map_err(ApiError::InternalServerError)?; remote_timeline.awaits_download = true; tenant_entry.insert(timeline_id, remote_timeline); @@ -433,7 +450,10 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, tenant_mgr::detach_tenant(conf, tenant_id) .instrument(info_span!("tenant_detach", tenant = %tenant_id)) .await - .map_err(ApiError::from_err)?; + // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors. + // Replace this with better handling once the error type permits it. + .map_err(ApiError::InternalServerError)?; let mut remote_index = state.remote_index.write().await; remote_index.remove_tenant_entry(&tenant_id); @@ -473,7 +495,7 @@ async fn tenant_list_handler(request: Request) -> Result, A crate::tenant_mgr::list_tenant_info(&remote_index) }) .await - .map_err(ApiError::from_err)?; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?; json_response(StatusCode::OK, response_data) } @@ -485,7 +507,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro // if tenant is in progress of downloading it can be absent in global tenant map let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false)) .await - .map_err(ApiError::from_err)?; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?; let state = get_state(&request); let remote_index = &state.remote_index; @@ -514,7 +536,7 @@ async fn tenant_status(request: Request) -> Result, ApiErro let current_physical_size = match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false)) .await - .map_err(ApiError::from_err)? + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))? { Err(err) => { // Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded). @@ -540,6 +562,16 @@ async fn tenant_status(request: Request) -> Result, ApiErro ) } +// Helper function to standardize the error messages we produce on bad durations +// +// Intended to be used with anyhow's `with_context`, e.g.: +// +// let value = result.with_context(bad_duration("name", &value))?; +// +fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String { + move || format!("Cannot parse `{field_name}` duration {value:?}") +} + async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -548,25 +580,39 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result) -> Result json_response(StatusCode::CREATED, TenantCreateResponse(id))?, @@ -616,24 +671,38 @@ async fn tenant_config_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result, ApiError> { + if !fail::has_failpoints() { + return Err(ApiError::BadRequest(anyhow!( + "Cannot manage failpoints because pageserver was compiled without failpoints support" + ))); + } + + let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; + for fp in failpoints { + info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = if fp.actions == "exit" { + fail::cfg_callback(fp.name, || { + info!("Exit requested by failpoint"); + std::process::exit(1); + }) + } else { + fail::cfg(fp.name, &fp.actions) + }; + + if let Err(err_msg) = cfg_result { + return Err(ApiError::BadRequest(anyhow!( + "Failed to configure failpoints: {err_msg}" + ))); + } + } + + json_response(StatusCode::OK, ()) +} + +// Run GC immediately on given timeline. +// FIXME: This is just for tests. See test_runner/regress/test_gc.py. +// This probably should require special authentication or a global flag to +// enable, I don't think we want to or need to allow regular clients to invoke +// GC. +// @hllinnaka in commits ec44f4b29, 3aca717f3 +#[cfg(feature = "testing")] +async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + // FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX + let repo = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?; + let gc_req: TimelineGcRequest = json_request(&mut request).await?; + + let _span_guard = + info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered(); + let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| repo.get_gc_horizon()); + + // Use tenant's pitr setting + let pitr = repo.get_pitr_interval(); + let result = repo + .gc_iteration(Some(timeline_id), gc_horizon, pitr, true) + // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it + // better once the types support it. + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, result) +} + +// Run compaction immediately on given timeline. +// FIXME This is just for tests. Don't expect this to be exposed to +// the users or the api. +// @dhammika in commit a0781f229 +#[cfg(feature = "testing")] +async fn timeline_compact_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; + let timeline = repo + .get_timeline(timeline_id) + .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}")) + .map_err(ApiError::NotFound)?; + timeline.compact().map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + +// Run checkpoint immediately on given timeline. +#[cfg(feature = "testing")] +async fn timeline_checkpoint_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; + let timeline = repo + .get_timeline(timeline_id) + .with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}")) + .map_err(ApiError::NotFound)?; + timeline + .checkpoint(CheckpointConfig::Forced) + .map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, ()) } @@ -690,12 +870,35 @@ pub fn make_router( })) } + macro_rules! testing_api { + ($handler_desc:literal, $handler:path $(,)?) => {{ + #[cfg(not(feature = "testing"))] + async fn cfg_disabled(_req: Request) -> Result, ApiError> { + Err(ApiError::BadRequest(anyhow!(concat!( + "Cannot ", + $handler_desc, + " because pageserver was compiled without testing APIs", + )))) + } + + #[cfg(feature = "testing")] + let handler = $handler; + #[cfg(not(feature = "testing"))] + let handler = cfg_disabled; + handler + }}; + } + Ok(router .data(Arc::new( State::new(conf, auth, remote_index, remote_storage) .context("Failed to initialize router state")?, )) .get("/v1/status", status_handler) + .put( + "/v1/failpoints", + testing_api!("manage failpoints", failpoints_handler), + ) .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) .get("/v1/tenant/:tenant_id", tenant_status) @@ -708,6 +911,18 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", + testing_api!("run timeline GC", timeline_gc_handler), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/compact", + testing_api!("run timeline compaction", timeline_compact_handler), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", + testing_api!("run timeline checkpoint", timeline_checkpoint_handler), + ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler, diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index c1e736d552..23c4351b4e 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -16,11 +16,13 @@ use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; -use postgres_ffi::v14::relfile_utils::*; -use postgres_ffi::v14::waldecoder::*; -use postgres_ffi::v14::xlog_utils::*; -use postgres_ffi::v14::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::*; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::ControlFileData; +use postgres_ffi::DBState_DB_SHUTDOWNED; use postgres_ffi::Oid; +use postgres_ffi::XLogFileName; use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; @@ -236,7 +238,7 @@ fn import_slru( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> { - let mut waldecoder = WalStreamDecoder::new(startpoint); + let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version); let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE); let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE); @@ -354,7 +356,7 @@ pub fn import_wal_from_tar( end_lsn: Lsn, ) -> Result<()> { // Set up walingest mutable state - let mut waldecoder = WalStreamDecoder::new(start_lsn); + let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version); let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE); let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; @@ -439,7 +441,7 @@ fn import_file( len: usize, ) -> Result> { if file_path.starts_with("global") { - let spcnode = pg_constants::GLOBALTABLESPACE_OID; + let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; let dbnode = 0; match file_path @@ -467,7 +469,7 @@ fn import_file( debug!("imported relmap file") } "PG_VERSION" => { - debug!("ignored"); + debug!("ignored PG_VERSION file"); } _ => { import_rel(modification, file_path, spcnode, dbnode, reader, len)?; @@ -495,7 +497,7 @@ fn import_file( debug!("imported relmap file") } "PG_VERSION" => { - debug!("ignored"); + debug!("ignored PG_VERSION file"); } _ => { import_rel(modification, file_path, spcnode, dbnode, reader, len)?; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 1392c204e8..9f31907a0b 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -33,11 +33,15 @@ use crate::task_mgr::TaskKind; /// Current storage format version /// -/// This is embedded in the metadata file, and also in the header of all the -/// layer files. If you make any backwards-incompatible changes to the storage +/// This is embedded in the header of all the layer files. +/// If you make any backwards-incompatible changes to the storage /// format, bump this! +/// Note that TimelineMetadata uses its own version number to track +/// backwards-compatible changes to the metadata format. pub const STORAGE_FORMAT_VERSION: u16 = 3; +pub const DEFAULT_PG_VERSION: u32 = 14; + // Magic constants used to identify different kinds of files pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; pub const DELTA_FILE_MAGIC: u16 = 0x5A61; @@ -106,7 +110,7 @@ fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds } /// A newtype to store arbitrary data grouped by tenant and timeline ids. -/// One could use [`utils::zid::TenantTimelineId`] for grouping, but that would +/// One could use [`utils::id::TenantTimelineId`] for grouping, but that would /// not include the cases where a certain tenant has zero timelines. /// This is sometimes important: a tenant could be registered during initial load from FS, /// even if he has no timelines on disk. diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 2f03943429..5c2f81d731 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,8 +1,9 @@ use metrics::core::{AtomicU64, GenericCounter}; use metrics::{ - register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec, - register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec, - IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, + register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, + GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, + UIntGaugeVec, }; use once_cell::sync::Lazy; use utils::id::{TenantId, TimelineId}; @@ -204,12 +205,34 @@ pub static REMAINING_SYNC_ITEMS: Lazy = Lazy::new(|| { .expect("failed to register pageserver remote storage remaining sync items int gauge") }); -pub static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { +pub static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { + register_gauge_vec!( + "pageserver_remote_storage_image_sync_duration", + "Time spent to synchronize (up/download) a whole pageserver image", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register per-timeline pageserver image sync time vec") +}); + +pub static IMAGE_SYNC_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"]; +pub static IMAGE_SYNC_STATUS: &[&str] = &["success", "failure", "abort"]; + +pub static IMAGE_SYNC_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_storage_image_sync_count", + "Number of synchronization operations executed for pageserver images. \ + Grouped by tenant, timeline, operation_kind and status", + &["tenant_id", "timeline_id", "operation_kind", "status"] + ) + .expect("failed to register pageserver image sync count vec") +}); + +pub static IMAGE_SYNC_TIME_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_remote_storage_image_sync_seconds", "Time took to synchronize (download or upload) a whole pageserver image. \ - Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)", - &["tenant_id", "timeline_id", "operation_kind", "status"], + Grouped by operation_kind and status", + &["operation_kind", "status"], vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0] ) .expect("failed to register pageserver image sync time histogram vec") @@ -256,7 +279,7 @@ macro_rules! redo_histogram_time_buckets { () => { vec![ 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, - 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, + 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, ] }; } @@ -411,6 +434,14 @@ impl Drop for TimelineMetrics { for op in SMGR_QUERY_TIME_OPERATIONS { let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]); } + + for op in IMAGE_SYNC_OPERATION_KINDS { + for status in IMAGE_SYNC_STATUS { + let _ = IMAGE_SYNC_COUNT.remove_label_values(&[tenant_id, timeline_id, op, status]); + } + } + + let _ = IMAGE_SYNC_TIME.remove_label_values(&[tenant_id, timeline_id]); } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b0b26b4f44..6dd1d0099f 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -29,7 +29,7 @@ use utils::{ lsn::Lsn, postgres_backend::AuthType, postgres_backend_async::{self, PostgresBackend}, - pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}, + pq_proto::{BeMessage, FeMessage, RowDescriptor}, simple_rcu::RcuReadGuard, }; @@ -46,9 +46,9 @@ use crate::tenant::Timeline; use crate::tenant_mgr; use crate::trace::Tracer; use crate::CheckpointConfig; -use postgres_ffi::v14::xlog_utils::to_pg_timestamp; -use postgres_ffi::v14::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi::to_pg_timestamp; use postgres_ffi::BLCKSZ; // Wrapped in libpq CopyData @@ -569,12 +569,16 @@ impl PageServerHandler { timeline_id: TimelineId, base_lsn: Lsn, _end_lsn: Lsn, + pg_version: u32, ) -> anyhow::Result<()> { task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); // Create empty timeline info!("creating new timeline"); - let timeline = tenant_mgr::get_tenant(tenant_id, true)? - .create_empty_timeline(timeline_id, base_lsn)?; + let timeline = tenant_mgr::get_tenant(tenant_id, true)?.create_empty_timeline( + timeline_id, + base_lsn, + pg_version, + )?; // TODO mark timeline as not ready until it reaches end_lsn. // We might have some wal to import as well, and we should prevent compute @@ -734,7 +738,7 @@ impl PageServerHandler { Ok(lsn) } - #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] async fn handle_get_rel_exists_request( &self, timeline: &Timeline, @@ -751,7 +755,7 @@ impl PageServerHandler { })) } - #[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] async fn handle_get_nblocks_request( &self, timeline: &Timeline, @@ -768,7 +772,7 @@ impl PageServerHandler { })) } - #[instrument(skip(timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] async fn handle_db_size_request( &self, timeline: &Timeline, @@ -788,7 +792,7 @@ impl PageServerHandler { })) } - #[instrument(skip(timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] async fn handle_get_page_at_lsn_request( &self, timeline: &Timeline, @@ -1026,19 +1030,27 @@ impl postgres_backend_async::Handler for PageServerHandler { // 1. Get start/end LSN from backup_manifest file // 2. Run: // cat my_backup/base.tar | psql -h $PAGESERVER \ - // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN" + // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" let (_, params_raw) = query_string.split_at("import basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); - ensure!(params.len() == 4); + ensure!(params.len() == 5); let tenant_id = TenantId::from_str(params[0])?; let timeline_id = TimelineId::from_str(params[1])?; let base_lsn = Lsn::from_str(params[2])?; let end_lsn = Lsn::from_str(params[3])?; + let pg_version = u32::from_str(params[4])?; self.check_permission(Some(tenant_id))?; match self - .handle_import_basebackup(pgb, tenant_id, timeline_id, base_lsn, end_lsn) + .handle_import_basebackup( + pgb, + tenant_id, + timeline_id, + base_lsn, + end_lsn, + pg_version, + ) .await { Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, @@ -1076,37 +1088,15 @@ impl postgres_backend_async::Handler for PageServerHandler { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("failpoints ") { - ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support"); - - let (_, failpoints) = query_string.split_at("failpoints ".len()); - - for failpoint in failpoints.split(';') { - if let Some((name, actions)) = failpoint.split_once('=') { - info!("cfg failpoint: {} {}", name, actions); - - // We recognize one extra "action" that's not natively recognized - // by the failpoints crate: exit, to immediately kill the process - if actions == "exit" { - fail::cfg_callback(name, || { - info!("Exit requested by failpoint"); - std::process::exit(1); - }) - .unwrap(); - } else { - fail::cfg(name, actions).unwrap(); - } - } else { - bail!("Invalid failpoints format"); - } - } - pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("show ") { // show let (_, params_raw) = query_string.split_at("show ".len()); let params = params_raw.split(' ').collect::>(); ensure!(params.len() == 1, "invalid param number for config command"); let tenant_id = TenantId::from_str(params[0])?; + + self.check_permission(Some(tenant_id))?; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; pgb.write_message(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), @@ -1143,91 +1133,6 @@ impl postgres_backend_async::Handler for PageServerHandler { Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("do_gc ") { - // Run GC immediately on given timeline. - // FIXME: This is just for tests. See test_runner/regress/test_gc.py. - // This probably should require special authentication or a global flag to - // enable, I don't think we want to or need to allow regular clients to invoke - // GC. - - // do_gc - let re = Regex::new(r"^do_gc ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)([[:digit:]]+)?") - .unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("invalid do_gc: '{}'", query_string))?; - - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - - let tenant = tenant_mgr::get_tenant(tenant_id, true)?; - - let gc_horizon: u64 = caps - .get(4) - .map(|h| h.as_str().parse()) - .unwrap_or_else(|| Ok(tenant.get_gc_horizon()))?; - - // Use tenant's pitr setting - let pitr = tenant.get_pitr_interval(); - let result = tenant.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?; - pgb.write_message(&BeMessage::RowDescription(&[ - RowDescriptor::int8_col(b"layers_total"), - RowDescriptor::int8_col(b"layers_needed_by_cutoff"), - RowDescriptor::int8_col(b"layers_needed_by_pitr"), - RowDescriptor::int8_col(b"layers_needed_by_branches"), - RowDescriptor::int8_col(b"layers_not_updated"), - RowDescriptor::int8_col(b"layers_removed"), - RowDescriptor::int8_col(b"elapsed"), - ]))? - .write_message(&BeMessage::DataRow(&[ - Some(result.layers_total.to_string().as_bytes()), - Some(result.layers_needed_by_cutoff.to_string().as_bytes()), - Some(result.layers_needed_by_pitr.to_string().as_bytes()), - Some(result.layers_needed_by_branches.to_string().as_bytes()), - Some(result.layers_not_updated.to_string().as_bytes()), - Some(result.layers_removed.to_string().as_bytes()), - Some(result.elapsed.as_millis().to_string().as_bytes()), - ]))? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("compact ") { - // Run compaction immediately on given timeline. - // FIXME This is just for tests. Don't expect this to be exposed to - // the users or the api. - - // compact - let re = Regex::new(r"^compact ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("Invalid compact: '{}'", query_string))?; - - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = get_local_timeline(tenant_id, timeline_id)?; - timeline.compact()?; - - pgb.write_message(&SINGLE_COL_ROWDESC)? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("checkpoint ") { - // Run checkpoint immediately on given timeline. - - // checkpoint - let re = Regex::new(r"^checkpoint ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?; - - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; - let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = get_local_timeline(tenant_id, timeline_id)?; - - // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`). - timeline.checkpoint(CheckpointConfig::Forced)?; - - pgb.write_message(&SINGLE_COL_ROWDESC)? - .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; } else if query_string.starts_with("get_lsn_by_timestamp ") { // Locate LSN of last transaction with timestamp less or equal than sppecified // TODO lazy static @@ -1236,14 +1141,14 @@ impl postgres_backend_async::Handler for PageServerHandler { let caps = re .captures(query_string) .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?; - let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?; let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?; - let timeline = get_local_timeline(tenant_id, timeline_id)?; - let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?; let timestamp_pg = to_pg_timestamp(timestamp); + self.check_permission(Some(tenant_id))?; + + let timeline = get_local_timeline(tenant_id, timeline_id)?; pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col( b"lsn", )]))?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 9d4b438dc4..fc9867dc05 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -13,7 +13,7 @@ use crate::tenant::Timeline; use crate::walrecord::NeonWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; -use postgres_ffi::v14::pg_constants; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; use postgres_ffi::{Oid, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; @@ -125,8 +125,7 @@ impl Timeline { return Ok(nblocks); } - if (tag.forknum == pg_constants::FSM_FORKNUM - || tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM) + if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) && !self.get_rel_exists(tag, lsn, latest)? { // FIXME: Postgres sometimes calls smgrcreate() to create @@ -1090,6 +1089,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); // 03 misc // controlfile // checkpoint +// pg_version // // Below is a full list of the keyspace allocation: // @@ -1128,7 +1128,6 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); // // Checkpoint: // 03 00000000 00000000 00000000 00 00000001 - //-- Section 01: relation data and metadata const DBDIR_KEY: Key = Key { @@ -1402,8 +1401,9 @@ fn is_slru_block_key(key: Key) -> bool { pub fn create_test_timeline( tenant: &crate::tenant::Tenant, timeline_id: utils::id::TimelineId, + pg_version: u32, ) -> Result> { - let tline = tenant.create_empty_timeline(timeline_id, Lsn(8))?; + let tline = tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version)?; let mut m = tline.begin_modification(Lsn(8)); m.init_empty()?; m.commit()?; diff --git a/pageserver/src/reltag.rs b/pageserver/src/reltag.rs index e3d08f8b3d..43d38bd986 100644 --- a/pageserver/src/reltag.rs +++ b/pageserver/src/reltag.rs @@ -2,8 +2,8 @@ use serde::{Deserialize, Serialize}; use std::cmp::Ordering; use std::fmt; -use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::relfile_utils::forknumber_to_name; +use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; +use postgres_ffi::relfile_utils::forknumber_to_name; use postgres_ffi::Oid; /// @@ -78,7 +78,7 @@ impl fmt::Display for RelTag { impl RelTag { pub fn to_segfile_name(&self, segno: u32) -> String { - let mut name = if self.spcnode == pg_constants::GLOBALTABLESPACE_OID { + let mut name = if self.spcnode == GLOBALTABLESPACE_OID { "global/".to_string() } else { format!("base/{}/", self.dbnode) diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index f6ea9d8c5d..0c2fedd7d5 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -24,6 +24,19 @@ pub struct Key { pub const KEY_SIZE: usize = 18; impl Key { + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. + /// As long as Neon does not support tablespace (because of lack of access to local file system), + /// we can assume that only some predefined namespace OIDs are used which can fit in u16 + pub fn to_i128(&self) -> i128 { + assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); + (((self.field1 & 0xf) as i128) << 120) + | (((self.field2 & 0xFFFF) as i128) << 104) + | ((self.field3 as i128) << 72) + | ((self.field4 as i128) << 40) + | ((self.field5 as i128) << 32) + | self.field6 as i128 + } + pub fn next(&self) -> Key { self.add(1) } @@ -176,7 +189,7 @@ impl Value { /// /// Result of performing GC /// -#[derive(Default)] +#[derive(Default, Serialize)] pub struct GcResult { pub layers_total: u64, pub layers_needed_by_cutoff: u64, @@ -185,9 +198,18 @@ pub struct GcResult { pub layers_not_updated: u64, pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. + #[serde(serialize_with = "serialize_duration_as_millis")] pub elapsed: Duration, } +// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds +fn serialize_duration_as_millis(d: &Duration, serializer: S) -> Result +where + S: serde::Serializer, +{ + d.as_millis().serialize(serializer) +} + impl AddAssign for GcResult { fn add_assign(&mut self, other: Self) { self.layers_total += other.layers_total; diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index 9d259bf1e2..bee460d173 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -169,13 +169,8 @@ use self::{ upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, }; use crate::{ - config::PageServerConf, - exponential_backoff, - storage_sync::index::RemoteIndex, - task_mgr, - task_mgr::TaskKind, - task_mgr::BACKGROUND_RUNTIME, - tenant::metadata::{metadata_path, TimelineMetadata}, + config::PageServerConf, exponential_backoff, storage_sync::index::RemoteIndex, task_mgr, + task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, tenant::metadata::TimelineMetadata, tenant_mgr::attach_local_tenants, }; use crate::{ @@ -183,6 +178,7 @@ use crate::{ TenantTimelineValues, }; +use crate::metrics::{IMAGE_SYNC_COUNT, IMAGE_SYNC_TIME_HISTOGRAM}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use self::download::download_index_parts; @@ -601,6 +597,7 @@ pub fn spawn_storage_sync_task( for (tenant_id, timeline_data) in local_timeline_files.0 { if timeline_data.is_empty() { + info!("got empty tenant {}", tenant_id); let _ = empty_tenants.0.entry(tenant_id).or_default(); } else { for (timeline_id, timeline_data) in timeline_data { @@ -642,6 +639,7 @@ pub fn spawn_storage_sync_task( (storage, remote_index_clone, sync_queue), max_sync_errors, ) + .instrument(info_span!("storage_sync_loop")) .await; Ok(()) }, @@ -839,7 +837,6 @@ async fn process_sync_task_batch( sync_id, upload_data, sync_start, - "upload", ) .await } @@ -883,7 +880,6 @@ async fn process_sync_task_batch( sync_id, download_data, sync_start, - "download", ) .await; } @@ -915,7 +911,6 @@ async fn process_sync_task_batch( sync_id, delete_data, sync_start, - "delete", ) .instrument(info_span!("delete_timeline_data")) .await; @@ -952,8 +947,9 @@ async fn download_timeline_data( sync_id: TenantTimelineId, new_download_data: SyncData, sync_start: Instant, - task_name: &str, ) -> DownloadStatus { + static TASK_NAME: &str = "download"; + match download_timeline_layers( conf, storage, @@ -965,19 +961,19 @@ async fn download_timeline_data( .await { DownloadedTimeline::Abort => { - register_sync_status(sync_id, sync_start, task_name, None); + register_sync_status(sync_id, sync_start, TASK_NAME, None); if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) { error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}"); } } DownloadedTimeline::FailedAndRescheduled => { - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); } DownloadedTimeline::Successful(mut download_data) => { match update_local_metadata(conf, sync_id, current_remote_timeline).await { Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { Ok(()) => { - register_sync_status(sync_id, sync_start, task_name, Some(true)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(true)); return DownloadStatus::Downloaded; } Err(e) => { @@ -988,7 +984,7 @@ async fn download_timeline_data( error!("Failed to update local timeline metadata: {e:?}"); download_data.retries += 1; sync_queue.push(sync_id, SyncTask::Download(download_data)); - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); } } } @@ -1011,7 +1007,7 @@ async fn update_local_metadata( }; let remote_lsn = remote_metadata.disk_consistent_lsn(); - let local_metadata_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id); + let local_metadata_path = conf.metadata_path(sync_id.timeline_id, sync_id.tenant_id); let local_lsn = if local_metadata_path.exists() { let local_metadata = read_metadata_file(&local_metadata_path) .await @@ -1064,8 +1060,9 @@ async fn delete_timeline_data( sync_id: TenantTimelineId, mut new_delete_data: SyncData, sync_start: Instant, - task_name: &str, ) { + static TASK_NAME: &str = "delete"; + let timeline_delete = &mut new_delete_data.data; if !timeline_delete.deletion_registered { @@ -1081,14 +1078,14 @@ async fn delete_timeline_data( error!("Failed to update remote timeline {sync_id}: {e:?}"); new_delete_data.retries += 1; sync_queue.push(sync_id, SyncTask::Delete(new_delete_data)); - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); return; } } timeline_delete.deletion_registered = true; let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await; - register_sync_status(sync_id, sync_start, task_name, Some(sync_status)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(sync_status)); } async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result { @@ -1107,8 +1104,8 @@ async fn upload_timeline_data( sync_id: TenantTimelineId, new_upload_data: SyncData, sync_start: Instant, - task_name: &str, ) -> UploadStatus { + static TASK_NAME: &str = "upload"; let mut uploaded_data = match upload_timeline_layers( storage, sync_queue, @@ -1119,7 +1116,7 @@ async fn upload_timeline_data( .await { UploadedTimeline::FailedAndRescheduled(e) => { - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); return UploadStatus::Failed(e); } UploadedTimeline::Successful(upload_data) => upload_data, @@ -1138,14 +1135,14 @@ async fn upload_timeline_data( .await { Ok(()) => { - register_sync_status(sync_id, sync_start, task_name, Some(true)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(true)); UploadStatus::Uploaded } Err(e) => { error!("Failed to update remote timeline {sync_id}: {e:?}"); uploaded_data.retries += 1; sync_queue.push(sync_id, SyncTask::Upload(uploaded_data)); - register_sync_status(sync_id, sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); UploadStatus::Failed(e) } } @@ -1303,6 +1300,10 @@ fn schedule_first_sync_tasks( None => { // TODO (rodionov) does this mean that we've crashed during tenant creation? // is it safe to upload this checkpoint? could it be half broken? + warn!( + "marking {} as locally complete, while it doesnt exist in remote index", + sync_id + ); new_sync_tasks.push_back(( sync_id, SyncTask::upload(LayersUpload { @@ -1337,6 +1338,8 @@ fn compare_local_and_remote_timeline( local_files: HashSet, remote_entry: &RemoteTimeline, ) -> (LocalTimelineInitStatus, bool) { + let _entered = info_span!("compare_local_and_remote_timeline", sync_id = %sync_id).entered(); + let remote_files = remote_entry.stored_files(); let number_of_layers_to_download = remote_files.difference(&local_files).count(); @@ -1347,10 +1350,12 @@ fn compare_local_and_remote_timeline( layers_to_skip: local_files.clone(), }), )); + info!("NeedsSync"); (LocalTimelineInitStatus::NeedsSync, true) // we do not need to manipulate with remote consistent lsn here // because it will be updated when sync will be completed } else { + info!("LocallyComplete"); ( LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()), false, @@ -1387,16 +1392,22 @@ fn register_sync_status( let tenant_id = sync_id.tenant_id.to_string(); let timeline_id = sync_id.timeline_id.to_string(); - match sync_status { - Some(true) => { - IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "success"]) - } - Some(false) => { - IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "failure"]) - } - None => return, - } - .observe(secs_elapsed) + + let sync_status = match sync_status { + Some(true) => "success", + Some(false) => "failure", + None => "abort", + }; + + IMAGE_SYNC_TIME_HISTOGRAM + .with_label_values(&[sync_name, sync_status]) + .observe(secs_elapsed); + IMAGE_SYNC_TIME + .with_label_values(&[&tenant_id, &timeline_id]) + .add(secs_elapsed); + IMAGE_SYNC_COUNT + .with_label_values(&[&tenant_id, &timeline_id, sync_name, sync_status]) + .inc(); } #[cfg(test)] @@ -1424,7 +1435,7 @@ mod test_utils { } fs::write( - metadata_path(harness.conf, timeline_id, harness.tenant_id), + harness.conf.metadata_path(timeline_id, harness.tenant_id), metadata.to_bytes()?, ) .await?; @@ -1441,7 +1452,17 @@ mod test_utils { } pub(super) fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata { - TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0)) + TimelineMetadata::new( + disk_consistent_lsn, + None, + None, + Lsn(0), + Lsn(0), + Lsn(0), + // Any version will do + // but it should be consistent with the one in the tests + crate::DEFAULT_PG_VERSION, + ) } } diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index 80d5ca5994..3e850443d8 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -9,18 +9,18 @@ use std::{ use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use remote_storage::{path_with_suffix_extension, DownloadError, GenericRemoteStorage}; +use remote_storage::{DownloadError, GenericRemoteStorage}; use tokio::{ fs, io::{self, AsyncWriteExt}, }; use tracing::{debug, error, info, warn}; -use crate::{ - config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path, - TEMP_FILE_SUFFIX, +use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX}; +use utils::{ + crashsafe_dir::path_with_suffix_extension, + id::{TenantId, TenantTimelineId, TimelineId}, }; -use utils::id::{TenantId, TenantTimelineId, TimelineId}; use super::{ index::{IndexPart, RemoteTimeline}, @@ -137,7 +137,8 @@ async fn download_index_part( storage: &GenericRemoteStorage, sync_id: TenantTimelineId, ) -> Result { - let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) + let index_part_path = conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); let mut index_part_download = storage .download_storage_object(None, &index_part_path) @@ -620,9 +621,10 @@ mod tests { metadata.to_bytes()?, ); - let local_index_part_path = - metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id) - .with_file_name(IndexPart::FILE_NAME); + let local_index_part_path = harness + .conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) + .with_file_name(IndexPart::FILE_NAME); let index_part_remote_id = local_storage.remote_object_id(&local_index_part_path)?; let index_part_local_path = PathBuf::from(index_part_remote_id.to_string()); fs::create_dir_all(index_part_local_path.parent().unwrap()).await?; diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 13495ffefe..db37c7b411 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -341,13 +341,21 @@ mod tests { use super::*; use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; + use crate::DEFAULT_PG_VERSION; #[test] fn index_part_conversion() { let harness = TenantHarness::create("index_part_conversion").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); - let metadata = - TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); + let metadata = TimelineMetadata::new( + Lsn(5).align(), + Some(Lsn(4)), + None, + Lsn(3), + Lsn(2), + Lsn(1), + DEFAULT_PG_VERSION, + ); let remote_timeline = RemoteTimeline { timeline_layers: HashSet::from([ timeline_path.join("layer_1"), @@ -464,8 +472,15 @@ mod tests { fn index_part_conversion_negatives() { let harness = TenantHarness::create("index_part_conversion_negatives").unwrap(); let timeline_path = harness.timeline_path(&TIMELINE_ID); - let metadata = - TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1)); + let metadata = TimelineMetadata::new( + Lsn(5).align(), + Some(Lsn(4)), + None, + Lsn(3), + Lsn(2), + Lsn(1), + DEFAULT_PG_VERSION, + ); let conversion_result = IndexPart::from_remote_timeline( &timeline_path, diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index aa5a2232cf..75657915c0 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -15,7 +15,7 @@ use super::{ LayersUpload, SyncData, SyncQueue, }; use crate::metrics::NO_LAYERS_UPLOAD; -use crate::{config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path}; +use crate::{config::PageServerConf, storage_sync::SyncTask}; /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( @@ -29,7 +29,8 @@ pub(super) async fn upload_index_part( let index_part_size = index_part_bytes.len(); let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes)); - let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id) + let index_part_path = conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) .with_file_name(IndexPart::FILE_NAME); storage .upload_storage_object( diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 48f0547934..573251f6c9 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -14,9 +14,9 @@ use anyhow::{bail, ensure, Context, Result}; use tokio::sync::watch; use tracing::*; +use utils::crashsafe_dir::path_with_suffix_extension; use std::cmp::min; -use std::collections::hash_map; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; @@ -27,6 +27,8 @@ use std::io::Write; use std::num::NonZeroU64; use std::ops::Bound::Included; use std::path::Path; +use std::process::Command; +use std::process::Stdio; use std::sync::Arc; use std::sync::MutexGuard; use std::sync::{Mutex, RwLock}; @@ -34,16 +36,16 @@ use std::time::{Duration, Instant}; use self::metadata::TimelineMetadata; use crate::config::PageServerConf; -use crate::metrics::remove_tenant_metrics; -use crate::storage_sync::index::RemoteIndex; -use crate::tenant_config::{TenantConf, TenantConfOpt}; - -use crate::metrics::STORAGE_TIME; +use crate::import_datadir; +use crate::metrics::{remove_tenant_metrics, STORAGE_TIME}; use crate::repository::GcResult; +use crate::storage_sync::index::RemoteIndex; use crate::task_mgr; +use crate::tenant_config::TenantConfOpt; use crate::virtual_file::VirtualFile; use crate::walredo::WalRedoManager; -use crate::CheckpointConfig; +use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; +pub use pageserver_api::models::TenantState; use toml_edit; use utils::{ @@ -117,22 +119,14 @@ pub struct Tenant { upload_layers: bool, } -/// A state of a tenant in pageserver's memory. -#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -pub enum TenantState { - /// Tenant is fully operational, its background jobs might be running or not. - Active { background_jobs_running: bool }, - /// A tenant is recognized by pageserver, but not yet ready to operate: - /// e.g. not present locally and being downloaded or being read into memory from the file system. - Paused, - /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated. - Broken, -} - /// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. impl Tenant { - /// Get Timeline handle for given zenith timeline ID. + pub fn tenant_id(&self) -> TenantId { + self.tenant_id + } + + /// Get Timeline handle for given Neon timeline ID. /// This function is idempotent. It doesn't change internal state in any way. pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result> { self.timelines @@ -142,8 +136,7 @@ impl Tenant { .with_context(|| { format!( "Timeline {} was not found for tenant {}", - timeline_id, - self.tenant_id() + timeline_id, self.tenant_id ) }) .map(Arc::clone) @@ -166,6 +159,7 @@ impl Tenant { &self, new_timeline_id: TimelineId, initdb_lsn: Lsn, + pg_version: u32, ) -> Result> { // XXX: keep the lock to avoid races during timeline creation let mut timelines = self.timelines.lock().unwrap(); @@ -180,122 +174,84 @@ impl Tenant { bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") } - // Create the timeline directory, and write initial metadata to file. - crashsafe_dir::create_dir_all(timeline_path)?; - - let new_metadata = - TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - save_metadata( - self.conf, - new_timeline_id, - self.tenant_id, - &new_metadata, - true, - )?; - + let new_metadata = TimelineMetadata::new( + Lsn(0), + None, + None, + Lsn(0), + initdb_lsn, + initdb_lsn, + pg_version, + ); let new_timeline = - self.initialize_new_timeline(new_timeline_id, new_metadata, &mut timelines)?; + self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?; new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); - if let hash_map::Entry::Vacant(v) = timelines.entry(new_timeline_id) { - v.insert(Arc::clone(&new_timeline)); - } - Ok(new_timeline) } - /// Branch a timeline - pub fn branch_timeline( + /// Create a new timeline. + /// + /// Returns the new timeline ID and reference to its Timeline object. + /// + /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with + /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, + /// a new unique ID is generated. + pub async fn create_timeline( &self, - src: TimelineId, - dst: TimelineId, - start_lsn: Option, - ) -> Result> { - // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn - // about timelines, so otherwise a race condition is possible, where we create new timeline and GC - // concurrently removes data that is needed by the new timeline. - let _gc_cs = self.gc_cs.lock().unwrap(); + new_timeline_id: Option, + ancestor_timeline_id: Option, + mut ancestor_start_lsn: Option, + pg_version: u32, + ) -> Result>> { + let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); - // In order for the branch creation task to not wait for GC/compaction, - // we need to make sure that the starting LSN of the child branch is not out of scope midway by - // - // 1. holding the GC lock to prevent overwritting timeline's GC data - // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline - // - // Step 2 is to avoid initializing the new branch using data removed by past GC iterations - // or in-queue GC iterations. - - // XXX: keep the lock to avoid races during timeline creation - let mut timelines = self.timelines.lock().unwrap(); - let src_timeline = timelines - .get(&src) - // message about timeline being remote is one .context up in the stack - .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; - - let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); - - // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN - let start_lsn = start_lsn.unwrap_or_else(|| { - let lsn = src_timeline.get_last_record_lsn(); - info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}"); - lsn - }); - - // Check if the starting LSN is out of scope because it is less than - // 1. the latest GC cutoff LSN or - // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. - src_timeline - .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) - .context(format!( - "invalid branch start lsn: less than latest GC cutoff {}", - *latest_gc_cutoff_lsn, - ))?; + if self + .conf + .timeline_path(&new_timeline_id, &self.tenant_id) + .exists() { - let gc_info = src_timeline.gc_info.read().unwrap(); - let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); - if start_lsn < cutoff { - bail!(format!( - "invalid branch start lsn: less than planned GC cutoff {cutoff}" - )); - } + debug!("timeline {new_timeline_id} already exists"); + return Ok(None); } - // Determine prev-LSN for the new timeline. We can only determine it if - // the timeline was branched at the current end of the source timeline. - let RecordLsn { - last: src_last, - prev: src_prev, - } = src_timeline.get_last_record_rlsn(); - let dst_prev = if src_last == start_lsn { - Some(src_prev) - } else { - None + let loaded_timeline = match ancestor_timeline_id { + Some(ancestor_timeline_id) => { + let ancestor_timeline = self + .get_timeline(ancestor_timeline_id) + .context("Cannot branch off the timeline that's not present in pageserver")?; + + if let Some(lsn) = ancestor_start_lsn.as_mut() { + // Wait for the WAL to arrive and be processed on the parent branch up + // to the requested branch point. The repository code itself doesn't + // require it, but if we start to receive WAL on the new timeline, + // decoding the new WAL might need to look up previous pages, relation + // sizes etc. and that would get confused if the previous page versions + // are not in the repository yet. + *lsn = lsn.align(); + ancestor_timeline.wait_lsn(*lsn).await?; + + let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); + if ancestor_ancestor_lsn > *lsn { + // can we safely just branch from the ancestor instead? + bail!( + "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", + lsn, + ancestor_timeline_id, + ancestor_ancestor_lsn, + ); + } + } + + self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? + } + None => self.bootstrap_timeline(new_timeline_id, pg_version)?, }; - // create a new timeline directory - let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id); - crashsafe_dir::create_dir(&timelinedir)?; + // Have added new timeline into the tenant, now its background tasks are needed. + self.activate(true); - // Create the metadata file, noting the ancestor of the new timeline. - // There is initially no data in it, but all the read-calls know to look - // into the ancestor. - let metadata = TimelineMetadata::new( - start_lsn, - dst_prev, - Some(src), - start_lsn, - *src_timeline.latest_gc_cutoff_lsn.read(), - src_timeline.initdb_lsn, - ); - crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?; - save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?; - - let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?; - timelines.insert(dst, Arc::clone(&new_timeline)); - - info!("branched timeline {dst} from {src} at {start_lsn}"); - - Ok(new_timeline) + Ok(Some(loaded_timeline)) } /// perform one garbage collection iteration, removing old data files from disk. @@ -342,8 +298,7 @@ impl Tenant { drop(timelines); for (timeline_id, timeline) in &timelines_to_compact { - let _entered = - info_span!("compact", timeline = %timeline_id, tenant = %self.tenant_id).entered(); + let _entered = info_span!("compact_timeline", timeline = %timeline_id).entered(); timeline.compact()?; } @@ -429,16 +384,24 @@ impl Tenant { let mut timelines_accessor = self.timelines.lock().unwrap(); for (timeline_id, metadata) in sorted_timelines { - let timeline = self - .initialize_new_timeline(timeline_id, metadata, &mut timelines_accessor) - .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; - - match timelines_accessor.entry(timeline.timeline_id) { - hash_map::Entry::Occupied(_) => anyhow::bail!( - "Found freshly initialized timeline {} in the tenant map", - timeline.timeline_id + info!( + "Attaching timeline {} pg_version {}", + timeline_id, + metadata.pg_version() + ); + let ancestor = metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) + .cloned(); + match timelines_accessor.entry(timeline_id) { + Entry::Occupied(_) => warn!( + "Timeline {}/{} already exists in the tenant map, skipping its initialization", + self.tenant_id, timeline_id ), - hash_map::Entry::Vacant(v) => { + Entry::Vacant(v) => { + let timeline = self + .initialize_new_timeline(timeline_id, metadata, ancestor) + .with_context(|| format!("Failed to initialize timeline {timeline_id}"))?; v.insert(timeline); } } @@ -646,24 +609,17 @@ impl Tenant { &self, new_timeline_id: TimelineId, new_metadata: TimelineMetadata, - timelines: &mut MutexGuard>>, + ancestor: Option>, ) -> anyhow::Result> { - let ancestor = match new_metadata.ancestor_timeline() { - Some(ancestor_timeline_id) => Some( - timelines - .get(&ancestor_timeline_id) - .cloned() - .with_context(|| { - format!( - "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" - ) - })?, - ), - None => None, - }; + if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { + anyhow::ensure!( + ancestor.is_some(), + "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" + ) + } let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn(); - + let pg_version = new_metadata.pg_version(); let new_timeline = Arc::new(Timeline::new( self.conf, Arc::clone(&self.tenant_conf), @@ -673,6 +629,7 @@ impl Tenant { self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, + pg_version, )); new_timeline @@ -711,7 +668,7 @@ impl Tenant { conf: &'static PageServerConf, tenant_id: TenantId, ) -> anyhow::Result { - let target_config_path = TenantConf::path(conf, tenant_id); + let target_config_path = conf.tenant_config_path(tenant_id); let target_config_display = target_config_path.display(); info!("loading tenantconf from {target_config_display}"); @@ -803,7 +760,7 @@ impl Tenant { }) .with_context(|| { format!( - "Failed to fsync on firts save for config {}", + "Failed to fsync on first save for config {}", target_config_path.display() ) })?; @@ -843,9 +800,6 @@ impl Tenant { pitr: Duration, checkpoint_before_gc: bool, ) -> Result { - let _span_guard = - info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timeline_id) - .entered(); let mut totals: GcResult = Default::default(); let now = Instant::now(); @@ -960,9 +914,220 @@ impl Tenant { Ok(totals) } - pub fn tenant_id(&self) -> TenantId { - self.tenant_id + fn branch_timeline( + &self, + src: TimelineId, + dst: TimelineId, + start_lsn: Option, + ) -> Result> { + // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn + // about timelines, so otherwise a race condition is possible, where we create new timeline and GC + // concurrently removes data that is needed by the new timeline. + let _gc_cs = self.gc_cs.lock().unwrap(); + + // In order for the branch creation task to not wait for GC/compaction, + // we need to make sure that the starting LSN of the child branch is not out of scope midway by + // + // 1. holding the GC lock to prevent overwritting timeline's GC data + // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline + // + // Step 2 is to avoid initializing the new branch using data removed by past GC iterations + // or in-queue GC iterations. + + // XXX: keep the lock to avoid races during timeline creation + let mut timelines = self.timelines.lock().unwrap(); + let src_timeline = timelines + .get(&src) + // message about timeline being remote is one .context up in the stack + .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?; + + let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); + + // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN + let start_lsn = start_lsn.unwrap_or_else(|| { + let lsn = src_timeline.get_last_record_lsn(); + info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}"); + lsn + }); + + // Check if the starting LSN is out of scope because it is less than + // 1. the latest GC cutoff LSN or + // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. + src_timeline + .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) + .context(format!( + "invalid branch start lsn: less than latest GC cutoff {}", + *latest_gc_cutoff_lsn, + ))?; + { + let gc_info = src_timeline.gc_info.read().unwrap(); + let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + if start_lsn < cutoff { + bail!(format!( + "invalid branch start lsn: less than planned GC cutoff {cutoff}" + )); + } + } + + // Determine prev-LSN for the new timeline. We can only determine it if + // the timeline was branched at the current end of the source timeline. + let RecordLsn { + last: src_last, + prev: src_prev, + } = src_timeline.get_last_record_rlsn(); + let dst_prev = if src_last == start_lsn { + Some(src_prev) + } else { + None + }; + + // Create the metadata file, noting the ancestor of the new timeline. + // There is initially no data in it, but all the read-calls know to look + // into the ancestor. + let metadata = TimelineMetadata::new( + start_lsn, + dst_prev, + Some(src), + start_lsn, + *src_timeline.latest_gc_cutoff_lsn.read(), + src_timeline.initdb_lsn, + src_timeline.pg_version, + ); + let new_timeline = self.create_initialized_timeline(dst, metadata, &mut timelines)?; + info!("branched timeline {dst} from {src} at {start_lsn}"); + + Ok(new_timeline) } + + /// - run initdb to init temporary instance and get bootstrap data + /// - after initialization complete, remove the temp dir. + fn bootstrap_timeline( + &self, + timeline_id: TimelineId, + pg_version: u32, + ) -> Result> { + // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` + // temporary directory for basebackup files for the given timeline. + let initdb_path = path_with_suffix_extension( + self.conf + .timelines_path(&self.tenant_id) + .join(format!("basebackup-{timeline_id}")), + TEMP_FILE_SUFFIX, + ); + + // Init temporarily repo to get bootstrap data + run_initdb(self.conf, &initdb_path, pg_version)?; + let pgdata_path = initdb_path; + + let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); + + // Import the contents of the data directory at the initial checkpoint + // LSN, and any WAL after that. + // Initdb lsn will be equal to last_record_lsn which will be set after import. + // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. + let timeline = self.create_empty_timeline(timeline_id, lsn, pg_version)?; + import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + bail!("failpoint before-checkpoint-new-timeline"); + }); + + timeline.checkpoint(CheckpointConfig::Forced)?; + + info!( + "created root timeline {} timeline.lsn {}", + timeline_id, + timeline.get_last_record_lsn() + ); + + // Remove temp dir. We don't need it anymore + fs::remove_dir_all(pgdata_path)?; + + Ok(timeline) + } + + fn create_initialized_timeline( + &self, + new_timeline_id: TimelineId, + new_metadata: TimelineMetadata, + timelines: &mut MutexGuard>>, + ) -> Result> { + crashsafe_dir::create_dir_all(self.conf.timeline_path(&new_timeline_id, &self.tenant_id)) + .with_context(|| { + format!( + "Failed to create timeline {}/{} directory", + new_timeline_id, self.tenant_id + ) + })?; + save_metadata( + self.conf, + new_timeline_id, + self.tenant_id, + &new_metadata, + true, + ) + .with_context(|| { + format!( + "Failed to create timeline {}/{} metadata", + new_timeline_id, self.tenant_id + ) + })?; + + let ancestor = new_metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines.get(&ancestor_timeline_id)) + .cloned(); + let new_timeline = self + .initialize_new_timeline(new_timeline_id, new_metadata, ancestor) + .with_context(|| { + format!( + "Failed to initialize timeline {}/{}", + new_timeline_id, self.tenant_id + ) + })?; + + match timelines.entry(new_timeline_id) { + Entry::Occupied(_) => bail!( + "Found freshly initialized timeline {} in the tenant map", + new_timeline_id + ), + Entry::Vacant(v) => { + v.insert(Arc::clone(&new_timeline)); + } + } + + Ok(new_timeline) + } +} + +/// Create the cluster temporarily in 'initdbpath' directory inside the repository +/// to get bootstrap data for timeline initialization. +fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32) -> Result<()> { + info!("running initdb in {}... ", initdbpath.display()); + + let initdb_path = conf.pg_bin_dir(pg_version).join("initdb"); + let initdb_output = Command::new(initdb_path) + .args(&["-D", &initdbpath.to_string_lossy()]) + .args(&["-U", &conf.superuser]) + .args(&["-E", "utf8"]) + .arg("--no-instructions") + // This is only used for a temporary installation that is deleted shortly after, + // so no need to fsync it + .arg("--no-sync") + .env_clear() + .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .stdout(Stdio::null()) + .output() + .context("failed to execute initdb")?; + if !initdb_output.status.success() { + bail!( + "initdb failed: '{}'", + String::from_utf8_lossy(&initdb_output.stderr) + ); + } + + Ok(()) } impl Drop for Tenant { @@ -1010,7 +1175,6 @@ pub mod harness { walredo::{WalRedoError, WalRedoManager}, }; - use super::metadata::metadata_path; use super::*; use crate::tenant_config::{TenantConf, TenantConfOpt}; use hex_literal::hex; @@ -1146,7 +1310,7 @@ pub mod harness { timeline_id: TimelineId, tenant_id: TenantId, ) -> anyhow::Result { - let metadata_path = metadata_path(conf, timeline_id, tenant_id); + let metadata_path = conf.metadata_path(timeline_id, tenant_id); let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { format!( "Failed to read metadata bytes from path {}", @@ -1171,6 +1335,7 @@ pub mod harness { lsn: Lsn, base_img: Option, records: Vec<(Lsn, NeonWalRecord)>, + _pg_version: u32, ) -> Result { let s = format!( "redo for {} to get to {}, with {} and {} records", @@ -1192,11 +1357,12 @@ pub mod harness { #[cfg(test)] mod tests { - use super::metadata::METADATA_FILE_NAME; use super::*; + use crate::config::METADATA_FILE_NAME; use crate::keyspace::KeySpaceAccum; use crate::repository::{Key, Value}; use crate::tenant::harness::*; + use crate::DEFAULT_PG_VERSION; use bytes::BytesMut; use hex_literal::hex; use once_cell::sync::Lazy; @@ -1208,7 +1374,7 @@ mod tests { #[test] fn test_basic() -> Result<()> { let tenant = TenantHarness::create("test_basic")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1230,9 +1396,9 @@ mod tests { #[test] fn no_duplicate_timelines() -> Result<()> { let tenant = TenantHarness::create("no_duplicate_timelines")?.load(); - let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; - match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0)) { + match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) { Ok(_) => panic!("duplicate timeline creation should fail"), Err(e) => assert_eq!( e.to_string(), @@ -1256,7 +1422,7 @@ mod tests { #[test] fn test_branch() -> Result<()> { let tenant = TenantHarness::create("test_branch")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let writer = tline.writer(); use std::str::from_utf8; @@ -1351,7 +1517,7 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? .load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 @@ -1381,7 +1547,7 @@ mod tests { let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), @@ -1407,7 +1573,7 @@ mod tests { RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? .load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; @@ -1425,7 +1591,7 @@ mod tests { fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1442,7 +1608,7 @@ mod tests { fn test_parent_keeps_data_forever_after_branching() -> Result<()> { let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; @@ -1470,7 +1636,8 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; { let tenant = harness.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; + let tline = + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x8000))?; tline.checkpoint(CheckpointConfig::Forced)?; } @@ -1490,7 +1657,7 @@ mod tests { // create two timelines { let tenant = harness.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; @@ -1526,7 +1693,7 @@ mod tests { let harness = TenantHarness::create(TEST_NAME)?; let tenant = harness.load(); - tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; drop(tenant); let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); @@ -1563,7 +1730,7 @@ mod tests { #[test] fn test_images() -> Result<()> { let tenant = TenantHarness::create("test_images")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; @@ -1613,7 +1780,7 @@ mod tests { #[test] fn test_bulk_insert() -> Result<()> { let tenant = TenantHarness::create("test_bulk_insert")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; let mut lsn = Lsn(0x10); @@ -1653,7 +1820,7 @@ mod tests { #[test] fn test_random_updates() -> Result<()> { let tenant = TenantHarness::create("test_random_updates")?.load(); - let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; const NUM_KEYS: usize = 1000; @@ -1723,7 +1890,7 @@ mod tests { #[test] fn test_traverse_branches() -> Result<()> { let tenant = TenantHarness::create("test_traverse_branches")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; const NUM_KEYS: usize = 1000; @@ -1802,7 +1969,7 @@ mod tests { #[test] fn test_traverse_ancestors() -> Result<()> { let tenant = TenantHarness::create("test_traverse_ancestors")?.load(); - let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; const NUM_KEYS: usize = 100; const NUM_TLINES: usize = 50; diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs index 892000c20b..57c5be91a4 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/delta_layer.rs @@ -713,7 +713,7 @@ impl DeltaLayerWriter { for buf in block_buf.blocks { file.write_all(buf.as_ref())?; } - + assert!(self.lsn_range.start < self.lsn_range.end); // Fill in the summary on blk 0 let summary = Summary { magic: DELTA_FILE_MAGIC, diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 8abeebf54c..495833e3ae 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -15,9 +15,15 @@ use crate::repository::Key; use crate::tenant::inmemory_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; use crate::tenant::storage_layer::{range_eq, range_overlaps}; +use amplify_num::i256; use anyhow::Result; +use num_traits::identities::{One, Zero}; +use num_traits::{Bounded, Num, Signed}; +use rstar::{RTree, RTreeObject, AABB}; +use std::cmp::Ordering; use std::collections::VecDeque; use std::ops::Range; +use std::ops::{Add, Div, Mul, Neg, Rem, Sub}; use std::sync::Arc; use tracing::*; use utils::lsn::Lsn; @@ -47,14 +53,163 @@ pub struct LayerMap { pub frozen_layers: VecDeque>, /// All the historic layers are kept here + historic_layers: RTree, - /// TODO: This is a placeholder implementation of a data structure - /// to hold information about all the layer files on disk and in - /// S3. Currently, it's just a vector and all operations perform a - /// linear scan over it. That obviously becomes slow as the - /// number of layers grows. I'm imagining that an R-tree or some - /// other 2D data structure would be the long-term solution here. - historic_layers: Vec>, + /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. + /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. + l0_delta_layers: Vec>, +} + +struct LayerRTreeObject { + layer: Arc, +} + +// Representation of Key as numeric type. +// We can not use native implementation of i128, because rstar::RTree +// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi). +// Overflow will cause panic in debug mode and incorrect area calculation in release mode, +// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work). +// By using i256 as the type, even though all the actual values would fit in i128, we can be +// sure that multiplication doesn't overflow. +// + +#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)] +struct IntKey(i256); + +impl Copy for IntKey {} + +impl IntKey { + fn from(i: i128) -> Self { + IntKey(i256::from(i)) + } +} + +impl Bounded for IntKey { + fn min_value() -> Self { + IntKey(i256::MIN) + } + fn max_value() -> Self { + IntKey(i256::MAX) + } +} + +impl Signed for IntKey { + fn is_positive(&self) -> bool { + self.0 > i256::ZERO + } + fn is_negative(&self) -> bool { + self.0 < i256::ZERO + } + fn signum(&self) -> Self { + match self.0.cmp(&i256::ZERO) { + Ordering::Greater => IntKey(i256::ONE), + Ordering::Less => IntKey(-i256::ONE), + Ordering::Equal => IntKey(i256::ZERO), + } + } + fn abs(&self) -> Self { + IntKey(self.0.abs()) + } + fn abs_sub(&self, other: &Self) -> Self { + if self.0 <= other.0 { + IntKey(i256::ZERO) + } else { + IntKey(self.0 - other.0) + } + } +} + +impl Neg for IntKey { + type Output = Self; + fn neg(self) -> Self::Output { + IntKey(-self.0) + } +} + +impl Rem for IntKey { + type Output = Self; + fn rem(self, rhs: Self) -> Self::Output { + IntKey(self.0 % rhs.0) + } +} + +impl Div for IntKey { + type Output = Self; + fn div(self, rhs: Self) -> Self::Output { + IntKey(self.0 / rhs.0) + } +} + +impl Add for IntKey { + type Output = Self; + fn add(self, rhs: Self) -> Self::Output { + IntKey(self.0 + rhs.0) + } +} + +impl Sub for IntKey { + type Output = Self; + fn sub(self, rhs: Self) -> Self::Output { + IntKey(self.0 - rhs.0) + } +} + +impl Mul for IntKey { + type Output = Self; + fn mul(self, rhs: Self) -> Self::Output { + IntKey(self.0 * rhs.0) + } +} + +impl One for IntKey { + fn one() -> Self { + IntKey(i256::ONE) + } +} + +impl Zero for IntKey { + fn zero() -> Self { + IntKey(i256::ZERO) + } + fn is_zero(&self) -> bool { + self.0 == i256::ZERO + } +} + +impl Num for IntKey { + type FromStrRadixErr = ::FromStrRadixErr; + fn from_str_radix(str: &str, radix: u32) -> Result { + Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?))) + } +} + +impl PartialEq for LayerRTreeObject { + fn eq(&self, other: &Self) -> bool { + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + Arc::ptr_eq(&self.layer, &other.layer) + } +} + +impl RTreeObject for LayerRTreeObject { + type Envelope = AABB<[IntKey; 2]>; + fn envelope(&self) -> Self::Envelope { + let key_range = self.layer.get_key_range(); + let lsn_range = self.layer.get_lsn_range(); + AABB::from_corners( + [ + IntKey::from(key_range.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(key_range.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive + ) + } } /// Return value of LayerMap::search @@ -80,19 +235,24 @@ impl LayerMap { // Find the latest image layer that covers the given key let mut latest_img: Option> = None; let mut latest_img_lsn: Option = None; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [IntKey::from(key.to_i128()), IntKey::from(0i128)], + [ + IntKey::from(key.to_i128()), + IntKey::from(end_lsn.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } + assert!(l.get_key_range().contains(&key)); let img_lsn = l.get_lsn_range().start; - - if img_lsn >= end_lsn { - // too new - continue; - } + assert!(img_lsn < end_lsn); if Lsn(img_lsn.0 + 1) == end_lsn { // found exact match return Ok(Some(SearchResult { @@ -108,19 +268,24 @@ impl LayerMap { // Search the delta layers let mut latest_delta: Option> = None; - for l in self.historic_layers.iter() { + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if !l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } - + assert!(l.get_key_range().contains(&key)); if l.get_lsn_range().start >= end_lsn { - // too new - continue; + info!( + "Candidate delta layer {}..{} is too new for lsn {}", + l.get_lsn_range().start, + l.get_lsn_range().end, + end_lsn + ); } - + assert!(l.get_lsn_range().start < end_lsn); if l.get_lsn_range().end >= end_lsn { // this layer contains the requested point in the key/lsn space. // No need to search any further @@ -170,7 +335,10 @@ impl LayerMap { /// Insert an on-disk layer /// pub fn insert_historic(&mut self, layer: Arc) { - self.historic_layers.push(layer); + if layer.get_key_range() == (Key::MIN..Key::MAX) { + self.l0_delta_layers.push(layer.clone()); + } + self.historic_layers.insert(LayerRTreeObject { layer }); NUM_ONDISK_LAYERS.inc(); } @@ -180,17 +348,22 @@ impl LayerMap { /// This should be called when the corresponding file on disk has been deleted. /// pub fn remove_historic(&mut self, layer: Arc) { - let len_before = self.historic_layers.len(); + if layer.get_key_range() == (Key::MIN..Key::MAX) { + let len_before = self.l0_delta_layers.len(); - // FIXME: ptr_eq might fail to return true for 'dyn' - // references. Clippy complains about this. In practice it - // seems to work, the assertion below would be triggered - // otherwise but this ought to be fixed. - #[allow(clippy::vtable_address_comparisons)] - self.historic_layers - .retain(|other| !Arc::ptr_eq(other, &layer)); - - assert_eq!(self.historic_layers.len(), len_before - 1); + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + self.l0_delta_layers + .retain(|other| !Arc::ptr_eq(other, &layer)); + assert_eq!(self.l0_delta_layers.len(), len_before - 1); + } + assert!(self + .historic_layers + .remove(&LayerRTreeObject { layer }) + .is_some()); NUM_ONDISK_LAYERS.dec(); } @@ -207,15 +380,26 @@ impl LayerMap { loop { let mut made_progress = false; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [ + IntKey::from(range_remain.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(range_remain.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } let img_lsn = l.get_lsn_range().start; - if !l.is_incremental() - && l.get_key_range().contains(&range_remain.start) - && lsn_range.contains(&img_lsn) - { + if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) { made_progress = true; let img_key_end = l.get_key_range().end; @@ -232,8 +416,8 @@ impl LayerMap { } } - pub fn iter_historic_layers(&self) -> impl Iterator> { - self.historic_layers.iter() + pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { + self.historic_layers.iter().map(|e| e.layer.clone()) } /// Find the last image layer that covers 'key', ignoring any image layers @@ -241,19 +425,22 @@ impl LayerMap { fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option> { let mut candidate_lsn = Lsn(0); let mut candidate = None; - for l in self.historic_layers.iter() { + let envelope = AABB::from_corners( + [IntKey::from(key.to_i128()), IntKey::from(0)], + [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if l.is_incremental() { continue; } - if !l.get_key_range().contains(&key) { - continue; - } - + assert!(l.get_key_range().contains(&key)); let this_lsn = l.get_lsn_range().start; - if this_lsn > lsn { - continue; - } + assert!(this_lsn <= lsn); if this_lsn < candidate_lsn { // our previous candidate was better continue; @@ -279,10 +466,19 @@ impl LayerMap { lsn: Lsn, ) -> Result, Option>)>> { let mut points = vec![key_range.start]; - for l in self.historic_layers.iter() { - if l.get_lsn_range().start > lsn { - continue; - } + let envelope = AABB::from_corners( + [IntKey::from(key_range.start.to_i128()), IntKey::from(0)], + [ + IntKey::from(key_range.end.to_i128()), + IntKey::from(lsn.0 as i128), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; + assert!(l.get_lsn_range().start <= lsn); let range = l.get_key_range(); if key_range.contains(&range.start) { points.push(l.get_key_range().start); @@ -315,16 +511,29 @@ impl LayerMap { /// given key and LSN range. pub fn count_deltas(&self, key_range: &Range, lsn_range: &Range) -> Result { let mut result = 0; - for l in self.historic_layers.iter() { + if lsn_range.start >= lsn_range.end { + return Ok(0); + } + let envelope = AABB::from_corners( + [ + IntKey::from(key_range.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(key_range.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; if !l.is_incremental() { continue; } - if !range_overlaps(&l.get_lsn_range(), lsn_range) { - continue; - } - if !range_overlaps(&l.get_key_range(), key_range) { - continue; - } + assert!(range_overlaps(&l.get_lsn_range(), lsn_range)); + assert!(range_overlaps(&l.get_key_range(), key_range)); // We ignore level0 delta layers. Unless the whole keyspace fits // into one partition @@ -341,17 +550,7 @@ impl LayerMap { /// Return all L0 delta layers pub fn get_level0_deltas(&self) -> Result>> { - let mut deltas = Vec::new(); - for l in self.historic_layers.iter() { - if !l.is_incremental() { - continue; - } - if l.get_key_range() != (Key::MIN..Key::MAX) { - continue; - } - deltas.push(Arc::clone(l)); - } - Ok(deltas) + Ok(self.l0_delta_layers.clone()) } /// debugging function to print out the contents of the layer map @@ -370,8 +569,8 @@ impl LayerMap { } println!("historic_layers:"); - for layer in self.historic_layers.iter() { - layer.dump(verbose)?; + for e in self.historic_layers.iter() { + e.layer.dump(verbose)?; } println!("End dump LayerMap"); Ok(()) diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index ace4dc91e9..3fb9ccd936 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -8,7 +8,6 @@ use std::fs::{File, OpenOptions}; use std::io::Write; -use std::path::PathBuf; use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; @@ -21,7 +20,12 @@ use utils::{ use crate::config::PageServerConf; use crate::virtual_file::VirtualFile; -use crate::STORAGE_FORMAT_VERSION; + +/// Use special format number to enable backward compatibility. +const METADATA_FORMAT_VERSION: u16 = 4; + +/// Previous supported format versions. +const METADATA_OLD_FORMAT_VERSION: u16 = 3; /// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic. /// @@ -29,28 +33,46 @@ use crate::STORAGE_FORMAT_VERSION; /// see PG_CONTROL_MAX_SAFE_SIZE const METADATA_MAX_SIZE: usize = 512; -/// The name of the metadata file pageserver creates per timeline. -pub const METADATA_FILE_NAME: &str = "metadata"; - /// Metadata stored on disk for each timeline /// /// The fields correspond to the values we hold in memory, in Timeline. #[derive(Debug, Clone, PartialEq, Eq)] pub struct TimelineMetadata { hdr: TimelineMetadataHeader, - body: TimelineMetadataBody, + body: TimelineMetadataBodyV2, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] struct TimelineMetadataHeader { checksum: u32, // CRC of serialized metadata body size: u16, // size of serialized metadata - format_version: u16, // storage format version (used for compatibility checks) + format_version: u16, // metadata format version (used for compatibility checks) } const METADATA_HDR_SIZE: usize = std::mem::size_of::(); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -struct TimelineMetadataBody { +struct TimelineMetadataBodyV2 { + disk_consistent_lsn: Lsn, + // This is only set if we know it. We track it in memory when the page + // server is running, but we only track the value corresponding to + // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a + // lot. We only store it in the metadata file when we flush *all* the + // in-memory data so that 'last_record_lsn' is the same as + // 'disk_consistent_lsn'. That's OK, because after page server restart, as + // soon as we reprocess at least one record, we will have a valid + // 'prev_record_lsn' value in memory again. This is only really needed when + // doing a clean shutdown, so that there is no more WAL beyond + // 'disk_consistent_lsn' + prev_record_lsn: Option, + ancestor_timeline: Option, + ancestor_lsn: Lsn, + latest_gc_cutoff_lsn: Lsn, + initdb_lsn: Lsn, + pg_version: u32, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +struct TimelineMetadataBodyV1 { disk_consistent_lsn: Lsn, // This is only set if we know it. We track it in memory when the page // server is running, but we only track the value corresponding to @@ -77,34 +99,63 @@ impl TimelineMetadata { ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, + pg_version: u32, ) -> Self { Self { hdr: TimelineMetadataHeader { checksum: 0, size: 0, - format_version: STORAGE_FORMAT_VERSION, + format_version: METADATA_FORMAT_VERSION, }, - body: TimelineMetadataBody { + body: TimelineMetadataBodyV2 { disk_consistent_lsn, prev_record_lsn, ancestor_timeline, ancestor_lsn, latest_gc_cutoff_lsn, initdb_lsn, + pg_version, }, } } + fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result { + let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; + + // backward compatible only up to this version + ensure!( + hdr.format_version == METADATA_OLD_FORMAT_VERSION, + "unsupported metadata format version {}", + hdr.format_version + ); + + let metadata_size = hdr.size as usize; + + let body: TimelineMetadataBodyV1 = + TimelineMetadataBodyV1::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; + + let body = TimelineMetadataBodyV2 { + disk_consistent_lsn: body.disk_consistent_lsn, + prev_record_lsn: body.prev_record_lsn, + ancestor_timeline: body.ancestor_timeline, + ancestor_lsn: body.ancestor_lsn, + latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn, + initdb_lsn: body.initdb_lsn, + pg_version: 14, // All timelines created before this version had pg_version 14 + }; + + hdr.format_version = METADATA_FORMAT_VERSION; + + Ok(Self { hdr, body }) + } + pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result { ensure!( metadata_bytes.len() == METADATA_MAX_SIZE, "metadata bytes size is wrong" ); let hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; - ensure!( - hdr.format_version == STORAGE_FORMAT_VERSION, - "format version mismatch" - ); + let metadata_size = hdr.size as usize; ensure!( metadata_size <= METADATA_MAX_SIZE, @@ -115,13 +166,20 @@ impl TimelineMetadata { hdr.checksum == calculated_checksum, "metadata checksum mismatch" ); - let body = TimelineMetadataBody::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; - ensure!( - body.disk_consistent_lsn.is_aligned(), - "disk_consistent_lsn is not aligned" - ); - Ok(TimelineMetadata { hdr, body }) + if hdr.format_version != METADATA_FORMAT_VERSION { + // If metadata has the old format, + // upgrade it and return the result + TimelineMetadata::upgrade_timeline_metadata(metadata_bytes) + } else { + let body = + TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; + ensure!( + body.disk_consistent_lsn.is_aligned(), + "disk_consistent_lsn is not aligned" + ); + Ok(TimelineMetadata { hdr, body }) + } } pub fn to_bytes(&self) -> anyhow::Result> { @@ -129,7 +187,7 @@ impl TimelineMetadata { let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); let hdr = TimelineMetadataHeader { size: metadata_size as u16, - format_version: STORAGE_FORMAT_VERSION, + format_version: METADATA_FORMAT_VERSION, checksum: crc32c::crc32c(&body_bytes), }; let hdr_bytes = hdr.ser()?; @@ -164,17 +222,10 @@ impl TimelineMetadata { pub fn initdb_lsn(&self) -> Lsn { self.body.initdb_lsn } -} -/// Points to a place in pageserver's local directory, -/// where certain timeline's metadata file should be located. -pub fn metadata_path( - conf: &'static PageServerConf, - timeline_id: TimelineId, - tenant_id: TenantId, -) -> PathBuf { - conf.timeline_path(&timeline_id, &tenant_id) - .join(METADATA_FILE_NAME) + pub fn pg_version(&self) -> u32 { + self.body.pg_version + } } /// Save timeline metadata to file @@ -186,7 +237,7 @@ pub fn save_metadata( first_save: bool, ) -> anyhow::Result<()> { let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timeline_id, tenant_id); + let path = conf.metadata_path(timeline_id, tenant_id); // use OpenOptions to ensure file presence is consistent with first_save let mut file = VirtualFile::open_with_options( &path, @@ -227,6 +278,8 @@ mod tests { Lsn(0), Lsn(0), Lsn(0), + // Any version will do here, so use the default + crate::DEFAULT_PG_VERSION, ); let metadata_bytes = original_metadata @@ -241,4 +294,72 @@ mod tests { "Metadata that was serialized to bytes and deserialized back should not change" ); } + + // Generate old version metadata and read it with current code. + // Ensure that it is upgraded correctly + #[test] + fn test_metadata_upgrade() { + #[derive(Debug, Clone, PartialEq, Eq)] + struct TimelineMetadataV1 { + hdr: TimelineMetadataHeader, + body: TimelineMetadataBodyV1, + } + + let metadata_v1 = TimelineMetadataV1 { + hdr: TimelineMetadataHeader { + checksum: 0, + size: 0, + format_version: METADATA_OLD_FORMAT_VERSION, + }, + body: TimelineMetadataBodyV1 { + disk_consistent_lsn: Lsn(0x200), + prev_record_lsn: Some(Lsn(0x100)), + ancestor_timeline: Some(TIMELINE_ID), + ancestor_lsn: Lsn(0), + latest_gc_cutoff_lsn: Lsn(0), + initdb_lsn: Lsn(0), + }, + }; + + impl TimelineMetadataV1 { + pub fn to_bytes(&self) -> anyhow::Result> { + let body_bytes = self.body.ser()?; + let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); + let hdr = TimelineMetadataHeader { + size: metadata_size as u16, + format_version: METADATA_OLD_FORMAT_VERSION, + checksum: crc32c::crc32c(&body_bytes), + }; + let hdr_bytes = hdr.ser()?; + let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE]; + metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes); + metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes); + Ok(metadata_bytes) + } + } + + let metadata_bytes = metadata_v1 + .to_bytes() + .expect("Should serialize correct metadata to bytes"); + + // This should deserialize to the latest version format + let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes) + .expect("Should deserialize its own bytes"); + + let expected_metadata = TimelineMetadata::new( + Lsn(0x200), + Some(Lsn(0x100)), + Some(TIMELINE_ID), + Lsn(0), + Lsn(0), + Lsn(0), + 14, // All timelines created before this version had pg_version 14 + ); + + assert_eq!( + deserialized_metadata.body, expected_metadata.body, + "Metadata of the old version {} should be upgraded to the latest version {}", + METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION + ); + } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e821ef1b9a..247e076230 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -24,12 +24,12 @@ use crate::tenant::{ image_layer::{ImageLayer, ImageLayerWriter}, inmemory_layer::InMemoryLayer, layer_map::{LayerMap, SearchResult}, - metadata::{save_metadata, TimelineMetadata, METADATA_FILE_NAME}, + metadata::{save_metadata, TimelineMetadata}, par_fsync, storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}, }; -use crate::config::PageServerConf; +use crate::config::{PageServerConf, METADATA_FILE_NAME}; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::BlockNumber; @@ -37,7 +37,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp; use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; -use postgres_ffi::v14::xlog_utils::to_pg_timestamp; +use postgres_ffi::to_pg_timestamp; use utils::{ id::{TenantId, TimelineId}, lsn::{AtomicLsn, Lsn, RecordLsn}, @@ -61,6 +61,8 @@ pub struct Timeline { pub tenant_id: TenantId, pub timeline_id: TimelineId, + pub pg_version: u32, + pub layers: RwLock, last_freeze_at: AtomicLsn, @@ -232,14 +234,16 @@ impl LogicalSize { } fn current_size(&self) -> anyhow::Result { - let size_increment = self.size_added_after_initial.load(AtomicOrdering::Acquire); + let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire); + // ^^^ keep this type explicit so that the casts in this function break if + // we change the type. match self.initial_logical_size.get() { Some(initial_size) => { let absolute_size_increment = u64::try_from( size_increment .checked_abs() .with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?, - ).with_context(|| format!("Failed to convert size increment {size_increment} to u64"))?; + ).expect("casting nonnegative i64 to u64 should not fail"); if size_increment < 0 { initial_size.checked_sub(absolute_size_increment) @@ -249,11 +253,7 @@ impl LogicalSize { .map(CurrentLogicalSize::Exact) } None => { - let non_negative_size_increment = if size_increment < 0 { - 0 - } else { - u64::try_from(size_increment).expect("not negative, cannot fail") - }; + let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0); Ok(CurrentLogicalSize::Approximate(non_negative_size_increment)) } } @@ -343,7 +343,9 @@ impl Timeline { match cached_lsn.cmp(&lsn) { Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image - Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn + Ordering::Greater => { + unreachable!("the returned lsn should never be after the requested lsn") + } } Some((cached_lsn, cached_img)) } @@ -535,6 +537,7 @@ impl Timeline { tenant_id: TenantId, walredo_mgr: Arc, upload_layers: bool, + pg_version: u32, ) -> Timeline { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -543,6 +546,7 @@ impl Timeline { tenant_conf, timeline_id, tenant_id, + pg_version, layers: RwLock::new(LayerMap::default()), walredo_mgr, @@ -623,7 +627,7 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); drop(tenant_conf_guard); let self_clone = Arc::clone(self); - let _ = spawn_connection_manager_task( + spawn_connection_manager_task( self.conf.broker_etcd_prefix.clone(), self_clone, walreceiver_connect_timeout, @@ -724,10 +728,10 @@ impl Timeline { Ok(()) } - pub fn layer_removal_guard(&self) -> Result, anyhow::Error> { + pub fn layer_removal_guard(&self) -> anyhow::Result> { self.layer_removal_cs .try_lock() - .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) + .map_err(|e| anyhow!("cannot lock compaction critical section {e}")) } /// Retrieve current logical size of the timeline. @@ -1262,6 +1266,7 @@ impl Timeline { self.ancestor_lsn, *self.latest_gc_cutoff_lsn.read(), self.initdb_lsn, + self.pg_version, ); fail_point!("checkpoint-before-saving-metadata", |x| bail!( @@ -1918,18 +1923,19 @@ impl Timeline { let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let _enter = + info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff) + .entered(); + // Nothing to GC. Return early. let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); if latest_gc_cutoff >= new_gc_cutoff { info!( - "Nothing to GC for timeline {}: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", - self.timeline_id + "Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", ); return Ok(result); } - let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered(); - // We need to ensure that no one tries to read page versions or create // branches at a point before latest_gc_cutoff_lsn. See branch_timeline() // for details. This will block until the old value is no longer in use. @@ -2051,7 +2057,7 @@ impl Timeline { l.filename().display(), l.is_incremental(), ); - layers_to_remove.push(Arc::clone(l)); + layers_to_remove.push(Arc::clone(&l)); } // Actually delete the layers from disk and remove them from the map. @@ -2134,9 +2140,13 @@ impl Timeline { let last_rec_lsn = data.records.last().unwrap().0; - let img = - self.walredo_mgr - .request_redo(key, request_lsn, base_img, data.records)?; + let img = self.walredo_mgr.request_redo( + key, + request_lsn, + base_img, + data.records, + self.pg_version, + )?; if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 3b9e6778be..c6636b5936 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -8,14 +8,9 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! -use crate::config::PageServerConf; use serde::{Deserialize, Serialize}; use std::num::NonZeroU64; -use std::path::PathBuf; use std::time::Duration; -use utils::id::TenantId; - -pub const TENANT_CONFIG_NAME: &str = "config"; pub mod defaults { // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB @@ -224,12 +219,6 @@ impl TenantConf { } } - /// Points to a place in pageserver's local directory, - /// where certain tenant's tenantconf file should be located. - pub fn path(conf: &'static PageServerConf, tenant_id: TenantId) -> PathBuf { - conf.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) - } - #[cfg(test)] pub fn dummy_conf() -> Self { TenantConf { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index d6fa843305..0e8ee8c067 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -10,23 +10,21 @@ use std::sync::Arc; use anyhow::Context; use tracing::*; -use remote_storage::{path_with_suffix_extension, GenericRemoteStorage}; +use remote_storage::GenericRemoteStorage; -use crate::config::PageServerConf; +use crate::config::{PageServerConf, METADATA_FILE_NAME}; use crate::http::models::TenantInfo; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::{ - ephemeral_file::is_ephemeral_file, - metadata::{TimelineMetadata, METADATA_FILE_NAME}, - Tenant, TenantState, + ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState, }; -use crate::tenant_config::{TenantConf, TenantConfOpt}; +use crate::tenant_config::TenantConfOpt; use crate::walredo::PostgresRedoManager; use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX}; -use utils::crashsafe_dir; +use utils::crashsafe_dir::{self, path_with_suffix_extension}; use utils::id::{TenantId, TimelineId}; mod tenants_state { @@ -109,6 +107,13 @@ pub fn init_tenant_mgr( /// Ignores other timelines that might be present for tenant, but were not passed as a parameter. /// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", /// and the load continues. +/// +/// For successful tenant attach, it first has to have a `timelines/` subdirectory and a tenant config file that's loaded into memory successfully. +/// If either of the conditions fails, the tenant will be added to memory with [`TenantState::Broken`] state, otherwise we start to load its timelines. +/// Alternatively, tenant is considered loaded successfully, if it's already in pageserver's memory (i.e. was loaded already before). +/// +/// Attach happens on startup and sucessful timeline downloads +/// (some subset of timeline files, always including its metadata, after which the new one needs to be registered). pub fn attach_local_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, @@ -124,18 +129,20 @@ pub fn attach_local_tenants( ); debug!("Timelines to attach: {local_timelines:?}"); - let tenant = load_local_tenant(conf, tenant_id, remote_index); - { - match tenants_state::write_tenants().entry(tenant_id) { - hash_map::Entry::Occupied(_) => { - error!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state"); - continue; - } - hash_map::Entry::Vacant(v) => { - v.insert(Arc::clone(&tenant)); - } + let mut tenants_accessor = tenants_state::write_tenants(); + let tenant = match tenants_accessor.entry(tenant_id) { + hash_map::Entry::Occupied(o) => { + info!("Tenant {tenant_id} was found in pageserver's memory"); + Arc::clone(o.get()) } - } + hash_map::Entry::Vacant(v) => { + info!("Tenant {tenant_id} was not found in pageserver's memory, loading it"); + let tenant = load_local_tenant(conf, tenant_id, remote_index); + v.insert(Arc::clone(&tenant)); + tenant + } + }; + drop(tenants_accessor); if tenant.current_state() == TenantState::Broken { warn!("Skipping timeline load for broken tenant {tenant_id}") @@ -170,16 +177,28 @@ fn load_local_tenant( remote_index.clone(), conf.remote_storage_config.is_some(), )); - match Tenant::load_tenant_config(conf, tenant_id) { - Ok(tenant_conf) => { - tenant.update_tenant_config(tenant_conf); - tenant.activate(false); - } - Err(e) => { - error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); - tenant.set_state(TenantState::Broken); + + let tenant_timelines_dir = conf.timelines_path(&tenant_id); + if !tenant_timelines_dir.is_dir() { + error!( + "Tenant {} has no timelines directory at {}", + tenant_id, + tenant_timelines_dir.display() + ); + tenant.set_state(TenantState::Broken); + } else { + match Tenant::load_tenant_config(conf, tenant_id) { + Ok(tenant_conf) => { + tenant.update_tenant_config(tenant_conf); + tenant.activate(false); + } + Err(e) => { + error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); + tenant.set_state(TenantState::Broken); + } } } + tenant } @@ -246,7 +265,7 @@ fn create_tenant_files( &temporary_tenant_dir, )?; let temporary_tenant_config_path = rebase_directory( - &TenantConf::path(conf, tenant_id), + &conf.tenant_config_path(tenant_id), &target_tenant_directory, &temporary_tenant_dir, )?; @@ -343,7 +362,7 @@ pub fn update_tenant_config( ) -> anyhow::Result<()> { info!("configuring tenant {tenant_id}"); get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf); - Tenant::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?; + Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?; Ok(()) } @@ -627,14 +646,10 @@ fn collect_timelines_for_tenant( } if tenant_timelines.is_empty() { - match remove_if_empty(&timelines_dir) { - Ok(true) => info!( - "Removed empty tenant timelines directory {}", - timelines_dir.display() - ), - Ok(false) => (), - Err(e) => error!("Failed to remove empty tenant timelines directory: {e:?}"), - } + // this is normal, we've removed all broken, empty and temporary timeline dirs + // but should allow the tenant to stay functional and allow creating new timelines + // on a restart, we require tenants to have the timelines dir, so leave it on disk + debug!("Tenant {tenant_id} has no timelines loaded"); } Ok((tenant_id, tenant_timelines)) diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index c543a0ecb1..8329b15c08 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -21,7 +21,9 @@ pub fn start_background_loops(tenant_id: TenantId) { &format!("compactor for tenant {tenant_id}"), false, async move { - compaction_loop(tenant_id).await; + compaction_loop(tenant_id) + .instrument(info_span!("compaction_loop", tenant_id = %tenant_id)) + .await; Ok(()) }, ); @@ -33,7 +35,9 @@ pub fn start_background_loops(tenant_id: TenantId) { &format!("garbage collector for tenant {tenant_id}"), false, async move { - gc_loop(tenant_id).await; + gc_loop(tenant_id) + .instrument(info_span!("gc_loop", tenant_id = %tenant_id)) + .await; Ok(()) }, ); @@ -44,7 +48,7 @@ pub fn start_background_loops(tenant_id: TenantId) { /// async fn compaction_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting compaction loop for {tenant_id}"); + info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { loop { @@ -52,7 +56,7 @@ async fn compaction_loop(tenant_id: TenantId) { let tenant = tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received compaction cancellation request"); + info!("received cancellation request"); return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { @@ -73,7 +77,7 @@ async fn compaction_loop(tenant_id: TenantId) { // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received compaction cancellation request during idling"); + info!("received cancellation request during idling"); break ; }, _ = tokio::time::sleep(sleep_duration) => {}, @@ -91,7 +95,7 @@ async fn compaction_loop(tenant_id: TenantId) { /// async fn gc_loop(tenant_id: TenantId) { let wait_duration = Duration::from_secs(2); - info!("starting gc loop for {tenant_id}"); + info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { loop { @@ -99,7 +103,7 @@ async fn gc_loop(tenant_id: TenantId) { let tenant = tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received GC cancellation request"); + info!("received cancellation request"); return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { @@ -123,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) { // Sleep tokio::select! { _ = task_mgr::shutdown_watcher() => { - info!("received GC cancellation request during idling"); + info!("received cancellation request during idling"); break; }, _ = tokio::time::sleep(sleep_duration) => {}, diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs deleted file mode 100644 index 88b26e18f4..0000000000 --- a/pageserver/src/timelines.rs +++ /dev/null @@ -1,168 +0,0 @@ -//! -//! Timeline management code -// - -use std::{ - fs, - path::Path, - process::{Command, Stdio}, - sync::Arc, -}; - -use anyhow::{bail, Context, Result}; -use tracing::*; - -use remote_storage::path_with_suffix_extension; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -use crate::config::PageServerConf; -use crate::tenant::{Tenant, Timeline}; -use crate::tenant_mgr; -use crate::CheckpointConfig; -use crate::{import_datadir, TEMP_FILE_SUFFIX}; - -// Create the cluster temporarily in 'initdbpath' directory inside the repository -// to get bootstrap data for timeline initialization. -// -fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { - info!("running initdb in {}... ", initdbpath.display()); - - let initdb_path = conf.pg_bin_dir().join("initdb"); - let initdb_output = Command::new(initdb_path) - .args(&["-D", &initdbpath.to_string_lossy()]) - .args(&["-U", &conf.superuser]) - .args(&["-E", "utf8"]) - .arg("--no-instructions") - // This is only used for a temporary installation that is deleted shortly after, - // so no need to fsync it - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) - .stdout(Stdio::null()) - .output() - .context("failed to execute initdb")?; - if !initdb_output.status.success() { - bail!( - "initdb failed: '{}'", - String::from_utf8_lossy(&initdb_output.stderr) - ); - } - - Ok(()) -} - -// -// - run initdb to init temporary instance and get bootstrap data -// - after initialization complete, remove the temp dir. -// -fn bootstrap_timeline( - conf: &'static PageServerConf, - tenant_id: TenantId, - timeline_id: TimelineId, - tenant: &Tenant, -) -> Result> { - // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` - // temporary directory for basebackup files for the given timeline. - let initdb_path = path_with_suffix_extension( - conf.timelines_path(&tenant_id) - .join(format!("basebackup-{timeline_id}")), - TEMP_FILE_SUFFIX, - ); - - // Init temporarily repo to get bootstrap data - run_initdb(conf, &initdb_path)?; - let pgdata_path = initdb_path; - - let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); - - // Import the contents of the data directory at the initial checkpoint - // LSN, and any WAL after that. - // Initdb lsn will be equal to last_record_lsn which will be set after import. - // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = tenant.create_empty_timeline(timeline_id, lsn)?; - import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; - - fail::fail_point!("before-checkpoint-new-timeline", |_| { - bail!("failpoint before-checkpoint-new-timeline"); - }); - - timeline.checkpoint(CheckpointConfig::Forced)?; - - info!( - "created root timeline {} timeline.lsn {}", - timeline_id, - timeline.get_last_record_lsn() - ); - - // Remove temp dir. We don't need it anymore - fs::remove_dir_all(pgdata_path)?; - - Ok(timeline) -} - -/// -/// Create a new timeline. -/// -/// Returns the new timeline ID and reference to its Timeline object. -/// -/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with -/// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, -/// a new unique ID is generated. -/// -pub(crate) async fn create_timeline( - conf: &'static PageServerConf, - tenant_id: TenantId, - new_timeline_id: Option, - ancestor_timeline_id: Option, - mut ancestor_start_lsn: Option, -) -> Result>> { - let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); - let tenant = tenant_mgr::get_tenant(tenant_id, true)?; - - if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { - debug!("timeline {new_timeline_id} already exists"); - return Ok(None); - } - - let loaded_timeline = match ancestor_timeline_id { - Some(ancestor_timeline_id) => { - let ancestor_timeline = tenant - .get_timeline(ancestor_timeline_id) - .context("Cannot branch off the timeline that's not present in pageserver")?; - - if let Some(lsn) = ancestor_start_lsn.as_mut() { - // Wait for the WAL to arrive and be processed on the parent branch up - // to the requested branch point. The repository code itself doesn't - // require it, but if we start to receive WAL on the new timeline, - // decoding the new WAL might need to look up previous pages, relation - // sizes etc. and that would get confused if the previous page versions - // are not in the repository yet. - *lsn = lsn.align(); - ancestor_timeline.wait_lsn(*lsn).await?; - - let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); - if ancestor_ancestor_lsn > *lsn { - // can we safely just branch from the ancestor instead? - anyhow::bail!( - "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", - lsn, - ancestor_timeline_id, - ancestor_ancestor_lsn, - ); - } - } - - tenant.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? - } - None => bootstrap_timeline(conf, tenant_id, new_timeline_id, &tenant)?, - }; - - // Have added new timeline into the tenant, now its background tasks are needed. - tenant.activate(true); - - Ok(Some(loaded_timeline)) -} diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index bede4ac13e..d3d2c6d9b2 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -34,8 +34,9 @@ use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walrecord::*; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; -use postgres_ffi::v14::pg_constants; use postgres_ffi::v14::xlog_utils::*; use postgres_ffi::v14::CheckPoint; use postgres_ffi::TransactionId; @@ -82,7 +83,8 @@ impl<'a> WalIngest<'a> { decoded: &mut DecodedWALRecord, ) -> Result<()> { modification.lsn = lsn; - decode_wal_record(recdata, decoded).context("failed decoding wal record")?; + decode_wal_record(recdata, decoded, self.timeline.pg_version) + .context("failed decoding wal record")?; let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -113,18 +115,49 @@ impl<'a> WalIngest<'a> { let truncate = XlSmgrTruncate::decode(&mut buf); self.ingest_xlog_smgr_truncate(modification, &truncate)?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { - if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == pg_constants::XLOG_DBASE_CREATE - { - let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(modification, &createdb)?; - } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == pg_constants::XLOG_DBASE_DROP - { - let dropdb = XlDropDatabase::decode(&mut buf); - for tablespace_id in dropdb.tablespace_ids { - trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + debug!( + "handle RM_DBASE_ID for Postgres version {:?}", + self.timeline.pg_version + ); + if self.timeline.pg_version == 14 { + if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE + { + let createdb = XlCreateDatabase::decode(&mut buf); + debug!("XLOG_DBASE_CREATE v14"); + + self.ingest_xlog_dbase_create(modification, &createdb)?; + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v14::bindings::XLOG_DBASE_DROP + { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + } + } + } else if self.timeline.pg_version == 15 { + if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG + { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY + { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb)?; + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_DROP + { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + } } } } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID { @@ -291,7 +324,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0 + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? { // Extract page image from FPI record let img_len = blk.bimg_len as usize; @@ -392,7 +425,7 @@ impl<'a> WalIngest<'a> { // Clear the VM bits if required. if new_heap_blkno.is_some() || old_heap_blkno.is_some() { let vm_rel = RelTag { - forknum: pg_constants::VISIBILITYMAP_FORKNUM, + forknum: VISIBILITYMAP_FORKNUM, spcnode: decoded.blocks[0].rnode_spcnode, dbnode: decoded.blocks[0].rnode_dbnode, relnode: decoded.blocks[0].rnode_relnode, @@ -568,7 +601,7 @@ impl<'a> WalIngest<'a> { spcnode, dbnode, relnode, - forknum: pg_constants::MAIN_FORKNUM, + forknum: MAIN_FORKNUM, }; self.put_rel_truncation(modification, rel, rec.blkno)?; } @@ -577,7 +610,7 @@ impl<'a> WalIngest<'a> { spcnode, dbnode, relnode, - forknum: pg_constants::FSM_FORKNUM, + forknum: FSM_FORKNUM, }; // FIXME: 'blkno' stored in the WAL record is the new size of the @@ -600,7 +633,7 @@ impl<'a> WalIngest<'a> { spcnode, dbnode, relnode, - forknum: pg_constants::VISIBILITYMAP_FORKNUM, + forknum: VISIBILITYMAP_FORKNUM, }; // FIXME: Like with the FSM above, the logic to truncate the VM @@ -672,7 +705,7 @@ impl<'a> WalIngest<'a> { )?; for xnode in &parsed.xnodes { - for forknum in pg_constants::MAIN_FORKNUM..=pg_constants::VISIBILITYMAP_FORKNUM { + for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM { let rel = RelTag { forknum, spcnode: xnode.spcnode, @@ -1032,6 +1065,8 @@ mod tests { use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; + use crate::DEFAULT_PG_VERSION; + /// Arbitrary relation tag, for testing. const TESTREL_A: RelTag = RelTag { spcnode: 0, @@ -1059,7 +1094,7 @@ mod tests { #[test] fn test_relsize() -> Result<()> { let tenant = TenantHarness::create("test_relsize")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1187,7 +1222,7 @@ mod tests { #[test] fn test_drop_extend() -> Result<()> { let tenant = TenantHarness::create("test_drop_extend")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; let mut m = tline.begin_modification(Lsn(0x20)); @@ -1227,7 +1262,7 @@ mod tests { #[test] fn test_truncate_extend() -> Result<()> { let tenant = TenantHarness::create("test_truncate_extend")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; // Create a 20 MB relation (the size is arbitrary) @@ -1315,7 +1350,7 @@ mod tests { #[test] fn test_large_rel() -> Result<()> { let tenant = TenantHarness::create("test_large_rel")?.load(); - let tline = create_test_timeline(&tenant, TIMELINE_ID)?; + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index deac299747..c7de24080a 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -31,7 +31,6 @@ use etcd_broker::Client; use itertools::Itertools; use once_cell::sync::OnceCell; use std::future::Future; -use std::sync::Arc; use tokio::sync::watch; use tracing::*; use url::Url; @@ -88,37 +87,44 @@ pub fn is_etcd_client_initialized() -> bool { /// That may lead to certain events not being observed by the listener. #[derive(Debug)] pub struct TaskHandle { - events_receiver: watch::Receiver>, + join_handle: Option>>, + events_receiver: watch::Receiver>, cancellation: watch::Sender<()>, } -#[derive(Debug, Clone)] pub enum TaskEvent { + Update(TaskStateUpdate), + End(anyhow::Result<()>), +} + +#[derive(Debug, Clone)] +pub enum TaskStateUpdate { + Init, Started, - NewEvent(E), - End, + Progress(E), } impl TaskHandle { /// Initializes the task, starting it immediately after the creation. pub fn spawn( - task: impl FnOnce(Arc>>, watch::Receiver<()>) -> Fut + Send + 'static, + task: impl FnOnce(watch::Sender>, watch::Receiver<()>) -> Fut + + Send + + 'static, ) -> Self where - Fut: Future> + Send, - E: Sync + Send + 'static, + Fut: Future> + Send, + E: Send + Sync + 'static, { let (cancellation, cancellation_receiver) = watch::channel(()); - let (events_sender, events_receiver) = watch::channel(TaskEvent::Started); - let events_sender = Arc::new(events_sender); + let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started); - let sender = Arc::clone(&events_sender); - let _ = WALRECEIVER_RUNTIME.spawn(async move { - events_sender.send(TaskEvent::Started).ok(); - task(sender, cancellation_receiver).await + let join_handle = WALRECEIVER_RUNTIME.spawn(async move { + events_sender.send(TaskStateUpdate::Started).ok(); + task(events_sender, cancellation_receiver).await }); TaskHandle { + join_handle: Some(join_handle), events_receiver, cancellation, } @@ -126,15 +132,45 @@ impl TaskHandle { async fn next_task_event(&mut self) -> TaskEvent { match self.events_receiver.changed().await { - Ok(()) => self.events_receiver.borrow().clone(), - Err(_task_channel_part_dropped) => TaskEvent::End, + Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()), + Err(_task_channel_part_dropped) => { + TaskEvent::End(match self.join_handle.take() { + Some(jh) => { + if !jh.is_finished() { + warn!("sender is dropped while join handle is still alive"); + } + + jh.await + .map_err(|e| anyhow::anyhow!("Failed to join task: {e}")) + .and_then(|x| x) + } + None => { + // Another option is to have an enum, join handle or result and give away the reference to it + Err(anyhow::anyhow!("Task was joined more than once")) + } + }) + } } } /// Aborts current task, waiting for it to finish. - pub async fn shutdown(mut self) { - self.cancellation.send(()).ok(); - // wait until the sender is dropped - while self.events_receiver.changed().await.is_ok() {} + pub async fn shutdown(self) { + match self.join_handle { + Some(jh) => { + self.cancellation.send(()).ok(); + match jh.await { + Ok(Ok(())) => debug!("Shutdown success"), + Ok(Err(e)) => error!("Shutdown task error: {e:?}"), + Err(join_error) => { + if join_error.is_cancelled() { + error!("Shutdown task was cancelled"); + } else { + error!("Shutdown task join error: {join_error}") + } + } + } + } + None => {} + } } } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 1e4b4e7d52..29179e9871 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -16,10 +16,10 @@ use std::{ time::Duration, }; -use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::task_mgr::WALRECEIVER_RUNTIME; use crate::tenant::Timeline; +use crate::{task_mgr, walreceiver::TaskStateUpdate}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -58,7 +58,10 @@ pub fn spawn_connection_manager_task( TaskKind::WalReceiverManager, Some(tenant_id), Some(timeline_id), - &format!("walreceiver for tenant {} timeline {}", timeline.tenant_id, timeline.timeline_id), + &format!( + "walreceiver for tenant {} timeline {}", + timeline.tenant_id, timeline.timeline_id + ), false, async move { info!("WAL receiver broker started, connecting to etcd"); @@ -88,7 +91,9 @@ pub fn spawn_connection_manager_task( } } } - .instrument(info_span!("wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) + .instrument( + info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id), + ), ); Ok(()) } @@ -140,19 +145,26 @@ async fn connection_manager_loop_step( let wal_connection = walreceiver_state.wal_connection.as_mut() .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { - TaskEvent::Started => {}, - TaskEvent::NewEvent(status) => { - if status.has_processed_wal { - // We have advanced last_record_lsn by processing the WAL received - // from this safekeeper. This is good enough to clean unsuccessful - // retries history and allow reconnecting to this safekeeper without - // sleeping for a long time. - walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); + TaskEvent::Update(c) => { + match c { + TaskStateUpdate::Init | TaskStateUpdate::Started => {}, + TaskStateUpdate::Progress(status) => { + if status.has_processed_wal { + // We have advanced last_record_lsn by processing the WAL received + // from this safekeeper. This is good enough to clean unsuccessful + // retries history and allow reconnecting to this safekeeper without + // sleeping for a long time. + walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); + } + wal_connection.status = status.to_owned(); + } } - wal_connection.status = status; }, - TaskEvent::End => { - debug!("WAL receiving task finished"); + TaskEvent::End(walreceiver_task_result) => { + match walreceiver_task_result { + Ok(()) => debug!("WAL receiving task finished"), + Err(e) => error!("wal receiver task finished with an error: {e:?}"), + } walreceiver_state.drop_old_connection(false).await; }, } @@ -358,13 +370,13 @@ impl WalreceiverState { async move { super::walreceiver_connection::handle_walreceiver_connection( timeline, - &new_wal_source_connstr, - events_sender.as_ref(), + new_wal_source_connstr, + events_sender, cancellation, connect_timeout, ) .await - .map_err(|e| format!("walreceiver connection handling failure: {e:#}")) + .context("walreceiver connection handling failure") } .instrument(info_span!("walreceiver_connection", id = %id)) }); @@ -880,7 +892,7 @@ mod tests { status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status.clone())) .ok(); Ok(()) }), @@ -1140,7 +1152,7 @@ mod tests { status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status.clone())) .ok(); Ok(()) }), @@ -1228,7 +1240,7 @@ mod tests { status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskEvent::NewEvent(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status.clone())) .ok(); Ok(()) }), @@ -1353,7 +1365,7 @@ mod tests { const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; - fn dummy_state(harness: &TenantHarness) -> WalreceiverState { + fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState { WalreceiverState { id: TenantTimelineId { tenant_id: harness.tenant_id, @@ -1361,7 +1373,7 @@ mod tests { }, timeline: harness .load() - .create_empty_timeline(TIMELINE_ID, Lsn(0)) + .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION) .expect("Failed to create an empty timeline for dummy wal connection manager"), wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 29c4cea882..ef5baeb570 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -16,10 +16,9 @@ use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; -use tracing::{debug, error, info, info_span, trace, warn, Instrument}; +use tracing::{debug, error, info, trace, warn}; -use super::TaskEvent; -use crate::metrics::LIVE_CONNECTIONS_COUNT; +use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate}; use crate::{ task_mgr, task_mgr::TaskKind, @@ -29,7 +28,7 @@ use crate::{ walingest::WalIngest, walrecord::DecodedWALRecord, }; -use postgres_ffi::v14::waldecoder::WalStreamDecoder; +use postgres_ffi::waldecoder::WalStreamDecoder; use utils::id::TenantTimelineId; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback}; @@ -55,8 +54,8 @@ pub struct WalConnectionStatus { /// messages as we go. pub async fn handle_walreceiver_connection( timeline: Arc, - wal_source_connstr: &str, - events_sender: &watch::Sender>, + wal_source_connstr: String, + events_sender: watch::Sender>, mut cancellation: watch::Receiver<()>, connect_timeout: Duration, ) -> anyhow::Result<()> { @@ -81,7 +80,7 @@ pub async fn handle_walreceiver_connection( streaming_lsn: None, commit_lsn: None, }; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); return Ok(()); } @@ -112,8 +111,7 @@ pub async fn handle_walreceiver_connection( _ = connection_cancellation.changed() => info!("Connection cancelled"), } Ok(()) - } - .instrument(info_span!("walreceiver connection")), + }, ); // Immediately increment the gauge, then create a job to decrement it on task exit. @@ -134,7 +132,7 @@ pub async fn handle_walreceiver_connection( connection_status.latest_connection_update = Utc::now().naive_utc(); connection_status.latest_wal_update = Utc::now().naive_utc(); connection_status.commit_lsn = Some(end_of_wal); - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); return Ok(()); } @@ -166,7 +164,7 @@ pub async fn handle_walreceiver_connection( let physical_stream = ReplicationStream::new(copy_stream); pin!(physical_stream); - let mut waldecoder = WalStreamDecoder::new(startpoint); + let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?; @@ -202,7 +200,7 @@ pub async fn handle_walreceiver_connection( } &_ => {} }; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } @@ -268,7 +266,8 @@ pub async fn handle_walreceiver_connection( if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg { // We have successfully processed at least one WAL record. connection_status.has_processed_wal = true; - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) + { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index dbf9bf9d33..38fb9a4247 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -3,12 +3,11 @@ //! use anyhow::Result; use bytes::{Buf, Bytes}; -use postgres_ffi::v14::pg_constants; -use postgres_ffi::v14::xlog_utils::XLOG_SIZE_OF_XLOG_RECORD; -use postgres_ffi::v14::XLogRecord; +use postgres_ffi::pg_constants; use postgres_ffi::BLCKSZ; use postgres_ffi::{BlockNumber, OffsetNumber, TimestampTz}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; +use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; use serde::{Deserialize, Serialize}; use tracing::*; use utils::bin_ser::DeserializeError; @@ -390,6 +389,16 @@ impl XlXactParsedRecord { xid = buf.get_u32_le(); trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE"); } + + if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 { + let nitems = buf.get_i32_le(); + debug!( + "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}", + nitems + ); + //FIXME: do we need to handle dropped stats here? + } + XlXactParsedRecord { xid, info, @@ -517,7 +526,8 @@ impl XlMultiXactTruncate { pub fn decode_wal_record( record: Bytes, decoded: &mut DecodedWALRecord, -) -> Result<(), DeserializeError> { + pg_version: u32, +) -> Result<()> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; @@ -610,9 +620,21 @@ pub fn decode_wal_record( blk.hole_offset = buf.get_u16_le(); blk.bimg_info = buf.get_u8(); - blk.apply_image = (blk.bimg_info & pg_constants::BKPIMAGE_APPLY) != 0; + blk.apply_image = if pg_version == 14 { + (blk.bimg_info & postgres_ffi::v14::bindings::BKPIMAGE_APPLY) != 0 + } else { + assert_eq!(pg_version, 15); + (blk.bimg_info & postgres_ffi::v15::bindings::BKPIMAGE_APPLY) != 0 + }; - if blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED != 0 { + let blk_img_is_compressed = + postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?; + + if blk_img_is_compressed { + debug!("compressed block image , pg_version = {}", pg_version); + } + + if blk_img_is_compressed { if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 { blk.hole_length = buf.get_u16_le(); } else { @@ -665,9 +687,7 @@ pub fn decode_wal_record( * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED * flag is set. */ - if (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0) - && blk.bimg_len == BLCKSZ - { + if !blk_img_is_compressed && blk.bimg_len == BLCKSZ { // TODO /* report_invalid_record(state, @@ -683,7 +703,7 @@ pub fn decode_wal_record( * IS_COMPRESSED flag is set. */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 - && blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0 + && !blk_img_is_compressed && blk.bimg_len != BLCKSZ { // TODO diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 9faabfebda..15a9408dc9 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -21,7 +21,6 @@ use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; -use remote_storage::path_with_suffix_extension; use serde::Serialize; use std::fs; use std::fs::OpenOptions; @@ -36,6 +35,7 @@ use std::sync::Mutex; use std::time::Duration; use std::time::Instant; use tracing::*; +use utils::crashsafe_dir::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; use crate::metrics::{ @@ -46,11 +46,12 @@ use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::NeonWalRecord; use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; use postgres_ffi::v14::nonrelfile_utils::{ mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, transaction_id_set_status, }; -use postgres_ffi::v14::pg_constants; use postgres_ffi::BLCKSZ; /// @@ -82,6 +83,7 @@ pub trait WalRedoManager: Send + Sync { lsn: Lsn, base_img: Option, records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, ) -> Result; } @@ -144,6 +146,7 @@ impl WalRedoManager for PostgresRedoManager { lsn: Lsn, base_img: Option, records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, ) -> Result { if records.is_empty() { error!("invalid WAL redo request with no records"); @@ -166,6 +169,7 @@ impl WalRedoManager for PostgresRedoManager { img, &records[batch_start..i], self.conf.wal_redo_timeout, + pg_version, ) }; img = Some(result?); @@ -184,6 +188,7 @@ impl WalRedoManager for PostgresRedoManager { img, &records[batch_start..], self.conf.wal_redo_timeout, + pg_version, ) } } @@ -212,6 +217,7 @@ impl PostgresRedoManager { base_img: Option, records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, + pg_version: u32, ) -> Result { let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; @@ -222,7 +228,7 @@ impl PostgresRedoManager { // launch the WAL redo process on first use if process_guard.is_none() { - let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id)?; + let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?; *process_guard = Some(p); } let process = process_guard.as_mut().unwrap(); @@ -326,7 +332,7 @@ impl PostgresRedoManager { // sanity check that this is modifying the correct relation let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; assert!( - rel.forknum == pg_constants::VISIBILITYMAP_FORKNUM, + rel.forknum == VISIBILITYMAP_FORKNUM, "ClearVisibilityMapFlags record on unexpected rel {}", rel ); @@ -570,7 +576,11 @@ impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // - fn launch(conf: &PageServerConf, tenant_id: &TenantId) -> Result { + fn launch( + conf: &PageServerConf, + tenant_id: &TenantId, + pg_version: u32, + ) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. @@ -588,12 +598,12 @@ impl PostgresRedoProcess { fs::remove_dir_all(&datadir)?; } info!("running initdb in {}", datadir.display()); - let initdb = Command::new(conf.pg_bin_dir().join("initdb")) + let initdb = Command::new(conf.pg_bin_dir(pg_version).join("initdb")) .args(&["-D", &datadir.to_string_lossy()]) .arg("-N") .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) .close_fds() .output() .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?; @@ -619,14 +629,14 @@ impl PostgresRedoProcess { } // Start postgres itself - let mut child = Command::new(conf.pg_bin_dir().join("postgres")) + let mut child = Command::new(conf.pg_bin_dir(pg_version).join("postgres")) .arg("--wal-redo") .stdin(Stdio::piped()) .stderr(Stdio::piped()) .stdout(Stdio::piped()) .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version)) .env("PGDATA", &datadir) // The redo process is not trusted, so it runs in seccomp mode // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 3f74fd7234..6b6a042402 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -183,7 +183,7 @@ pageserver_send(NeonRequest * request) if (!connected) pageserver_connect(); - req_buff = zm_pack_request(request); + req_buff = nm_pack_request(request); /* * Send request. @@ -204,7 +204,7 @@ pageserver_send(NeonRequest * request) if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((NeonMessage *) request); + char *msg = nm_to_string((NeonMessage *) request); neon_log(PageStoreTrace, "sent request: %s", msg); pfree(msg); @@ -230,12 +230,12 @@ pageserver_receive(void) else if (resp_buff.len == -2) neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); } - resp = zm_unpack_response(&resp_buff); + resp = nm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); if (message_level_is_interesting(PageStoreTrace)) { - char *msg = zm_to_string((NeonMessage *) resp); + char *msg = nm_to_string((NeonMessage *) resp); neon_log(PageStoreTrace, "got response: %s", msg); pfree(msg); @@ -341,9 +341,9 @@ page_server_api api = { static bool check_neon_id(char **newval, void **extra, GucSource source) { - uint8 zid[16]; + uint8 id[16]; - return **newval == '\0' || HexDecodeString(zid, *newval, 16); + return **newval == '\0' || HexDecodeString(id, *newval, 16); } static char * diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 633c7b465c..e0cda11b63 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -128,9 +128,9 @@ typedef struct * message */ } NeonErrorResponse; -extern StringInfoData zm_pack_request(NeonRequest * msg); -extern NeonResponse * zm_unpack_response(StringInfo s); -extern char *zm_to_string(NeonMessage * msg); +extern StringInfoData nm_pack_request(NeonRequest * msg); +extern NeonResponse * nm_unpack_response(StringInfo s); +extern char *nm_to_string(NeonMessage * msg); /* * API diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 24adee019f..1187550f2a 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -160,7 +160,7 @@ page_server_request(void const *req) StringInfoData -zm_pack_request(NeonRequest * msg) +nm_pack_request(NeonRequest * msg) { StringInfoData s; @@ -235,7 +235,7 @@ zm_pack_request(NeonRequest * msg) } NeonResponse * -zm_unpack_response(StringInfo s) +nm_unpack_response(StringInfo s) { NeonMessageTag tag = pq_getmsgbyte(s); NeonResponse *resp = NULL; @@ -329,7 +329,7 @@ zm_unpack_response(StringInfo s) /* dump to json for debugging / error reporting purposes */ char * -zm_to_string(NeonMessage * msg) +nm_to_string(NeonMessage * msg) { StringInfoData s; @@ -632,7 +632,7 @@ neon_init(void) * It may cause problems with XLogFlush. So return pointer backward to the origin of the page. */ static XLogRecPtr -zm_adjust_lsn(XLogRecPtr lsn) +nm_adjust_lsn(XLogRecPtr lsn) { /* * If lsn points to the beging of first record on page or segment, then @@ -685,7 +685,7 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", (uint32) ((lsn) >> 32), (uint32) (lsn)); - lsn = zm_adjust_lsn(lsn); + lsn = nm_adjust_lsn(lsn); /* * Is it possible that the last-written LSN is ahead of last flush @@ -959,7 +959,17 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (IS_LOCAL_REL(reln)) mdextend(reln, forkNum, blkno, buffer, skipFsync); #endif - + /* + * smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr. + * An smgr_write() call will come for the buffer later, after it has been initialized + * with the real page contents, and it is eventually evicted from the buffer cache. + * But we need a valid LSN to the relation metadata update now. + */ + if (lsn == InvalidXLogRecPtr) + { + lsn = GetXLogInsertRecPtr(); + SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forkNum, blkno); + } SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum); } @@ -1559,7 +1569,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) */ lsn = GetXLogInsertRecPtr(); - lsn = zm_adjust_lsn(lsn); + lsn = nm_adjust_lsn(lsn); /* * Flush it, too. We don't actually care about it here, but let's uphold diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 5417f4f2b3..7d0449cd1a 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -11,13 +11,14 @@ bstr = "0.2.17" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" futures = "0.3.13" +git-version = "0.3.5" hashbrown = "0.12" hex = "0.4.3" hmac = "0.12.1" hyper = "0.14" itertools = "0.10.3" -once_cell = "1.13.0" md5 = "0.7.0" +once_cell = "1.13.0" parking_lot = "0.12" pin-project-lite = "0.2.7" rand = "0.8.3" @@ -35,14 +36,13 @@ tokio = { version = "1.17", features = ["macros"] } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-rustls = "0.23.0" url = "2.2.2" -git-version = "0.3.5" +uuid = { version = "0.8.2", features = ["v4", "serde"]} +x509-parser = "0.13.2" utils = { path = "../libs/utils" } metrics = { path = "../libs/metrics" } workspace_hack = { version = "0.1", path = "../workspace_hack" } -x509-parser = "0.13.2" - [dev-dependencies] rcgen = "0.8.14" rstest = "0.12" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index a50d23e351..2df4f9d920 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,7 +1,7 @@ //! Client authentication mechanisms. pub mod backend; -pub use backend::{BackendType, DatabaseInfo}; +pub use backend::{BackendType, ConsoleReqExtra, DatabaseInfo}; mod credentials; pub use credentials::ClientCredentials; diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index de0719a196..7e93a32950 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -8,13 +8,12 @@ pub use console::{GetAuthInfoError, WakeComputeError}; use crate::{ auth::{self, AuthFlow, ClientCredentials}, - compute, config, mgmt, - stream::PqStream, + compute, http, mgmt, stream, url, waiters::{self, Waiter, Waiters}, }; - use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; +use std::borrow::Cow; use tokio::io::{AsyncRead, AsyncWrite}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); @@ -75,6 +74,14 @@ impl From for tokio_postgres::Config { } } +/// Extra query params we'd like to pass to the console. +pub struct ConsoleReqExtra<'a> { + /// A unique identifier for a connection. + pub session_id: uuid::Uuid, + /// Name of client application, if set. + pub application_name: Option<&'a str>, +} + /// This type serves two purposes: /// /// * When `T` is `()`, it's just a regular auth backend selector @@ -83,53 +90,83 @@ impl From for tokio_postgres::Config { /// * However, when we substitute `T` with [`ClientCredentials`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum BackendType { +#[derive(Debug)] +pub enum BackendType<'a, T> { /// Current Cloud API (V2). - Console(T), + Console(Cow<'a, http::Endpoint>, T), /// Local mock of Cloud API (V2). - Postgres(T), + Postgres(Cow<'a, url::ApiUrl>, T), /// Authentication via a web browser. - Link, + Link(Cow<'a, url::ApiUrl>), } -impl BackendType { +impl std::fmt::Display for BackendType<'_, ()> { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use BackendType::*; + match self { + Console(endpoint, _) => fmt + .debug_tuple("Console") + .field(&endpoint.url().as_str()) + .finish(), + Postgres(endpoint, _) => fmt + .debug_tuple("Postgres") + .field(&endpoint.as_str()) + .finish(), + Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), + } + } +} + +impl BackendType<'_, T> { + /// Very similar to [`std::option::Option::as_ref`]. + /// This helps us pass structured config to async tasks. + pub fn as_ref(&self) -> BackendType<'_, &T> { + use BackendType::*; + match self { + Console(c, x) => Console(Cow::Borrowed(c), x), + Postgres(c, x) => Postgres(Cow::Borrowed(c), x), + Link(c) => Link(Cow::Borrowed(c)), + } + } +} + +impl<'a, T> BackendType<'a, T> { /// Very similar to [`std::option::Option::map`]. /// Maps [`BackendType`] to [`BackendType`] by applying /// a function to a contained value. - pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType { + pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> { use BackendType::*; match self { - Console(x) => Console(f(x)), - Postgres(x) => Postgres(f(x)), - Link => Link, + Console(c, x) => Console(c, f(x)), + Postgres(c, x) => Postgres(c, f(x)), + Link(c) => Link(c), } } } -impl BackendType> { +impl<'a, T, E> BackendType<'a, Result> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. - pub fn transpose(self) -> Result, E> { + pub fn transpose(self) -> Result, E> { use BackendType::*; match self { - Console(x) => x.map(Console), - Postgres(x) => x.map(Postgres), - Link => Ok(Link), + Console(c, x) => x.map(|x| Console(c, x)), + Postgres(c, x) => x.map(|x| Postgres(c, x)), + Link(c) => Ok(Link(c)), } } } -impl BackendType> { +impl BackendType<'_, ClientCredentials<'_>> { /// Authenticate the client via the requested backend, possibly using credentials. pub async fn authenticate( mut self, - urls: &config::AuthUrls, - client: &mut PqStream, + extra: &ConsoleReqExtra<'_>, + client: &mut stream::PqStream, ) -> super::Result { use BackendType::*; - if let Console(creds) | Postgres(creds) = &mut self { + if let Console(_, creds) | Postgres(_, creds) = &mut self { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the project name. // We now expect to see a very specific payload in the place of password. @@ -145,15 +182,13 @@ impl BackendType> { creds.project = Some(payload.project.into()); let mut config = match &self { - Console(creds) => { - console::Api::new(&urls.auth_endpoint, creds) + Console(endpoint, creds) => { + console::Api::new(endpoint, extra, creds) .wake_compute() .await? } - Postgres(creds) => { - postgres::Api::new(&urls.auth_endpoint, creds) - .wake_compute() - .await? + Postgres(endpoint, creds) => { + postgres::Api::new(endpoint, creds).wake_compute().await? } _ => unreachable!("see the patterns above"), }; @@ -169,49 +204,18 @@ impl BackendType> { } match self { - Console(creds) => { - console::Api::new(&urls.auth_endpoint, &creds) + Console(endpoint, creds) => { + console::Api::new(&endpoint, extra, &creds) .handle_user(client) .await } - Postgres(creds) => { - postgres::Api::new(&urls.auth_endpoint, &creds) + Postgres(endpoint, creds) => { + postgres::Api::new(&endpoint, &creds) .handle_user(client) .await } // NOTE: this auth backend doesn't use client credentials. - Link => link::handle_user(&urls.auth_link_uri, client).await, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_backend_type_map() { - let values = [ - BackendType::Console(0), - BackendType::Postgres(0), - BackendType::Link, - ]; - - for value in values { - assert_eq!(value.map(|x| x), value); - } - } - - #[test] - fn test_backend_type_transpose() { - let values = [ - BackendType::Console(Ok::<_, ()>(0)), - BackendType::Postgres(Ok(0)), - BackendType::Link, - ]; - - for value in values { - assert_eq!(value.map(Result::unwrap), value.transpose().unwrap()); + Link(url) => link::handle_user(&url, client).await, } } } diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index e239320e9b..a351b82c6a 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -1,12 +1,12 @@ //! Cloud API V2. +use super::ConsoleReqExtra; use crate::{ auth::{self, AuthFlow, ClientCredentials}, compute::{self, ComputeConnCfg}, error::{io_error, UserFacingError}, - scram, + http, scram, stream::PqStream, - url::ApiUrl, }; use serde::{Deserialize, Serialize}; use std::future::Future; @@ -120,14 +120,23 @@ pub enum AuthInfo { #[must_use] pub(super) struct Api<'a> { - endpoint: &'a ApiUrl, + endpoint: &'a http::Endpoint, + extra: &'a ConsoleReqExtra<'a>, creds: &'a ClientCredentials<'a>, } impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. - pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self { - Self { endpoint, creds } + pub(super) fn new( + endpoint: &'a http::Endpoint, + extra: &'a ConsoleReqExtra<'a>, + creds: &'a ClientCredentials, + ) -> Self { + Self { + endpoint, + extra, + creds, + } } /// Authenticate the existing user or throw an error. @@ -139,16 +148,22 @@ impl<'a> Api<'a> { } async fn get_auth_info(&self) -> Result { - let mut url = self.endpoint.clone(); - url.path_segments_mut().push("proxy_get_role_secret"); - url.query_pairs_mut() - .append_pair("project", self.creds.project().expect("impossible")) - .append_pair("role", self.creds.user); + let req = self + .endpoint + .get("proxy_get_role_secret") + .header("X-Request-ID", uuid::Uuid::new_v4().to_string()) + .query(&[("session_id", self.extra.session_id)]) + .query(&[ + ("application_name", self.extra.application_name), + ("project", Some(self.creds.project().expect("impossible"))), + ("role", Some(self.creds.user)), + ]) + .build()?; // TODO: use a proper logger - println!("cplane request: {url}"); + println!("cplane request: {}", req.url()); - let resp = reqwest::get(url.into_inner()).await?; + let resp = self.endpoint.execute(req).await?; if !resp.status().is_success() { return Err(TransportError::HttpStatus(resp.status()).into()); } @@ -162,15 +177,21 @@ impl<'a> Api<'a> { /// Wake up the compute node and return the corresponding connection info. pub(super) async fn wake_compute(&self) -> Result { - let mut url = self.endpoint.clone(); - url.path_segments_mut().push("proxy_wake_compute"); - url.query_pairs_mut() - .append_pair("project", self.creds.project().expect("impossible")); + let req = self + .endpoint + .get("proxy_wake_compute") + .header("X-Request-ID", uuid::Uuid::new_v4().to_string()) + .query(&[("session_id", self.extra.session_id)]) + .query(&[ + ("application_name", self.extra.application_name), + ("project", Some(self.creds.project().expect("impossible"))), + ]) + .build()?; // TODO: use a proper logger - println!("cplane request: {url}"); + println!("cplane request: {}", req.url()); - let resp = reqwest::get(url.into_inner()).await?; + let resp = self.endpoint.execute(req).await?; if !resp.status().is_success() { return Err(TransportError::HttpStatus(resp.status()).into()); } @@ -238,3 +259,15 @@ fn parse_host_port(input: &str) -> Option<(&str, u16)> { let (host, port) = input.split_once(':')?; Some((host, port.parse().ok()?)) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_host_port() { + let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse"); + assert_eq!(host, "127.0.0.1"); + assert_eq!(port, 5432); + } +} diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index d740a4c5c4..eefa246eba 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -29,7 +29,7 @@ impl UserFacingError for LinkAuthError { } } -fn hello_message(redirect_uri: &str, session_id: &str) -> String { +fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String { format!( concat![ "Welcome to Neon!\n", @@ -46,11 +46,11 @@ pub fn new_psql_session_id() -> String { } pub async fn handle_user( - redirect_uri: &reqwest::Url, + link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result { let psql_session_id = new_psql_session_id(); - let greeting = hello_message(redirect_uri.as_str(), &psql_session_id); + let greeting = hello_message(link_uri, &psql_session_id); let db_info = super::with_waiter(psql_session_id, |waiter| async { // Give user a URL to spawn a new database diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index ea71eba010..e43bcf8791 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -54,13 +54,10 @@ impl<'a> ClientCredentials<'a> { let dbname = get_param("database")?; // Project name might be passed via PG's command-line options. - let project_a = params.options_raw().and_then(|options| { - for opt in options { - if let Some(value) = opt.strip_prefix("project=") { - return Some(Cow::Borrowed(value)); - } - } - None + let project_a = params.options_raw().and_then(|mut options| { + options + .find_map(|opt| opt.strip_prefix("project=")) + .map(Cow::Borrowed) }); // Alternative project name is in fact a subdomain from SNI. diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index b7412b6f5b..eb9312e6bb 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -52,6 +52,16 @@ impl CancelMap { let session = Session::new(key, self); f(session).await } + + #[cfg(test)] + fn contains(&self, session: &Session) -> bool { + self.0.lock().contains_key(&session.key) + } + + #[cfg(test)] + fn is_empty(&self) -> bool { + self.0.lock().is_empty() + } } /// This should've been a [`std::future::Future`], but @@ -104,3 +114,39 @@ impl<'a> Session<'a> { self.key } } + +#[cfg(test)] +mod tests { + use super::*; + use once_cell::sync::Lazy; + + #[tokio::test] + async fn check_session_drop() -> anyhow::Result<()> { + static CANCEL_MAP: Lazy = Lazy::new(Default::default); + + let (tx, rx) = tokio::sync::oneshot::channel(); + let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move { + assert!(CANCEL_MAP.contains(&session)); + + tx.send(()).expect("failed to send"); + futures::future::pending::<()>().await; // sleep forever + + Ok(()) + })); + + // Wait until the task has been spawned. + rx.await.context("failed to hear from the task")?; + + // Drop the session's entry by cancelling the task. + task.abort(); + let error = task.await.expect_err("task should have failed"); + if !error.is_cancelled() { + anyhow::bail!(error); + } + + // Check that the session has been dropped. + assert!(CANCEL_MAP.is_empty()); + + Ok(()) + } +} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 8835d660d5..031fa84509 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,16 +1,10 @@ -use crate::{auth, url::ApiUrl}; +use crate::auth; use anyhow::{ensure, Context}; use std::sync::Arc; pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: auth::BackendType<()>, - pub auth_urls: AuthUrls, -} - -pub struct AuthUrls { - pub auth_endpoint: ApiUrl, - pub auth_link_uri: ApiUrl, + pub auth_backend: auth::BackendType<'static, ()>, } pub struct TlsConfig { diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 5a75718742..dbeb3dc784 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -1,27 +1,81 @@ -use anyhow::anyhow; -use hyper::{Body, Request, Response, StatusCode}; -use std::net::TcpListener; -use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; +pub mod server; -async fn status_handler(_: Request) -> Result, ApiError> { - json_response(StatusCode::OK, "") +use crate::url::ApiUrl; + +/// Thin convenience wrapper for an API provided by an http endpoint. +#[derive(Debug, Clone)] +pub struct Endpoint { + /// API's base URL. + endpoint: ApiUrl, + /// Connection manager with built-in pooling. + client: reqwest::Client, } -fn make_router() -> RouterBuilder { - let router = endpoint::make_router(); - router.get("/v1/status", status_handler) -} - -pub async fn thread_main(http_listener: TcpListener) -> anyhow::Result<()> { - scopeguard::defer! { - println!("http has shut down"); +impl Endpoint { + /// Construct a new HTTP endpoint wrapper. + pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self { + Self { endpoint, client } } - let service = || RouterService::new(make_router().build()?); + pub fn url(&self) -> &ApiUrl { + &self.endpoint + } - hyper::Server::from_tcp(http_listener)? - .serve(service().map_err(|e| anyhow!(e))?) - .await?; + /// Return a [builder](reqwest::RequestBuilder) for a `GET` request, + /// appending a single `path` segment to the base endpoint URL. + pub fn get(&self, path: &str) -> reqwest::RequestBuilder { + let mut url = self.endpoint.clone(); + url.path_segments_mut().push(path); + self.client.get(url.into_inner()) + } - Ok(()) + /// Execute a [request](reqwest::Request). + pub async fn execute( + &self, + request: reqwest::Request, + ) -> Result { + self.client.execute(request).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn optional_query_params() -> anyhow::Result<()> { + let url = "http://example.com".parse()?; + let endpoint = Endpoint::new(url, reqwest::Client::new()); + + // Validate that this pattern makes sense. + let req = endpoint + .get("frobnicate") + .query(&[ + ("foo", Some("10")), // should be just `foo=10` + ("bar", None), // shouldn't be passed at all + ]) + .build()?; + + assert_eq!(req.url().as_str(), "http://example.com/frobnicate?foo=10"); + + Ok(()) + } + + #[test] + fn uuid_params() -> anyhow::Result<()> { + let url = "http://example.com".parse()?; + let endpoint = Endpoint::new(url, reqwest::Client::new()); + + let req = endpoint + .get("frobnicate") + .query(&[("session_id", uuid::Uuid::nil())]) + .build()?; + + assert_eq!( + req.url().as_str(), + "http://example.com/frobnicate?session_id=00000000-0000-0000-0000-000000000000" + ); + + Ok(()) + } } diff --git a/proxy/src/http/server.rs b/proxy/src/http/server.rs new file mode 100644 index 0000000000..5a75718742 --- /dev/null +++ b/proxy/src/http/server.rs @@ -0,0 +1,27 @@ +use anyhow::anyhow; +use hyper::{Body, Request, Response, StatusCode}; +use std::net::TcpListener; +use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; + +async fn status_handler(_: Request) -> Result, ApiError> { + json_response(StatusCode::OK, "") +} + +fn make_router() -> RouterBuilder { + let router = endpoint::make_router(); + router.get("/v1/status", status_handler) +} + +pub async fn thread_main(http_listener: TcpListener) -> anyhow::Result<()> { + scopeguard::defer! { + println!("http has shut down"); + } + + let service = || RouterService::new(make_router().build()?); + + hyper::Server::from_tcp(http_listener)? + .serve(service().map_err(|e| anyhow!(e))?) + .await?; + + Ok(()) +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index efe45f6386..f2dc7425ba 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -23,7 +23,7 @@ use anyhow::{bail, Context}; use clap::{self, Arg}; use config::ProxyConfig; use futures::FutureExt; -use std::{future::Future, net::SocketAddr}; +use std::{borrow::Cow, future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; use utils::project_git_version; @@ -36,23 +36,6 @@ async fn flatten_err( f.map(|r| r.context("join error").and_then(|x| x)).await } -/// A proper parser for auth backend parameter. -impl clap::ValueEnum for auth::BackendType<()> { - fn value_variants<'a>() -> &'a [Self] { - use auth::BackendType::*; - &[Console(()), Postgres(()), Link] - } - - fn to_possible_value<'a>(&self) -> Option> { - use auth::BackendType::*; - Some(clap::PossibleValue::new(match self { - Console(_) => "console", - Postgres(_) => "postgres", - Link => "link", - })) - } -} - #[tokio::main] async fn main() -> anyhow::Result<()> { let arg_matches = clap::App::new("Neon proxy/router") @@ -69,7 +52,7 @@ async fn main() -> anyhow::Result<()> { Arg::new("auth-backend") .long("auth-backend") .takes_value(true) - .value_parser(clap::builder::EnumValueParser::>::new()) + .possible_values(["console", "postgres", "link"]) .default_value("link"), ) .arg( @@ -135,23 +118,30 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?; let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?; - let auth_backend = *arg_matches - .try_get_one::>("auth-backend")? - .unwrap(); - - let auth_urls = config::AuthUrls { - auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, - auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?, + let auth_backend = match arg_matches.value_of("auth-backend").unwrap() { + "console" => { + let url = arg_matches.value_of("auth-endpoint").unwrap().parse()?; + let endpoint = http::Endpoint::new(url, reqwest::Client::new()); + auth::BackendType::Console(Cow::Owned(endpoint), ()) + } + "postgres" => { + let url = arg_matches.value_of("auth-endpoint").unwrap().parse()?; + auth::BackendType::Postgres(Cow::Owned(url), ()) + } + "link" => { + let url = arg_matches.value_of("uri").unwrap().parse()?; + auth::BackendType::Link(Cow::Owned(url)) + } + other => bail!("unsupported auth backend: {other}"), }; let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend, - auth_urls, })); println!("Version: {GIT_VERSION}"); - println!("Authentication backend: {:?}", config.auth_backend); + println!("Authentication backend: {}", config.auth_backend); // Check that we can bind to address before further initialization println!("Starting http on {}", http_address); @@ -164,7 +154,7 @@ async fn main() -> anyhow::Result<()> { let proxy_listener = TcpListener::bind(proxy_address).await?; let tasks = [ - tokio::spawn(http::thread_main(http_listener)), + tokio::spawn(http::server::thread_main(http_listener)), tokio::spawn(proxy::thread_main(config, proxy_listener)), tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)), ] diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs index 8a05ff9c82..cbd48d91e9 100644 --- a/proxy/src/parse.rs +++ b/proxy/src/parse.rs @@ -1,6 +1,5 @@ //! Small parsing helpers. -use std::convert::TryInto; use std::ffi::CStr; pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { @@ -10,9 +9,36 @@ pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { Some((unsafe { CStr::from_bytes_with_nul_unchecked(cstr) }, other)) } +/// See . pub fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { (bytes.len() >= N).then(|| { let (head, tail) = bytes.split_at(N); (head.try_into().unwrap(), tail) }) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_split_cstr() { + assert!(split_cstr(b"").is_none()); + assert!(split_cstr(b"foo").is_none()); + + let (cstr, rest) = split_cstr(b"\0").expect("uh-oh"); + assert_eq!(cstr.to_bytes(), b""); + assert_eq!(rest, b""); + + let (cstr, rest) = split_cstr(b"foo\0bar").expect("uh-oh"); + assert_eq!(cstr.to_bytes(), b"foo"); + assert_eq!(rest, b"bar"); + } + + #[test] + fn test_split_at_const() { + assert!(split_at_const::<0>(b"").is_some()); + assert!(split_at_const::<1>(b"").is_none()); + assert!(matches!(split_at_const::<1>(b"ok"), Some((b"o", b"k")))); + } +} diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 72cb822910..efb1b6f358 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,6 +1,6 @@ use crate::auth; use crate::cancellation::{self, CancelMap}; -use crate::config::{AuthUrls, ProxyConfig, TlsConfig}; +use crate::config::{ProxyConfig, TlsConfig}; use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; use futures::TryFutureExt; @@ -99,6 +99,7 @@ async fn handle_client( let common_name = tls.and_then(|tls| tls.common_name.as_deref()); let result = config .auth_backend + .as_ref() .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) .transpose(); @@ -107,7 +108,7 @@ async fn handle_client( let client = Client::new(stream, creds, ¶ms); cancel_map - .with_session(|session| client.connect_to_db(&config.auth_urls, session)) + .with_session(|session| client.connect_to_db(session)) .await } @@ -179,7 +180,7 @@ struct Client<'a, S> { /// The underlying libpq protocol stream. stream: PqStream, /// Client credentials that we care about. - creds: auth::BackendType>, + creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, /// KV-dictionary with PostgreSQL connection params. params: &'a StartupMessageParams, } @@ -188,7 +189,7 @@ impl<'a, S> Client<'a, S> { /// Construct a new connection context. fn new( stream: PqStream, - creds: auth::BackendType>, + creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, params: &'a StartupMessageParams, ) -> Self { Self { @@ -201,19 +202,22 @@ impl<'a, S> Client<'a, S> { impl Client<'_, S> { /// Let the client authenticate and connect to the designated compute node. - async fn connect_to_db( - self, - urls: &AuthUrls, - session: cancellation::Session<'_>, - ) -> anyhow::Result<()> { + async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> { let Self { mut stream, creds, params, } = self; + let extra = auth::ConsoleReqExtra { + // Currently it's OK to generate a new UUID **here**, but + // it might be better to move this to `cancellation::Session`. + session_id: uuid::Uuid::new_v4(), + application_name: params.get("application_name"), + }; + // Authenticate and connect to a compute node. - let auth = creds.authenticate(urls, &mut stream).await; + let auth = creds.authenticate(&extra, &mut stream).await; let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; let reported_auth_ok = node.reported_auth_ok; diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 76d6ad0e66..92c64bb8ad 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -1,8 +1,8 @@ use anyhow::bail; -use url::form_urlencoded::Serializer; /// A [url](url::Url) type with additional guarantees. -#[derive(Debug, Clone)] +#[repr(transparent)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct ApiUrl(url::Url); impl ApiUrl { @@ -11,11 +11,6 @@ impl ApiUrl { self.0 } - /// See [`url::Url::query_pairs_mut`]. - pub fn query_pairs_mut(&mut self) -> Serializer<'_, url::UrlQuery<'_>> { - self.0.query_pairs_mut() - } - /// See [`url::Url::path_segments_mut`]. pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut { // We've already verified that it works during construction. @@ -72,10 +67,7 @@ mod tests { let mut b = url.parse::().expect("unexpected parsing failure"); a.path_segments_mut().unwrap().push("method"); - a.query_pairs_mut().append_pair("key", "value"); - b.path_segments_mut().push("method"); - b.query_pairs_mut().append_pair("key", "value"); assert_eq!(a, b.into_inner()); } diff --git a/pytest.ini b/pytest.ini index bfa07e520b..7197b078c6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -5,6 +5,7 @@ filterwarnings = ignore:record_property is incompatible with junit_family:pytest.PytestWarning addopts = -m 'not remote_cluster' + --ignore=test_runner/performance markers = remote_cluster testpaths = diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 1a27e92fec..5aa0f8d4e5 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,11 +1,10 @@ [toolchain] # We try to stick to a toolchain version that is widely available on popular distributions, so that most people # can use the toolchain that comes with their operating system. But if there's a feature we miss badly from a later -# version, we can consider updating. As of this writing, 1.60 is available on Debian 'experimental' but not yet on -# 'testing' or even 'unstable', which is a bit more cutting-edge than we'd like. Hopefully the 1.60 packages reach -# 'testing' soon (and similarly for the other distributions). -# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package. -channel = "1.60" # do update GitHub CI cache values for rust builds, when changing this value +# version, we can consider updating. +# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package, +# we use "unstable" version number as the highest version used in the project by default. +channel = "1.61" # do update GitHub CI cache values for rust builds, when changing this value profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index cae095c3c2..cb1cecade9 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -30,7 +30,10 @@ git-version = "0.3.5" async-trait = "0.1" once_cell = "1.13.0" toml_edit = { version = "0.13", features = ["easy"] } +thiserror = "1" +parking_lot = "0.12.1" +safekeeper_api = { path = "../libs/safekeeper_api" } postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index d518ac01cc..7726f25a2d 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -24,9 +24,9 @@ use safekeeper::defaults::{ }; use safekeeper::http; use safekeeper::remove_wal; -use safekeeper::timeline::GlobalTimelines; use safekeeper::wal_backup; use safekeeper::wal_service; +use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; use utils::auth::JwtAuth; use utils::{ @@ -298,7 +298,9 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); - GlobalTimelines::init(wal_backup_launcher_tx); + + // Load all timelines from disk to memory. + GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?; let conf_ = conf.clone(); threads.push( diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index f276fad613..6a2456ecda 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -10,6 +10,7 @@ use etcd_broker::LeaseKeeper; use std::collections::hash_map::Entry; use std::collections::HashMap; +use std::collections::HashSet; use std::time::Duration; use tokio::spawn; use tokio::task::JoinHandle; @@ -17,7 +18,8 @@ use tokio::{runtime, time::sleep}; use tracing::*; use url::Url; -use crate::{timeline::GlobalTimelines, SafeKeeperConf}; +use crate::GlobalTimelines; +use crate::SafeKeeperConf; use etcd_broker::{ subscription_key::{OperationKind, SkOperationKind, SubscriptionKey}, Client, PutOptions, @@ -45,12 +47,12 @@ pub fn thread_main(conf: SafeKeeperConf) { /// Key to per timeline per safekeeper data. fn timeline_safekeeper_path( broker_etcd_prefix: String, - zttid: TenantTimelineId, + ttid: TenantTimelineId, sk_id: NodeId, ) -> String { format!( "{}/{sk_id}", - SubscriptionKey::sk_timeline_info(broker_etcd_prefix, zttid).watch_key() + SubscriptionKey::sk_timeline_info(broker_etcd_prefix, ttid).watch_key() ) } @@ -162,7 +164,7 @@ pub fn get_candiate_name(system_id: NodeId) -> String { } async fn push_sk_info( - zttid: TenantTimelineId, + ttid: TenantTimelineId, mut client: Client, key: String, sk_info: SkTimelineInfo, @@ -190,7 +192,7 @@ async fn push_sk_info( .await .context("failed to receive LeaseKeepAliveResponse")?; - Ok((zttid, lease)) + Ok((ttid, lease)) } struct Lease { @@ -210,11 +212,15 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. - let active_tlis = GlobalTimelines::get_active_timelines(); + let mut active_tlis = GlobalTimelines::get_all(); + active_tlis.retain(|tli| tli.is_active()); + + let active_tlis_set: HashSet = + active_tlis.iter().map(|tli| tli.ttid).collect(); // // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data. - for zttid in active_tlis.iter() { - if let Entry::Vacant(v) = leases.entry(*zttid) { + for tli in &active_tlis { + if let Entry::Vacant(v) = leases.entry(tli.ttid) { let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?; v.insert(Lease { @@ -224,30 +230,26 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { }); } } - leases.retain(|zttid, _| active_tlis.contains(zttid)); + leases.retain(|ttid, _| active_tlis_set.contains(ttid)); // Push data concurrently to not suffer from latency, with many timelines it can be slow. let handles = active_tlis .iter() - .filter_map(|zttid| GlobalTimelines::get_loaded(*zttid)) .map(|tli| { let sk_info = tli.get_public_info(&conf); - let key = timeline_safekeeper_path( - conf.broker_etcd_prefix.clone(), - tli.zttid, - conf.my_id, - ); - let lease = leases.remove(&tli.zttid).unwrap(); - tokio::spawn(push_sk_info(tli.zttid, client.clone(), key, sk_info, lease)) + let key = + timeline_safekeeper_path(conf.broker_etcd_prefix.clone(), tli.ttid, conf.my_id); + let lease = leases.remove(&tli.ttid).unwrap(); + tokio::spawn(push_sk_info(tli.ttid, client.clone(), key, sk_info, lease)) }) .collect::>(); for h in handles { - let (zttid, lease) = h.await??; + let (ttid, lease) = h.await??; // It is ugly to pull leases from hash and then put it back, but // otherwise we have to resort to long living per tli tasks (which // would generate a lot of errors when etcd is down) as task wants to // have 'static objects, we can't borrow to it. - leases.insert(zttid, lease); + leases.insert(ttid, lease); } sleep(push_interval).await; @@ -279,7 +281,7 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { match subscription.value_updates.recv().await { Some(new_info) => { // note: there are blocking operations below, but it's considered fine for now - if let Ok(tli) = GlobalTimelines::get(&conf, new_info.key.id, false) { + if let Ok(tli) = GlobalTimelines::get(new_info.key.id) { tli.record_safekeeper_info(&new_info.value, new_info.key.node_id) .await? } diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index ff23f0360f..6be3f9abb2 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -2,18 +2,15 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use once_cell::sync::Lazy; use std::fs::{self, File, OpenOptions}; use std::io::{Read, Write}; use std::ops::Deref; use std::path::{Path, PathBuf}; -use tracing::*; - use crate::control_file_upgrade::upgrade_control_file; +use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; -use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; @@ -26,16 +23,6 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control"; const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); -static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_persist_control_file_seconds", - "Seconds to persist and sync control file, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") -}); - /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. pub trait Storage: Deref { @@ -48,51 +35,43 @@ pub struct FileStorage { // save timeline dir to avoid reconstructing it every time timeline_dir: PathBuf, conf: SafeKeeperConf, - persist_control_file_seconds: Histogram, /// Last state persisted to disk. state: SafeKeeperState, } impl FileStorage { - pub fn restore_new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { - let timeline_dir = conf.timeline_dir(zttid); - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); + /// Initialize storage by loading state from disk. + pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { + let timeline_dir = conf.timeline_dir(ttid); - let state = Self::load_control_file_conf(conf, zttid)?; + let state = Self::load_control_file_conf(conf, ttid)?; Ok(FileStorage { timeline_dir, conf: conf.clone(), - persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS - .with_label_values(&[&tenant_id, &timeline_id]), state, }) } + /// Create file storage for a new timeline, but don't persist it yet. pub fn create_new( - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, conf: &SafeKeeperConf, state: SafeKeeperState, ) -> Result { - let timeline_dir = conf.timeline_dir(zttid); - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); + let timeline_dir = conf.timeline_dir(ttid); - let mut store = FileStorage { + let store = FileStorage { timeline_dir, conf: conf.clone(), - persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS - .with_label_values(&[&tenant_id, &timeline_id]), - state: state.clone(), + state, }; - store.persist(&state)?; Ok(store) } - // Check the magic/version in the on-disk data and deserialize it, if possible. + /// Check the magic/version in the on-disk data and deserialize it, if possible. fn deser_sk_state(buf: &mut &[u8]) -> Result { // Read the version independent part let magic = buf.read_u32::()?; @@ -112,23 +91,17 @@ impl FileStorage { upgrade_control_file(buf, version) } - // Load control file for given zttid at path specified by conf. + /// Load control file for given ttid at path specified by conf. pub fn load_control_file_conf( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, ) -> Result { - let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME); + let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME); Self::load_control_file(path) } /// Read in the control file. - /// If create=false and file doesn't exist, bails out. pub fn load_control_file>(control_file_path: P) -> Result { - info!( - "loading control file {}", - control_file_path.as_ref().display(), - ); - let mut control_file = OpenOptions::new() .read(true) .write(true) @@ -179,10 +152,10 @@ impl Deref for FileStorage { } impl Storage for FileStorage { - // persists state durably to underlying storage - // for description see https://lwn.net/Articles/457667/ + /// persists state durably to underlying storage + /// for description see https://lwn.net/Articles/457667/ fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { - let _timer = &self.persist_control_file_seconds.start_timer(); + let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer(); // write data to safekeeper.control.partial let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL); @@ -264,57 +237,57 @@ mod test { fn load_from_control_file( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); + fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); Ok(( - FileStorage::restore_new(zttid, conf)?, - FileStorage::load_control_file_conf(conf, zttid)?, + FileStorage::restore_new(ttid, conf)?, + FileStorage::load_control_file_conf(conf, ttid)?, )) } fn create( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, + ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); + fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); - let storage = FileStorage::create_new(zttid, conf, state.clone())?; + let storage = FileStorage::create_new(ttid, conf, state.clone())?; Ok((storage, state)) } #[test] fn test_read_write_safekeeper_state() { let conf = stub_conf(); - let zttid = TenantTimelineId::generate(); + let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = create(&conf, &zttid).expect("failed to create state"); + let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state"); // change something state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let (_, state) = load_from_control_file(&conf, &zttid).expect("failed to read state"); + let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state"); assert_eq!(state.commit_lsn, Lsn(42)); } #[test] fn test_safekeeper_state_checksum_mismatch() { let conf = stub_conf(); - let zttid = TenantTimelineId::generate(); + let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = create(&conf, &zttid).expect("failed to read state"); + let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state"); // change something state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let control_path = conf.timeline_dir(&zttid).join(CONTROL_FILE_NAME); + let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME); let mut data = fs::read(&control_path).unwrap(); data[0] += 1; // change the first byte of the file to fail checksum validation fs::write(&control_path, &data).expect("failed to write control file"); - match load_from_control_file(&conf, &zttid) { + match load_from_control_file(&conf, &ttid) { Err(err) => assert!(err .to_string() .contains("safekeeper control file checksum mismatch")), diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 87204d6b49..1ce9186085 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -167,7 +167,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), }); - // migrate to hexing some zids + // migrate to hexing some ids } else if version == 2 { info!("reading safekeeper control file version {}", version); let oldstate = SafeKeeperStateV2::des(&buf[..buf.len()])?; @@ -248,6 +248,18 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result oldstate.timeline_start_lsn = Lsn(1); oldstate.local_start_lsn = Lsn(1); + return Ok(oldstate); + } else if version == 6 { + info!("reading safekeeper control file version {}", version); + let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?; + if oldstate.server.pg_version != 0 { + return Ok(oldstate); + } + + // set pg_version to the default v14 + info!("setting pg_version to 140005"); + oldstate.server.pg_version = 140005; + return Ok(oldstate); } bail!("unsupported safekeeper control file version {}", version) diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index ad2c0ec8bf..ca887399e1 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -3,15 +3,15 @@ use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; use crate::receive_wal::ReceiveWalConn; -use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; + use crate::send_wal::ReplicationConn; -use crate::timeline::{Timeline, TimelineTools}; -use crate::SafeKeeperConf; + +use crate::{GlobalTimelines, SafeKeeperConf}; use anyhow::{bail, Context, Result}; use postgres_ffi::PG_TLI; use regex::Regex; -use std::sync::Arc; + use tracing::info; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, @@ -27,7 +27,7 @@ pub struct SafekeeperPostgresHandler { pub appname: Option, pub tenant_id: Option, pub timeline_id: Option, - pub timeline: Option>, + pub ttid: TenantTimelineId, } /// Parsed Postgres command. @@ -101,30 +101,21 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { query_string, self.timeline_id ); - let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) - || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); - - let tenant_id = self.tenant_id.context("tenant_id is required")?; - let timeline_id = self.timeline_id.context("timeline_id is required")?; - if self.timeline.is_none() { - self.timeline.set( - &self.conf, - TenantTimelineId::new(tenant_id, timeline_id), - create, - )?; - } + let tenant_id = self.tenant_id.context("tenantid is required")?; + let timeline_id = self.timeline_id.context("timelineid is required")?; + self.ttid = TenantTimelineId::new(tenant_id, timeline_id); match cmd { - SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb) - .run(self) - .context("failed to run ReceiveWalConn"), - SafekeeperPostgresCommand::StartReplication { start_lsn } => ReplicationConn::new(pgb) - .run(self, pgb, start_lsn) - .context("failed to run ReplicationConn"), + SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self), + SafekeeperPostgresCommand::StartReplication { start_lsn } => { + ReplicationConn::new(pgb).run(self, pgb, start_lsn) + } SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), } - .context(format!("timeline {timeline_id}"))?; + .context(format!( + "Failed to process query for timeline {timeline_id}" + ))?; Ok(()) } @@ -137,42 +128,26 @@ impl SafekeeperPostgresHandler { appname: None, tenant_id: None, timeline_id: None, - timeline: None, + ttid: TenantTimelineId::empty(), } } - /// Shortcut for calling `process_msg` in the timeline. - pub fn process_safekeeper_msg( - &self, - msg: &ProposerAcceptorMessage, - ) -> Result> { - self.timeline - .get() - .process_msg(msg) - .context("failed to process ProposerAcceptorMessage") - } - /// /// Handle IDENTIFY_SYSTEM replication command /// fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> { + let tli = GlobalTimelines::get(self.ttid)?; + let lsn = if self.is_walproposer_recovery() { // walproposer should get all local WAL until flush_lsn - self.timeline.get().get_end_of_wal() + tli.get_flush_lsn() } else { // other clients shouldn't get any uncommitted WAL - self.timeline.get().get_state().0.commit_lsn + tli.get_state().0.commit_lsn } .to_string(); - let sysid = self - .timeline - .get() - .get_state() - .1 - .server - .system_id - .to_string(); + let sysid = tli.get_state().1.server.system_id.to_string(); let lsn_bytes = lsn.as_bytes(); let tli = PG_TLI.to_string(); let tli_bytes = tli.as_bytes(); diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 4c0be17ecd..1831470007 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,3 +1,4 @@ -pub mod models; pub mod routes; pub use routes::make_router; + +pub use safekeeper_api::models; diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 14c9414c09..43c0a17f84 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,15 +1,20 @@ +use anyhow::anyhow; use hyper::{Body, Request, Response, StatusCode, Uri}; +use anyhow::Context; use once_cell::sync::Lazy; use serde::Serialize; use serde::Serializer; use std::collections::{HashMap, HashSet}; use std::fmt::Display; use std::sync::Arc; +use tokio::task::JoinError; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; -use crate::timeline::{GlobalTimelines, TimelineDeleteForceResult}; + +use crate::timelines_global_map::TimelineDeleteForceResult; +use crate::GlobalTimelines; use crate::SafeKeeperConf; use etcd_broker::subscription_value::SkTimelineInfo; use utils::{ @@ -90,15 +95,20 @@ struct TimelineStatus { /// Report info about timeline. async fn timeline_status_handler(request: Request) -> Result, ApiError> { - let zttid = TenantTimelineId::new( + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(zttid.tenant_id))?; + check_permission(&request, Some(ttid.tenant_id))?; - let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?; + let tli = GlobalTimelines::get(ttid) + // FIXME: Currently, the only errors from `GlobalTimelines::get` will be client errors + // because the provided timeline isn't there. However, the method can in theory change and + // fail from internal errors later. Remove this comment once it the method returns + // something other than `anyhow::Result`. + .map_err(ApiError::InternalServerError)?; let (inmem, state) = tli.get_state(); - let flush_lsn = tli.get_end_of_wal(); + let flush_lsn = tli.get_flush_lsn(); let acc_state = AcceptorStateStatus { term: state.acceptor_state.term, @@ -108,8 +118,8 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { let request_data: TimelineCreateRequest = json_request(&mut request).await?; - let zttid = TenantTimelineId { + let ttid = TenantTimelineId { tenant_id: parse_request_param(&request, "tenant_id")?, timeline_id: request_data.timeline_id, }; - check_permission(&request, Some(zttid.tenant_id))?; - GlobalTimelines::create(get_conf(&request), zttid, request_data.peer_ids) - .map_err(ApiError::from_err)?; + check_permission(&request, Some(ttid.tenant_id))?; - json_response(StatusCode::CREATED, ()) + Err(ApiError::BadRequest(anyhow!("not implemented"))) } /// Deactivates the timeline and removes its data directory. -/// -/// It does not try to stop any processing of the timeline; there is no such code at the time of writing. -/// However, it tries to check whether the timeline was active and report it to caller just in case. -/// Note that this information is inaccurate: -/// 1. There is a race condition between checking the timeline for activity and actual directory deletion. -/// 2. At the time of writing Safekeeper rarely marks a timeline inactive. E.g. disconnecting the compute node does nothing. async fn timeline_delete_force_handler( mut request: Request, ) -> Result, ApiError> { - let zttid = TenantTimelineId::new( + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(zttid.tenant_id))?; + check_permission(&request, Some(ttid.tenant_id))?; ensure_no_body(&mut request).await?; - json_response( - StatusCode::OK, - GlobalTimelines::delete_force(get_conf(&request), &zttid) - .await - .map_err(ApiError::from_err)?, - ) + let resp = tokio::task::spawn_blocking(move || { + // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better + // error handling here when we're able to. + GlobalTimelines::delete_force(&ttid).map_err(ApiError::InternalServerError) + }) + .await + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; + json_response(StatusCode::OK, resp) } /// Deactivates all timelines for the tenant and removes its data directory. @@ -168,29 +172,44 @@ async fn tenant_delete_force_handler( let tenant_id = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; ensure_no_body(&mut request).await?; + let delete_info = tokio::task::spawn_blocking(move || { + // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons; + // Using an `InternalServerError` should be fixed when the types support it + GlobalTimelines::delete_force_all_for_tenant(&tenant_id) + .map_err(ApiError::InternalServerError) + }) + .await + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; json_response( StatusCode::OK, - GlobalTimelines::delete_force_all_for_tenant(get_conf(&request), &tenant_id) - .await - .map_err(ApiError::from_err)? + delete_info .iter() - .map(|(zttid, resp)| (format!("{}", zttid.timeline_id), *resp)) + .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp)) .collect::>(), ) } /// Used only in tests to hand craft required data. async fn record_safekeeper_info(mut request: Request) -> Result, ApiError> { - let zttid = TenantTimelineId::new( + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(zttid.tenant_id))?; + check_permission(&request, Some(ttid.tenant_id))?; let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?; - let tli = GlobalTimelines::get(get_conf(&request), zttid, false).map_err(ApiError::from_err)?; + let tli = GlobalTimelines::get(ttid) + // `GlobalTimelines::get` returns an error when it can't find the timeline. + .with_context(|| { + format!( + "Couldn't get timeline {} for tenant {}", + ttid.timeline_id, ttid.tenant_id + ) + }) + .map_err(ApiError::NotFound)?; tli.record_safekeeper_info(&safekeeper_info, NodeId(1)) - .await?; + .await + .map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, ()) } diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 00fc43521b..3de410d117 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -6,19 +6,23 @@ //! modifications in tests. //! +use std::sync::Arc; + use anyhow::Result; use bytes::Bytes; use serde::{Deserialize, Serialize}; use tracing::*; +use utils::id::TenantTimelineId; use crate::handler::SafekeeperPostgresHandler; -use crate::safekeeper::{AcceptorProposerMessage, AppendResponse}; +use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; use crate::safekeeper::{ - AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, ProposerGreeting, + AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, }; use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; -use crate::timeline::TimelineTools; -use postgres_ffi::v14::xlog_utils; +use crate::timeline::Timeline; +use crate::GlobalTimelines; +use postgres_ffi::encode_logical_message; use postgres_ffi::WAL_SEGMENT_SIZE; use utils::{ lsn::Lsn, @@ -43,6 +47,7 @@ pub struct AppendLogicalMessage { epoch_start_lsn: Lsn, begin_lsn: Lsn, truncate_lsn: Lsn, + pg_version: u32, } #[derive(Serialize, Deserialize)] @@ -57,23 +62,23 @@ struct AppendResult { /// content, and then append it with specified term and lsn. This /// function is used to test safekeepers in different scenarios. pub fn handle_json_ctrl( - spg: &mut SafekeeperPostgresHandler, + spg: &SafekeeperPostgresHandler, pgb: &mut PostgresBackend, append_request: &AppendLogicalMessage, ) -> Result<()> { info!("JSON_CTRL request: {:?}", append_request); // need to init safekeeper state before AppendRequest - prepare_safekeeper(spg)?; + let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?; // if send_proposer_elected is true, we need to update local history if append_request.send_proposer_elected { - send_proposer_elected(spg, append_request.term, append_request.epoch_start_lsn)?; + send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn)?; } - let inserted_wal = append_logical_message(spg, append_request)?; + let inserted_wal = append_logical_message(&tli, append_request)?; let response = AppendResult { - state: spg.timeline.get().get_state().1, + state: tli.get_state().1, inserted_wal, }; let response_data = serde_json::to_vec(&response)?; @@ -91,28 +96,20 @@ pub fn handle_json_ctrl( /// Prepare safekeeper to process append requests without crashes, /// by sending ProposerGreeting with default server.wal_seg_size. -fn prepare_safekeeper(spg: &mut SafekeeperPostgresHandler) -> Result<()> { - let greeting_request = ProposerAcceptorMessage::Greeting(ProposerGreeting { - protocol_version: 2, // current protocol - pg_version: 0, // unknown - proposer_id: [0u8; 16], - system_id: 0, - timeline_id: spg.timeline_id.unwrap(), - tenant_id: spg.tenant_id.unwrap(), - tli: 0, - wal_seg_size: WAL_SEGMENT_SIZE as u32, // 16MB, default for tests - }); - - let response = spg.timeline.get().process_msg(&greeting_request)?; - match response { - Some(AcceptorProposerMessage::Greeting(_)) => Ok(()), - _ => anyhow::bail!("not GreetingResponse"), - } +fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result> { + GlobalTimelines::create( + ttid, + ServerInfo { + pg_version, + wal_seg_size: WAL_SEGMENT_SIZE as u32, + system_id: 0, + }, + ) } -fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: Lsn) -> Result<()> { +fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> Result<()> { // add new term to existing history - let history = spg.timeline.get().get_state().1.acceptor_state.term_history; + let history = tli.get_state().1.acceptor_state.term_history; let history = history.up_to(lsn.checked_sub(1u64).unwrap()); let mut history_entries = history.0; history_entries.push(TermSwitchEntry { term, lsn }); @@ -125,7 +122,7 @@ fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: L timeline_start_lsn: lsn, }); - spg.timeline.get().process_msg(&proposer_elected_request)?; + tli.process_msg(&proposer_elected_request)?; Ok(()) } @@ -138,12 +135,9 @@ struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. -fn append_logical_message( - spg: &mut SafekeeperPostgresHandler, - msg: &AppendLogicalMessage, -) -> Result { - let wal_data = xlog_utils::encode_logical_message(&msg.lm_prefix, &msg.lm_message); - let sk_state = spg.timeline.get().get_state().1; +fn append_logical_message(tli: &Arc, msg: &AppendLogicalMessage) -> Result { + let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); + let sk_state = tli.get_state().1; let begin_lsn = msg.begin_lsn; let end_lsn = begin_lsn + wal_data.len() as u64; @@ -167,7 +161,7 @@ fn append_logical_message( wal_data: Bytes::from(wal_data), }); - let response = spg.timeline.get().process_msg(&append_request)?; + let response = tli.process_msg(&append_request)?; let append_response = match response { Some(AcceptorProposerMessage::AppendResponse(resp)) => resp, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index b466d5aab5..e38a5a4633 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -23,15 +23,17 @@ pub mod wal_backup; pub mod wal_service; pub mod wal_storage; +mod timelines_global_map; +pub use timelines_global_map::GlobalTimelines; + pub mod defaults { - use const_format::formatcp; use std::time::Duration; - pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; - pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); + pub use safekeeper_api::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PG_LISTEN_PORT, + }; - pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; - pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10); pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8; } @@ -65,9 +67,9 @@ impl SafeKeeperConf { self.workdir.join(tenant_id.to_string()) } - pub fn timeline_dir(&self, zttid: &TenantTimelineId) -> PathBuf { - self.tenant_dir(&zttid.tenant_id) - .join(zttid.timeline_id.to_string()) + pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> PathBuf { + self.tenant_dir(&ttid.tenant_id) + .join(ttid.timeline_id.to_string()) } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 3fa3916266..095d80623a 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -1,22 +1,105 @@ -//! This module exports metrics for all active timelines. +//! Global safekeeper mertics and per-timeline safekeeper metrics. use std::time::{Instant, SystemTime}; +use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; +use anyhow::Result; use metrics::{ core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, proto::MetricFamily, Gauge, IntGaugeVec, }; +use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ safekeeper::{SafeKeeperState, SafekeeperMemState}, - timeline::{GlobalTimelines, ReplicaState}, + timeline::ReplicaState, + GlobalTimelines, }; +// Global metrics across all timelines. +pub static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_write_wal_bytes", + "Bytes written to WAL in a single request", + vec![ + 1.0, + 10.0, + 100.0, + 1024.0, + 8192.0, + 128.0 * 1024.0, + 1024.0 * 1024.0, + 10.0 * 1024.0 * 1024.0 + ] + ) + .expect("Failed to register safekeeper_write_wal_bytes histogram") +}); +pub static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_write_wal_seconds", + "Seconds spent writing and syncing WAL to a disk in a single request", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_write_wal_seconds histogram") +}); +pub static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_flush_wal_seconds", + "Seconds spent syncing WAL to a disk", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_flush_wal_seconds histogram") +}); +pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_persist_control_file_seconds", + "Seconds to persist and sync control file", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") +}); + +/// Metrics for WalStorage in a single timeline. +#[derive(Clone, Default)] +pub struct WalStorageMetrics { + /// How much bytes were written in total. + write_wal_bytes: u64, + /// How much time spent writing WAL to disk, waiting for write(2). + write_wal_seconds: f64, + /// How much time spent syncing WAL to disk, waiting for fsync(2). + flush_wal_seconds: f64, +} + +impl WalStorageMetrics { + pub fn observe_write_bytes(&mut self, bytes: usize) { + self.write_wal_bytes += bytes as u64; + WRITE_WAL_BYTES.observe(bytes as f64); + } + + pub fn observe_write_seconds(&mut self, seconds: f64) { + self.write_wal_seconds += seconds; + WRITE_WAL_SECONDS.observe(seconds); + } + + pub fn observe_flush_seconds(&mut self, seconds: f64) { + self.flush_wal_seconds += seconds; + FLUSH_WAL_SECONDS.observe(seconds); + } +} + +/// Accepts a closure that returns a result, and returns the duration of the closure. +pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result { + let start = std::time::Instant::now(); + closure()?; + Ok(start.elapsed().as_secs_f64()) +} + +/// Metrics for a single timeline. pub struct FullTimelineInfo { - pub zttid: TenantTimelineId, + pub ttid: TenantTimelineId, pub replicas: Vec, pub wal_backup_active: bool, pub timeline_is_active: bool, @@ -28,8 +111,11 @@ pub struct FullTimelineInfo { pub persisted_state: SafeKeeperState, pub flush_lsn: Lsn, + + pub wal_storage: WalStorageMetrics, } +/// Collects metrics for all active timelines. pub struct TimelineCollector { descs: Vec, commit_lsn: GenericGaugeVec, @@ -45,7 +131,11 @@ pub struct TimelineCollector { connected_computes: IntGaugeVec, disk_usage: GenericGaugeVec, acceptor_term: GenericGaugeVec, + written_wal_bytes: GenericGaugeVec, + written_wal_seconds: GaugeVec, + flushed_wal_seconds: GaugeVec, collect_timeline_metrics: Gauge, + timelines_count: IntGauge, } impl Default for TimelineCollector { @@ -185,6 +275,36 @@ impl TimelineCollector { .unwrap(); descs.extend(acceptor_term.desc().into_iter().cloned()); + let written_wal_bytes = GenericGaugeVec::new( + Opts::new( + "safekeeper_written_wal_bytes_total", + "Number of WAL bytes written to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(written_wal_bytes.desc().into_iter().cloned()); + + let written_wal_seconds = GaugeVec::new( + Opts::new( + "safekeeper_written_wal_seconds_total", + "Total time spent in write(2) writing WAL to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(written_wal_seconds.desc().into_iter().cloned()); + + let flushed_wal_seconds = GaugeVec::new( + Opts::new( + "safekeeper_flushed_wal_seconds_total", + "Total time spent in fsync(2) flushing WAL to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(flushed_wal_seconds.desc().into_iter().cloned()); + let collect_timeline_metrics = Gauge::new( "safekeeper_collect_timeline_metrics_seconds", "Time spent collecting timeline metrics, including obtaining mutex lock for all timelines", @@ -192,6 +312,13 @@ impl TimelineCollector { .unwrap(); descs.extend(collect_timeline_metrics.desc().into_iter().cloned()); + let timelines_count = IntGauge::new( + "safekeeper_timelines", + "Total number of timelines loaded in-memory", + ) + .unwrap(); + descs.extend(timelines_count.desc().into_iter().cloned()); + TimelineCollector { descs, commit_lsn, @@ -207,7 +334,11 @@ impl TimelineCollector { connected_computes, disk_usage, acceptor_term, + written_wal_bytes, + written_wal_seconds, + flushed_wal_seconds, collect_timeline_metrics, + timelines_count, } } } @@ -234,12 +365,22 @@ impl Collector for TimelineCollector { self.connected_computes.reset(); self.disk_usage.reset(); self.acceptor_term.reset(); + self.written_wal_bytes.reset(); + self.written_wal_seconds.reset(); + self.flushed_wal_seconds.reset(); - let timelines = GlobalTimelines::active_timelines_metrics(); + let timelines = GlobalTimelines::get_all(); + let timelines_count = timelines.len(); - for tli in timelines { - let tenant_id = tli.zttid.tenant_id.to_string(); - let timeline_id = tli.zttid.timeline_id.to_string(); + for arc_tli in timelines { + let tli = arc_tli.info_for_metrics(); + if tli.is_none() { + continue; + } + let tli = tli.unwrap(); + + let tenant_id = tli.ttid.tenant_id.to_string(); + let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; let mut most_advanced: Option = None; @@ -285,6 +426,15 @@ impl Collector for TimelineCollector { self.acceptor_term .with_label_values(labels) .set(tli.persisted_state.acceptor_state.term as u64); + self.written_wal_bytes + .with_label_values(labels) + .set(tli.wal_storage.write_wal_bytes); + self.written_wal_seconds + .with_label_values(labels) + .set(tli.wal_storage.write_wal_seconds); + self.flushed_wal_seconds + .with_label_values(labels) + .set(tli.wal_storage.flush_wal_seconds); if let Some(feedback) = most_advanced { self.feedback_ps_write_lsn @@ -325,12 +475,19 @@ impl Collector for TimelineCollector { mfs.extend(self.connected_computes.collect()); mfs.extend(self.disk_usage.collect()); mfs.extend(self.acceptor_term.collect()); + mfs.extend(self.written_wal_bytes.collect()); + mfs.extend(self.written_wal_seconds.collect()); + mfs.extend(self.flushed_wal_seconds.collect()); // report time it took to collect all info let elapsed = start_collecting.elapsed().as_secs_f64(); self.collect_timeline_metrics.set(elapsed); mfs.extend(self.collect_timeline_metrics.collect()); + // report total number of timelines + self.timelines_count.set(timelines_count as i64); + mfs.extend(self.timelines_count.collect()); + mfs } } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index b0b6a73621..e28caa2f19 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -7,7 +7,9 @@ use anyhow::{anyhow, bail, Result}; use bytes::BytesMut; use tracing::*; +use crate::safekeeper::ServerInfo; use crate::timeline::Timeline; +use crate::GlobalTimelines; use std::net::SocketAddr; use std::sync::mpsc::channel; @@ -20,7 +22,6 @@ use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::TimelineTools; use utils::{ postgres_backend::PostgresBackend, pq_proto::{BeMessage, FeMessage}, @@ -67,15 +68,21 @@ impl<'pg> ReceiveWalConn<'pg> { // Receive information about server let next_msg = poll_reader.recv_msg()?; - match next_msg { + let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( "start handshake with wal proposer {} sysid {} timeline {}", self.peer_addr, greeting.system_id, greeting.tli, ); + let server_info = ServerInfo { + pg_version: greeting.pg_version, + system_id: greeting.system_id, + wal_seg_size: greeting.wal_seg_size, + }; + GlobalTimelines::create(spg.ttid, server_info)? } _ => bail!("unexpected message {:?} instead of greeting", next_msg), - } + }; let mut next_msg = Some(next_msg); @@ -88,7 +95,7 @@ impl<'pg> ReceiveWalConn<'pg> { while let Some(ProposerAcceptorMessage::AppendRequest(append_request)) = next_msg { let msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); - let reply = spg.process_safekeeper_msg(&msg)?; + let reply = tli.process_msg(&msg)?; if let Some(reply) = reply { self.write_msg(&reply)?; } @@ -97,13 +104,13 @@ impl<'pg> ReceiveWalConn<'pg> { } // flush all written WAL to the disk - let reply = spg.process_safekeeper_msg(&ProposerAcceptorMessage::FlushWAL)?; + let reply = tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?; if let Some(reply) = reply { self.write_msg(&reply)?; } } else if let Some(msg) = next_msg.take() { // process other message - let reply = spg.process_safekeeper_msg(&msg)?; + let reply = tli.process_msg(&msg)?; if let Some(reply) = reply { self.write_msg(&reply)?; } @@ -112,9 +119,9 @@ impl<'pg> ReceiveWalConn<'pg> { // Register the connection and defer unregister. Do that only // after processing first message, as it sets wal_seg_size, // wanted by many. - spg.timeline.get().on_compute_connect()?; + tli.on_compute_connect()?; _guard = Some(ComputeConnectionGuard { - timeline: Arc::clone(spg.timeline.get()), + timeline: Arc::clone(&tli), }); first_time_through = false; } @@ -190,6 +197,8 @@ struct ComputeConnectionGuard { impl Drop for ComputeConnectionGuard { fn drop(&mut self) { - self.timeline.on_compute_disconnect().unwrap(); + if let Err(e) = self.timeline.on_compute_disconnect() { + error!("failed to unregister compute connection: {}", e); + } } } diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 004c0243f9..b6d497f34e 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -4,20 +4,21 @@ use std::{thread, time::Duration}; use tracing::*; -use crate::{timeline::GlobalTimelines, SafeKeeperConf}; +use crate::{GlobalTimelines, SafeKeeperConf}; pub fn thread_main(conf: SafeKeeperConf) { let wal_removal_interval = Duration::from_millis(5000); loop { - let active_tlis = GlobalTimelines::get_active_timelines(); - for zttid in &active_tlis { - if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { - if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { - warn!( - "failed to remove WAL for tenant {} timeline {}: {}", - tli.zttid.tenant_id, tli.zttid.timeline_id, e - ); - } + let tlis = GlobalTimelines::get_all(); + for tli in &tlis { + if !tli.is_active() { + continue; + } + let ttid = tli.ttid; + let _enter = + info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id).entered(); + if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { + warn!("failed to remove WAL: {}", e); } } thread::sleep(wal_removal_interval) diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index fa045eed90..7869aa8b3a 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -25,9 +25,9 @@ use utils::{ }; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 6; +pub const SK_FORMAT_VERSION: u32 = 7; const SK_PROTOCOL_VERSION: u32 = 2; -const UNKNOWN_SERVER_VERSION: u32 = 0; +pub const UNKNOWN_SERVER_VERSION: u32 = 0; /// Consensus logical timestamp. pub type Term = u64; @@ -218,19 +218,19 @@ pub struct SafekeeperMemState { } impl SafeKeeperState { - pub fn new(zttid: &TenantTimelineId, peers: Vec) -> SafeKeeperState { + pub fn new( + ttid: &TenantTimelineId, + server_info: ServerInfo, + peers: Vec, + ) -> SafeKeeperState { SafeKeeperState { - tenant_id: zttid.tenant_id, - timeline_id: zttid.timeline_id, + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, acceptor_state: AcceptorState { term: 0, term_history: TermHistory::empty(), }, - server: ServerInfo { - pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ - system_id: 0, /* Postgres system identifier */ - wal_seg_size: 0, - }, + server: server_info, proposer_uuid: [0; 16], timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), @@ -244,7 +244,15 @@ impl SafeKeeperState { #[cfg(test)] pub fn empty() -> Self { - SafeKeeperState::new(&TenantTimelineId::empty(), vec![]) + SafeKeeperState::new( + &TenantTimelineId::empty(), + ServerInfo { + pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ + system_id: 0, /* Postgres system identifier */ + wal_seg_size: 0, + }, + vec![], + ) } } @@ -479,8 +487,12 @@ impl AcceptorProposerMessage { } } -/// SafeKeeper which consumes events (messages from compute) and provides -/// replies. +/// Safekeeper implements consensus to reliably persist WAL across nodes. +/// It controls all WAL disk writes and updates of control file. +/// +/// Currently safekeeper processes: +/// - messages from compute (proposers) and provides replies +/// - messages from broker peers pub struct SafeKeeper { /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. /// Note: be careful to set only if we are sure our WAL (term history) matches @@ -503,20 +515,20 @@ where CTRL: control_file::Storage, WAL: wal_storage::Storage, { - // constructor - pub fn new( - timeline_id: TimelineId, - state: CTRL, - mut wal_store: WAL, - node_id: NodeId, - ) -> Result> { - if state.timeline_id != TimelineId::from([0u8; 16]) && timeline_id != state.timeline_id { - bail!("Calling SafeKeeper::new with inconsistent timeline_id ({}) and SafeKeeperState.server.timeline_id ({})", timeline_id, state.timeline_id); + /// Accepts a control file storage containing the safekeeper state. + /// State must be initialized, i.e. contain filled `tenant_id`, `timeline_id` + /// and `server` (`wal_seg_size` inside it) fields. + pub fn new(state: CTRL, wal_store: WAL, node_id: NodeId) -> Result> { + if state.tenant_id == TenantId::from([0u8; 16]) + || state.timeline_id == TimelineId::from([0u8; 16]) + { + bail!( + "Calling SafeKeeper::new with empty tenant_id ({}) or timeline_id ({})", + state.tenant_id, + state.timeline_id + ); } - // initialize wal_store, if state is already initialized - wal_store.init_storage(&state)?; - Ok(SafeKeeper { global_commit_lsn: state.commit_lsn, epoch_start_lsn: Lsn(0), @@ -574,7 +586,7 @@ where &mut self, msg: &ProposerGreeting, ) -> Result> { - /* Check protocol compatibility */ + // Check protocol compatibility if msg.protocol_version != SK_PROTOCOL_VERSION { bail!( "incompatible protocol version {}, expected {}", @@ -582,15 +594,20 @@ where SK_PROTOCOL_VERSION ); } - /* Postgres upgrade is not treated as fatal error */ - if msg.pg_version != self.state.server.pg_version + /* Postgres major version mismatch is treated as fatal error + * because safekeepers parse WAL headers and the format + * may change between versions. + */ + if msg.pg_version / 10000 != self.state.server.pg_version / 10000 && self.state.server.pg_version != UNKNOWN_SERVER_VERSION { - info!( + bail!( "incompatible server version {}, expected {}", - msg.pg_version, self.state.server.pg_version + msg.pg_version, + self.state.server.pg_version ); } + if msg.tenant_id != self.state.tenant_id { bail!( "invalid tenant ID, got {}, expected {}", @@ -605,17 +622,28 @@ where self.state.timeline_id ); } - - // set basic info about server, if not yet - // TODO: verify that is doesn't change after - { - let mut state = self.state.clone(); - state.server.system_id = msg.system_id; - state.server.wal_seg_size = msg.wal_seg_size; - self.state.persist(&state)?; + if self.state.server.wal_seg_size != msg.wal_seg_size { + bail!( + "invalid wal_seg_size, got {}, expected {}", + msg.wal_seg_size, + self.state.server.wal_seg_size + ); } - self.wal_store.init_storage(&self.state)?; + // system_id will be updated on mismatch + if self.state.server.system_id != msg.system_id { + warn!( + "unexpected system ID arrived, got {}, expected {}", + msg.system_id, self.state.server.system_id + ); + + let mut state = self.state.clone(); + state.server.system_id = msg.system_id; + if msg.pg_version != UNKNOWN_SERVER_VERSION { + state.server.pg_version = msg.pg_version; + } + self.state.persist(&state)?; + } info!( "processed greeting from proposer {:?}, sending term {:?}", @@ -665,16 +693,6 @@ where Ok(Some(AcceptorProposerMessage::VoteResponse(resp))) } - /// Bump our term if received a note from elected proposer with higher one - fn bump_if_higher(&mut self, term: Term) -> Result<()> { - if self.state.acceptor_state.term < term { - let mut state = self.state.clone(); - state.acceptor_state.term = term; - self.state.persist(&state)?; - } - Ok(()) - } - /// Form AppendResponse from current state. fn append_response(&self) -> AppendResponse { let ar = AppendResponse { @@ -691,7 +709,12 @@ where fn handle_elected(&mut self, msg: &ProposerElected) -> Result> { info!("received ProposerElected {:?}", msg); - self.bump_if_higher(msg.term)?; + if self.state.acceptor_state.term < msg.term { + let mut state = self.state.clone(); + state.acceptor_state.term = msg.term; + self.state.persist(&state)?; + } + // If our term is higher, ignore the message (next feedback will inform the compute) if self.state.acceptor_state.term > msg.term { return Ok(None); @@ -748,7 +771,7 @@ where } /// Advance commit_lsn taking into account what we have locally - pub fn update_commit_lsn(&mut self) -> Result<()> { + fn update_commit_lsn(&mut self) -> Result<()> { let commit_lsn = min(self.global_commit_lsn, self.flush_lsn()); assert!(commit_lsn >= self.inmem.commit_lsn); @@ -768,6 +791,11 @@ where Ok(()) } + /// Persist control file to disk, called only after timeline creation (bootstrap). + pub fn persist(&mut self) -> Result<()> { + self.persist_control_file(self.state.clone()) + } + /// Persist in-memory state to the disk, taking other data from state. fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { state.commit_lsn = self.inmem.commit_lsn; @@ -918,6 +946,8 @@ where #[cfg(test)] mod tests { + use postgres_ffi::WAL_SEGMENT_SIZE; + use super::*; use crate::wal_storage::Storage; use std::ops::Deref; @@ -942,6 +972,14 @@ mod tests { } } + fn test_sk_state() -> SafeKeeperState { + let mut state = SafeKeeperState::empty(); + state.server.wal_seg_size = WAL_SEGMENT_SIZE as u32; + state.tenant_id = TenantId::from([1u8; 16]); + state.timeline_id = TimelineId::from([1u8; 16]); + state + } + struct DummyWalStore { lsn: Lsn, } @@ -951,10 +989,6 @@ mod tests { self.lsn } - fn init_storage(&mut self, _state: &SafeKeeperState) -> Result<()> { - Ok(()) - } - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { self.lsn = startpos + buf.len() as u64; Ok(()) @@ -972,17 +1006,19 @@ mod tests { fn remove_up_to(&self) -> Box Result<()>> { Box::new(move |_segno_up_to: XLogSegNo| Ok(())) } + + fn get_metrics(&self) -> crate::metrics::WalStorageMetrics { + crate::metrics::WalStorageMetrics::default() + } } #[test] fn test_voting() { let storage = InMemoryState { - persisted_state: SafeKeeperState::empty(), + persisted_state: test_sk_state(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let timeline_id = TimelineId::from([0u8; 16]); - - let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -998,7 +1034,7 @@ mod tests { persisted_state: state, }; - sk = SafeKeeper::new(timeline_id, storage, sk.wal_store, NodeId(0)).unwrap(); + sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1011,12 +1047,11 @@ mod tests { #[test] fn test_epoch_switch() { let storage = InMemoryState { - persisted_state: SafeKeeperState::empty(), + persisted_state: test_sk_state(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; - let timeline_id = TimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(timeline_id, storage, wal_store, NodeId(0)).unwrap(); + let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 375b6eea18..2829c875ed 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -2,12 +2,13 @@ //! with the "START_REPLICATION" message. use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::{ReplicaState, Timeline, TimelineTools}; +use crate::timeline::{ReplicaState, Timeline}; use crate::wal_storage::WalReader; +use crate::GlobalTimelines; use anyhow::{bail, Context, Result}; use bytes::Bytes; -use postgres_ffi::v14::xlog_utils::get_current_timestamp; +use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::min; @@ -167,8 +168,10 @@ impl ReplicationConn { ) -> Result<()> { let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered(); + let tli = GlobalTimelines::get(spg.ttid)?; + // spawn the background thread which receives HotStandbyFeedback messages. - let bg_timeline = Arc::clone(spg.timeline.get()); + let bg_timeline = Arc::clone(&tli); let bg_stream_in = self.stream_in.take().unwrap(); let bg_timeline_id = spg.timeline_id.unwrap(); @@ -201,11 +204,8 @@ impl ReplicationConn { .build()?; runtime.block_on(async move { - let (inmem_state, persisted_state) = spg.timeline.get().get_state(); + let (inmem_state, persisted_state) = tli.get_state(); // add persisted_state.timeline_start_lsn == Lsn(0) check - if persisted_state.server.wal_seg_size == 0 { - bail!("Cannot start replication before connecting to walproposer"); - } // Walproposer gets special handling: safekeeper must give proposer all // local WAL till the end, whether committed or not (walproposer will @@ -217,7 +217,7 @@ impl ReplicationConn { // on this safekeeper itself. That's ok as (old) proposer will never be // able to commit such WAL. let stop_pos: Option = if spg.is_walproposer_recovery() { - let wal_end = spg.timeline.get().get_end_of_wal(); + let wal_end = tli.get_flush_lsn(); Some(wal_end) } else { None @@ -231,7 +231,7 @@ impl ReplicationConn { let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn); let mut wal_reader = WalReader::new( - spg.conf.timeline_dir(&spg.timeline.get().zttid), + spg.conf.timeline_dir(&tli.ttid), &persisted_state, start_pos, spg.conf.wal_backup_enabled, @@ -241,7 +241,7 @@ impl ReplicationConn { let mut send_buf = vec![0u8; MAX_SEND_SIZE]; // watcher for commit_lsn updates - let mut commit_lsn_watch_rx = spg.timeline.get().get_commit_lsn_watch_rx(); + let mut commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx(); loop { if let Some(stop_pos) = stop_pos { @@ -258,7 +258,7 @@ impl ReplicationConn { } else { // TODO: also check once in a while whether we are walsender // to right pageserver. - if spg.timeline.get().stop_walsender(replica_id)? { + if tli.should_walsender_stop(replica_id) { // Shut down, timeline is suspended. // TODO create proper error type for this bail!("end streaming to {:?}", spg.appname); diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index cf317c41c3..dc7503af65 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -1,37 +1,35 @@ -//! This module contains timeline id -> safekeeper state map with file-backed -//! persistence and support for interaction between sending and receiving wal. +//! This module implements Timeline lifecycle management and has all neccessary code +//! to glue together SafeKeeper and all other background services. -use anyhow::{bail, Context, Result}; +use anyhow::{bail, Result}; use etcd_broker::subscription_value::SkTimelineInfo; -use once_cell::sync::Lazy; use postgres_ffi::XLogSegNo; -use serde::Serialize; use tokio::sync::watch; use std::cmp::{max, min}; -use std::collections::{HashMap, HashSet}; -use std::fs::{self}; -use std::sync::{Arc, Mutex, MutexGuard}; +use parking_lot::{Mutex, MutexGuard}; + +use std::path::PathBuf; use tokio::sync::mpsc::Sender; use tracing::*; use utils::{ - id::{NodeId, TenantId, TenantTimelineId}, + id::{NodeId, TenantTimelineId}, lsn::Lsn, pq_proto::ReplicationFeedback, }; -use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, - SafekeeperMemState, + SafekeeperMemState, ServerInfo, }; use crate::send_wal::HotStandbyFeedback; +use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::FullTimelineInfo; use crate::wal_storage; @@ -73,7 +71,7 @@ impl ReplicaState { } /// Shared state associated with database instance -struct SharedState { +pub struct SharedState { /// Safekeeper object sk: SafeKeeper, /// State of replicas @@ -95,17 +93,25 @@ struct SharedState { } impl SharedState { - /// Initialize timeline state, creating control file - fn create( + /// Initialize fresh timeline state without persisting anything to disk. + fn create_new( conf: &SafeKeeperConf, - zttid: &TenantTimelineId, - peer_ids: Vec, + ttid: &TenantTimelineId, + state: SafeKeeperState, ) -> Result { - let state = SafeKeeperState::new(zttid, peer_ids); - let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; + if state.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(*ttid)); + } - let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); - let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?; + if state.server.pg_version == UNKNOWN_SERVER_VERSION { + bail!(TimelineError::UninitialinzedPgVersion(*ttid)); + } + + // We don't want to write anything to disk, because we may have existing timeline there. + // These functions should not change anything on disk. + let control_store = control_file::FileStorage::create_new(ttid, conf, state)?; + let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?; + let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; Ok(Self { sk, @@ -117,16 +123,17 @@ impl SharedState { }) } - /// Restore SharedState from control file. - /// If file doesn't exist, bails out. - fn restore(conf: &SafeKeeperConf, zttid: &TenantTimelineId) -> Result { - let control_store = control_file::FileStorage::restore_new(zttid, conf)?; - let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); + /// Restore SharedState from control file. If file doesn't exist, bails out. + fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result { + let control_store = control_file::FileStorage::restore_new(ttid, conf)?; + if control_store.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(*ttid)); + } - info!("timeline {} restored", zttid.timeline_id); + let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?; Ok(Self { - sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?, + sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?, replicas: Vec::new(), wal_backup_active: false, active: false, @@ -134,6 +141,7 @@ impl SharedState { last_removed_segno: 0, }) } + fn is_active(&self) -> bool { self.is_wal_backup_required() // FIXME: add tracking of relevant pageservers and check them here individually, @@ -254,148 +262,294 @@ impl SharedState { } } -/// Database instance (tenant) +#[derive(Debug, thiserror::Error)] +pub enum TimelineError { + #[error("Timeline {0} was cancelled and cannot be used anymore")] + Cancelled(TenantTimelineId), + #[error("Timeline {0} was not found in global map")] + NotFound(TenantTimelineId), + #[error("Timeline {0} exists on disk, but wasn't loaded on startup")] + Invalid(TenantTimelineId), + #[error("Timeline {0} is already exists")] + AlreadyExists(TenantTimelineId), + #[error("Timeline {0} is not initialized, wal_seg_size is zero")] + UninitializedWalSegSize(TenantTimelineId), + #[error("Timeline {0} is not initialized, pg_version is unknown")] + UninitialinzedPgVersion(TenantTimelineId), +} + +/// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. +/// It also holds SharedState and provides mutually exclusive access to it. pub struct Timeline { - pub zttid: TenantTimelineId, + pub ttid: TenantTimelineId, + /// Sending here asks for wal backup launcher attention (start/stop - /// offloading). Sending zttid instead of concrete command allows to do + /// offloading). Sending ttid instead of concrete command allows to do /// sending without timeline lock. wal_backup_launcher_tx: Sender, + + /// Used to broadcast commit_lsn updates to all background jobs. commit_lsn_watch_tx: watch::Sender, - /// For breeding receivers. commit_lsn_watch_rx: watch::Receiver, + + /// Safekeeper and other state, that should remain consistent and synchronized + /// with the disk. mutex: Mutex, + + /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal. + cancellation_tx: watch::Sender, + + /// Timeline should not be used after cancellation. Background tasks should + /// monitor this channel and stop eventually after receiving `true` from this channel. + cancellation_rx: watch::Receiver, + + /// Directory where timeline state is stored. + timeline_dir: PathBuf, } impl Timeline { - fn new( - zttid: TenantTimelineId, + /// Load existing timeline from disk. + pub fn load_timeline( + conf: SafeKeeperConf, + ttid: TenantTimelineId, wal_backup_launcher_tx: Sender, - shared_state: SharedState, - ) -> Timeline { + ) -> Result { + let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); + + let shared_state = SharedState::restore(&conf, &ttid)?; let (commit_lsn_watch_tx, commit_lsn_watch_rx) = - watch::channel(shared_state.sk.inmem.commit_lsn); - Timeline { - zttid, + watch::channel(shared_state.sk.state.commit_lsn); + let (cancellation_tx, cancellation_rx) = watch::channel(false); + + Ok(Timeline { + ttid, wal_backup_launcher_tx, commit_lsn_watch_tx, commit_lsn_watch_rx, mutex: Mutex::new(shared_state), + cancellation_rx, + cancellation_tx, + timeline_dir: conf.timeline_dir(&ttid), + }) + } + + /// Create a new timeline, which is not yet persisted to disk. + pub fn create_empty( + conf: SafeKeeperConf, + ttid: TenantTimelineId, + wal_backup_launcher_tx: Sender, + server_info: ServerInfo, + ) -> Result { + let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID); + let (cancellation_tx, cancellation_rx) = watch::channel(false); + let state = SafeKeeperState::new(&ttid, server_info, vec![]); + + Ok(Timeline { + ttid, + wal_backup_launcher_tx, + commit_lsn_watch_tx, + commit_lsn_watch_rx, + mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?), + cancellation_rx, + cancellation_tx, + timeline_dir: conf.timeline_dir(&ttid), + }) + } + + /// Initialize fresh timeline on disk and start background tasks. If bootstrap + /// fails, timeline is cancelled and cannot be used anymore. + /// + /// Bootstrap is transactional, so if it fails, created files will be deleted, + /// and state on disk should remain unchanged. + pub fn bootstrap(&self, shared_state: &mut MutexGuard) -> Result<()> { + match std::fs::metadata(&self.timeline_dir) { + Ok(_) => { + // Timeline directory exists on disk, we should leave state unchanged + // and return error. + bail!(TimelineError::Invalid(self.ttid)); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => { + return Err(e.into()); + } } + + // Create timeline directory. + std::fs::create_dir_all(&self.timeline_dir)?; + + // Write timeline to disk and TODO: start background tasks. + match || -> Result<()> { + shared_state.sk.persist()?; + // TODO: add more initialization steps here + Ok(()) + }() { + Ok(_) => Ok(()), + Err(e) => { + // Bootstrap failed, cancel timeline and remove timeline directory. + self.cancel(); + + if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) { + warn!( + "failed to remove timeline {} directory after bootstrap failure: {}", + self.ttid, fs_err + ); + } + + Err(e) + } + } + } + + /// Delete timeline from disk completely, by removing timeline directory. Background + /// timeline activities will stop eventually. + pub fn delete_from_disk( + &self, + shared_state: &mut MutexGuard, + ) -> Result<(bool, bool)> { + let was_active = shared_state.active; + self.cancel(); + let dir_existed = delete_dir(&self.timeline_dir)?; + Ok((dir_existed, was_active)) + } + + /// Cancel timeline to prevent further usage. Background tasks will stop + /// eventually after receiving cancellation signal. + fn cancel(&self) { + info!("Timeline {} is cancelled", self.ttid); + let _ = self.cancellation_tx.send(true); + let res = self.wal_backup_launcher_tx.blocking_send(self.ttid); + if let Err(e) = res { + error!("Failed to send stop signal to wal_backup_launcher: {}", e); + } + } + + /// Returns if timeline is cancelled. + pub fn is_cancelled(&self) -> bool { + *self.cancellation_rx.borrow() + } + + /// Take a writing mutual exclusive lock on timeline shared_state. + pub fn write_shared_state(&self) -> MutexGuard { + self.mutex.lock() } /// Register compute connection, starting timeline-related activity if it is /// not running yet. - /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_connect(&self) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let is_wal_backup_action_pending: bool; { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); shared_state.num_computes += 1; - is_wal_backup_action_pending = shared_state.update_status(self.zttid); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); } // Wake up wal backup launcher, if offloading not started yet. if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.blocking_send(self.zttid)?; + // Can fail only if channel to a static thread got closed, which is not normal at all. + self.wal_backup_launcher_tx.blocking_send(self.ttid)?; } Ok(()) } /// De-register compute connection, shutting down timeline activity if /// pageserver doesn't need catchup. - /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_disconnect(&self) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let is_wal_backup_action_pending: bool; { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); shared_state.num_computes -= 1; - is_wal_backup_action_pending = shared_state.update_status(self.zttid); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); } // Wake up wal backup launcher, if it is time to stop the offloading. if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.blocking_send(self.zttid)?; + // Can fail only if channel to a static thread got closed, which is not normal at all. + self.wal_backup_launcher_tx.blocking_send(self.ttid)?; } Ok(()) } - /// Whether we still need this walsender running? + /// Returns true if walsender should stop sending WAL to pageserver. /// TODO: check this pageserver is actually interested in this timeline. - pub fn stop_walsender(&self, replica_id: usize) -> Result { - let mut shared_state = self.mutex.lock().unwrap(); + pub fn should_walsender_stop(&self, replica_id: usize) -> bool { + if self.is_cancelled() { + return true; + } + + let mut shared_state = self.write_shared_state(); if shared_state.num_computes == 0 { let replica_state = shared_state.replicas[replica_id].unwrap(); let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); if stop { - shared_state.update_status(self.zttid); - return Ok(true); + shared_state.update_status(self.ttid); + return true; } } - Ok(false) + false } /// Returns whether s3 offloading is required and sets current status as /// matching it. pub fn wal_backup_attend(&self) -> bool { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.wal_backup_attend() - } - - // Can this safekeeper offload to s3? Recently joined safekeepers might not - // have necessary WAL. - pub fn can_wal_backup(&self) -> bool { - self.mutex.lock().unwrap().can_wal_backup() - } - - /// Deactivates the timeline, assuming it is being deleted. - /// Returns whether the timeline was already active. - /// - /// We assume all threads will stop by themselves eventually (possibly with errors, but no panics). - /// There should be no compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but - /// we're deleting the timeline anyway. - pub async fn deactivate_for_delete(&self) -> Result { - let was_active: bool; - { - let shared_state = self.mutex.lock().unwrap(); - was_active = shared_state.active; + if self.is_cancelled() { + return false; } - self.wal_backup_launcher_tx.send(self.zttid).await?; - Ok(was_active) + + self.write_shared_state().wal_backup_attend() } - fn is_active(&self) -> bool { - let shared_state = self.mutex.lock().unwrap(); - shared_state.active + /// Can this safekeeper offload to s3? Recently joined safekeepers might not + /// have necessary WAL. + pub fn can_wal_backup(&self) -> bool { + if self.is_cancelled() { + return false; + } + + let shared_state = self.write_shared_state(); + shared_state.can_wal_backup() } - /// Returns full timeline info, required for the metrics. - /// If the timeline is not active, returns None instead. + /// Returns full timeline info, required for the metrics. If the timeline is + /// not active, returns None instead. pub fn info_for_metrics(&self) -> Option { - let shared_state = self.mutex.lock().unwrap(); - if !shared_state.active { + if self.is_cancelled() { return None; } - Some(FullTimelineInfo { - zttid: self.zttid, - replicas: shared_state - .replicas - .iter() - .filter_map(|r| r.as_ref()) - .copied() - .collect(), - wal_backup_active: shared_state.wal_backup_active, - timeline_is_active: shared_state.active, - num_computes: shared_state.num_computes, - last_removed_segno: shared_state.last_removed_segno, - epoch_start_lsn: shared_state.sk.epoch_start_lsn, - mem_state: shared_state.sk.inmem.clone(), - persisted_state: shared_state.sk.state.clone(), - flush_lsn: shared_state.sk.wal_store.flush_lsn(), - }) + let state = self.write_shared_state(); + if state.active { + Some(FullTimelineInfo { + ttid: self.ttid, + replicas: state + .replicas + .iter() + .filter_map(|r| r.as_ref()) + .copied() + .collect(), + wal_backup_active: state.wal_backup_active, + timeline_is_active: state.active, + num_computes: state.num_computes, + last_removed_segno: state.last_removed_segno, + epoch_start_lsn: state.sk.epoch_start_lsn, + mem_state: state.sk.inmem.clone(), + persisted_state: state.sk.state.clone(), + flush_lsn: state.sk.wal_store.flush_lsn(), + wal_storage: state.sk.wal_store.get_metrics(), + }) + } else { + None + } } + /// Returns commit_lsn watch channel. pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { self.commit_lsn_watch_rx.clone() } @@ -405,10 +559,14 @@ impl Timeline { &self, msg: &ProposerAcceptorMessage, ) -> Result> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let mut rmsg: Option; let commit_lsn: Lsn; { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); rmsg = shared_state.sk.process_msg(msg)?; // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn @@ -426,28 +584,46 @@ impl Timeline { Ok(rmsg) } + /// Returns wal_seg_size. pub fn get_wal_seg_size(&self) -> usize { - self.mutex.lock().unwrap().get_wal_seg_size() + self.write_shared_state().get_wal_seg_size() } + /// Returns true only if the timeline is loaded and active. + pub fn is_active(&self) -> bool { + if self.is_cancelled() { + return false; + } + + self.write_shared_state().active + } + + /// Returns state of the timeline. pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { - let shared_state = self.mutex.lock().unwrap(); - (shared_state.sk.inmem.clone(), shared_state.sk.state.clone()) + let state = self.write_shared_state(); + (state.sk.inmem.clone(), state.sk.state.clone()) } + /// Returns latest backup_lsn. pub fn get_wal_backup_lsn(&self) -> Lsn { - self.mutex.lock().unwrap().sk.inmem.backup_lsn + self.write_shared_state().sk.inmem.backup_lsn } - pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) { - self.mutex.lock().unwrap().sk.inmem.backup_lsn = backup_lsn; + /// Sets backup_lsn to the given value. + pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + self.write_shared_state().sk.inmem.backup_lsn = backup_lsn; // we should check whether to shut down offloader, but this will be done // soon by peer communication anyway. + Ok(()) } - /// Prepare public safekeeper info for reporting. + /// Return public safekeeper info for broadcasting to broker and other peers. pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { - let shared_state = self.mutex.lock().unwrap(); + let shared_state = self.write_shared_state(); SkTimelineInfo { last_log_term: Some(shared_state.sk.get_epoch()), flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), @@ -473,54 +649,53 @@ impl Timeline { let is_wal_backup_action_pending: bool; let commit_lsn: Lsn; { - let mut shared_state = self.mutex.lock().unwrap(); - // WAL seg size not initialized yet (no message from compute ever - // received), can't do much without it. - if shared_state.get_wal_seg_size() == 0 { - return Ok(()); - } + let mut shared_state = self.write_shared_state(); shared_state.sk.record_safekeeper_info(sk_info)?; - is_wal_backup_action_pending = shared_state.update_status(self.zttid); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); commit_lsn = shared_state.sk.inmem.commit_lsn; } self.commit_lsn_watch_tx.send(commit_lsn)?; // Wake up wal backup launcher, if it is time to stop the offloading. if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.send(self.zttid).await?; + self.wal_backup_launcher_tx.send(self.ttid).await?; } Ok(()) } + /// Add send_wal replica to the in-memory vector of replicas. pub fn add_replica(&self, state: ReplicaState) -> usize { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.add_replica(state) + self.write_shared_state().add_replica(state) } + /// Update replication replica state. pub fn update_replica_state(&self, id: usize, state: ReplicaState) { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); shared_state.replicas[id] = Some(state); } + /// Remove send_wal replica from the in-memory vector of replicas. pub fn remove_replica(&self, id: usize) { - let mut shared_state = self.mutex.lock().unwrap(); + let mut shared_state = self.write_shared_state(); assert!(shared_state.replicas[id].is_some()); shared_state.replicas[id] = None; } - pub fn get_end_of_wal(&self) -> Lsn { - let shared_state = self.mutex.lock().unwrap(); - shared_state.sk.wal_store.flush_lsn() + /// Returns flush_lsn. + pub fn get_flush_lsn(&self) -> Lsn { + self.write_shared_state().sk.wal_store.flush_lsn() } + /// Delete WAL segments from disk that are no longer needed. This is determined + /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn. pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + let horizon_segno: XLogSegNo; let remover: Box Result<(), anyhow::Error>>; { - let shared_state = self.mutex.lock().unwrap(); - // WAL seg size not initialized yet, no WAL exists. - if shared_state.get_wal_seg_size() == 0 { - return Ok(()); - } + let shared_state = self.write_shared_state(); horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { @@ -528,243 +703,22 @@ impl Timeline { } // release the lock before removing } - let _enter = - info_span!("", tenant = %self.zttid.tenant_id, timeline = %self.zttid.timeline_id) - .entered(); + + // delete old WAL files remover(horizon_segno - 1)?; - self.mutex.lock().unwrap().last_removed_segno = horizon_segno; + + // update last_removed_segno + let mut shared_state = self.write_shared_state(); + shared_state.last_removed_segno = horizon_segno; Ok(()) } } -// Utilities needed by various Connection-like objects -pub trait TimelineTools { - fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()>; - - fn get(&self) -> &Arc; -} - -impl TimelineTools for Option> { - fn set(&mut self, conf: &SafeKeeperConf, zttid: TenantTimelineId, create: bool) -> Result<()> { - *self = Some(GlobalTimelines::get(conf, zttid, create)?); - Ok(()) - } - - fn get(&self) -> &Arc { - self.as_ref().unwrap() - } -} - -struct GlobalTimelinesState { - timelines: HashMap>, - wal_backup_launcher_tx: Option>, -} - -static TIMELINES_STATE: Lazy> = Lazy::new(|| { - Mutex::new(GlobalTimelinesState { - timelines: HashMap::new(), - wal_backup_launcher_tx: None, - }) -}); - -#[derive(Clone, Copy, Serialize)] -pub struct TimelineDeleteForceResult { - pub dir_existed: bool, - pub was_active: bool, -} - -/// A zero-sized struct used to manage access to the global timelines map. -pub struct GlobalTimelines; - -impl GlobalTimelines { - pub fn init(wal_backup_launcher_tx: Sender) { - let mut state = TIMELINES_STATE.lock().unwrap(); - assert!(state.wal_backup_launcher_tx.is_none()); - state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); - } - - fn create_internal( - mut state: MutexGuard, - conf: &SafeKeeperConf, - zttid: TenantTimelineId, - peer_ids: Vec, - ) -> Result> { - match state.timelines.get(&zttid) { - Some(_) => bail!("timeline {} already exists", zttid), - None => { - // TODO: check directory existence - let dir = conf.timeline_dir(&zttid); - fs::create_dir_all(dir)?; - - let shared_state = SharedState::create(conf, &zttid, peer_ids) - .context("failed to create shared state")?; - - let new_tli = Arc::new(Timeline::new( - zttid, - state.wal_backup_launcher_tx.as_ref().unwrap().clone(), - shared_state, - )); - state.timelines.insert(zttid, Arc::clone(&new_tli)); - Ok(new_tli) - } - } - } - - pub fn create( - conf: &SafeKeeperConf, - zttid: TenantTimelineId, - peer_ids: Vec, - ) -> Result> { - let state = TIMELINES_STATE.lock().unwrap(); - GlobalTimelines::create_internal(state, conf, zttid, peer_ids) - } - - /// Get a timeline with control file loaded from the global TIMELINES_STATE.timelines map. - /// If control file doesn't exist and create=false, bails out. - pub fn get( - conf: &SafeKeeperConf, - zttid: TenantTimelineId, - create: bool, - ) -> Result> { - let _enter = info_span!("", timeline = %zttid.timeline_id).entered(); - - let mut state = TIMELINES_STATE.lock().unwrap(); - - match state.timelines.get(&zttid) { - Some(result) => Ok(Arc::clone(result)), - None => { - let shared_state = SharedState::restore(conf, &zttid); - - let shared_state = match shared_state { - Ok(shared_state) => shared_state, - Err(error) => { - // TODO: always create timeline explicitly - if error - .root_cause() - .to_string() - .contains("No such file or directory") - && create - { - return GlobalTimelines::create_internal(state, conf, zttid, vec![]); - } else { - return Err(error); - } - } - }; - - let new_tli = Arc::new(Timeline::new( - zttid, - state.wal_backup_launcher_tx.as_ref().unwrap().clone(), - shared_state, - )); - state.timelines.insert(zttid, Arc::clone(&new_tli)); - Ok(new_tli) - } - } - } - - /// Get loaded timeline, if it exists. - pub fn get_loaded(zttid: TenantTimelineId) -> Option> { - let state = TIMELINES_STATE.lock().unwrap(); - state.timelines.get(&zttid).map(Arc::clone) - } - - pub fn get_active_timelines() -> HashSet { - let state = TIMELINES_STATE.lock().unwrap(); - state - .timelines - .iter() - .filter(|&(_, tli)| tli.is_active()) - .map(|(zttid, _)| *zttid) - .collect() - } - - /// Return FullTimelineInfo for all active timelines. - pub fn active_timelines_metrics() -> Vec { - let state = TIMELINES_STATE.lock().unwrap(); - state - .timelines - .iter() - .filter_map(|(_, tli)| tli.info_for_metrics()) - .collect() - } - - fn delete_force_internal( - conf: &SafeKeeperConf, - zttid: &TenantTimelineId, - was_active: bool, - ) -> Result { - match std::fs::remove_dir_all(conf.timeline_dir(zttid)) { - Ok(_) => Ok(TimelineDeleteForceResult { - dir_existed: true, - was_active, - }), - Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(TimelineDeleteForceResult { - dir_existed: false, - was_active, - }), - Err(e) => Err(e.into()), - } - } - - /// Deactivates and deletes the timeline, see `Timeline::deactivate_for_delete()`, the deletes - /// the corresponding data directory. - /// We assume all timeline threads do not care about `GlobalTimelines` not containing the timeline - /// anymore, and they will eventually terminate without panics. - /// - /// There are multiple ways the timeline may be accidentally "re-created" (so we end up with two - /// `Timeline` objects in memory): - /// a) a compute node connects after this method is called, or - /// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or - /// c) an HTTP POST request for timeline creation is made after the timeline is already deleted. - /// TODO: ensure all of the above never happens. - pub async fn delete_force( - conf: &SafeKeeperConf, - zttid: &TenantTimelineId, - ) -> Result { - info!("deleting timeline {}", zttid); - let timeline = TIMELINES_STATE.lock().unwrap().timelines.remove(zttid); - let mut was_active = false; - if let Some(tli) = timeline { - was_active = tli.deactivate_for_delete().await?; - } - GlobalTimelines::delete_force_internal(conf, zttid, was_active) - } - - /// Deactivates and deletes all timelines for the tenant, see `delete()`. - /// Returns map of all timelines which the tenant had, `true` if a timeline was active. - /// There may be a race if new timelines are created simultaneously. - pub async fn delete_force_all_for_tenant( - conf: &SafeKeeperConf, - tenant_id: &TenantId, - ) -> Result> { - info!("deleting all timelines for tenant {}", tenant_id); - let mut to_delete = HashMap::new(); - { - // Keep mutex in this scope. - let timelines = &mut TIMELINES_STATE.lock().unwrap().timelines; - for (&zttid, tli) in timelines.iter() { - if zttid.tenant_id == *tenant_id { - to_delete.insert(zttid, tli.clone()); - } - } - // TODO: test that the correct subset of timelines is removed. It's complicated because they are implicitly created currently. - timelines.retain(|zttid, _| !to_delete.contains_key(zttid)); - } - let mut deleted = HashMap::new(); - for (zttid, timeline) in to_delete { - let was_active = timeline.deactivate_for_delete().await?; - deleted.insert( - zttid, - GlobalTimelines::delete_force_internal(conf, &zttid, was_active)?, - ); - } - // There may be inactive timelines, so delete the whole tenant dir as well. - match std::fs::remove_dir_all(conf.tenant_dir(tenant_id)) { - Ok(_) => (), - Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), - e => e?, - }; - Ok(deleted) +/// Deletes directory and it's contents. Returns false if directory does not exist. +fn delete_dir(path: &PathBuf) -> Result { + match std::fs::remove_dir_all(path) { + Ok(_) => Ok(true), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(e.into()), } } diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs new file mode 100644 index 0000000000..cf99a243d7 --- /dev/null +++ b/safekeeper/src/timelines_global_map.rs @@ -0,0 +1,348 @@ +//! This module contains global (tenant_id, timeline_id) -> Arc mapping. +//! All timelines should always be present in this map, this is done by loading them +//! all from the disk on startup and keeping them in memory. + +use crate::safekeeper::ServerInfo; +use crate::timeline::{Timeline, TimelineError}; +use crate::SafeKeeperConf; +use anyhow::{anyhow, bail, Context, Result}; +use once_cell::sync::Lazy; +use serde::Serialize; +use std::collections::HashMap; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::{Arc, Mutex, MutexGuard}; +use tokio::sync::mpsc::Sender; +use tracing::*; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; + +struct GlobalTimelinesState { + timelines: HashMap>, + wal_backup_launcher_tx: Option>, + conf: SafeKeeperConf, +} + +impl GlobalTimelinesState { + /// Get dependencies for a timeline constructor. + fn get_dependencies(&self) -> (SafeKeeperConf, Sender) { + ( + self.conf.clone(), + self.wal_backup_launcher_tx.as_ref().unwrap().clone(), + ) + } + + /// Insert timeline into the map. Returns error if timeline with the same id already exists. + fn try_insert(&mut self, timeline: Arc) -> Result<()> { + let ttid = timeline.ttid; + if self.timelines.contains_key(&ttid) { + bail!(TimelineError::AlreadyExists(ttid)); + } + self.timelines.insert(ttid, timeline); + Ok(()) + } + + /// Get timeline from the map. Returns error if timeline doesn't exist. + fn get(&self, ttid: &TenantTimelineId) -> Result> { + self.timelines + .get(ttid) + .cloned() + .ok_or_else(|| anyhow!(TimelineError::NotFound(*ttid))) + } +} + +static TIMELINES_STATE: Lazy> = Lazy::new(|| { + Mutex::new(GlobalTimelinesState { + timelines: HashMap::new(), + wal_backup_launcher_tx: None, + conf: SafeKeeperConf::default(), + }) +}); + +/// A zero-sized struct used to manage access to the global timelines map. +pub struct GlobalTimelines; + +impl GlobalTimelines { + /// Inject dependencies needed for the timeline constructors and load all timelines to memory. + pub fn init( + conf: SafeKeeperConf, + wal_backup_launcher_tx: Sender, + ) -> Result<()> { + let mut state = TIMELINES_STATE.lock().unwrap(); + assert!(state.wal_backup_launcher_tx.is_none()); + state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); + state.conf = conf; + + // Iterate through all directories and load tenants for all directories + // named as a valid tenant_id. + let mut tenant_count = 0; + let tenants_dir = state.conf.workdir.clone(); + for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + .with_context(|| format!("failed to list tenants dir {}", tenants_dir.display()))? + { + match &tenants_dir_entry { + Ok(tenants_dir_entry) => { + if let Ok(tenant_id) = + TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or("")) + { + tenant_count += 1; + GlobalTimelines::load_tenant_timelines(&mut state, tenant_id)?; + } + } + Err(e) => error!( + "failed to list tenants dir entry {:?} in directory {}, reason: {:?}", + tenants_dir_entry, + tenants_dir.display(), + e + ), + } + } + + info!( + "found {} tenants directories, successfully loaded {} timelines", + tenant_count, + state.timelines.len() + ); + Ok(()) + } + + /// Loads all timelines for the given tenant to memory. Returns fs::read_dir errors if any. + fn load_tenant_timelines( + state: &mut MutexGuard, + tenant_id: TenantId, + ) -> Result<()> { + let timelines_dir = state.conf.tenant_dir(&tenant_id); + for timelines_dir_entry in std::fs::read_dir(&timelines_dir) + .with_context(|| format!("failed to list timelines dir {}", timelines_dir.display()))? + { + match &timelines_dir_entry { + Ok(timeline_dir_entry) => { + if let Ok(timeline_id) = + TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) + { + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + match Timeline::load_timeline( + state.conf.clone(), + ttid, + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), + ) { + Ok(timeline) => { + state.timelines.insert(ttid, Arc::new(timeline)); + } + // If we can't load a timeline, it's most likely because of a corrupted + // directory. We will log an error and won't allow to delete/recreate + // this timeline. The only way to fix this timeline is to repair manually + // and restart the safekeeper. + Err(e) => error!( + "failed to load timeline {} for tenant {}, reason: {:?}", + timeline_id, tenant_id, e + ), + } + } + } + Err(e) => error!( + "failed to list timelines dir entry {:?} in directory {}, reason: {:?}", + timelines_dir_entry, + timelines_dir.display(), + e + ), + } + } + + Ok(()) + } + + /// Create a new timeline with the given id. If the timeline already exists, returns + /// an existing timeline. + pub fn create(ttid: TenantTimelineId, server_info: ServerInfo) -> Result> { + let (conf, wal_backup_launcher_tx) = { + let state = TIMELINES_STATE.lock().unwrap(); + if let Ok(timeline) = state.get(&ttid) { + // Timeline already exists, return it. + return Ok(timeline); + } + state.get_dependencies() + }; + + info!("creating new timeline {}", ttid); + + let timeline = Arc::new(Timeline::create_empty( + conf, + ttid, + wal_backup_launcher_tx, + server_info, + )?); + + // Take a lock and finish the initialization holding this mutex. No other threads + // can interfere with creation after we will insert timeline into the map. + let mut shared_state = timeline.write_shared_state(); + + // We can get a race condition here in case of concurrent create calls, but only + // in theory. create() will return valid timeline on the next try. + TIMELINES_STATE + .lock() + .unwrap() + .try_insert(timeline.clone())?; + + // Write the new timeline to the disk and start background workers. + // Bootstrap is transactional, so if it fails, the timeline will be deleted, + // and the state on disk should remain unchanged. + match timeline.bootstrap(&mut shared_state) { + Ok(_) => { + // We are done with bootstrap, release the lock, return the timeline. + drop(shared_state); + Ok(timeline) + } + Err(e) => { + // Note: the most likely reason for bootstrap failure is that the timeline + // directory already exists on disk. This happens when timeline is corrupted + // and wasn't loaded from disk on startup because of that. We want to preserve + // the timeline directory in this case, for further inspection. + + // TODO: this is an unusual error, perhaps we should send it to sentry + // TODO: compute will try to create timeline every second, we should add backoff + error!("failed to bootstrap timeline {}: {}", ttid, e); + + // Timeline failed to bootstrap, it cannot be used. Remove it from the map. + TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid); + Err(e) + } + } + } + + /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, + /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid, + /// i.e. loaded in memory and not cancelled. + pub fn get(ttid: TenantTimelineId) -> Result> { + let res = TIMELINES_STATE.lock().unwrap().get(&ttid); + + match res { + Ok(tli) => { + if tli.is_cancelled() { + anyhow::bail!(TimelineError::Cancelled(ttid)); + } + Ok(tli) + } + Err(e) => Err(e), + } + } + + /// Returns all timelines. This is used for background timeline proccesses. + pub fn get_all() -> Vec> { + let global_lock = TIMELINES_STATE.lock().unwrap(); + global_lock + .timelines + .values() + .cloned() + .filter(|t| !t.is_cancelled()) + .collect() + } + + /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant, + /// and that's why it can return cancelled timelines, to retry deleting them. + fn get_all_for_tenant(tenant_id: TenantId) -> Vec> { + let global_lock = TIMELINES_STATE.lock().unwrap(); + global_lock + .timelines + .values() + .filter(|t| t.ttid.tenant_id == tenant_id) + .cloned() + .collect() + } + + /// Cancels timeline, then deletes the corresponding data directory. + pub fn delete_force(ttid: &TenantTimelineId) -> Result { + let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); + match tli_res { + Ok(timeline) => { + // Take a lock and finish the deletion holding this mutex. + let mut shared_state = timeline.write_shared_state(); + + info!("deleting timeline {}", ttid); + let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?; + + // Remove timeline from the map. + TIMELINES_STATE.lock().unwrap().timelines.remove(ttid); + + Ok(TimelineDeleteForceResult { + dir_existed, + was_active, + }) + } + Err(_) => { + // Timeline is not memory, but it may still exist on disk in broken state. + let dir_path = TIMELINES_STATE.lock().unwrap().conf.timeline_dir(ttid); + let dir_existed = delete_dir(dir_path)?; + + Ok(TimelineDeleteForceResult { + dir_existed, + was_active: false, + }) + } + } + } + + /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which + /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are + /// created simultaneously. In that case the function will return error and the caller should + /// retry tenant deletion again later. + pub fn delete_force_all_for_tenant( + tenant_id: &TenantId, + ) -> Result> { + info!("deleting all timelines for tenant {}", tenant_id); + let to_delete = Self::get_all_for_tenant(*tenant_id); + + let mut err = None; + + let mut deleted = HashMap::new(); + for tli in &to_delete { + match Self::delete_force(&tli.ttid) { + Ok(result) => { + deleted.insert(tli.ttid, result); + } + Err(e) => { + error!("failed to delete timeline {}: {}", tli.ttid, e); + // Save error to return later. + err = Some(e); + } + } + } + + // If there was an error, return it. + if let Some(e) = err { + return Err(e); + } + + // There may be broken timelines on disk, so delete the whole tenant dir as well. + // Note that we could concurrently create new timelines while we were deleting them, + // so the directory may be not empty. In this case timelines will have bad state + // and timeline background jobs can panic. + delete_dir(TIMELINES_STATE.lock().unwrap().conf.tenant_dir(tenant_id))?; + + let tlis_after_delete = Self::get_all_for_tenant(*tenant_id); + if !tlis_after_delete.is_empty() { + // Some timelines were created while we were deleting them, returning error + // to the caller, so it can retry later. + bail!( + "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them", + tenant_id + ); + } + + Ok(deleted) + } +} + +#[derive(Clone, Copy, Serialize)] +pub struct TimelineDeleteForceResult { + pub dir_existed: bool, + pub was_active: bool, +} + +/// Deletes directory and it's contents. Returns false if directory does not exist. +fn delete_dir(path: PathBuf) -> Result { + match std::fs::remove_dir_all(path) { + Ok(_) => Ok(true), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(e.into()), + } +} diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 85e967e218..c82a003161 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -11,7 +11,8 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Duration; -use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNoOffsetToRecPtr}; +use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; +use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; use remote_storage::GenericRemoteStorage; use tokio::fs::File; @@ -26,8 +27,8 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::broker::{Election, ElectionLeader}; -use crate::timeline::{GlobalTimelines, Timeline}; -use crate::{broker, SafeKeeperConf}; +use crate::timeline::Timeline; +use crate::{broker, GlobalTimelines, SafeKeeperConf}; use once_cell::sync::OnceCell; @@ -53,8 +54,10 @@ pub fn wal_backup_launcher_thread_main( /// Check whether wal backup is required for timeline. If yes, mark that launcher is /// aware of current status and return the timeline. -fn is_wal_backup_required(zttid: TenantTimelineId) -> Option> { - GlobalTimelines::get_loaded(zttid).filter(|t| t.wal_backup_attend()) +fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { + GlobalTimelines::get(ttid) + .ok() + .filter(|tli| tli.wal_backup_attend()) } struct WalBackupTaskHandle { @@ -70,20 +73,20 @@ struct WalBackupTimelineEntry { /// Start per timeline task, if it makes sense for this safekeeper to offload. fn consider_start_task( conf: &SafeKeeperConf, - zttid: TenantTimelineId, + ttid: TenantTimelineId, task: &mut WalBackupTimelineEntry, ) { if !task.timeline.can_wal_backup() { return; } - info!("starting WAL backup task for {}", zttid); + info!("starting WAL backup task for {}", ttid); // TODO: decide who should offload right here by simply checking current // state instead of running elections in offloading task. let election_name = SubscriptionKey { cluster_prefix: conf.broker_etcd_prefix.clone(), kind: SubscriptionKind::Operation( - zttid, + ttid, NodeKind::Safekeeper, OperationKind::Safekeeper(SkOperationKind::WalBackup), ), @@ -97,11 +100,11 @@ fn consider_start_task( ); let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let timeline_dir = conf.timeline_dir(&zttid); + let timeline_dir = conf.timeline_dir(&ttid); let handle = tokio::spawn( - backup_task_main(zttid, timeline_dir, shutdown_rx, election) - .instrument(info_span!("WAL backup task", zttid = %zttid)), + backup_task_main(ttid, timeline_dir, shutdown_rx, election) + .instrument(info_span!("WAL backup task", ttid = %ttid)), ); task.handle = Some(WalBackupTaskHandle { @@ -140,33 +143,33 @@ async fn wal_backup_launcher_main_loop( let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); loop { tokio::select! { - zttid = wal_backup_launcher_rx.recv() => { + ttid = wal_backup_launcher_rx.recv() => { // channel is never expected to get closed - let zttid = zttid.unwrap(); + let ttid = ttid.unwrap(); if conf.remote_storage.is_none() || !conf.wal_backup_enabled { continue; /* just drain the channel and do nothing */ } - let timeline = is_wal_backup_required(zttid); + let timeline = is_wal_backup_required(ttid); // do we need to do anything at all? - if timeline.is_some() != tasks.contains_key(&zttid) { + if timeline.is_some() != tasks.contains_key(&ttid) { if let Some(timeline) = timeline { // need to start the task - let entry = tasks.entry(zttid).or_insert(WalBackupTimelineEntry { + let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry { timeline, handle: None, }); - consider_start_task(&conf, zttid, entry); + consider_start_task(&conf, ttid, entry); } else { // need to stop the task - info!("stopping WAL backup task for {}", zttid); + info!("stopping WAL backup task for {}", ttid); - let entry = tasks.remove(&zttid).unwrap(); + let entry = tasks.remove(&ttid).unwrap(); if let Some(wb_handle) = entry.handle { // Tell the task to shutdown. Error means task exited earlier, that's ok. let _ = wb_handle.shutdown_tx.send(()).await; // Await the task itself. TODO: restart panicked tasks earlier. if let Err(e) = wb_handle.handle.await { - warn!("WAL backup task for {} panicked: {}", zttid, e); + warn!("WAL backup task for {} panicked: {}", ttid, e); } } } @@ -174,8 +177,8 @@ async fn wal_backup_launcher_main_loop( } // Start known tasks, if needed and possible. _ = ticker.tick() => { - for (zttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) { - consider_start_task(&conf, *zttid, entry); + for (ttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) { + consider_start_task(&conf, *ttid, entry); } } } @@ -191,26 +194,26 @@ struct WalBackupTask { election: Election, } -/// Offload single timeline. +/// Offload single timeline. Called only after we checked that backup +/// is required (wal_backup_attend) and possible (can_wal_backup). async fn backup_task_main( - zttid: TenantTimelineId, + ttid: TenantTimelineId, timeline_dir: PathBuf, mut shutdown_rx: Receiver<()>, election: Election, ) { info!("started"); - let timeline: Arc = if let Some(tli) = GlobalTimelines::get_loaded(zttid) { - tli - } else { - /* Timeline could get deleted while task was starting, just exit then. */ - info!("no timeline, exiting"); + let res = GlobalTimelines::get(ttid); + if let Err(e) = res { + error!("backup error for timeline {}: {}", ttid, e); return; - }; + } + let tli = res.unwrap(); let mut wb = WalBackupTask { - wal_seg_size: timeline.get_wal_seg_size(), - commit_lsn_watch_rx: timeline.get_commit_lsn_watch_rx(), - timeline, + wal_seg_size: tli.get_wal_seg_size(), + commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), + timeline: tli, timeline_dir, leader: None, election, @@ -322,7 +325,11 @@ impl WalBackupTask { { Ok(backup_lsn_result) => { backup_lsn = backup_lsn_result; - self.timeline.set_wal_backup_lsn(backup_lsn_result); + let res = self.timeline.set_wal_backup_lsn(backup_lsn_result); + if let Err(e) = res { + error!("backup error: {}", e); + return; + } retry_attempt = 0; } Err(e) => { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 58b69f06e7..bc5e2d7b24 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -7,17 +7,15 @@ //! //! Note that last file has `.partial` suffix, that's different from postgres. -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{bail, Context, Result}; + use std::io::{self, Seek, SeekFrom}; use std::pin::Pin; use tokio::io::AsyncRead; -use once_cell::sync::Lazy; -use postgres_ffi::v14::xlog_utils::{ - find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, -}; +use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; use postgres_ffi::{XLogSegNo, PG_TLI}; -use std::cmp::min; +use std::cmp::{max, min}; use std::fs::{self, remove_file, File, OpenOptions}; use std::io::Write; @@ -27,83 +25,22 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; +use crate::metrics::{time_io_closure, WalStorageMetrics}; use crate::safekeeper::SafeKeeperState; use crate::wal_backup::read_object; use crate::SafeKeeperConf; -use postgres_ffi::v14::xlog_utils::XLogFileName; +use postgres_ffi::XLogFileName; use postgres_ffi::XLOG_BLCKSZ; -use postgres_ffi::v14::waldecoder::WalStreamDecoder; - -use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; +use postgres_ffi::waldecoder::WalStreamDecoder; use tokio::io::{AsyncReadExt, AsyncSeekExt}; -// The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). -// i64 is faster than f64, so update to u64 when available. -static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_write_wal_bytes", - "Bytes written to WAL in a single request, grouped by timeline", - &["tenant_id", "timeline_id"], - vec![ - 1.0, - 10.0, - 100.0, - 1024.0, - 8192.0, - 128.0 * 1024.0, - 1024.0 * 1024.0, - 10.0 * 1024.0 * 1024.0 - ] - ) - .expect("Failed to register safekeeper_write_wal_bytes histogram vec") -}); -static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_write_wal_seconds", - "Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_write_wal_seconds histogram vec") -}); -static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { - register_histogram_vec!( - "safekeeper_flush_wal_seconds", - "Seconds spent syncing WAL to a disk, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_flush_wal_seconds histogram vec") -}); - -struct WalStorageMetrics { - write_wal_bytes: Histogram, - write_wal_seconds: Histogram, - flush_wal_seconds: Histogram, -} - -impl WalStorageMetrics { - fn new(zttid: &TenantTimelineId) -> Self { - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); - Self { - write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&tenant_id, &timeline_id]), - write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), - flush_wal_seconds: FLUSH_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), - } - } -} - pub trait Storage { /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; - /// Init storage with wal_seg_size and read WAL from disk to get latest LSN. - fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()>; - /// Write piece of WAL from buf to disk, but not necessarily sync it. fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; @@ -116,10 +53,13 @@ pub trait Storage { /// Remove all segments <= given segno. Returns closure as we want to do /// that without timeline lock. fn remove_up_to(&self) -> Box Result<()>>; + + /// Get metrics for this timeline. + fn get_metrics(&self) -> WalStorageMetrics; } /// PhysicalStorage is a storage that stores WAL on disk. Writes are separated from flushes -/// for better performance. Storage must be initialized before use. +/// for better performance. Storage is initialized in the constructor. /// /// WAL is stored in segments, each segment is a file. Last segment has ".partial" suffix in /// its filename and may be not fully flushed. @@ -127,16 +67,14 @@ pub trait Storage { /// Relationship of LSNs: /// `write_lsn` >= `write_record_lsn` >= `flush_record_lsn` /// -/// When storage is just created, all LSNs are zeroes and there are no segments on disk. +/// When storage is created first time, all LSNs are zeroes and there are no segments on disk. pub struct PhysicalStorage { metrics: WalStorageMetrics, - zttid: TenantTimelineId, timeline_dir: PathBuf, conf: SafeKeeperConf, - // fields below are filled upon initialization - /// None if uninitialized, Some(usize) if storage is initialized. - wal_seg_size: Option, + /// Size of WAL segment in bytes. + wal_seg_size: usize, /// Written to disk, but possibly still in the cache and not fully persisted. /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. @@ -161,52 +99,88 @@ pub struct PhysicalStorage { } impl PhysicalStorage { - pub fn new(zttid: &TenantTimelineId, conf: &SafeKeeperConf) -> PhysicalStorage { - let timeline_dir = conf.timeline_dir(zttid); - PhysicalStorage { - metrics: WalStorageMetrics::new(zttid), - zttid: *zttid, + /// Create new storage. If commit_lsn is not zero, flush_lsn is tried to be restored from + /// the disk. Otherwise, all LSNs are set to zero. + pub fn new( + ttid: &TenantTimelineId, + conf: &SafeKeeperConf, + state: &SafeKeeperState, + ) -> Result { + let timeline_dir = conf.timeline_dir(ttid); + let wal_seg_size = state.server.wal_seg_size as usize; + + // Find out where stored WAL ends, starting at commit_lsn which is a + // known recent record boundary (unless we don't have WAL at all). + // + // NB: find_end_of_wal MUST be backwards compatible with the previously + // written WAL. If find_end_of_wal fails to read any WAL written by an + // older version of the code, we could lose data forever. + let write_lsn = if state.commit_lsn == Lsn(0) { + Lsn(0) + } else { + match state.server.pg_version / 10000 { + 14 => postgres_ffi::v14::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )?, + 15 => postgres_ffi::v15::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )?, + _ => bail!("unsupported postgres version: {}", state.server.pg_version), + } + }; + + // TODO: do we really know that write_lsn is fully flushed to disk? + // If not, maybe it's better to call fsync() here to be sure? + let flush_lsn = write_lsn; + + debug!( + "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", + ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, + ); + if flush_lsn < state.commit_lsn || flush_lsn < state.peer_horizon_lsn { + warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", ttid.timeline_id); + } + + Ok(PhysicalStorage { + metrics: WalStorageMetrics::default(), timeline_dir, conf: conf.clone(), - wal_seg_size: None, - write_lsn: Lsn(0), - write_record_lsn: Lsn(0), - flush_record_lsn: Lsn(0), - decoder: WalStreamDecoder::new(Lsn(0)), + wal_seg_size, + write_lsn, + write_record_lsn: write_lsn, + flush_record_lsn: flush_lsn, + decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000), file: None, - } - } - - /// Wrapper for flush_lsn updates that also updates metrics. - fn update_flush_lsn(&mut self) { - self.flush_record_lsn = self.write_record_lsn; + }) } /// Call fdatasync if config requires so. - fn fdatasync_file(&self, file: &mut File) -> Result<()> { + fn fdatasync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { self.metrics - .flush_wal_seconds - .observe_closure_duration(|| file.sync_data())?; + .observe_flush_seconds(time_io_closure(|| Ok(file.sync_data()?))?); } Ok(()) } /// Call fsync if config requires so. - fn fsync_file(&self, file: &mut File) -> Result<()> { + fn fsync_file(&mut self, file: &mut File) -> Result<()> { if !self.conf.no_sync { self.metrics - .flush_wal_seconds - .observe_closure_duration(|| file.sync_all())?; + .observe_flush_seconds(time_io_closure(|| Ok(file.sync_all()?))?); } Ok(()) } /// Open or create WAL segment file. Caller must call seek to the wanted position. /// Returns `file` and `is_partial`. - fn open_or_create(&self, segno: XLogSegNo, wal_seg_size: usize) -> Result<(File, bool)> { + fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; // Try to open already completed segment if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { @@ -222,24 +196,18 @@ impl PhysicalStorage { .open(&wal_file_partial_path) .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?; - write_zeroes(&mut file, wal_seg_size)?; + write_zeroes(&mut file, self.wal_seg_size)?; self.fsync_file(&mut file)?; Ok((file, true)) } } /// Write WAL bytes, which are known to be located in a single WAL segment. - fn write_in_segment( - &mut self, - segno: u64, - xlogoff: usize, - buf: &[u8], - wal_seg_size: usize, - ) -> Result<()> { + fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> { let mut file = if let Some(file) = self.file.take() { file } else { - let (mut file, is_partial) = self.open_or_create(segno, wal_seg_size)?; + let (mut file, is_partial) = self.open_or_create(segno)?; assert!(is_partial, "unexpected write into non-partial segment file"); file.seek(SeekFrom::Start(xlogoff as u64))?; file @@ -247,13 +215,13 @@ impl PhysicalStorage { file.write_all(buf)?; - if xlogoff + buf.len() == wal_seg_size { + if xlogoff + buf.len() == self.wal_seg_size { // If we reached the end of a WAL segment, flush and close it. self.fdatasync_file(&mut file)?; // Rename partial file to completed file let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; fs::rename(&wal_file_partial_path, &wal_file_path)?; } else { // otherwise, file can be reused later @@ -269,10 +237,6 @@ impl PhysicalStorage { /// /// Updates `write_lsn`. fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> { - let wal_seg_size = self - .wal_seg_size - .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - if self.write_lsn != pos { // need to flush the file before discarding it if let Some(mut file) = self.file.take() { @@ -284,17 +248,17 @@ impl PhysicalStorage { while !buf.is_empty() { // Extract WAL location for this block - let xlogoff = self.write_lsn.segment_offset(wal_seg_size) as usize; - let segno = self.write_lsn.segment_number(wal_seg_size); + let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size) as usize; + let segno = self.write_lsn.segment_number(self.wal_seg_size); // If crossing a WAL boundary, only write up until we reach wal segment size. - let bytes_write = if xlogoff + buf.len() > wal_seg_size { - wal_seg_size - xlogoff + let bytes_write = if xlogoff + buf.len() > self.wal_seg_size { + self.wal_seg_size - xlogoff } else { buf.len() }; - self.write_in_segment(segno, xlogoff, &buf[..bytes_write], wal_seg_size)?; + self.write_in_segment(segno, xlogoff, &buf[..bytes_write])?; self.write_lsn += bytes_write as u64; buf = &buf[bytes_write..]; } @@ -309,53 +273,6 @@ impl Storage for PhysicalStorage { self.flush_record_lsn } - /// Storage needs to know wal_seg_size to know which segment to read/write, but - /// wal_seg_size is not always known at the moment of storage creation. This method - /// allows to postpone its initialization. - fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()> { - if state.server.wal_seg_size == 0 { - // wal_seg_size is still unknown. This is dead path normally, should - // be used only in tests. - return Ok(()); - } - - if let Some(wal_seg_size) = self.wal_seg_size { - // physical storage is already initialized - assert_eq!(wal_seg_size, state.server.wal_seg_size as usize); - return Ok(()); - } - - // initialize physical storage - let wal_seg_size = state.server.wal_seg_size as usize; - self.wal_seg_size = Some(wal_seg_size); - - // Find out where stored WAL ends, starting at commit_lsn which is a - // known recent record boundary (unless we don't have WAL at all). - self.write_lsn = if state.commit_lsn == Lsn(0) { - Lsn(0) - } else { - find_end_of_wal(&self.timeline_dir, wal_seg_size, state.commit_lsn)? - }; - - self.write_record_lsn = self.write_lsn; - - // TODO: do we really know that write_lsn is fully flushed to disk? - // If not, maybe it's better to call fsync() here to be sure? - self.update_flush_lsn(); - - info!( - "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", - self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.peer_horizon_lsn, - ); - if self.flush_record_lsn < state.commit_lsn - || self.flush_record_lsn < state.peer_horizon_lsn - { - warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", self.zttid.timeline_id); - } - - Ok(()) - } - /// Write WAL to disk. fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { // Disallow any non-sequential writes, which can result in gaps or overwrites. @@ -375,13 +292,10 @@ impl Storage for PhysicalStorage { ); } - { - let _timer = self.metrics.write_wal_seconds.start_timer(); - self.write_exact(startpos, buf)?; - } - + let write_seconds = time_io_closure(|| self.write_exact(startpos, buf))?; // WAL is written, updating write metrics - self.metrics.write_wal_bytes.observe(buf.len() as f64); + self.metrics.observe_write_seconds(write_seconds); + self.metrics.observe_write_bytes(buf.len()); // figure out last record's end lsn for reporting (if we got the // whole record) @@ -391,7 +305,8 @@ impl Storage for PhysicalStorage { self.decoder.available(), startpos, ); - self.decoder = WalStreamDecoder::new(startpos); + let pg_version = self.decoder.pg_version; + self.decoder = WalStreamDecoder::new(startpos, pg_version); } self.decoder.feed_bytes(buf); loop { @@ -419,80 +334,87 @@ impl Storage for PhysicalStorage { // We have unflushed data (write_lsn != flush_lsn), but no file. // This should only happen if last file was fully written and flushed, // but haven't updated flush_lsn yet. - assert!(self.write_lsn.segment_offset(self.wal_seg_size.unwrap()) == 0); + if self.write_lsn.segment_offset(self.wal_seg_size) != 0 { + bail!( + "unexpected unflushed data with no open file, write_lsn={}, flush_lsn={}", + self.write_lsn, + self.flush_record_lsn + ); + } } // everything is flushed now, let's update flush_lsn - self.update_flush_lsn(); + self.flush_record_lsn = self.write_record_lsn; Ok(()) } /// Truncate written WAL by removing all WAL segments after the given LSN. /// end_pos must point to the end of the WAL record. fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { - let wal_seg_size = self - .wal_seg_size - .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - // Streaming must not create a hole, so truncate cannot be called on non-written lsn - assert!(self.write_lsn == Lsn(0) || self.write_lsn >= end_pos); + if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { + bail!( + "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}", + self.write_lsn, + end_pos + ); + } // Close previously opened file, if any if let Some(mut unflushed_file) = self.file.take() { self.fdatasync_file(&mut unflushed_file)?; } - let xlogoff = end_pos.segment_offset(wal_seg_size) as usize; - let segno = end_pos.segment_number(wal_seg_size); - let (mut file, is_partial) = self.open_or_create(segno, wal_seg_size)?; + let xlogoff = end_pos.segment_offset(self.wal_seg_size) as usize; + let segno = end_pos.segment_number(self.wal_seg_size); + + // Remove all segments after the given LSN. + remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno)?; + + let (mut file, is_partial) = self.open_or_create(segno)?; // Fill end with zeroes file.seek(SeekFrom::Start(xlogoff as u64))?; - write_zeroes(&mut file, wal_seg_size - xlogoff)?; + write_zeroes(&mut file, self.wal_seg_size - xlogoff)?; self.fdatasync_file(&mut file)?; if !is_partial { // Make segment partial once again let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; fs::rename(&wal_file_path, &wal_file_partial_path)?; } - // Remove all subsequent segments - let mut segno = segno; - loop { - segno += 1; - let (wal_file_path, wal_file_partial_path) = - wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; - // TODO: better use fs::try_exists which is currently available only in nightly build - if wal_file_path.exists() { - fs::remove_file(&wal_file_path)?; - } else if wal_file_partial_path.exists() { - fs::remove_file(&wal_file_partial_path)?; - } else { - break; - } - } - // Update LSNs self.write_lsn = end_pos; self.write_record_lsn = end_pos; - self.update_flush_lsn(); + self.flush_record_lsn = end_pos; Ok(()) } fn remove_up_to(&self) -> Box Result<()>> { let timeline_dir = self.timeline_dir.clone(); - let wal_seg_size = self.wal_seg_size.unwrap(); + let wal_seg_size = self.wal_seg_size; Box::new(move |segno_up_to: XLogSegNo| { - remove_up_to(&timeline_dir, wal_seg_size, segno_up_to) + remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to) }) } + + fn get_metrics(&self) -> WalStorageMetrics { + self.metrics.clone() + } } -/// Remove all WAL segments in timeline_dir <= given segno. -fn remove_up_to(timeline_dir: &Path, wal_seg_size: usize, segno_up_to: XLogSegNo) -> Result<()> { +/// Remove all WAL segments in timeline_dir that match the given predicate. +fn remove_segments_from_disk( + timeline_dir: &Path, + wal_seg_size: usize, + remove_predicate: impl Fn(XLogSegNo) -> bool, +) -> Result<()> { let mut n_removed = 0; + let mut min_removed = u64::MAX; + let mut max_removed = u64::MIN; + for entry in fs::read_dir(&timeline_dir)? { let entry = entry?; let entry_path = entry.path(); @@ -504,19 +426,21 @@ fn remove_up_to(timeline_dir: &Path, wal_seg_size: usize, segno_up_to: XLogSegNo continue; } let (segno, _) = XLogFromFileName(fname_str, wal_seg_size); - if segno <= segno_up_to { + if remove_predicate(segno) { remove_file(entry_path)?; n_removed += 1; + min_removed = min(min_removed, segno); + max_removed = max(max_removed, segno); } } } - let segno_from = segno_up_to - n_removed + 1; - info!( - "removed {} WAL segments [{}; {}]", - n_removed, - XLogFileName(PG_TLI, segno_from, wal_seg_size), - XLogFileName(PG_TLI, segno_up_to, wal_seg_size) - ); + + if n_removed > 0 { + info!( + "removed {} WAL segments [{}; {}]", + n_removed, min_removed, max_removed + ); + } Ok(()) } @@ -526,8 +450,10 @@ pub struct WalReader { pos: Lsn, wal_segment: Option>>, - enable_remote_read: bool, // S3 will be used to read WAL if LSN is not available locally + enable_remote_read: bool, + + // We don't have WAL locally if LSN is less than local_start_lsn local_start_lsn: Lsn, } diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index af847be49e..6f6c3864dd 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -80,11 +80,13 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: class PgBin: """A helper class for executing postgres binaries""" - def __init__(self, log_dir: Path, pg_distrib_dir): + def __init__(self, log_dir: Path, pg_distrib_dir, pg_version): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "bin") + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), "lib") + self.env["LD_LIBRARY_PATH"] = os.path.join( + str(pg_distrib_dir), "v{}".format(pg_version), "lib" + ) def _fixpath(self, command: List[str]): if "/" not in command[0]: @@ -470,9 +472,10 @@ def import_timeline( last_lsn, prev_lsn, tar_filename, + pg_version, ): # Import timelines to new pageserver - import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn}" + import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}" full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") @@ -483,7 +486,7 @@ def import_timeline( with open(stdout_filename, "w") as stdout_f: with open(stderr_filename2, "w") as stderr_f: print(f"(capturing output to {stdout_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) subprocess.run( full_cmd, stdout=stdout_f, @@ -502,7 +505,15 @@ def import_timeline( def export_timeline( - args, psql_path, pageserver_connstr, tenant_id, timeline_id, last_lsn, prev_lsn, tar_filename + args, + psql_path, + pageserver_connstr, + tenant_id, + timeline_id, + last_lsn, + prev_lsn, + tar_filename, + pg_version, ): # Choose filenames incomplete_filename = tar_filename + ".incomplete" @@ -517,13 +528,13 @@ def export_timeline( with open(incomplete_filename, "w") as stdout_f: with open(stderr_filename, "w") as stderr_f: print(f"(capturing output to {incomplete_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) subprocess.run( cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True ) # Add missing rels - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir) + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin) # Log more info @@ -532,7 +543,8 @@ def export_timeline( def main(args: argparse.Namespace): - psql_path = str(Path(args.pg_distrib_dir) / "bin" / "psql") + # any psql version will do here. use current DEFAULT_PG_VERSION = 14 + psql_path = str(Path(args.pg_distrib_dir) / "v14" / "bin" / "psql") old_pageserver_host = args.old_pageserver_host new_pageserver_host = args.new_pageserver_host @@ -565,6 +577,8 @@ def main(args: argparse.Namespace): args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" ) + pg_version = timeline["local"]["pg_version"] + # Export timeline from old pageserver if args.only_import is False: last_lsn, prev_lsn = get_rlsn( @@ -581,6 +595,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, tar_filename, + pg_version, ) # Import into new pageserver @@ -594,6 +609,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, tar_filename, + pg_version, ) # Re-export and compare @@ -607,6 +623,7 @@ def main(args: argparse.Namespace): last_lsn, prev_lsn, re_export_filename, + pg_version, ) # Check the size is the same @@ -693,8 +710,8 @@ if __name__ == "__main__": "--psql-path", dest="psql_path", required=False, - default="/usr/local/bin/psql", - help="Path to the psql binary. Default: /usr/local/bin/psql", + default="/usr/local/v14/bin/psql", + help="Path to the psql binary. Default: /usr/local/v14/bin/psql", ) parser.add_argument( "--only-import", diff --git a/scripts/ninstall.sh b/scripts/ninstall.sh new file mode 100755 index 0000000000..3554e3e4df --- /dev/null +++ b/scripts/ninstall.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -euo pipefail +# GNU coreutil's `install -C` always overrides the destination if the source +# is not a regular file, which is the case with lots of headers symlinked into +# the build directory by `./configure`. That causes Rust's Cargo to think that +# Postgres headers have been updated after `make` call even if no files have been +# touched. That causes long recompilation of `postgres_ffi` and all dependent +# packages. To counter that, we handle a special case here: do not copy the file +# if its content did not change. We only handle a single case where `install` +# installs a single file with a specific set of arguments, the rest does not +# matter in our configuration. +# +# Such behavior may be incorrect if e.g. permissions have changed, but it should +# not happen during normal Neon development that often, and rebuild should help. +# +# See https://github.com/neondatabase/neon/issues/1873 +if [ "$#" == "5" ]; then + if [ "$1" == "-C" ] && [ "$2" == "-m" ] && [ "$3" == "644" ]; then + if [ -e "$5" ] && diff -q "$4" "$5" >/dev/null 2>&1; then + exit 0 + fi + fi +fi +install "$@" diff --git a/test_runner/README.md b/test_runner/README.md index 44751944b3..e066ac3235 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -6,6 +6,9 @@ Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) - Neon and Postgres binaries - See the root [README.md](/README.md) for build directions + If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands. + For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags. + Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release` - Tests can be run from the git tree; or see the environment variables below to run from other directories. - The neon git repo, including the postgres submodule @@ -53,10 +56,24 @@ If you want to run all tests that have the string "bench" in their names: `./scripts/pytest -k bench` +To run tests in parellel we utilize `pytest-xdist` plugin. By default everything runs single threaded. Number of workers can be specified with `-n` argument: + +`./scripts/pytest -n4` + +By default performance tests are excluded. To run them explicitly pass performance tests selection to the script: + +`./scripts/pytest test_runner/performance` + Useful environment variables: `NEON_BIN`: The directory where neon binaries can be found. `POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found. +Since pageserver supports several postgres versions, `POSTGRES_DISTRIB_DIR` must contain +a subdirectory for each version with naming convention `v{PG_VERSION}/`. +Inside that dir, a `bin/postgres` binary should be present. +`DEFAULT_PG_VERSION`: The version of Postgres to use, +This is used to construct full path to the postgres binaries. +Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"` `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index b61fd2d975..c5be558cd4 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -89,6 +89,7 @@ class NeonCompare(PgCompare): self.env = neon_simple_env self._zenbenchmark = zenbenchmark self._pg_bin = pg_bin + self.pageserver_http_client = self.env.pageserver.http_client() self.tenant, _ = self.env.neon_cli.create_tenant( conf={ @@ -110,10 +111,6 @@ class NeonCompare(PgCompare): self._pg = self.env.postgres.create_start( branch_name, "main", self.tenant, config_lines=["shared_buffers=2GB"]) - # Long-lived cursor, useful for flushing - self.psconn = self.env.pageserver.connect() - self.pscur = self.psconn.cursor() - @property def pg(self): return self._pg @@ -127,10 +124,10 @@ class NeonCompare(PgCompare): return self._pg_bin def flush(self): - self.pscur.execute(f"do_gc {self.tenant} {self.timeline} 0") + self.pageserver_http_client.timeline_gc(self.env.initial_tenant, self.timeline, 0) def compact(self): - self.pscur.execute(f"compact {self.tenant} {self.timeline}") + self.pageserver_http_client.timeline_compact(self.env.initial_tenant, self.timeline) def report_peak_memory_use(self) -> None: self.zenbenchmark.record( diff --git a/test_runner/fixtures/log_helper.py b/test_runner/fixtures/log_helper.py index 7d112fce89..17f2402391 100644 --- a/test_runner/fixtures/log_helper.py +++ b/test_runner/fixtures/log_helper.py @@ -1,6 +1,5 @@ import logging import logging.config -import re """ This file configures logging to use in python tests. @@ -30,17 +29,6 @@ LOGGING = { } -class PasswordFilter(logging.Filter): - """Filter out password from logs.""" - - # Good enough to filter our passwords produced by PgProtocol.connstr - FILTER = re.compile(r"(\s*)password=[^\s]+(\s*)") - - def filter(self, record: logging.LogRecord) -> bool: - record.msg = self.FILTER.sub(r"\1password=\2", str(record.msg)) - return True - - def getLogger(name="root") -> logging.Logger: """Method to get logger for tests. @@ -50,6 +38,5 @@ def getLogger(name="root") -> logging.Logger: # default logger for tests log = getLogger() -log.addFilter(PasswordFilter()) logging.config.dictConfig(LOGGING) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 084fa52d8f..22d267f05c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -59,8 +59,8 @@ Env = Dict[str, str] Fn = TypeVar("Fn", bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = "test_output" -DEFAULT_POSTGRES_DIR = "pg_install/v14" DEFAULT_BRANCH_NAME = "main" +DEFAULT_PG_VERSION_DEFAULT = "14" BASE_PORT = 15000 WORKER_PORT_NUM = 1000 @@ -71,6 +71,7 @@ base_dir = "" neon_binpath = "" pg_distrib_dir = "" top_output_dir = "" +default_pg_version = "" def pytest_configure(config): @@ -100,20 +101,36 @@ def pytest_configure(config): Path(top_output_dir).mkdir(exist_ok=True) # Find the postgres installation. + global default_pg_version + log.info(f"default_pg_version is {default_pg_version}") + env_default_pg_version = os.environ.get("DEFAULT_PG_VERSION") + if env_default_pg_version: + default_pg_version = env_default_pg_version + log.info(f"default_pg_version is set to {default_pg_version}") + else: + default_pg_version = DEFAULT_PG_VERSION_DEFAULT + global pg_distrib_dir + env_postgres_bin = os.environ.get("POSTGRES_DISTRIB_DIR") if env_postgres_bin: pg_distrib_dir = env_postgres_bin else: - pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR)) + pg_distrib_dir = os.path.normpath(os.path.join(base_dir, "pg_install")) + log.info(f"pg_distrib_dir is {pg_distrib_dir}") + psql_bin_path = os.path.join(pg_distrib_dir, "v{}".format(default_pg_version), "bin/psql") + postgres_bin_path = os.path.join( + pg_distrib_dir, "v{}".format(default_pg_version), "bin/postgres" + ) + if os.getenv("REMOTE_ENV"): # When testing against a remote server, we only need the client binary. - if not os.path.exists(os.path.join(pg_distrib_dir, "bin/psql")): - raise Exception('psql not found at "{}"'.format(pg_distrib_dir)) + if not os.path.exists(psql_bin_path): + raise Exception('psql not found at "{}"'.format(psql_bin_path)) else: - if not os.path.exists(os.path.join(pg_distrib_dir, "bin/postgres")): - raise Exception('postgres not found at "{}"'.format(pg_distrib_dir)) + if not os.path.exists(postgres_bin_path): + raise Exception('postgres not found at "{}"'.format(postgres_bin_path)) if os.getenv("REMOTE_ENV"): # we are in remote env and do not have neon binaries locally @@ -266,10 +283,15 @@ class PgProtocol: return str(make_dsn(**self.conn_options(**kwargs))) def conn_options(self, **kwargs): + """ + Construct a dictionary of connection options from default values and extra parameters. + An option can be dropped from the returning dictionary by None-valued extra parameter. + """ result = self.default_options.copy() if "dsn" in kwargs: result.update(parse_dsn(kwargs["dsn"])) result.update(kwargs) + result = {k: v for k, v in result.items() if v is not None} # Individual statement timeout in seconds. 2 minutes should be # enough for our tests, but if you need a longer, you can @@ -433,6 +455,9 @@ class RemoteStorageKind(enum.Enum): LOCAL_FS = "local_fs" MOCK_S3 = "mock_s3" REAL_S3 = "real_s3" + # Pass to tests that are generic to remote storage + # to ensure the test pass with or without the remote storage + NOOP = "noop" def available_remote_storages() -> List[RemoteStorageKind]: @@ -539,6 +564,7 @@ class NeonEnvBuilder: self.env: Optional[NeonEnv] = None self.remote_storage_prefix: Optional[str] = None self.keep_remote_storage_contents: bool = True + self.pg_version = default_pg_version def init(self) -> NeonEnv: # Cannot create more than one environment from one builder @@ -560,7 +586,9 @@ class NeonEnvBuilder: test_name: str, force_enable: bool = True, ): - if remote_storage_kind == RemoteStorageKind.LOCAL_FS: + if remote_storage_kind == RemoteStorageKind.NOOP: + return + elif remote_storage_kind == RemoteStorageKind.LOCAL_FS: self.enable_local_fs_remote_storage(force_enable=force_enable) elif remote_storage_kind == RemoteStorageKind.MOCK_S3: self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable) @@ -751,6 +779,7 @@ class NeonEnv: self.broker = config.broker self.remote_storage = config.remote_storage self.remote_storage_users = config.remote_storage_users + self.pg_version = config.pg_version # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -964,6 +993,24 @@ class NeonPageserverHttpClient(requests.Session): def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]) -> None: + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.put( + f"http://localhost:{self.port}/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + ) + log.info(f"Got failpoints request response code {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + def tenant_list(self) -> List[Dict[Any, Any]]: res = self.get(f"http://localhost:{self.port}/v1/tenant") self.verbose_error(res) @@ -1061,6 +1108,45 @@ class NeonPageserverHttpClient(requests.Session): assert res_json is None return res_json + def timeline_gc( + self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] + ) -> dict[str, Any]: + log.info( + f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}" + ) + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc", + json={"gc_horizon": gc_horizon}, + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId): + log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact" + ) + log.info(f"Got compact request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + + def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): + log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint" + ) + log.info(f"Got checkpoint request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) @@ -1194,6 +1280,8 @@ class NeonCli(AbstractNeonCli): str(tenant_id), "--timeline-id", str(timeline_id), + "--pg-version", + self.env.pg_version, ] ) else: @@ -1205,6 +1293,8 @@ class NeonCli(AbstractNeonCli): str(tenant_id), "--timeline-id", str(timeline_id), + "--pg-version", + self.env.pg_version, ] + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) ) @@ -1230,7 +1320,9 @@ class NeonCli(AbstractNeonCli): return res def create_timeline( - self, new_branch_name: str, tenant_id: Optional[TenantId] = None + self, + new_branch_name: str, + tenant_id: Optional[TenantId] = None, ) -> TimelineId: cmd = [ "timeline", @@ -1239,6 +1331,8 @@ class NeonCli(AbstractNeonCli): new_branch_name, "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--pg-version", + self.env.pg_version, ] res = self.raw_cli(cmd) @@ -1252,7 +1346,11 @@ class NeonCli(AbstractNeonCli): return TimelineId(str(created_timeline_id)) - def create_root_branch(self, branch_name: str, tenant_id: Optional[TenantId] = None): + def create_root_branch( + self, + branch_name: str, + tenant_id: Optional[TenantId] = None, + ): cmd = [ "timeline", "create", @@ -1260,6 +1358,8 @@ class NeonCli(AbstractNeonCli): branch_name, "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--pg-version", + self.env.pg_version, ] res = self.raw_cli(cmd) @@ -1329,7 +1429,9 @@ class NeonCli(AbstractNeonCli): return timelines_cli def init( - self, config_toml: str, initial_timeline_id: Optional[TimelineId] = None + self, + config_toml: str, + initial_timeline_id: Optional[TimelineId] = None, ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: tmp.write(config_toml) @@ -1338,6 +1440,9 @@ class NeonCli(AbstractNeonCli): cmd = ["init", f"--config={tmp.name}"] if initial_timeline_id: cmd.extend(["--timeline-id", str(initial_timeline_id)]) + + cmd.extend(["--pg-version", self.env.pg_version]) + append_pageserver_param_overrides( params_to_update=cmd, remote_storage=self.env.remote_storage, @@ -1364,7 +1469,10 @@ class NeonCli(AbstractNeonCli): log.info(f"pageserver_enabled_features success: {res.stdout}") return json.loads(res.stdout) - def pageserver_start(self, overrides=()) -> "subprocess.CompletedProcess[str]": + def pageserver_start( + self, + overrides=(), + ) -> "subprocess.CompletedProcess[str]": start_args = ["pageserver", "start", *overrides] append_pageserver_param_overrides( params_to_update=start_args, @@ -1419,6 +1527,8 @@ class NeonCli(AbstractNeonCli): str(tenant_id or self.env.initial_tenant), "--branch-name", branch_name, + "--pg-version", + self.env.pg_version, ] if lsn is not None: args.extend(["--lsn", str(lsn)]) @@ -1443,6 +1553,8 @@ class NeonCli(AbstractNeonCli): "start", "--tenant-id", str(tenant_id or self.env.initial_tenant), + "--pg-version", + self.env.pg_version, ] if lsn is not None: args.append(f"--lsn={lsn}") @@ -1572,11 +1684,13 @@ def append_pageserver_param_overrides( class PgBin: """A helper class for executing postgres binaries""" - def __init__(self, log_dir: Path): + def __init__(self, log_dir: Path, pg_version: str): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "bin") + self.pg_version = pg_version + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") + self.pg_lib_dir = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "lib") self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), "lib") + self.env["LD_LIBRARY_PATH"] = self.pg_lib_dir def _fixpath(self, command: List[str]): if "/" not in command[0]: @@ -1631,8 +1745,8 @@ class PgBin: @pytest.fixture(scope="function") -def pg_bin(test_output_dir: Path) -> PgBin: - return PgBin(test_output_dir) +def pg_bin(test_output_dir: Path, pg_version: str) -> PgBin: + return PgBin(test_output_dir, pg_version) @dataclass @@ -1710,12 +1824,19 @@ class VanillaPostgres(PgProtocol): self.stop() +@pytest.fixture(scope="session") +def pg_version() -> str: + return default_pg_version + + @pytest.fixture(scope="function") def vanilla_pg( - test_output_dir: Path, port_distributor: PortDistributor + test_output_dir: Path, + port_distributor: PortDistributor, + pg_version: str, ) -> Iterator[VanillaPostgres]: pgdatadir = test_output_dir / "pgdata-vanilla" - pg_bin = PgBin(test_output_dir) + pg_bin = PgBin(test_output_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: yield vanilla_pg @@ -1751,8 +1872,8 @@ class RemotePostgres(PgProtocol): @pytest.fixture(scope="function") -def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: - pg_bin = PgBin(test_output_dir) +def remote_pg(test_output_dir: Path, pg_version: str) -> Iterator[RemotePostgres]: + pg_bin = PgBin(test_output_dir, pg_version) connstr = os.getenv("BENCHMARK_CONNSTR") if connstr is None: @@ -2481,7 +2602,11 @@ def list_files_to_compare(pgdata_dir: Path): # pg is the existing and running compute node, that we want to compare with a basebackup -def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres): +def check_restored_datadir_content( + test_output_dir: Path, + env: NeonEnv, + pg: Postgres, +): # Get the timeline ID. We need it for the 'basebackup' command timeline = TimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) @@ -2492,7 +2617,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir" restored_dir_path.mkdir(exist_ok=True) - pg_bin = PgBin(test_output_dir) + pg_bin = PgBin(test_output_dir, env.pg_version) psql_path = os.path.join(pg_bin.pg_bin_path, "psql") cmd = rf""" @@ -2505,7 +2630,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Post # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": os.path.join(str(pg_distrib_dir), "lib")} + psql_env = {"LD_LIBRARY_PATH": pg_bin.pg_lib_dir} result = subprocess.run(cmd, env=psql_env, capture_output=True, text=True, shell=True) # Print captured stdout/stderr if basebackup cmd failed. diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py new file mode 100644 index 0000000000..d71fb6d12c --- /dev/null +++ b/test_runner/performance/test_layer_map.py @@ -0,0 +1,39 @@ +import time + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# +# Benchmark searching the layer map, when there are a lot of small layer files. +# +def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): + + env = neon_env_builder.init_start() + n_iters = 10 + n_records = 100000 + + # We want to have a lot of lot of layer files to exercise the layer map. Make + # gc_horizon and checkpoint_distance very small, so that we get a lot of small layer files. + tenant, _ = env.neon_cli.create_tenant( + conf={ + "gc_period": "100 m", + "gc_horizon": "1048576", + "checkpoint_distance": "8192", + "compaction_period": "1 s", + "compaction_threshold": "1", + "compaction_target_size": "8192", + } + ) + + env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant) + pg = env.postgres.create_start("test_layer_map", tenant_id=tenant) + cur = pg.connect().cursor() + cur.execute("create table t(x integer)") + for i in range(n_iters): + cur.execute(f"insert into t values (generate_series(1,{n_records}))") + time.sleep(1) + + cur.execute("vacuum t") + with zenbenchmark.record_duration("test_query"): + cur.execute("SELECT count(*) from t") + assert cur.fetchone() == (n_iters * n_records,) diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index d7aa1911b9..b99ae33232 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -4,7 +4,7 @@ import os import timeit from datetime import datetime from pathlib import Path -from typing import List +from typing import Dict, List import pytest from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult @@ -24,14 +24,18 @@ def utc_now_timestamp() -> int: return calendar.timegm(datetime.utcnow().utctimetuple()) -def init_pgbench(env: PgCompare, cmdline): +def init_pgbench(env: PgCompare, cmdline, password: None): + environ: Dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + # calculate timestamps and durations separately # timestamp is intended to be used for linking to grafana and logs # duration is actually a metric and uses float instead of int for timestamp start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() with env.record_pageserver_writes("init.pageserver_writes"): - out = env.pg_bin.run_capture(cmdline) + out = env.pg_bin.run_capture(cmdline, env=environ) env.flush() duration = timeit.default_timer() - t0 @@ -48,13 +52,15 @@ def init_pgbench(env: PgCompare, cmdline): env.zenbenchmark.record_pg_bench_init_result("init", res) -def run_pgbench(env: PgCompare, prefix: str, cmdline): +def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None): + environ: Dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + with env.record_pageserver_writes(f"{prefix}.pageserver_writes"): run_start_timestamp = utc_now_timestamp() t0 = timeit.default_timer() - out = env.pg_bin.run_capture( - cmdline, - ) + out = env.pg_bin.run_capture(cmdline, env=environ) run_duration = timeit.default_timer() - t0 run_end_timestamp = utc_now_timestamp() env.flush() @@ -82,11 +88,14 @@ def run_pgbench(env: PgCompare, prefix: str, cmdline): def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: PgBenchLoadType): env.zenbenchmark.record("scale", scale, "", MetricReport.TEST_PARAM) + password = env.pg.default_options.get("password", None) + options = "-cstatement_timeout=1h " + env.pg.default_options.get("options", "") + # drop password from the connection string by passing password=None and set password separately + connstr = env.pg.connstr(password=None, options=options) + if workload_type == PgBenchLoadType.INIT: # Run initialize - init_pgbench( - env, ["pgbench", f"-s{scale}", "-i", env.pg.connstr(options="-cstatement_timeout=1h")] - ) + init_pgbench(env, ["pgbench", f"-s{scale}", "-i", connstr], password=password) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: # Run simple-update workload @@ -100,8 +109,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P f"-T{duration}", "-P2", "--progress-timestamp", - env.pg.connstr(), + connstr, ], + password=password, ) if workload_type == PgBenchLoadType.SELECT_ONLY: @@ -116,8 +126,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P f"-T{duration}", "-P2", "--progress-timestamp", - env.pg.connstr(), + connstr, ], + password=password, ) env.report_size() diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index cb2621ff02..d7aebfb938 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -9,6 +9,7 @@ from fixtures.utils import query_scalar # def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() # Override defaults, 1M gc_horizon and 4M checkpoint_distance. # Extend compaction_period and gc_period to disable background compaction and gc. @@ -23,7 +24,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): } ) - env.pageserver.safe_psql("failpoints flush-frozen-before-sync=sleep(10000)") + pageserver_http.configure_failpoints(("flush-frozen-before-sync", "sleep(10000)")) pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() @@ -92,9 +93,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 300k rows: {lsn_300}") # Run compaction on branch1. - compact = f"compact {tenant} {branch1_timeline} {lsn_200}" + compact = f"compact {tenant} {branch1_timeline}" log.info(compact) - env.pageserver.safe_psql(compact) + pageserver_http.timeline_compact(tenant, branch1_timeline) assert query_scalar(branch0_cur, "SELECT count(*) FROM foo") == 100000 diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py index 81a46ee2f0..94d3999d17 100644 --- a/test_runner/regress/test_basebackup_error.py +++ b/test_runner/regress/test_basebackup_error.py @@ -9,9 +9,10 @@ from fixtures.neon_fixtures import NeonEnv def test_basebackup_error(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_basebackup_error", "empty") + pageserver_http = env.pageserver.http_client() # Introduce failpoint - env.pageserver.safe_psql("failpoints basebackup-before-control-file=return") + pageserver_http.configure_failpoints(("basebackup-before-control-file", "return")) with pytest.raises(Exception, match="basebackup-before-control-file"): env.postgres.create_start("test_basebackup_error") diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index c8c5929066..12debe50eb 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -47,6 +47,7 @@ from fixtures.utils import query_scalar # could not find data for key ... at LSN ..., for request at LSN ... def test_branch_and_gc(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() tenant, _ = env.neon_cli.create_tenant( conf={ @@ -84,7 +85,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # Set the GC horizon so that lsn1 is inside the horizon, which means # we can create a new branch starting from lsn1. - env.pageserver.safe_psql(f"do_gc {tenant} {timeline_main} {lsn2 - lsn1 + 1024}") + pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024) env.neon_cli.create_branch( "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1 @@ -113,6 +114,8 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # For more details, see discussion in https://github.com/neondatabase/neon/pull/2101#issuecomment-1185273447. def test_branch_creation_before_gc(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() + # Disable background GC but set the `pitr_interval` to be small, so GC can delete something tenant, _ = env.neon_cli.create_tenant( conf={ @@ -147,10 +150,10 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the # branch creation task but the individual timeline GC iteration happens *after* # the branch creation task. - env.pageserver.safe_psql("failpoints before-timeline-gc=sleep(2000)") + pageserver_http_client.configure_failpoints(("before-timeline-gc", "sleep(2000)")) def do_gc(): - env.pageserver.safe_psql(f"do_gc {tenant} {b0} 0") + pageserver_http_client.timeline_gc(tenant, b0, 0) thread = threading.Thread(target=do_gc, daemon=True) thread.start() @@ -161,7 +164,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): time.sleep(1.0) # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC. - with pytest.raises(Exception, match="invalid branch start lsn"): + with pytest.raises(Exception, match="invalid branch start lsn: .*"): env.neon_cli.create_branch("b1", "b0", tenant_id=tenant, ancestor_start_lsn=lsn) thread.join() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index b0d0737172..0e2a8b346b 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -1,4 +1,3 @@ -import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -96,7 +95,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): assert pg.safe_psql("SELECT 1")[0][0] == 1 # branch at pre-initdb lsn - with pytest.raises(Exception, match="invalid branch start lsn"): + with pytest.raises(Exception, match="invalid branch start lsn: .*"): env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42")) # branch at pre-ancestor lsn @@ -106,13 +105,11 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): ) # check that we cannot create branch based on garbage collected data - with env.pageserver.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - # call gc to advace latest_gc_cutoff_lsn - pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) + with env.pageserver.http_client() as pageserver_http: + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) - with pytest.raises(Exception, match="invalid branch start lsn"): + with pytest.raises(Exception, match="invalid branch start lsn: .*"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail env.neon_cli.create_branch( "test_branch_create_fail", "test_branch_behind", ancestor_start_lsn=gced_lsn diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index fd81981b2b..7baa67935d 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -113,13 +113,14 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http = env.pageserver.http_client() tenant_id, _ = env.neon_cli.create_tenant() old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) # Introduce failpoint when creating a new timeline - env.pageserver.safe_psql("failpoints before-checkpoint-new-timeline=return") + pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return")) with pytest.raises(Exception, match="before-checkpoint-new-timeline"): _ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id) diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 88d4ad8a6e..332bef225f 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -1,4 +1,5 @@ import asyncio +import concurrent.futures import random from fixtures.log_helper import log @@ -30,10 +31,15 @@ async def update_table(pg: Postgres): # Perform aggressive GC with 0 horizon async def gc(env: NeonEnv, timeline: TimelineId): - psconn = await env.pageserver.connect_async() + pageserver_http = env.pageserver.http_client() - while updates_performed < updates_to_perform: - await psconn.execute(f"do_gc {env.initial_tenant} {timeline} 0") + loop = asyncio.get_running_loop() + + with concurrent.futures.ThreadPoolExecutor() as pool: + while updates_performed < updates_to_perform: + await loop.run_in_executor( + pool, lambda: pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + ) # At the same time, run UPDATEs and GC diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 7b61b03b97..c84d282a4d 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -96,6 +96,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build end_lsn, "--wal-tarfile", wal, + "--pg-version", + env.pg_version, ] ) @@ -248,6 +250,8 @@ def _import( str(lsn), "--base-tarfile", os.path.join(tar_output_file), + "--pg-version", + env.pg_version, ] ) @@ -270,8 +274,7 @@ def _import( assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file) # Check that gc works - psconn = env.pageserver.connect() - pscur = psconn.cursor() - pscur.execute(f"do_gc {tenant} {timeline} 0") + pageserver_http = env.pageserver.http_client() + pageserver_http.timeline_gc(tenant, timeline, 0) return tar_output_file diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index c99e13f45f..3e387bb6cc 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,4 +1,3 @@ -import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.types import TimelineId @@ -29,8 +28,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Get the timeline ID of our branch. We need it for the 'do_gc' command timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) - psconn = env.pageserver.connect() - pscur = psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) + pageserver_http = env.pageserver.http_client() # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers. @@ -61,9 +59,8 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. for i in range(10): - pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") - gcrow = pscur.fetchone() - print_gc_result(gcrow) + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) for j in range(100): cur.execute("UPDATE foo SET val = val + 1 WHERE id = 1;") diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index aa5a65f446..f23811b671 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -26,9 +26,9 @@ def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, cap (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/regress") - src_path = os.path.join(base_dir, "vendor/postgres-v14/src/test/regress") - bindir = os.path.join(pg_distrib_dir, "bin") + build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/regress").format(env.pg_version) + src_path = os.path.join(base_dir, "vendor/postgres-v{}/src/test/regress").format(env.pg_version) + bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") @@ -80,9 +80,11 @@ def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, caps (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/isolation") - src_path = os.path.join(base_dir, "vendor/postgres-v14/src/test/isolation") - bindir = os.path.join(pg_distrib_dir, "bin") + build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/isolation".format(env.pg_version)) + src_path = os.path.join( + base_dir, "vendor/postgres-v{}/src/test/isolation".format(env.pg_version) + ) + bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") schedule = os.path.join(src_path, "isolation_schedule") pg_isolation_regress = os.path.join(build_path, "pg_isolation_regress") @@ -124,9 +126,9 @@ def test_sql_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, ca # Compute all the file locations that pg_regress will need. # This test runs neon specific tests - build_path = os.path.join(pg_distrib_dir, "../build/v14/src/test/regress") + build_path = os.path.join(pg_distrib_dir, "build/v{}/src/test/regress").format(env.pg_version) src_path = os.path.join(base_dir, "test_runner/sql_regress") - bindir = os.path.join(pg_distrib_dir, "bin") + bindir = os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin") schedule = os.path.join(src_path, "parallel_schedule") pg_regress = os.path.join(build_path, "pg_regress") diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 57b2ee1c04..d8b7256577 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -1,6 +1,3 @@ -from contextlib import closing - -import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.types import TimelineId @@ -54,13 +51,11 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 10000 rows: {debug_lsn} xid {debug_xid}") # run GC - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - pscur.execute(f"compact {env.initial_tenant} {timeline}") - # perform aggressive GC. Data still should be kept because of the PITR setting. - pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) + with env.pageserver.http_client() as pageserver_http: + pageserver_http.timeline_compact(env.initial_tenant, timeline) + # perform aggressive GC. Data still should be kept because of the PITR setting. + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) # Branch at the point where only 100 rows were inserted # It must have been preserved by PITR setting diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 3be64e077f..dfa57aec25 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -106,6 +106,7 @@ def test_readonly_node(neon_simple_env: NeonEnv): # Similar test, but with more data, and we force checkpoints def test_timetravel(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() env.neon_cli.create_branch("test_timetravel", "empty") pg = env.postgres.create_start("test_timetravel") @@ -136,7 +137,7 @@ def test_timetravel(neon_simple_env: NeonEnv): wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to force a new layer file - env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id) ##### Restart pageserver env.postgres.stop_all() diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 08c15d8f09..d0ba96e8e0 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -1,7 +1,6 @@ import time from contextlib import closing -import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder @@ -19,8 +18,8 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): f = env.neon_cli.pageserver_enabled_features() assert ( - "failpoints" in f["features"] - ), "Build pageserver with --features=failpoints option to run this test" + "testing" in f["features"] + ), "Build pageserver with --features=testing option to run this test" neon_env_builder.start() # Create a branch for us @@ -31,26 +30,28 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): with closing(pg.connect()) as conn: with conn.cursor() as cur: - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - # Create and initialize test table - cur.execute("CREATE TABLE foo(x bigint)") - cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))") + with env.pageserver.http_client() as pageserver_http: + # Create and initialize test table + cur.execute("CREATE TABLE foo(x bigint)") + cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))") - # Sleep for some time to let checkpoint create image layers - time.sleep(2) + # Sleep for some time to let checkpoint create image layers + time.sleep(2) - # Configure failpoints - pscur.execute( - "failpoints flush-frozen-before-sync=sleep(2000);checkpoint-after-sync=exit" - ) + # Configure failpoints + pageserver_http.configure_failpoints( + [ + ("flush-frozen-before-sync", "sleep(2000)"), + ("checkpoint-after-sync", "exit"), + ] + ) - # Do some updates until pageserver is crashed - try: - while True: - cur.execute("update foo set x=x+1") - except Exception as err: - log.info(f"Expected server crash {err}") + # Do some updates until pageserver is crashed + try: + while True: + cur.execute("update foo set x=x+1") + except Exception as err: + log.info(f"Expected server crash {err}") log.info("Wait before server restart") env.pageserver.stop() diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index cbe74cad5c..3e775b10b0 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -57,6 +57,7 @@ def test_remote_storage_backup_and_restore( ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() pg = env.postgres.create_start("main") client = env.pageserver.http_client() @@ -80,7 +81,7 @@ def test_remote_storage_backup_and_restore( wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) # run checkpoint manually to be sure that data landed in remote storage - env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) log.info(f"waiting for checkpoint {checkpoint_number} upload") # wait until pageserver successfully uploaded a checkpoint to remote storage @@ -99,7 +100,7 @@ def test_remote_storage_backup_and_restore( env.pageserver.start() # Introduce failpoint in download - env.pageserver.safe_psql("failpoints remote-storage-download-pre-rename=return") + pageserver_http.configure_failpoints(("remote-storage-download-pre-rename", "return")) client.tenant_attach(tenant_id) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index e3c9a091f9..f18e6867a9 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -1,16 +1,21 @@ from threading import Thread -import psycopg2 import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + NeonPageserverApiException, + NeonPageserverHttpClient, +) from fixtures.types import TenantId, TimelineId -def do_gc_target(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): +def do_gc_target( + pageserver_http: NeonPageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId +): """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: - env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") + pageserver_http.timeline_gc(tenant_id, timeline_id, 0) except Exception as e: log.error("do_gc failed: %s", e) @@ -44,13 +49,13 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # gc should not try to even start with pytest.raises( - expected_exception=psycopg2.DatabaseError, match="gc target timeline does not exist" + expected_exception=NeonPageserverApiException, match="gc target timeline does not exist" ): bogus_timeline_id = TimelineId.generate() - env.pageserver.safe_psql(f"do_gc {tenant_id} {bogus_timeline_id} 0") + pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) # try to concurrently run gc and detach - gc_thread = Thread(target=lambda: do_gc_target(env, tenant_id, timeline_id)) + gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id)) gc_thread.start() last_error = None @@ -73,6 +78,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() with pytest.raises( - expected_exception=psycopg2.DatabaseError, match=f"Tenant {tenant_id} not found" + expected_exception=NeonPageserverApiException, match=f"Tenant {tenant_id} not found" ): - env.pageserver.safe_psql(f"do_gc {tenant_id} {timeline_id} 0") + pageserver_http.timeline_gc(tenant_id, timeline_id, 0) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index aa7d92f1fd..2b01546198 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -147,14 +147,13 @@ def populate_branch( def ensure_checkpoint( - pageserver_cur, pageserver_http: NeonPageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, current_lsn: Lsn, ): # run checkpoint manually to be sure that data landed in remote storage - pageserver_cur.execute(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) # wait until pageserver successfully uploaded a checkpoint to remote storage wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) @@ -324,22 +323,19 @@ def test_tenant_relocation( # this requirement introduces a problem # if user creates a branch during migration # it wont appear on the new pageserver - with pg_cur(env.pageserver) as cur: - ensure_checkpoint( - cur, - pageserver_http=pageserver_http, - tenant_id=tenant_id, - timeline_id=timeline_id_main, - current_lsn=current_lsn_main, - ) + ensure_checkpoint( + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_main, + current_lsn=current_lsn_main, + ) - ensure_checkpoint( - cur, - pageserver_http=pageserver_http, - tenant_id=tenant_id, - timeline_id=timeline_id_second, - current_lsn=current_lsn_second, - ) + ensure_checkpoint( + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_second, + current_lsn=current_lsn_second, + ) log.info("inititalizing new pageserver") # bootstrap second pageserver diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 4500395c8f..f49b6fccb9 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,4 +1,5 @@ import os +import shutil from contextlib import closing from datetime import datetime from pathlib import Path @@ -7,8 +8,13 @@ from typing import List import pytest from fixtures.log_helper import log from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -from fixtures.types import Lsn, TenantId +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + RemoteStorageKind, + available_remote_storages, +) +from fixtures.types import Lsn, TenantId, TimelineId from prometheus_client.samples import Sample @@ -19,7 +25,8 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) initial_tenant_dirs = set([d for d in tenants_dir.iterdir()]) - neon_simple_env.pageserver.safe_psql("failpoints tenant-creation-before-tmp-rename=return") + pageserver_http = neon_simple_env.pageserver.http_client() + pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return")) with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"): _ = neon_simple_env.neon_cli.create_tenant() @@ -200,3 +207,63 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde post_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) assert post_detach_samples == set() + + +# Check that empty tenants work with or without the remote storage +@pytest.mark.parametrize( + "remote_storage_kind", available_remote_storages() + [RemoteStorageKind.NOOP] +) +def test_pageserver_with_empty_tenants( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_pageserver_with_empty_tenants", + ) + + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + tenant_without_timelines_dir = env.initial_tenant + log.info( + f"Tenant {tenant_without_timelines_dir} becomes broken: it abnormally looses tenants/ directory and is expected to be completely ignored when pageserver restarts" + ) + shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines") + + tenant_with_empty_timelines_dir = client.tenant_create() + log.info( + f"Tenant {tenant_with_empty_timelines_dir} gets all of its timelines deleted: still should be functional" + ) + temp_timelines = client.timeline_list(tenant_with_empty_timelines_dir) + for temp_timeline in temp_timelines: + client.timeline_delete( + tenant_with_empty_timelines_dir, TimelineId(temp_timeline["timeline_id"]) + ) + files_in_timelines_dir = sum( + 1 + for _p in Path.iterdir( + Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" + ) + ) + assert ( + files_in_timelines_dir == 0 + ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory" + + # Trigger timeline reinitialization after pageserver restart + env.postgres.stop_all() + env.pageserver.stop() + env.pageserver.start() + + client = env.pageserver.http_client() + tenants = client.tenant_list() + + assert ( + len(tenants) == 1 + ), "Pageserver should attach only tenants with empty timelines/ dir on restart" + loaded_tenant = tenants[0] + assert loaded_tenant["id"] == str( + tenant_with_empty_timelines_dir + ), f"Tenant {tenant_with_empty_timelines_dir} should be loaded as the only one with tenants/ directory" + assert loaded_tenant["state"] == { + "Active": {"background_jobs_running": False} + }, "Empty tenant should be loaded and ready for timeline creation" diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 85f371c845..d8424e22c8 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -7,19 +7,25 @@ # import asyncio +import os +from pathlib import Path from typing import List, Tuple import pytest +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + NeonPageserverHttpClient, Postgres, RemoteStorageKind, available_remote_storages, wait_for_last_record_lsn, wait_for_upload, + wait_until, ) from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import query_scalar async def tenant_workload(env: NeonEnv, pg: Postgres): @@ -91,5 +97,95 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) # run final checkpoint manually to flush all the data to remote storage - env.pageserver.safe_psql(f"checkpoint {tenant_id} {timeline_id}") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_tenants_attached_after_download( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="remote_storage_kind", + ) + + data_id = 1 + data_secret = "very secret secret" + + ##### First start, insert secret data and upload it to the remote storage + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + for checkpoint_number in range(1, 3): + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE t{checkpoint_number}(id int primary key, secret text); + INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + log.info(f"waiting for checkpoint {checkpoint_number} upload") + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info(f"upload of checkpoint {checkpoint_number} is done") + + ##### Stop the pageserver, erase its layer file to force it being downloaded from S3 + env.postgres.stop_all() + env.pageserver.stop() + + timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + local_layer_deleted = False + for path in Path.iterdir(timeline_dir): + if path.name.startswith("00000"): + # Looks like a layer file. Remove it + os.remove(path) + local_layer_deleted = True + break + assert local_layer_deleted, f"Found no local layer files to delete in directory {timeline_dir}" + + ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + env.pageserver.start() + client = env.pageserver.http_client() + + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: expect_tenant_to_download_timeline(client, tenant_id), + ) + + restored_timelines = client.timeline_list(tenant_id) + assert ( + len(restored_timelines) == 1 + ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" + retored_timeline = restored_timelines[0] + assert retored_timeline["timeline_id"] == str( + timeline_id + ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + + +def expect_tenant_to_download_timeline( + client: NeonPageserverHttpClient, + tenant_id: TenantId, +): + for tenant in client.tenant_list(): + if tenant["id"] == str(tenant_id): + assert not tenant.get( + "has_in_progress_downloads", True + ), f"Tenant {tenant_id} should have no downloads in progress" + return + assert False, f"Tenant {tenant_id} is missing on pageserver" diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 83018f46f5..3a482be5db 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -3,6 +3,7 @@ import random import re import time from contextlib import closing +from pathlib import Path import psycopg2.errors import psycopg2.extras @@ -11,7 +12,10 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient, + PgBin, + PortDistributor, Postgres, + VanillaPostgres, assert_timeline_local, wait_for_last_flush_lsn, ) @@ -238,6 +242,7 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv): def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): env = neon_simple_env + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint") pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint") @@ -251,7 +256,7 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -264,6 +269,7 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder ) env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") @@ -278,8 +284,8 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") - env.pageserver.safe_psql(f"compact {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) + pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) assert_physical_size(env, env.initial_tenant, new_timeline_id) @@ -290,6 +296,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") pg = env.postgres.create_start("test_timeline_physical_size_post_gc") @@ -304,7 +311,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pg.safe_psql( """ @@ -315,17 +322,23 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"do_gc {env.initial_tenant} {new_timeline_id} 0") + pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) assert_physical_size(env, env.initial_tenant, new_timeline_id) # The timeline logical and physical sizes are also exposed as prometheus metrics. # Test the metrics. -def test_timeline_size_metrics(neon_simple_env: NeonEnv): +def test_timeline_size_metrics( + neon_simple_env: NeonEnv, + test_output_dir: Path, + port_distributor: PortDistributor, + pg_version: str, +): env = neon_simple_env + pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_metrics") pg = env.postgres.create_start("test_timeline_size_metrics") @@ -340,7 +353,7 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) - env.pageserver.safe_psql(f"checkpoint {env.initial_tenant} {new_timeline_id}") + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) # get the metrics and parse the metric for the current timeline's physical size metrics = env.pageserver.http_client().get_metrics() @@ -365,11 +378,28 @@ def test_timeline_size_metrics(neon_simple_env: NeonEnv): assert matches tl_logical_size_metric = int(matches.group(1)) - # An empty database is around 8 MB. There at least 3 databases, 'postgres', - # 'template0', 'template1'. So the total size should be about 32 MB. This isn't - # very accurate and can change with different PostgreSQL versions, so allow a - # couple of MB of slack. - assert math.isclose(tl_logical_size_metric, 32 * 1024 * 1024, abs_tol=2 * 1024 * 1024) + pgdatadir = test_output_dir / "pgdata-vanilla" + pg_bin = PgBin(test_output_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: + vanilla_pg.configure([f"port={port}"]) + vanilla_pg.start() + + # Create database based on template0 because we can't connect to template0 + vanilla_pg.safe_psql("CREATE TABLE foo (t text)") + vanilla_pg.safe_psql( + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""" + ) + vanilla_size_sum = vanilla_pg.safe_psql( + "select sum(pg_database_size(oid)) from pg_database" + )[0][0] + + # Compare the size with Vanilla postgres. + # Allow some slack, because the logical size metric includes some things like + # the SLRUs that are not included in pg_database_size(). + assert math.isclose(tl_logical_size_metric, vanilla_size_sum, abs_tol=2 * 1024 * 1024) # The sum of the sizes of all databases, as seen by pg_database_size(), should also # be close. Again allow some slack, the logical size metric includes some things like @@ -382,6 +412,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): random.seed(100) env = neon_simple_env + pageserver_http = env.pageserver.http_client() client = env.pageserver.http_client() tenant, timeline = env.neon_cli.create_tenant() @@ -405,7 +436,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): ) wait_for_last_flush_lsn(env, pg, tenant, timeline) - env.pageserver.safe_psql(f"checkpoint {tenant} {timeline}") + pageserver_http.timeline_checkpoint(tenant, timeline) timeline_total_size += get_timeline_physical_size(timeline) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 089ed91c98..d5a5ec2f36 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -59,9 +59,7 @@ def wait_lsn_force_checkpoint( ) # force checkpoint to advance remote_consistent_lsn - with closing(ps.connect(**pageserver_conn_options)) as psconn: - with psconn.cursor() as pscur: - pscur.execute(f"checkpoint {tenant_id} {timeline_id}") + ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) # ensure that remote_consistent_lsn is advanced wait_for_upload( @@ -636,6 +634,9 @@ class ProposerPostgres(PgProtocol): } basepath = self.pg_bin.run_capture(command, env) + + log.info(f"postgres --sync-safekeepers output: {basepath}") + stdout_filename = basepath + ".stdout" with open(stdout_filename, "r") as stdout_f: @@ -664,7 +665,9 @@ class ProposerPostgres(PgProtocol): # insert wal in all safekeepers and run sync on proposer def test_sync_safekeepers( - neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + port_distributor: PortDistributor, ): # We don't really need the full environment for this test, just the @@ -701,6 +704,7 @@ def test_sync_safekeepers( "begin_lsn": int(begin_lsn), "epoch_start_lsn": int(epoch_start_lsn), "truncate_lsn": int(epoch_start_lsn), + "pg_version": int(env.pg_version) * 10000, }, ) lsn = Lsn(res["inserted_wal"]["end_lsn"]) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 21921a3bc2..db6f1e5137 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -26,11 +26,11 @@ def test_wal_restore( env.neon_cli.pageserver_stop() port = port_distributor.get_port() data_dir = test_output_dir / "pgsql.restored" - with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored: + with VanillaPostgres(data_dir, PgBin(test_output_dir, env.pg_version), port) as restored: pg_bin.run_capture( [ os.path.join(base_dir, "libs/utils/scripts/restore_from_wal.sh"), - os.path.join(pg_distrib_dir, "bin"), + os.path.join(pg_distrib_dir, "v{}".format(env.pg_version), "bin"), str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"), str(data_dir), str(port), diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index ce723ee499..19d948fd47 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit ce723ee499450cb108aede464a35a17f3d75cf84 +Subproject commit 19d948fd47f45d83367062d9a54709cf2d9c8921 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 0858387047..9383aaa9c2 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 08583870479e30c64aeb5a97d6fee9cf470f05fb +Subproject commit 9383aaa9c2616fd81cfafb058fe0d692f5e43ac3 diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 096b3a5d70..6977665c7d 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -19,16 +19,10 @@ anyhow = { version = "1", features = ["backtrace", "std"] } bstr = { version = "0.2", features = ["lazy_static", "regex-automata", "serde", "serde1", "serde1-nostd", "std", "unicode"] } bytes = { version = "1", features = ["serde", "std"] } chrono = { version = "0.4", features = ["clock", "libc", "oldtime", "serde", "std", "time", "winapi"] } -clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } +crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] } either = { version = "1", features = ["use_std"] } fail = { version = "0.5", default-features = false, features = ["failpoints"] } -futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } -futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] } -futures-util = { version = "0.3", default-features = false, features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] } -generic-array = { version = "0.14", default-features = false, features = ["more_lengths"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } -hex = { version = "0.4", features = ["alloc", "serde", "std"] } -hyper = { version = "0.14", features = ["client", "full", "h2", "http1", "http2", "runtime", "server", "socket2", "stream", "tcp"] } indexmap = { version = "1", default-features = false, features = ["std"] } itoa = { version = "0.4", features = ["i128", "std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } @@ -37,7 +31,7 @@ memchr = { version = "2", features = ["std", "use_std"] } nom = { version = "7", features = ["alloc", "std"] } num-bigint = { version = "0.4", features = ["std"] } num-integer = { version = "0.1", default-features = false, features = ["i128", "std"] } -num-traits = { version = "0.2", features = ["i128", "std"] } +num-traits = { version = "0.2", features = ["i128", "libm", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } @@ -45,17 +39,18 @@ regex-automata = { version = "0.1", features = ["regex-syntax", "std"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } +stable_deref_trait = { version = "1", features = ["alloc", "std"] } time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } -tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] } +tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } -tracing-core = { version = "0.1", features = ["once_cell", "std", "valuable"] } +tracing-core = { version = "0.1", features = ["once_cell", "std"] } +uuid = { version = "0.8", features = ["getrandom", "serde", "std", "v4"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } -clap = { version = "2", features = ["ansi_term", "atty", "color", "strsim", "suggestions", "vec_map"] } either = { version = "1", features = ["use_std"] } hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } indexmap = { version = "1", default-features = false, features = ["std"] }