diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 1d1b50e458..b7e0be761a 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -33,9 +33,14 @@ config-variables: - REMOTE_STORAGE_AZURE_CONTAINER - REMOTE_STORAGE_AZURE_REGION - SLACK_CICD_CHANNEL_ID + - SLACK_COMPUTE_CHANNEL_ID - SLACK_ON_CALL_DEVPROD_STREAM - SLACK_ON_CALL_QA_STAGING_STREAM - SLACK_ON_CALL_STORAGE_STAGING_STREAM + - SLACK_ONCALL_COMPUTE_GROUP + - SLACK_ONCALL_PROXY_GROUP + - SLACK_ONCALL_STORAGE_GROUP + - SLACK_PROXY_CHANNEL_ID - SLACK_RUST_CHANNEL_ID - SLACK_STORAGE_CHANNEL_ID - SLACK_UPCOMING_RELEASE_CHANNEL_ID diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index a5b4104908..d7ff05be1a 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -49,10 +49,6 @@ inputs: description: 'A JSON object with project settings' required: false default: '{}' - default_endpoint_settings: - description: 'A JSON object with the default endpoint settings' - required: false - default: '{}' outputs: dsn: @@ -139,21 +135,6 @@ runs: -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ -d "{\"scheduling\": \"Essential\"}" fi - # XXX - # This is a workaround for the default endpoint settings, which currently do not allow some settings in the public API. - # https://github.com/neondatabase/cloud/issues/27108 - if [[ -n ${DEFAULT_ENDPOINT_SETTINGS} && ${DEFAULT_ENDPOINT_SETTINGS} != "{}" ]] ; then - PROJECT_DATA=$(curl -X GET \ - "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}" \ - -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ - -d "{\"scheduling\": \"Essential\"}" - ) - NEW_DEFAULT_ENDPOINT_SETTINGS=$(echo ${PROJECT_DATA} | jq -rc ".project.default_endpoint_settings + ${DEFAULT_ENDPOINT_SETTINGS}") - curl -X POST --fail \ - "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}/default_endpoint_settings" \ - -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ - --data "${NEW_DEFAULT_ENDPOINT_SETTINGS}" - fi env: @@ -171,4 +152,3 @@ runs: PSQL: ${{ inputs.psql_path }} LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }} PROJECT_SETTINGS: ${{ inputs.project_settings }} - DEFAULT_ENDPOINT_SETTINGS: ${{ inputs.default_endpoint_settings }} diff --git a/.github/scripts/lint-release-pr.sh b/.github/scripts/lint-release-pr.sh index 6dc5b99f0e..d3badf9562 100755 --- a/.github/scripts/lint-release-pr.sh +++ b/.github/scripts/lint-release-pr.sh @@ -41,7 +41,7 @@ echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}" LAST_COMMIT=$(git rev-parse HEAD) MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}") -EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$" +EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2} UTC$" if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then report_error "Merge commit message does not match expected pattern: ' release YYYY-MM-DD' diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 7cede309f3..663afa2c8b 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -279,18 +279,14 @@ jobs: # run all non-pageserver tests ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' - # run pageserver tests with different settings - for get_vectored_concurrent_io in sequential sidecar-task; do - for io_engine in std-fs tokio-epoll-uring ; do - for io_mode in buffered direct direct-rw ; do - NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \ - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \ - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE=$io_mode \ - ${cov_prefix} \ - cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' - done - done - done + # run pageserver tests + # (When developing new pageserver features gated by config fields, we commonly make the rust + # unit tests sensitive to an environment variable NEON_PAGESERVER_UNIT_TEST_FEATURENAME. + # Then run the nextest invocation below for all relevant combinations. Singling out the + # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.) + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring \ + ${cov_prefix} \ + cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty @@ -405,8 +401,6 @@ jobs: CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task - PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} # Temporary disable this step until we figure out why it's so flaky diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml deleted file mode 100644 index f96ed7d69b..0000000000 --- a/.github/workflows/_create-release-pr.yml +++ /dev/null @@ -1,103 +0,0 @@ -name: Create Release PR - -on: - workflow_call: - inputs: - component-name: - description: 'Component name' - required: true - type: string - source-branch: - description: 'Source branch' - required: true - type: string - secrets: - ci-access-token: - description: 'CI access token' - required: true - -defaults: - run: - shell: bash -euo pipefail {0} - -permissions: - contents: read - -jobs: - create-release-branch: - runs-on: ubuntu-22.04 - - permissions: - contents: write # for `git push` - - steps: - - name: Harden the runner (Audit all outbound calls) - uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 - with: - egress-policy: audit - - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - ref: ${{ inputs.source-branch }} - fetch-depth: 0 - - - name: Set variables - id: vars - env: - COMPONENT_NAME: ${{ inputs.component-name }} - RELEASE_BRANCH: >- - ${{ - false - || inputs.component-name == 'Storage' && 'release' - || inputs.component-name == 'Proxy' && 'release-proxy' - || inputs.component-name == 'Compute' && 'release-compute' - }} - run: | - now_date=$(date -u +'%Y-%m-%d') - now_time=$(date -u +'%H-%M-%Z') - { - echo "title=${COMPONENT_NAME} release ${now_date}" - echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}" - echo "release-branch=${RELEASE_BRANCH}" - } | tee -a ${GITHUB_OUTPUT} - - - name: Configure git - run: | - git config user.name "github-actions[bot]" - git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - - - name: Create RC branch - env: - RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }} - RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} - TITLE: ${{ steps.vars.outputs.title }} - run: | - git switch -c "${RC_BRANCH}" - - # Manually create a merge commit on the current branch, keeping the - # tree and setting the parents to the current HEAD and the HEAD of the - # release branch. This commit is what we'll fast-forward the release - # branch to when merging the release branch. - # For details on why, look at - # https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs - current_tree=$(git rev-parse 'HEAD^{tree}') - release_head=$(git rev-parse "origin/${RELEASE_BRANCH}") - current_head=$(git rev-parse HEAD) - merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}") - - # Fast-forward the current branch to the newly created merge_commit - git merge --ff-only ${merge_commit} - - git push origin "${RC_BRANCH}" - - - name: Create a PR into ${{ steps.vars.outputs.release-branch }} - env: - GH_TOKEN: ${{ secrets.ci-access-token }} - RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} - RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }} - TITLE: ${{ steps.vars.outputs.title }} - run: | - gh pr create --title "${TITLE}" \ - --body "" \ - --head "${RC_BRANCH}" \ - --base "${RELEASE_BRANCH}" diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 5107f457e2..79371ec704 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -53,6 +53,77 @@ concurrency: cancel-in-progress: true jobs: + cleanup: + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: ghcr.io/neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --init + env: + ORG_ID: org-solitary-dew-09443886 + LIMIT: 100 + SEARCH: "GITHUB_RUN_ID=" + BASE_URL: https://console-stage.neon.build/api/v2 + DRY_RUN: "false" # Set to "true" to just test out the workflow + + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Cleanup inactive Neon projects left over from prior runs + env: + API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + run: | + set -euo pipefail + + NOW=$(date -u +%s) + DAYS_AGO=$((NOW - 5 * 86400)) + + REQUEST_URL="$BASE_URL/projects?limit=$LIMIT&search=$(printf '%s' "$SEARCH" | jq -sRr @uri)&org_id=$ORG_ID" + + echo "Requesting project list from:" + echo "$REQUEST_URL" + + response=$(curl -s -X GET "$REQUEST_URL" \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" ) + + echo "Response:" + echo "$response" | jq . + + projects_to_delete=$(echo "$response" | jq --argjson cutoff "$DAYS_AGO" ' + .projects[] + | select(.compute_last_active_at != null) + | select((.compute_last_active_at | fromdateiso8601) < $cutoff) + | {id, name, compute_last_active_at} + ') + + if [ -z "$projects_to_delete" ]; then + echo "No projects eligible for deletion." + exit 0 + fi + + echo "Projects that will be deleted:" + echo "$projects_to_delete" | jq -r '.id' + + if [ "$DRY_RUN" = "false" ]; then + echo "$projects_to_delete" | jq -r '.id' | while read -r project_id; do + echo "Deleting project: $project_id" + curl -s -X DELETE "$BASE_URL/projects/$project_id" \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" + done + else + echo "Dry run enabled — no projects were deleted." + fi bench: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} permissions: diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1791cddacc..9f2fa3d52c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -69,7 +69,7 @@ jobs: submodules: true - name: Check for file changes - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 + uses: step-security/paths-filter@v3 id: files-changed with: token: ${{ secrets.GITHUB_TOKEN }} @@ -314,7 +314,8 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ github.ref_name == 'main' }} - extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} + # test_pageserver_max_throughput_getpage_at_latest_lsn is run in separate workflow periodic_pagebench.yml because it needs snapshots + extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} --ignore=test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} pg_version: v16 aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} @@ -323,8 +324,6 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task - PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw SYNC_BETWEEN_TESTS: true # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones @@ -824,7 +823,7 @@ jobs: - pg: v17 debian: bookworm env: - VM_BUILDER_VERSION: v0.42.2 + VM_BUILDER_VERSION: v0.46.0 steps: - name: Harden the runner (Audit all outbound calls) @@ -965,7 +964,7 @@ jobs: fi - name: Verify docker-compose example and test extensions - timeout-minutes: 20 + timeout-minutes: 60 env: TAG: >- ${{ @@ -1434,10 +1433,10 @@ jobs: ;; esac - notify-storage-release-deploy-failure: - needs: [ deploy ] + notify-release-deploy-failure: + needs: [ meta, deploy ] # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs. - if: github.ref_name == 'release' && needs.deploy.result != 'success' && always() + if: contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) && needs.deploy.result != 'success' && always() runs-on: ubuntu-22.04 steps: - name: Harden the runner (Audit all outbound calls) @@ -1445,15 +1444,40 @@ jobs: with: egress-policy: audit - - name: Post release-deploy failure to team-storage slack channel + - name: Post release-deploy failure to team slack channel uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 + env: + TEAM_ONCALL: >- + ${{ + fromJSON(format('{ + "storage-release": "", + "compute-release": "", + "proxy-release": "" + }', + vars.SLACK_ONCALL_STORAGE_GROUP, + vars.SLACK_ONCALL_COMPUTE_GROUP, + vars.SLACK_ONCALL_PROXY_GROUP + ))[needs.meta.outputs.run-kind] + }} + CHANNEL: >- + ${{ + fromJSON(format('{ + "storage-release": "{0}", + "compute-release": "{1}", + "proxy-release": "{2}" + }', + vars.SLACK_STORAGE_CHANNEL_ID, + vars.SLACK_COMPUTE_CHANNEL_ID, + vars.SLACK_PROXY_CHANNEL_ID + ))[needs.meta.outputs.run-kind] + }} with: method: chat.postMessage token: ${{ secrets.SLACK_BOT_TOKEN }} payload: | - channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }} + channel: ${{ env.CHANNEL }} text: | - 🔴 : deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. + 🔴 ${{ env.TEAM_ONCALL }}: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>. # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: diff --git a/.github/workflows/cloud-extensions.yml b/.github/workflows/cloud-extensions.yml index 7d60469f92..25fe0877d9 100644 --- a/.github/workflows/cloud-extensions.yml +++ b/.github/workflows/cloud-extensions.yml @@ -35,7 +35,7 @@ jobs: matrix: pg-version: [16, 17] - runs-on: [ self-hosted, small ] + runs-on: us-east-2 container: # We use the neon-test-extensions image here as it contains the source code for the extensions. image: ghcr.io/neondatabase/neon-test-extensions-v${{ matrix.pg-version }}:latest @@ -68,23 +68,10 @@ jobs: id: create-neon-project uses: ./.github/actions/neon-project-create with: - region_id: ${{ inputs.region_id }} + region_id: ${{ inputs.region_id || 'aws-us-east-2' }} postgres_version: ${{ matrix.pg-version }} project_settings: ${{ steps.project-settings.outputs.settings }} - # We need these settings to get the expected output results. - # We cannot use the environment variables e.g. PGTZ due to - # https://github.com/neondatabase/neon/issues/1287 - default_endpoint_settings: > - { - "pg_settings": { - "DateStyle": "Postgres,MDY", - "TimeZone": "America/Los_Angeles", - "compute_query_id": "off", - "neon.allow_unstable_extensions": "on" - } - } api_key: ${{ secrets.NEON_STAGING_API_KEY }} - admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} - name: Run the regression tests run: /run-tests.sh -r /ext-src diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 79467c8f95..3427a0eb49 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -53,7 +53,7 @@ jobs: submodules: true - name: Check for Postgres changes - uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3 + uses: step-security/paths-filter@v3 id: files_changed with: token: ${{ github.token }} @@ -63,8 +63,10 @@ jobs: - name: Filter out only v-string for build matrix id: postgres_changes + env: + CHANGES: ${{ steps.files_changed.outputs.changes }} run: | - v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c) + v_strings_only_as_json_array=$(echo ${CHANGES} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c) echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}" check-macos-build: diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 532da435c2..317db94052 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -1,4 +1,4 @@ -name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region +name: Periodic pagebench performance test on unit-perf hetzner runner on: schedule: @@ -8,7 +8,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '0 */3 * * *' # Runs every 3 hours + - cron: '0 */4 * * *' # Runs every 4 hours workflow_dispatch: # Allows manual triggering of the workflow inputs: commit_hash: @@ -16,6 +16,11 @@ on: description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.' required: false default: '' + recreate_snapshots: + type: boolean + description: 'Recreate snapshots - !!!WARNING!!! We should only recreate snapshots if the previous ones are no longer compatible. Otherwise benchmarking results are not comparable across runs.' + required: false + default: false defaults: run: @@ -29,13 +34,13 @@ permissions: contents: read jobs: - trigger_bench_on_ec2_machine_in_eu_central_1: + run_periodic_pagebench_test: permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: write pull-requests: write - runs-on: [ self-hosted, small ] + runs-on: [ self-hosted, unit-perf ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: @@ -44,10 +49,13 @@ jobs: options: --init timeout-minutes: 360 # Set the timeout to 6 hours env: - API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }} RUN_ID: ${{ github.run_id }} - AWS_DEFAULT_REGION : "eu-central-1" - AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74" + DEFAULT_PG_VERSION: 16 + BUILD_TYPE: release + RUST_BACKTRACE: 1 + # NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS: 1 - doesn't work without root in container + S3_BUCKET: neon-github-public-dev + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" steps: # we don't need the neon source code because we run everything remotely # however we still need the local github actions to run the allure step below @@ -56,99 +64,194 @@ jobs: with: egress-policy: audit - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive + id: set-env + shell: bash -euxo pipefail {0} + run: | + { + echo "NEON_DIR=${RUNNER_TEMP}/neon" + echo "NEON_BIN=${RUNNER_TEMP}/neon/bin" + echo "POSTGRES_DISTRIB_DIR=${RUNNER_TEMP}/neon/pg_install" + echo "LD_LIBRARY_PATH=${RUNNER_TEMP}/neon/pg_install/v${DEFAULT_PG_VERSION}/lib" + echo "BACKUP_DIR=${RUNNER_TEMP}/instance_store/saved_snapshots" + echo "TEST_OUTPUT=${RUNNER_TEMP}/neon/test_output" + echo "PERF_REPORT_DIR=${RUNNER_TEMP}/neon/test_output/perf-report-local" + echo "ALLURE_DIR=${RUNNER_TEMP}/neon/test_output/allure-results" + echo "ALLURE_RESULTS_DIR=${RUNNER_TEMP}/neon/test_output/allure-results/results" + } >> "$GITHUB_ENV" - - name: Show my own (github runner) external IP address - usefull for IP allowlisting - run: curl https://ifconfig.me + echo "allure_results_dir=${RUNNER_TEMP}/neon/test_output/allure-results/results" >> "$GITHUB_OUTPUT" - - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine) - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 with: aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }} - role-duration-seconds: 3600 - - - name: Start EC2 instance and wait for the instance to boot up - run: | - aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID - aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID - sleep 60 # sleep some time to allow cloudinit and our API server to start up - - - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US - run: | - public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text) - echo "Public IP of the EC2 instance: $public_ip" - echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV - + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # max 5 hours (needed in case commit hash is still being built) - name: Determine commit hash + id: commit_hash + shell: bash -euxo pipefail {0} env: INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }} run: | - if [ -z "$INPUT_COMMIT_HASH" ]; then - echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV + if [[ -z "${INPUT_COMMIT_HASH}" ]]; then + COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha') + echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV + echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT" echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV else - echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV + COMMIT_HASH="${INPUT_COMMIT_HASH}" + echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV + echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT" echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV fi + - name: Checkout the neon repository at given commit hash + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + ref: ${{ steps.commit_hash.outputs.commit_hash }} - - name: Start Bench with run_id + # does not reuse ./.github/actions/download because we need to download the artifact for the given commit hash + # example artifact + # s3://neon-github-public-dev/artifacts/48b870bc078bd2c450eb7b468e743b9c118549bf/15036827400/1/neon-Linux-X64-release-artifact.tar.zst /instance_store/artifacts/neon-Linux-release-artifact.tar.zst + - name: Determine artifact S3_KEY for given commit hash and download and extract artifact + id: artifact_prefix + shell: bash -euxo pipefail {0} + env: + ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst + COMMIT_HASH: ${{ env.COMMIT_HASH }} + COMMIT_HASH_TYPE: ${{ env.COMMIT_HASH_TYPE }} run: | - curl -k -X 'POST' \ - "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -H "Authorization: Bearer $API_KEY" \ - -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}" + attempt=0 + max_attempts=24 # 5 minutes * 24 = 2 hours - - name: Poll Test Status - id: poll_step - run: | - status="" - while [[ "$status" != "failure" && "$status" != "success" ]]; do - response=$(curl -k -X 'GET' \ - "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \ - -H 'accept: application/json' \ - -H "Authorization: Bearer $API_KEY") - echo "Response: $response" - set +x - status=$(echo $response | jq -r '.status') - echo "Test status: $status" - if [[ "$status" == "failure" ]]; then - echo "Test failed" - exit 1 # Fail the job step if status is failure - elif [[ "$status" == "success" || "$status" == "null" ]]; then + while [[ $attempt -lt $max_attempts ]]; do + # the following command will fail until the artifacts are available ... + S3_KEY=$(aws s3api list-objects-v2 --bucket "$S3_BUCKET" --prefix "artifacts/$COMMIT_HASH/" \ + | jq -r '.Contents[]?.Key' \ + | grep "neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst" \ + | sort --version-sort \ + | tail -1) || true # ... thus ignore errors from the command + if [[ -n "${S3_KEY}" ]]; then + echo "Artifact found: $S3_KEY" + echo "S3_KEY=$S3_KEY" >> $GITHUB_ENV break - elif [[ "$status" == "too_many_runs" ]]; then - echo "Too many runs already running" - echo "too_many_runs=true" >> "$GITHUB_OUTPUT" - exit 1 fi - - sleep 60 # Poll every 60 seconds + + # Increment attempt counter and sleep for 5 minutes + attempt=$((attempt + 1)) + echo "Attempt $attempt of $max_attempts to find artifacts in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH failed. Retrying in 5 minutes..." + sleep 300 # Sleep for 5 minutes done - - name: Retrieve Test Logs - if: always() && steps.poll_step.outputs.too_many_runs != 'true' - run: | - curl -k -X 'GET' \ - "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \ - -H 'accept: application/gzip' \ - -H "Authorization: Bearer $API_KEY" \ - --output "test_log_${GITHUB_RUN_ID}.gz" + if [[ -z "${S3_KEY}" ]]; then + echo "Error: artifact not found in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH" after 2 hours + else + mkdir -p $(dirname $ARCHIVE) + time aws s3 cp --only-show-errors s3://$S3_BUCKET/${S3_KEY} ${ARCHIVE} + mkdir -p ${NEON_DIR} + time tar -xf ${ARCHIVE} -C ${NEON_DIR} + rm -f ${ARCHIVE} + fi - - name: Unzip Test Log and Print it into this job's log - if: always() && steps.poll_step.outputs.too_many_runs != 'true' + - name: Download snapshots from S3 + if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.recreate_snapshots == 'false' || github.event.inputs.recreate_snapshots == '' }} + id: download_snapshots + shell: bash -euxo pipefail {0} run: | - gzip -d "test_log_${GITHUB_RUN_ID}.gz" - cat "test_log_${GITHUB_RUN_ID}" + # Download the snapshots from S3 + mkdir -p ${TEST_OUTPUT} + mkdir -p $BACKUP_DIR + cd $BACKUP_DIR + mkdir parts + cd parts + PART=$(aws s3api list-objects-v2 --bucket $S3_BUCKET --prefix performance/pagebench/ \ + | jq -r '.Contents[]?.Key' \ + | grep -E 'shared-snapshots-[0-9]{4}-[0-9]{2}-[0-9]{2}' \ + | sort \ + | tail -1) + echo "Latest PART: $PART" + if [[ -z "$PART" ]]; then + echo "ERROR: No matching S3 key found" >&2 + exit 1 + fi + S3_KEY=$(dirname $PART) + time aws s3 cp --only-show-errors --recursive s3://${S3_BUCKET}/$S3_KEY/ . + cd $TEST_OUTPUT + time cat $BACKUP_DIR/parts/* | zstdcat | tar --extract --preserve-permissions + rm -rf ${BACKUP_DIR} + + - name: Cache poetry deps + uses: actions/cache@v4 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + shell: bash -euxo pipefail {0} + run: ./scripts/pysync + + # we need high number of open files for pagebench + - name: show ulimits + shell: bash -euxo pipefail {0} + run: | + ulimit -a + + - name: Run pagebench testcase + shell: bash -euxo pipefail {0} + env: + CI: false # need to override this env variable set by github to enforce using snapshots + run: | + export PLATFORM=hetzner-unit-perf-${COMMIT_HASH_TYPE} + # report the commit hash of the neon repository in the revision of the test results + export GITHUB_SHA=${COMMIT_HASH} + rm -rf ${PERF_REPORT_DIR} + rm -rf ${ALLURE_RESULTS_DIR} + mkdir -p ${PERF_REPORT_DIR} + mkdir -p ${ALLURE_RESULTS_DIR} + PARAMS="--alluredir=${ALLURE_RESULTS_DIR} --tb=short --verbose -rA" + EXTRA_PARAMS="--out-dir ${PERF_REPORT_DIR} --durations-path $TEST_OUTPUT/benchmark_durations.json" + # run only two selected tests + # environment set by parent: + # RUST_BACKTRACE=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release + ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_throughput_with_n_tenants ${EXTRA_PARAMS} + ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant ${EXTRA_PARAMS} + + - name: upload the performance metrics to the Neon performance database which is used by grafana dashboards to display the results + shell: bash -euxo pipefail {0} + run: | + export REPORT_FROM="$PERF_REPORT_DIR" + export GITHUB_SHA=${COMMIT_HASH} + time ./scripts/generate_and_push_perf_report.sh + + - name: Upload test results + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-store + with: + report-dir: ${{ steps.set-env.outputs.allure_results_dir }} + unique-key: ${{ env.BUILD_TYPE }}-${{ env.DEFAULT_PG_VERSION }}-${{ runner.arch }} + aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Allure report + id: create-allure-report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + - name: Upload snapshots + if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.recreate_snapshots != 'false' && github.event.inputs.recreate_snapshots != '' }} + id: upload_snapshots + shell: bash -euxo pipefail {0} + run: | + mkdir -p $BACKUP_DIR + cd $TEST_OUTPUT + tar --create --preserve-permissions --file - shared-snapshots | zstd -o $BACKUP_DIR/shared_snapshots.tar.zst + cd $BACKUP_DIR + mkdir parts + split -b 1G shared_snapshots.tar.zst ./parts/shared_snapshots.tar.zst.part. + SNAPSHOT_DATE=$(date +%F) # YYYY-MM-DD + cd parts + time aws s3 cp --recursive . s3://${S3_BUCKET}/performance/pagebench/shared-snapshots-${SNAPSHOT_DATE}/ + - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1 @@ -157,26 +260,22 @@ jobs: slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} - + - name: Cleanup Test Resources if: always() + shell: bash -euxo pipefail {0} + env: + ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst run: | - curl -k -X 'POST' \ - "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \ - -H 'accept: application/json' \ - -H "Authorization: Bearer $API_KEY" \ - -d '' + # Cleanup the test resources + if [[ -d "${BACKUP_DIR}" ]]; then + rm -rf ${BACKUP_DIR} + fi + if [[ -d "${TEST_OUTPUT}" ]]; then + rm -rf ${TEST_OUTPUT} + fi + if [[ -d "${NEON_DIR}" ]]; then + rm -rf ${NEON_DIR} + fi + rm -rf $(dirname $ARCHIVE) - - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine) - if: always() && steps.poll_step.outputs.too_many_runs != 'true' - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 - with: - aws-region: eu-central-1 - role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }} - role-duration-seconds: 3600 - - - name: Stop EC2 instance and wait for the instance to be stopped - if: always() && steps.poll_step.outputs.too_many_runs != 'true' - run: | - aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID - aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID diff --git a/.github/workflows/release-compute.yml b/.github/workflows/release-compute.yml new file mode 100644 index 0000000000..f123dd2f44 --- /dev/null +++ b/.github/workflows/release-compute.yml @@ -0,0 +1,12 @@ +name: Create compute release PR + +on: + schedule: + - cron: '0 7 * * FRI' + +jobs: + create-release-pr: + uses: ./.github/workflows/release.yml + with: + component: compute + secrets: inherit diff --git a/.github/workflows/release-proxy.yml b/.github/workflows/release-proxy.yml new file mode 100644 index 0000000000..d9055984d2 --- /dev/null +++ b/.github/workflows/release-proxy.yml @@ -0,0 +1,12 @@ +name: Create proxy release PR + +on: + schedule: + - cron: '0 6 * * TUE' + +jobs: + create-release-pr: + uses: ./.github/workflows/release.yml + with: + component: proxy + secrets: inherit diff --git a/.github/workflows/release-storage.yml b/.github/workflows/release-storage.yml new file mode 100644 index 0000000000..91f02fddda --- /dev/null +++ b/.github/workflows/release-storage.yml @@ -0,0 +1,12 @@ +name: Create storage release PR + +on: + schedule: + - cron: '0 6 * * FRI' + +jobs: + create-release-pr: + uses: ./.github/workflows/release.yml + with: + component: storage + secrets: inherit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4068eafb95..0f97cf7c87 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,25 +1,34 @@ -name: Create Release Branch +name: Create release PR on: - schedule: - # It should be kept in sync with if-condition in jobs - - cron: '0 6 * * TUE' # Proxy release - - cron: '0 6 * * FRI' # Storage release - - cron: '0 7 * * FRI' # Compute release workflow_dispatch: inputs: - create-storage-release-branch: - type: boolean - description: 'Create Storage release PR' + component: + description: "Component to release" + required: true + type: choice + options: + - compute + - proxy + - storage + cherry-pick: + description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)" required: false - create-proxy-release-branch: - type: boolean - description: 'Create Proxy release PR' - required: false - create-compute-release-branch: - type: boolean - description: 'Create Compute release PR' + type: string + default: '' + + workflow_call: + inputs: + component: + description: "Component to release" + required: true + type: string + cherry-pick: + description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)" required: false + type: string + default: '' + # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} @@ -29,41 +38,31 @@ defaults: shell: bash -euo pipefail {0} jobs: - create-storage-release-branch: - if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }} + create-release-pr: + runs-on: ubuntu-22.04 permissions: contents: write - uses: ./.github/workflows/_create-release-pr.yml - with: - component-name: 'Storage' - source-branch: ${{ github.ref_name }} - secrets: - ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit - create-proxy-release-branch: - if: ${{ github.event.schedule == '0 6 * * TUE' || inputs.create-proxy-release-branch }} + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 - permissions: - contents: write + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" - uses: ./.github/workflows/_create-release-pr.yml - with: - component-name: 'Proxy' - source-branch: ${{ github.ref_name }} - secrets: - ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} - - create-compute-release-branch: - if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }} - - permissions: - contents: write - - uses: ./.github/workflows/_create-release-pr.yml - with: - component-name: 'Compute' - source-branch: ${{ github.ref_name }} - secrets: - ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} + - name: Create release PR + uses: neondatabase/dev-actions/release-pr@290dec821d86fa8a93f019e8c69720f5865b5677 + with: + component: ${{ inputs.component }} + cherry-pick: ${{ inputs.cherry-pick }} + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} diff --git a/Cargo.lock b/Cargo.lock index c881c68a97..052e85ed66 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -739,7 +739,7 @@ dependencies = [ "http 1.1.0", "http-body 1.0.0", "http-body-util", - "hyper 1.4.1", + "hyper 1.6.0", "hyper-util", "itoa", "matchit 0.8.4", @@ -1189,6 +1189,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "cgroups-rs" version = "0.3.3" @@ -1354,15 +1360,15 @@ dependencies = [ "libc", "metrics", "neonart", - "nix 0.27.1", + "nix 0.30.1", "pageserver_client_grpc", "pageserver_page_api", "prometheus", - "prost 0.13.3", + "prost 0.13.5", "thiserror 1.0.69", "tokio", "tokio-pipe", - "tonic", + "tonic 0.12.3", "tracing", "tracing-subscriber", "uring-common", @@ -1391,6 +1397,7 @@ name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", + "async-compression", "aws-config", "aws-sdk-kms", "aws-sdk-s3", @@ -1409,9 +1416,10 @@ dependencies = [ "futures", "http 1.1.0", "indexmap 2.9.0", + "itertools 0.10.5", "jsonwebtoken", "metrics", - "nix 0.27.1", + "nix 0.30.1", "notify", "num_cpus", "once_cell", @@ -1527,13 +1535,14 @@ dependencies = [ "clap", "comfy-table", "compute_api", + "endpoint_storage", "futures", "http-utils", "humantime", "humantime-serde", "hyper 0.14.30", "jsonwebtoken", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "pageserver_api", "pageserver_client", @@ -2428,7 +2437,7 @@ dependencies = [ "futures-core", "futures-sink", "http-body-util", - "hyper 1.4.1", + "hyper 1.6.0", "hyper-util", "pin-project", "rand 0.8.5", @@ -2979,14 +2988,14 @@ dependencies = [ "pprof", "regex", "routerify", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-pemfile 2.1.1", "serde", "serde_json", "serde_path_to_error", "thiserror 1.0.69", "tokio", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-stream", "tokio-util", "tracing", @@ -2999,9 +3008,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.8.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" [[package]] name = "httpdate" @@ -3051,9 +3060,9 @@ dependencies = [ [[package]] name = "hyper" -version = "1.4.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" dependencies = [ "bytes", "futures-channel", @@ -3093,7 +3102,7 @@ checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c" dependencies = [ "futures-util", "http 1.1.0", - "hyper 1.4.1", + "hyper 1.6.0", "hyper-util", "rustls 0.22.4", "rustls-pki-types", @@ -3108,7 +3117,7 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793" dependencies = [ - "hyper 1.4.1", + "hyper 1.6.0", "hyper-util", "pin-project-lite", "tokio", @@ -3117,20 +3126,20 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.7" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" +checksum = "cf9f1e950e0d9d1d3c47184416723cf29c0d1f93bd8cccf37e4beb6b44f31710" dependencies = [ "bytes", "futures-channel", "futures-util", "http 1.1.0", "http-body 1.0.0", - "hyper 1.4.1", + "hyper 1.6.0", + "libc", "pin-project-lite", "socket2", "tokio", - "tower 0.4.13", "tower-service", "tracing", ] @@ -3634,9 +3643,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.169" +version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libloading" @@ -3916,6 +3925,16 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +[[package]] +name = "neon-shmem" +version = "0.1.0" +dependencies = [ + "nix 0.30.1", + "tempfile", + "thiserror 1.0.69", + "workspace_hack", +] + [[package]] name = "neonart" version = "0.1.0" @@ -3960,12 +3979,13 @@ dependencies = [ [[package]] name = "nix" -version = "0.27.1" +version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" +checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ "bitflags 2.8.0", "cfg-if", + "cfg_aliases", "libc", "memoffset 0.9.0", ] @@ -4020,6 +4040,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num" version = "0.4.1" @@ -4224,7 +4254,7 @@ dependencies = [ "opentelemetry-http", "opentelemetry-proto", "opentelemetry_sdk", - "prost 0.13.3", + "prost 0.13.5", "reqwest", "thiserror 1.0.69", ] @@ -4237,8 +4267,8 @@ checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6" dependencies = [ "opentelemetry", "opentelemetry_sdk", - "prost 0.13.3", - "tonic", + "prost 0.13.5", + "tonic 0.12.3", ] [[package]] @@ -4304,6 +4334,12 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "p256" version = "0.11.1" @@ -4411,6 +4447,7 @@ dependencies = [ "enumset", "fail", "futures", + "hashlink", "hex", "hex-literal", "http-utils", @@ -4422,7 +4459,7 @@ dependencies = [ "jsonwebtoken", "md5", "metrics", - "nix 0.27.1", + "nix 0.30.1", "num-traits", "num_cpus", "once_cell", @@ -4442,7 +4479,7 @@ dependencies = [ "pprof", "pq_proto", "procfs", - "prost 0.13.3", + "prost 0.13.5", "rand 0.8.5", "range-set-blaze", "regex", @@ -4450,7 +4487,7 @@ dependencies = [ "reqwest", "rpds", "rstest", - "rustls 0.23.18", + "rustls 0.23.27", "scopeguard", "send-future", "serde", @@ -4469,14 +4506,15 @@ dependencies = [ "tokio-epoll-uring", "tokio-io-timeout", "tokio-postgres", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-stream", "tokio-tar", "tokio-util", "toml_edit", - "tonic", + "tonic 0.13.1", "tracing", "tracing-utils", + "twox-hash", "url", "utils", "uuid", @@ -4501,7 +4539,7 @@ dependencies = [ "humantime", "humantime-serde", "itertools 0.10.5", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "postgres_backend", "postgres_ffi", @@ -4546,10 +4584,15 @@ dependencies = [ "bytes", "futures", "http 1.1.0", + "hyper 1.6.0", + "hyper-util", "pageserver_page_api", + "rand 0.8.5", "thiserror 1.0.69", "tokio", - "tonic", + "tokio-util", + "tonic 0.13.1", + "tower 0.4.13", "tracing", "utils", "uuid", @@ -4583,12 +4626,12 @@ name = "pageserver_page_api" version = "0.1.0" dependencies = [ "bytes", - "prost 0.13.3", - "smallvec", + "prost 0.13.5", "thiserror 1.0.69", - "tonic", + "tonic 0.13.1", "tonic-build", "utils", + "workspace_hack", ] [[package]] @@ -4976,14 +5019,14 @@ dependencies = [ "bytes", "once_cell", "pq_proto", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-pemfile 2.1.1", "serde", "thiserror 1.0.69", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-util", "tracing", ] @@ -5032,6 +5075,19 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "posthog_client_lite" +version = "0.1.0" +dependencies = [ + "anyhow", + "reqwest", + "serde", + "serde_json", + "sha2", + "thiserror 1.0.69", + "workspace_hack", +] + [[package]] name = "powerfmt" version = "0.2.0" @@ -5077,7 +5133,7 @@ dependencies = [ "inferno 0.12.0", "num", "paste", - "prost 0.13.3", + "prost 0.13.5", ] [[package]] @@ -5182,12 +5238,12 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", - "prost-derive 0.13.3", + "prost-derive 0.13.5", ] [[package]] @@ -5225,7 +5281,7 @@ dependencies = [ "once_cell", "petgraph", "prettyplease", - "prost 0.13.3", + "prost 0.13.5", "prost-types 0.13.3", "regex", "syn 2.0.100", @@ -5247,9 +5303,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", "itertools 0.12.1", @@ -5273,7 +5329,7 @@ version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" dependencies = [ - "prost 0.13.3", + "prost 0.13.5", ] [[package]] @@ -5319,7 +5375,7 @@ dependencies = [ "humantime", "humantime-serde", "hyper 0.14.30", - "hyper 1.4.1", + "hyper 1.6.0", "hyper-util", "indexmap 2.9.0", "ipnet", @@ -5355,7 +5411,7 @@ dependencies = [ "rsa", "rstest", "rustc-hash 1.1.0", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.1", "scopeguard", @@ -5374,13 +5430,14 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-postgres2", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-tungstenite 0.21.0", "tokio-util", "tracing", "tracing-log", "tracing-opentelemetry", "tracing-subscriber", + "tracing-test", "tracing-utils", "try-lock", "typed-json", @@ -5642,13 +5699,13 @@ dependencies = [ "num-bigint", "percent-encoding", "pin-project-lite", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-native-certs 0.8.0", "ryu", "sha1_smol", "socket2", "tokio", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-util", "url", ] @@ -5759,7 +5816,7 @@ dependencies = [ "http-body-util", "http-types", "humantime-serde", - "hyper 1.4.1", + "hyper 1.6.0", "itertools 0.10.5", "metrics", "once_cell", @@ -5799,7 +5856,7 @@ dependencies = [ "http 1.1.0", "http-body 1.0.0", "http-body-util", - "hyper 1.4.1", + "hyper 1.6.0", "hyper-rustls 0.26.0", "hyper-util", "ipnet", @@ -5856,7 +5913,7 @@ dependencies = [ "futures", "getrandom 0.2.11", "http 1.1.0", - "hyper 1.4.1", + "hyper 1.6.0", "parking_lot 0.11.2", "reqwest", "reqwest-middleware", @@ -6096,15 +6153,15 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.18" +version = "0.23.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c9cc1d47e243d655ace55ed38201c19ae02c148ae56412ab8750e8f0166ab7f" +checksum = "730944ca083c1c233a75c09f199e973ca499344a2b7ba9e755c457e86fb4a321" dependencies = [ "log", "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.102.8", + "rustls-webpki 0.103.3", "subtle", "zeroize", ] @@ -6193,6 +6250,17 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustls-webpki" +version = "0.103.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.12" @@ -6244,7 +6312,7 @@ dependencies = [ "regex", "remote_storage", "reqwest", - "rustls 0.23.18", + "rustls 0.23.27", "safekeeper_api", "safekeeper_client", "scopeguard", @@ -6261,7 +6329,7 @@ dependencies = [ "tokio", "tokio-io-timeout", "tokio-postgres", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-stream", "tokio-tar", "tokio-util", @@ -6433,7 +6501,7 @@ checksum = "255914a8e53822abd946e2ce8baa41d4cded6b8e938913b7f7b9da5b7ab44335" dependencies = [ "httpdate", "reqwest", - "rustls 0.23.18", + "rustls 0.23.27", "sentry-backtrace", "sentry-contexts", "sentry-core", @@ -6786,12 +6854,12 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.5" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9" +checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef" dependencies = [ "libc", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -6860,16 +6928,16 @@ dependencies = [ "http-body-util", "http-utils", "humantime", - "hyper 1.4.1", + "hyper 1.6.0", "hyper-util", "metrics", "once_cell", "parking_lot 0.12.1", - "prost 0.13.3", - "rustls 0.23.18", + "prost 0.13.5", + "rustls 0.23.27", "tokio", - "tokio-rustls 0.26.0", - "tonic", + "tokio-rustls 0.26.2", + "tonic 0.13.1", "tonic-build", "tracing", "utils", @@ -6911,7 +6979,7 @@ dependencies = [ "regex", "reqwest", "routerify", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-native-certs 0.8.0", "safekeeper_api", "safekeeper_client", @@ -6926,7 +6994,7 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-util", "tracing", "utils", @@ -6964,7 +7032,7 @@ dependencies = [ "postgres_ffi", "remote_storage", "reqwest", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-native-certs 0.8.0", "serde", "serde_json", @@ -7508,10 +7576,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab" dependencies = [ "ring", - "rustls 0.23.18", + "rustls 0.23.27", "tokio", "tokio-postgres", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "x509-certificate", ] @@ -7555,12 +7623,11 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.26.0" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" dependencies = [ - "rustls 0.23.18", - "rustls-pki-types", + "rustls 0.23.27", "tokio", ] @@ -7676,22 +7743,18 @@ dependencies = [ "axum 0.7.9", "base64 0.22.1", "bytes", - "flate2", "h2 0.4.4", "http 1.1.0", "http-body 1.0.0", "http-body-util", - "hyper 1.4.1", + "hyper 1.6.0", "hyper-timeout", "hyper-util", "percent-encoding", "pin-project", - "prost 0.13.3", - "rustls-native-certs 0.8.0", - "rustls-pemfile 2.1.1", + "prost 0.13.5", "socket2", "tokio", - "tokio-rustls 0.26.0", "tokio-stream", "tower 0.4.13", "tower-layer", @@ -7700,10 +7763,42 @@ dependencies = [ ] [[package]] -name = "tonic-build" -version = "0.12.3" +name = "tonic" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" +checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9" +dependencies = [ + "async-trait", + "axum 0.8.1", + "base64 0.22.1", + "bytes", + "flate2", + "h2 0.4.4", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "hyper 1.6.0", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "prost 0.13.5", + "rustls-native-certs 0.8.0", + "socket2", + "tokio", + "tokio-rustls 0.26.2", + "tokio-stream", + "tower 0.5.2", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-build" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac6f67be712d12f0b41328db3137e0d0757645d8904b4cb7d51cd9c2279e847" dependencies = [ "prettyplease", "proc-macro2", @@ -7741,9 +7836,12 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", + "indexmap 2.9.0", "pin-project-lite", + "slab", "sync_wrapper 1.0.1", "tokio", + "tokio-util", "tower-layer", "tower-service", "tracing", @@ -7894,6 +7992,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" dependencies = [ "matchers", + "nu-ansi-term", "once_cell", "regex", "serde", @@ -7907,6 +8006,27 @@ dependencies = [ "tracing-serde", ] +[[package]] +name = "tracing-test" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "557b891436fe0d5e0e363427fc7f217abf9ccd510d5136549847bdcbcd011d68" +dependencies = [ + "tracing-core", + "tracing-subscriber", + "tracing-test-macro", +] + +[[package]] +name = "tracing-test-macro" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568" +dependencies = [ + "quote", + "syn 2.0.100", +] + [[package]] name = "tracing-utils" version = "0.1.0" @@ -8049,7 +8169,7 @@ dependencies = [ "base64 0.22.1", "log", "once_cell", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-pki-types", "url", "webpki-roots", @@ -8132,7 +8252,7 @@ dependencies = [ "humantime", "jsonwebtoken", "metrics", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "pem", "pin-project-lite", @@ -8244,7 +8364,7 @@ dependencies = [ "pageserver_api", "postgres_ffi", "pprof", - "prost 0.13.3", + "prost 0.13.5", "remote_storage", "serde", "serde_json", @@ -8704,8 +8824,10 @@ dependencies = [ "fail", "form_urlencoded", "futures-channel", + "futures-core", "futures-executor", "futures-io", + "futures-task", "futures-util", "generic-array", "getrandom 0.2.11", @@ -8714,9 +8836,8 @@ dependencies = [ "hex", "hmac", "hyper 0.14.30", - "hyper 1.4.1", + "hyper 1.6.0", "hyper-util", - "indexmap 1.9.3", "indexmap 2.9.0", "itertools 0.12.1", "lazy_static", @@ -8724,6 +8845,7 @@ dependencies = [ "log", "memchr", "nix 0.26.4", + "nix 0.30.1", "nom", "num", "num-bigint", @@ -8735,18 +8857,19 @@ dependencies = [ "once_cell", "p256 0.13.2", "parquet", + "percent-encoding", "prettyplease", "proc-macro2", - "prost 0.13.3", + "prost 0.13.5", "quote", "rand 0.8.5", "regex", "regex-automata 0.4.3", "regex-syntax 0.8.2", "reqwest", - "rustls 0.23.18", + "rustls 0.23.27", "rustls-pki-types", - "rustls-webpki 0.102.8", + "rustls-webpki 0.103.3", "scopeguard", "sec1 0.7.3", "serde", @@ -8764,15 +8887,15 @@ dependencies = [ "time", "time-macros", "tokio", - "tokio-rustls 0.26.0", + "tokio-rustls 0.26.2", "tokio-stream", "tokio-util", "toml_edit", - "tonic", - "tower 0.4.13", + "tower 0.5.2", "tracing", "tracing-core", "tracing-log", + "tracing-subscriber", "url", "uuid", "zeroize", diff --git a/Cargo.toml b/Cargo.toml index 0cf8d0ba38..825d05375b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ members = [ "pageserver/client", "pageserver/client_grpc", "pageserver/pagebench", + "pageserver/page_api", "proxy", "safekeeper", "safekeeper/client", @@ -24,9 +25,11 @@ members = [ "libs/postgres_ffi", "libs/safekeeper_api", "libs/desim", + "libs/neon-shmem", "libs/utils", "libs/consumption_metrics", "libs/postgres_backend", + "libs/posthog_client_lite", "libs/pq_proto", "libs/tenant_size_model", "libs/metrics", @@ -130,7 +133,7 @@ md5 = "0.7.0" measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.9" -nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] } +nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] } # Do not update to >= 7.0.0, at least. The update will have a significant impact # on compute startup metrics (start_postgres_ms), >= 25% degradation. notify = "6.0.0" @@ -151,7 +154,7 @@ pin-project-lite = "0.2" pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] } procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency -prost = "0.13" +prost = "0.13.5" rand = "0.8" redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" @@ -201,7 +204,7 @@ tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.8" toml_edit = "0.22" -tonic = {version = "0.12.3", default-features = false, features = ["channel", "server", "tls", "tls-roots", "gzip"]} +tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots"] } tower = { version = "0.5.2", default-features = false } tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] } @@ -251,6 +254,7 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus ## Local libraries compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } +endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" } http-utils = { version = "0.1", path = "./libs/http-utils/" } metrics = { version = "0.1", path = "./libs/metrics/" } neonart = { version = "0.1", path = "./libs/neonart/" } @@ -258,8 +262,8 @@ pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } pageserver_client_grpc = { path = "./pageserver/client_grpc" } -pageserver_page_api = { path = "./pageserver/page_api" } pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } +pageserver_page_api = { path = "./pageserver/page_api" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } @@ -287,7 +291,7 @@ criterion = "0.5.1" rcgen = "0.13" rstest = "0.18" camino-tempfile = "1.0.2" -tonic-build = "0.12" +tonic-build = "0.13.1" [patch.crates-io] diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index f63d844afd..9d4c93e1cd 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -155,7 +155,7 @@ RUN set -e \ # Keep the version the same as in compute/compute-node.Dockerfile and # test_runner/regress/test_compute_metrics.py. -ENV SQL_EXPORTER_VERSION=0.17.0 +ENV SQL_EXPORTER_VERSION=0.17.3 RUN curl -fsSL \ "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \ --output sql_exporter.tar.gz \ @@ -292,7 +292,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.86.0 +ENV RUSTC_VERSION=1.87.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index b9299eee90..f4a5593b71 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -582,6 +582,38 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control +######################################################################################### +# +# Layer "online_advisor-build" +# compile online_advisor extension +# +######################################################################################### +FROM build-deps AS online_advisor-src +ARG PG_VERSION + +# online_advisor supports all Postgres version starting from PG14, but prior to PG17 has to be included in preload_shared_libraries +# last release 1.0 - May 15, 2025 +WORKDIR /ext-src +RUN case "${PG_VERSION:?}" in \ + "v17") \ + ;; \ + *) \ + echo "skipping the version of online_advistor for $PG_VERSION" && exit 0 \ + ;; \ + esac && \ + wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \ + echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \ + mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C . + +FROM pg-build AS online_advisor-build +COPY --from=online_advisor-src /ext-src/ /ext-src/ +WORKDIR /ext-src/ +RUN if [ -d online_advisor-src ]; then \ + cd online_advisor-src && \ + make -j install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/online_advisor.control; \ + fi + ######################################################################################### # # Layer "pg_hashids-build" @@ -1085,6 +1117,23 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \ USER root +######################################################################################### +# +# Layer "rust extensions pgrx14" +# +# Version 14 is now required by a few +# This layer should be used as a base for new pgrx extensions, +# and eventually get merged with `rust-extensions-build` +# +######################################################################################### +FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14 +ARG PG_VERSION + +RUN cargo install --locked --version 0.14.1 cargo-pgrx && \ + /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' + +USER root + ######################################################################################### # # Layers "pg-onnx-build" and "pgrag-build" @@ -1100,11 +1149,11 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar. mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ echo "#nothing to test here" > neon-test.sh -RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \ - echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.2.tar.gz -O pgrag.tar.gz && \ + echo "7361654ea24f08cbb9db13c2ee1c0fe008f6114076401bb871619690dafc5225 pgrag.tar.gz" | sha256sum --check && \ mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . -FROM rust-extensions-build-pgrx12 AS pgrag-build +FROM rust-extensions-build-pgrx14 AS pgrag-build COPY --from=pgrag-src /ext-src/ /ext-src/ # Install build-time dependencies @@ -1124,19 +1173,19 @@ RUN . venv/bin/activate && \ WORKDIR /ext-src/pgrag-src RUN cd exts/rag && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control RUN cd exts/rag_bge_small_en_v15 && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \ cargo pgrx install --release --features remote_onnx && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control RUN cd exts/rag_jina_reranker_v1_tiny_en && \ - sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \ REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \ cargo pgrx install --release --features remote_onnx && \ @@ -1305,8 +1354,8 @@ ARG PG_VERSION # Do not update without approve from proxy team # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs WORKDIR /ext-src -RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \ - echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.1.tar.gz -O pg_session_jwt.tar.gz && \ + echo "62fec9e472cb805c53ba24a0765afdb8ea2720cfc03ae7813e61687b36d1b0ad pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \ @@ -1319,6 +1368,40 @@ COPY --from=pg_session_jwt-src /ext-src/ /ext-src/ WORKDIR /ext-src/pg_session_jwt-src RUN cargo pgrx install --release +######################################################################################### +# +# Layer "pg-anon-pg-build" +# compile anon extension +# +######################################################################################### +FROM pg-build AS pg_anon-src +ARG PG_VERSION +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +WORKDIR /ext-src +COPY compute/patches/anon_v2.patch . + +# This is an experimental extension, never got to real production. +# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. +ENV PATH="/usr/local/pgsql/bin/:$PATH" +RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/2.1.0/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \ + echo "48e7f5ae2f1ca516df3da86c5c739d48dd780a4e885705704ccaad0faa89d6c0 pg_anon.tar.gz" | sha256sum --check && \ + mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ + find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt && \ + sed -i 's/pgrx = "0.14.1"/pgrx = { version = "=0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + patch -p1 < /ext-src/anon_v2.patch + +FROM rust-extensions-build-pgrx14 AS pg-anon-pg-build +ARG PG_VERSION +COPY --from=pg_anon-src /ext-src/ /ext-src/ +WORKDIR /ext-src +RUN cd pg_anon-src && \ + make -j $(getconf _NPROCESSORS_ONLN) extension PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \ + chmod -R a+r ../pg_anon-src && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; + +######################################################################################## + ######################################################################################### # # Layer "wal2json-build" @@ -1597,6 +1680,7 @@ COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=online_advisor-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -1615,6 +1699,7 @@ COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql +COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -1699,17 +1784,17 @@ ARG TARGETARCH RUN if [ "$TARGETARCH" = "amd64" ]; then\ postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\ pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\ - sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\ + sql_exporter_sha256='9a41127a493e8bfebfe692bf78c7ed2872a58a3f961ee534d1b0da9ae584aaab';\ else\ postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\ pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\ - sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\ + sql_exporter_sha256='530e6afc77c043497ed965532c4c9dfa873bc2a4f0b3047fad367715c0081d6a';\ fi\ && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ - && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.0/sql_exporter-0.17.0.linux-${TARGETARCH}.tar.gz\ + && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.3/sql_exporter-0.17.3.linux-${TARGETARCH}.tar.gz\ | tar xzf - --strip-components=1 -C.\ && echo "${postgres_exporter_sha256} postgres_exporter" | sha256sum -c -\ && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\ @@ -1771,6 +1856,7 @@ COPY --from=pgjwt-src /ext-src/ /ext-src/ COPY --from=pg_graphql-src /ext-src/ /ext-src/ #COPY --from=pg_tiktoken-src /ext-src/ /ext-src/ COPY --from=hypopg-src /ext-src/ /ext-src/ +COPY --from=online_advisor-src /ext-src/ /ext-src/ COPY --from=pg_hashids-src /ext-src/ /ext-src/ COPY --from=rum-src /ext-src/ /ext-src/ COPY --from=pgtap-src /ext-src/ /ext-src/ @@ -1919,7 +2005,8 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml # Make the libraries we built available -RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig +COPY --chmod=0666 compute/etc/ld.so.conf.d/00-neon.conf /etc/ld.so.conf.d/00-neon.conf +RUN /sbin/ldconfig # rsyslog config permissions # directory for rsyslogd pid file diff --git a/compute/etc/ld.so.conf.d/00-neon.conf b/compute/etc/ld.so.conf.d/00-neon.conf new file mode 100644 index 0000000000..e8e4bdcd42 --- /dev/null +++ b/compute/etc/ld.so.conf.d/00-neon.conf @@ -0,0 +1 @@ +/usr/local/lib diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index 449e1199d0..e64d907fe4 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -23,6 +23,8 @@ import 'sql_exporter/getpage_prefetch_requests_total.libsonnet', import 'sql_exporter/getpage_prefetches_buffered.libsonnet', import 'sql_exporter/getpage_sync_requests_total.libsonnet', + import 'sql_exporter/compute_getpage_stuck_requests_total.libsonnet', + import 'sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet', import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet', import 'sql_exporter/getpage_wait_seconds_count.libsonnet', import 'sql_exporter/getpage_wait_seconds_sum.libsonnet', diff --git a/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet b/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet new file mode 100644 index 0000000000..bc1100c832 --- /dev/null +++ b/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'compute_getpage_max_inflight_stuck_time_ms', + type: 'gauge', + help: 'Max wait time for stuck requests among all backends. Includes only active stuck requests, terminated or disconnected ones are not accounted for', + values: [ + 'compute_getpage_max_inflight_stuck_time_ms', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet b/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet new file mode 100644 index 0000000000..5f72f43254 --- /dev/null +++ b/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'compute_getpage_stuck_requests_total', + type: 'counter', + help: 'Total number of Getpage requests left without an answer for more than pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout', + values: [ + 'compute_getpage_stuck_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql index 4a36f3bf2f..39a9d03412 100644 --- a/compute/etc/sql_exporter/neon_perf_counters.sql +++ b/compute/etc/sql_exporter/neon_perf_counters.sql @@ -9,6 +9,8 @@ SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d( getpage_wait_seconds_sum numeric, getpage_prefetch_requests_total numeric, getpage_sync_requests_total numeric, + compute_getpage_stuck_requests_total numeric, + compute_getpage_max_inflight_stuck_time_ms numeric, getpage_prefetch_misses_total numeric, getpage_prefetch_discards_total numeric, getpage_prefetches_buffered numeric, diff --git a/compute/patches/anon_v2.patch b/compute/patches/anon_v2.patch new file mode 100644 index 0000000000..e833a6dfd3 --- /dev/null +++ b/compute/patches/anon_v2.patch @@ -0,0 +1,129 @@ +diff --git a/sql/anon.sql b/sql/anon.sql +index 0cdc769..f6cc950 100644 +--- a/sql/anon.sql ++++ b/sql/anon.sql +@@ -1141,3 +1141,8 @@ $$ + -- TODO : https://en.wikipedia.org/wiki/L-diversity + + -- TODO : https://en.wikipedia.org/wiki/T-closeness ++ ++-- NEON Patches ++ ++GRANT ALL ON SCHEMA anon to neon_superuser; ++GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser; +diff --git a/sql/init.sql b/sql/init.sql +index 7da6553..9b6164b 100644 +--- a/sql/init.sql ++++ b/sql/init.sql +@@ -74,50 +74,49 @@ $$ + + SECURITY LABEL FOR anon ON FUNCTION anon.load_csv IS 'UNTRUSTED'; + +--- load fake data from a given path +-CREATE OR REPLACE FUNCTION anon.init( +- datapath TEXT +-) ++CREATE OR REPLACE FUNCTION anon.load_fake_data() + RETURNS BOOLEAN + AS $$ + DECLARE +- datapath_check TEXT; + success BOOLEAN; ++ sharedir TEXT; ++ datapath TEXT; + BEGIN + +- IF anon.is_initialized() THEN +- RAISE NOTICE 'The anon extension is already initialized.'; +- RETURN TRUE; +- END IF; ++ datapath := '/extension/anon/'; ++ -- find the local extension directory ++ SELECT setting INTO sharedir ++ FROM pg_catalog.pg_config ++ WHERE name = 'SHAREDIR'; + + SELECT bool_or(results) INTO success + FROM unnest(array[ +- anon.load_csv('anon.identifiers_category',datapath||'/identifiers_category.csv'), +- anon.load_csv('anon.identifier',datapath ||'/identifier.csv'), +- anon.load_csv('anon.address',datapath ||'/address.csv'), +- anon.load_csv('anon.city',datapath ||'/city.csv'), +- anon.load_csv('anon.company',datapath ||'/company.csv'), +- anon.load_csv('anon.country',datapath ||'/country.csv'), +- anon.load_csv('anon.email', datapath ||'/email.csv'), +- anon.load_csv('anon.first_name',datapath ||'/first_name.csv'), +- anon.load_csv('anon.iban',datapath ||'/iban.csv'), +- anon.load_csv('anon.last_name',datapath ||'/last_name.csv'), +- anon.load_csv('anon.postcode',datapath ||'/postcode.csv'), +- anon.load_csv('anon.siret',datapath ||'/siret.csv'), +- anon.load_csv('anon.lorem_ipsum',datapath ||'/lorem_ipsum.csv') ++ anon.load_csv('anon.identifiers_category',sharedir || datapath || '/identifiers_category.csv'), ++ anon.load_csv('anon.identifier',sharedir || datapath || '/identifier.csv'), ++ anon.load_csv('anon.address',sharedir || datapath || '/address.csv'), ++ anon.load_csv('anon.city',sharedir || datapath || '/city.csv'), ++ anon.load_csv('anon.company',sharedir || datapath || '/company.csv'), ++ anon.load_csv('anon.country',sharedir || datapath || '/country.csv'), ++ anon.load_csv('anon.email', sharedir || datapath || '/email.csv'), ++ anon.load_csv('anon.first_name',sharedir || datapath || '/first_name.csv'), ++ anon.load_csv('anon.iban',sharedir || datapath || '/iban.csv'), ++ anon.load_csv('anon.last_name',sharedir || datapath || '/last_name.csv'), ++ anon.load_csv('anon.postcode',sharedir || datapath || '/postcode.csv'), ++ anon.load_csv('anon.siret',sharedir || datapath || '/siret.csv'), ++ anon.load_csv('anon.lorem_ipsum',sharedir || datapath || '/lorem_ipsum.csv') + ]) results; + RETURN success; +- + END; + $$ +- LANGUAGE PLPGSQL ++ LANGUAGE plpgsql + VOLATILE + RETURNS NULL ON NULL INPUT +- PARALLEL UNSAFE -- because load_csv is unsafe +- SECURITY INVOKER ++ PARALLEL UNSAFE -- because of the EXCEPTION ++ SECURITY DEFINER + SET search_path='' + ; +-SECURITY LABEL FOR anon ON FUNCTION anon.init(TEXT) IS 'UNTRUSTED'; ++ ++SECURITY LABEL FOR anon ON FUNCTION anon.load_fake_data IS 'UNTRUSTED'; + + -- People tend to forget the anon.init() step + -- This is a friendly notice for them +@@ -144,7 +143,7 @@ SECURITY LABEL FOR anon ON FUNCTION anon.notice_if_not_init IS 'UNTRUSTED'; + CREATE OR REPLACE FUNCTION anon.load(TEXT) + RETURNS BOOLEAN AS + $$ +- SELECT anon.init($1); ++ SELECT anon.init(); + $$ + LANGUAGE SQL + VOLATILE +@@ -159,16 +158,16 @@ SECURITY LABEL FOR anon ON FUNCTION anon.load(TEXT) IS 'UNTRUSTED'; + CREATE OR REPLACE FUNCTION anon.init() + RETURNS BOOLEAN + AS $$ +- WITH conf AS ( +- -- find the local extension directory +- SELECT setting AS sharedir +- FROM pg_catalog.pg_config +- WHERE name = 'SHAREDIR' +- ) +- SELECT anon.init(conf.sharedir || '/extension/anon/') +- FROM conf; ++BEGIN ++ IF anon.is_initialized() THEN ++ RAISE NOTICE 'The anon extension is already initialized.'; ++ RETURN TRUE; ++ END IF; ++ ++ RETURN anon.load_fake_data(); ++END; + $$ +- LANGUAGE SQL ++ LANGUAGE plpgsql + VOLATILE + PARALLEL UNSAFE -- because init is unsafe + SECURITY INVOKER diff --git a/compute/patches/rum.patch b/compute/patches/rum.patch index b45afe2874..aed1badc13 100644 --- a/compute/patches/rum.patch +++ b/compute/patches/rum.patch @@ -7,7 +7,7 @@ index 255e616..1c6edb7 100644 RelationGetRelationName(index)); +#ifdef NEON_SMGR -+ smgr_start_unlogged_build(index->rd_smgr); ++ smgr_start_unlogged_build(RelationGetSmgr(index)); +#endif + initRumState(&buildstate.rumstate, index); @@ -18,7 +18,7 @@ index 255e616..1c6edb7 100644 rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild); +#ifdef NEON_SMGR -+ smgr_finish_unlogged_build_phase_1(index->rd_smgr); ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); +#endif + /* @@ -29,7 +29,7 @@ index 255e616..1c6edb7 100644 } +#ifdef NEON_SMGR -+ smgr_end_unlogged_build(index->rd_smgr); ++ smgr_end_unlogged_build(RelationGetSmgr(index)); +#endif + /* diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index ec24d73242..057099994a 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -22,7 +22,7 @@ commands: - name: local_proxy user: postgres sysvInitAction: respawn - shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' + shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' - name: postgres-exporter user: nobody sysvInitAction: respawn diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index b40bdecebc..d048e20b2e 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -22,7 +22,7 @@ commands: - name: local_proxy user: postgres sysvInitAction: respawn - shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' + shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' - name: postgres-exporter user: nobody sysvInitAction: respawn diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8c1e7ad149..f9da3ba700 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -10,6 +10,7 @@ default = [] testing = ["fail/failpoints"] [dependencies] +async-compression.workspace = true base64.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true @@ -27,6 +28,7 @@ flate2.workspace = true futures.workspace = true http.workspace = true indexmap.workspace = true +itertools.workspace = true jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index e337ee7b15..20b5e567a8 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -60,12 +60,16 @@ use utils::failpoint_support; // Compatibility hack: if the control plane specified any remote-ext-config // use the default value for extension storage proxy gateway. // Remove this once the control plane is updated to pass the gateway URL -fn parse_remote_ext_config(arg: &str) -> Result { - if arg.starts_with("http") { - Ok(arg.trim_end_matches('/').to_string()) +fn parse_remote_ext_base_url(arg: &str) -> Result { + const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str = + "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"; + + Ok(if arg.starts_with("http") { + arg } else { - Ok("http://pg-ext-s3-gateway".to_string()) + FALLBACK_PG_EXT_GATEWAY_BASE_URL } + .to_owned()) } #[derive(Parser)] @@ -74,8 +78,10 @@ struct Cli { #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")] pub pgbin: String, - #[arg(short = 'r', long, value_parser = parse_remote_ext_config)] - pub remote_ext_config: Option, + /// The base URL for the remote extension storage proxy gateway. + /// Should be in the form of `http(s)://[:]`. + #[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")] + pub remote_ext_base_url: Option, /// The port to bind the external listening HTTP server to. Clients running /// outside the compute will talk to the compute through this port. Keep @@ -164,7 +170,7 @@ fn main() -> Result<()> { pgversion: get_pg_version_string(&cli.pgbin), external_http_port: cli.external_http_port, internal_http_port: cli.internal_http_port, - ext_remote_storage: cli.remote_ext_config.clone(), + remote_ext_base_url: cli.remote_ext_base_url.clone(), resize_swap_on_bind: cli.resize_swap_on_bind, set_disk_quota_for_fs: cli.set_disk_quota_for_fs, #[cfg(target_os = "linux")] @@ -265,4 +271,18 @@ mod test { fn verify_cli() { Cli::command().debug_assert() } + + #[test] + fn parse_pg_ext_gateway_base_url() { + let arg = "http://pg-ext-s3-gateway2"; + let result = super::parse_remote_ext_base_url(arg).unwrap(); + assert_eq!(result, arg); + + let arg = "pg-ext-s3-gateway"; + let result = super::parse_remote_ext_base_url(arg).unwrap(); + assert_eq!( + result, + "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local" + ); + } } diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 537028cde1..78acd78585 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -348,6 +348,7 @@ async fn run_dump_restore( "--no-security-labels".to_string(), "--no-subscriptions".to_string(), "--no-tablespaces".to_string(), + "--no-event-triggers".to_string(), // format "--format".to_string(), "directory".to_string(), diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 8834f0d63d..f494e2444a 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,4 +1,26 @@ -use std::collections::HashMap; +use anyhow::{Context, Result}; +use chrono::{DateTime, Utc}; +use compute_api::privilege::Privilege; +use compute_api::responses::{ + ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState, + LfcPrewarmState, +}; +use compute_api::spec::{ + ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, +}; +use futures::StreamExt; +use futures::future::join_all; +use futures::stream::FuturesUnordered; +use itertools::Itertools; +use nix::sys::signal::{Signal, kill}; +use nix::unistd::Pid; +use once_cell::sync::Lazy; +use postgres; +use postgres::NoTls; +use postgres::error::SqlState; +use remote_storage::{DownloadError, RemotePath}; +use std::collections::{HashMap, HashSet}; +use std::net::SocketAddr; use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; use std::process::{Command, Stdio}; @@ -7,24 +29,6 @@ use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::time::{Duration, Instant}; use std::{env, fs}; - -use anyhow::{Context, Result}; -use chrono::{DateTime, Utc}; -use compute_api::privilege::Privilege; -use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus}; -use compute_api::spec::{ - ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, -}; -use futures::StreamExt; -use futures::future::join_all; -use futures::stream::FuturesUnordered; -use nix::sys::signal::{Signal, kill}; -use nix::unistd::Pid; -use once_cell::sync::Lazy; -use postgres; -use postgres::NoTls; -use postgres::error::SqlState; -use remote_storage::{DownloadError, RemotePath}; use tokio::spawn; use tracing::{Instrument, debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; @@ -92,7 +96,7 @@ pub struct ComputeNodeParams { pub internal_http_port: u16, /// the address of extension storage proxy gateway - pub ext_remote_storage: Option, + pub remote_ext_base_url: Option, } /// Compute node info shared across several `compute_ctl` threads. @@ -150,6 +154,9 @@ pub struct ComputeState { /// set up the span relationship ourselves. pub startup_span: Option, + pub lfc_prewarm_state: LfcPrewarmState, + pub lfc_offload_state: LfcOffloadState, + pub metrics: ComputeMetrics, } @@ -163,6 +170,8 @@ impl ComputeState { pspec: None, startup_span: None, metrics: ComputeMetrics::default(), + lfc_prewarm_state: LfcPrewarmState::default(), + lfc_offload_state: LfcOffloadState::default(), } } @@ -198,6 +207,8 @@ pub struct ParsedSpec { pub pageserver_connstr: String, pub safekeeper_connstrings: Vec, pub storage_auth_token: Option, + pub endpoint_storage_addr: Option, + pub endpoint_storage_token: Option, } impl TryFrom for ParsedSpec { @@ -251,6 +262,18 @@ impl TryFrom for ParsedSpec { .or(Err("invalid timeline id"))? }; + let endpoint_storage_addr: Option = spec + .endpoint_storage_addr + .clone() + .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr")) + .unwrap_or_default() + .parse() + .ok(); + let endpoint_storage_token = spec + .endpoint_storage_token + .clone() + .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_token")); + Ok(ParsedSpec { spec, pageserver_connstr, @@ -258,6 +281,8 @@ impl TryFrom for ParsedSpec { storage_auth_token, tenant_id, timeline_id, + endpoint_storage_addr, + endpoint_storage_token, }) } } @@ -305,11 +330,39 @@ struct StartVmMonitorResult { impl ComputeNode { pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result { let connstr = params.connstr.as_str(); - let conn_conf = postgres::config::Config::from_str(connstr) + let mut conn_conf = postgres::config::Config::from_str(connstr) .context("cannot build postgres config from connstr")?; - let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr) + let mut tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr) .context("cannot build tokio postgres config from connstr")?; + // Users can set some configuration parameters per database with + // ALTER DATABASE ... SET ... + // + // There are at least these parameters: + // + // - role=some_other_role + // - default_transaction_read_only=on + // - statement_timeout=1, i.e., 1ms, which will cause most of the queries to fail + // - search_path=non_public_schema, this should be actually safe because + // we don't call any functions in user databases, but better to always reset + // it to public. + // + // that can affect `compute_ctl` and prevent it from properly configuring the database schema. + // Unset them via connection string options before connecting to the database. + // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. + // + // TODO(ololobus): we currently pass `-c default_transaction_read_only=off` from control plane + // as well. After rolling out this code, we can remove this parameter from control plane. + // In the meantime, double-passing is fine, the last value is applied. + // See: + const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + let options = match conn_conf.get_options() { + Some(options) => format!("{} {}", options, EXTRA_OPTIONS), + None => EXTRA_OPTIONS.to_string(), + }; + conn_conf.options(&options); + tokio_conn_conf.options(&options); + let mut new_state = ComputeState::new(); if let Some(spec) = config.spec { let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; @@ -736,6 +789,9 @@ impl ComputeNode { // Log metrics so that we can search for slow operations in logs info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished"); + if pspec.spec.prewarm_lfc_on_startup { + self.prewarm_lfc(); + } Ok(()) } @@ -1422,15 +1478,20 @@ impl ComputeNode { Err(e) => match e.code() { Some(&SqlState::INVALID_PASSWORD) | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { - // Connect with zenith_admin if cloud_admin could not authenticate + // Connect with `zenith_admin` if `cloud_admin` could not authenticate info!( - "cannot connect to postgres: {}, retrying with `zenith_admin` username", + "cannot connect to Postgres: {}, retrying with 'zenith_admin' username", e ); let mut zenith_admin_conf = postgres::config::Config::from(conf.clone()); zenith_admin_conf.application_name("compute_ctl:apply_config"); zenith_admin_conf.user("zenith_admin"); + // It doesn't matter what were the options before, here we just want + // to connect and create a new superuser role. + const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + zenith_admin_conf.options(ZENITH_OPTIONS); + let mut client = zenith_admin_conf.connect(NoTls) .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; @@ -1596,9 +1657,7 @@ impl ComputeNode { self.pg_reload_conf()?; if spec.mode == ComputeMode::Primary { - let mut conf = - tokio_postgres::Config::from_str(self.params.connstr.as_str()).unwrap(); - conf.application_name("apply_config"); + let conf = self.get_tokio_conn_conf(Some("compute_ctl:reconfigure")); let conf = Arc::new(conf); let spec = Arc::new(spec.clone()); @@ -1838,9 +1897,9 @@ LIMIT 100", real_ext_name: String, ext_path: RemotePath, ) -> Result { - let ext_remote_storage = + let remote_ext_base_url = self.params - .ext_remote_storage + .remote_ext_base_url .as_ref() .ok_or(DownloadError::BadInput(anyhow::anyhow!( "Remote extensions storage is not configured", @@ -1902,7 +1961,7 @@ LIMIT 100", let download_size = extension_server::download_extension( &real_ext_name, &ext_path, - ext_remote_storage, + remote_ext_base_url, &self.params.pgbin, ) .await @@ -1937,23 +1996,40 @@ LIMIT 100", tokio::spawn(conn); // TODO: support other types of grants apart from schemas? - let query = format!( - "GRANT {} ON SCHEMA {} TO {}", - privileges - .iter() - // should not be quoted as it's part of the command. - // is already sanitized so it's ok - .map(|p| p.as_str()) - .collect::>() - .join(", "), - // quote the schema and role name as identifiers to sanitize them. - schema_name.pg_quote(), - role_name.pg_quote(), - ); - db_client - .simple_query(&query) + + // check the role grants first - to gracefully handle read-replicas. + let select = "SELECT privilege_type + FROM pg_namespace + JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true + JOIN pg_user users ON acl.grantee = users.usesysid + WHERE users.usename = $1 + AND nspname = $2"; + let rows = db_client + .query(select, &[role_name, schema_name]) .await - .with_context(|| format!("Failed to execute query: {}", query))?; + .with_context(|| format!("Failed to execute query: {select}"))?; + + let already_granted: HashSet = rows.into_iter().map(|row| row.get(0)).collect(); + + let grants = privileges + .iter() + .filter(|p| !already_granted.contains(p.as_str())) + // should not be quoted as it's part of the command. + // is already sanitized so it's ok + .map(|p| p.as_str()) + .join(", "); + + if !grants.is_empty() { + // quote the schema and role name as identifiers to sanitize them. + let schema_name = schema_name.pg_quote(); + let role_name = role_name.pg_quote(); + + let query = format!("GRANT {grants} ON SCHEMA {schema_name} TO {role_name}",); + db_client + .simple_query(&query) + .await + .with_context(|| format!("Failed to execute query: {}", query))?; + } Ok(()) } @@ -2011,7 +2087,7 @@ LIMIT 100", &self, spec: &ComputeSpec, ) -> Result { - if self.params.ext_remote_storage.is_none() { + if self.params.remote_ext_base_url.is_none() { return Ok(RemoteExtensionMetrics { num_ext_downloaded: 0, largest_ext_size: 0, diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs new file mode 100644 index 0000000000..a6a84b3f1f --- /dev/null +++ b/compute_tools/src/compute_prewarm.rs @@ -0,0 +1,202 @@ +use crate::compute::ComputeNode; +use anyhow::{Context, Result, bail}; +use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder}; +use compute_api::responses::LfcOffloadState; +use compute_api::responses::LfcPrewarmState; +use http::StatusCode; +use reqwest::Client; +use std::sync::Arc; +use tokio::{io::AsyncReadExt, spawn}; +use tracing::{error, info}; + +#[derive(serde::Serialize, Default)] +pub struct LfcPrewarmStateWithProgress { + #[serde(flatten)] + base: LfcPrewarmState, + total: i32, + prewarmed: i32, + skipped: i32, +} + +/// A pair of url and a token to query endpoint storage for LFC prewarm-related tasks +struct EndpointStoragePair { + url: String, + token: String, +} + +const KEY: &str = "lfc_state"; +impl TryFrom<&crate::compute::ParsedSpec> for EndpointStoragePair { + type Error = anyhow::Error; + fn try_from(pspec: &crate::compute::ParsedSpec) -> Result { + let Some(ref endpoint_id) = pspec.spec.endpoint_id else { + bail!("pspec.endpoint_id missing") + }; + let Some(ref base_uri) = pspec.endpoint_storage_addr else { + bail!("pspec.endpoint_storage_addr missing") + }; + let tenant_id = pspec.tenant_id; + let timeline_id = pspec.timeline_id; + + let url = format!("http://{base_uri}/{tenant_id}/{timeline_id}/{endpoint_id}/{KEY}"); + let Some(ref token) = pspec.endpoint_storage_token else { + bail!("pspec.endpoint_storage_token missing") + }; + let token = token.clone(); + Ok(EndpointStoragePair { url, token }) + } +} + +impl ComputeNode { + // If prewarm failed, we want to get overall number of segments as well as done ones. + // However, this function should be reliable even if querying postgres failed. + pub async fn lfc_prewarm_state(&self) -> LfcPrewarmStateWithProgress { + info!("requesting LFC prewarm state from postgres"); + let mut state = LfcPrewarmStateWithProgress::default(); + { + state.base = self.state.lock().unwrap().lfc_prewarm_state.clone(); + } + + let client = match ComputeNode::get_maintenance_client(&self.tokio_conn_conf).await { + Ok(client) => client, + Err(err) => { + error!(%err, "connecting to postgres"); + return state; + } + }; + let row = match client + .query_one("select * from get_prewarm_info()", &[]) + .await + { + Ok(row) => row, + Err(err) => { + error!(%err, "querying LFC prewarm status"); + return state; + } + }; + state.total = row.try_get(0).unwrap_or_default(); + state.prewarmed = row.try_get(1).unwrap_or_default(); + state.skipped = row.try_get(2).unwrap_or_default(); + state + } + + pub fn lfc_offload_state(&self) -> LfcOffloadState { + self.state.lock().unwrap().lfc_offload_state.clone() + } + + /// Returns false if there is a prewarm request ongoing, true otherwise + pub fn prewarm_lfc(self: &Arc) -> bool { + crate::metrics::LFC_PREWARM_REQUESTS.inc(); + { + let state = &mut self.state.lock().unwrap().lfc_prewarm_state; + if let LfcPrewarmState::Prewarming = + std::mem::replace(state, LfcPrewarmState::Prewarming) + { + return false; + } + } + + let cloned = self.clone(); + spawn(async move { + let Err(err) = cloned.prewarm_impl().await else { + cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed; + return; + }; + error!(%err); + cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed { + error: err.to_string(), + }; + }); + true + } + + fn endpoint_storage_pair(&self) -> Result { + let state = self.state.lock().unwrap(); + state.pspec.as_ref().unwrap().try_into() + } + + async fn prewarm_impl(&self) -> Result<()> { + let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?; + info!(%url, "requesting LFC state from endpoint storage"); + + let request = Client::new().get(&url).bearer_auth(token); + let res = request.send().await.context("querying endpoint storage")?; + let status = res.status(); + if status != StatusCode::OK { + bail!("{status} querying endpoint storage") + } + + let mut uncompressed = Vec::new(); + let lfc_state = res + .bytes() + .await + .context("getting request body from endpoint storage")?; + ZstdDecoder::new(lfc_state.iter().as_slice()) + .read_to_end(&mut uncompressed) + .await + .context("decoding LFC state")?; + let uncompressed_len = uncompressed.len(); + info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into postgres"); + + ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + .await + .context("connecting to postgres")? + .query_one("select prewarm_local_cache($1)", &[&uncompressed]) + .await + .context("loading LFC state into postgres") + .map(|_| ()) + } + + /// Returns false if there is an offload request ongoing, true otherwise + pub fn offload_lfc(self: &Arc) -> bool { + crate::metrics::LFC_OFFLOAD_REQUESTS.inc(); + { + let state = &mut self.state.lock().unwrap().lfc_offload_state; + if let LfcOffloadState::Offloading = + std::mem::replace(state, LfcOffloadState::Offloading) + { + return false; + } + } + + let cloned = self.clone(); + spawn(async move { + let Err(err) = cloned.offload_lfc_impl().await else { + cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed; + return; + }; + error!(%err); + cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed { + error: err.to_string(), + }; + }); + true + } + + async fn offload_lfc_impl(&self) -> Result<()> { + let EndpointStoragePair { url, token } = self.endpoint_storage_pair()?; + info!(%url, "requesting LFC state from postgres"); + + let mut compressed = Vec::new(); + ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + .await + .context("connecting to postgres")? + .query_one("select get_local_cache_state()", &[]) + .await + .context("querying LFC state")? + .try_get::(0) + .context("deserializing LFC state") + .map(ZstdEncoder::new)? + .read_to_end(&mut compressed) + .await + .context("compressing LFC state")?; + let compressed_len = compressed.len(); + info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage"); + + let request = Client::new().put(url).bearer_auth(token).body(compressed); + match request.send().await { + Ok(res) if res.status() == StatusCode::OK => Ok(()), + Ok(res) => bail!("Error writing to endpoint storage: {}", res.status()), + Err(err) => Err(err).context("writing to endpoint storage"), + } + } +} diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 71c6123c3b..933b30134f 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -223,6 +223,12 @@ pub fn write_postgres_conf( // TODO: tune this after performance testing writeln!(file, "pgaudit.log_rotation_age=5")?; + // Enable audit logs for pg_session_jwt extension + // TODO: Consider a good approach for shipping pg_session_jwt logs to the same sink as + // pgAudit - additional context in https://github.com/neondatabase/cloud/issues/28863 + // + // writeln!(file, "pg_session_jwt.audit_log=on")?; + // Add audit shared_preload_libraries, if they are not present. // // The caller who sets the flag is responsible for ensuring that the necessary diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index ee889e0c40..3439383699 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -158,14 +158,14 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion { pub async fn download_extension( ext_name: &str, ext_path: &RemotePath, - ext_remote_storage: &str, + remote_ext_base_url: &str, pgbin: &str, ) -> Result { info!("Download extension {:?} from {:?}", ext_name, ext_path); // TODO add retry logic let download_buffer = - match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await { + match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await { Ok(buffer) => buffer, Err(error_message) => { return Err(anyhow::anyhow!( @@ -272,8 +272,8 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { // Do request to extension storage proxy, e.g., // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst // using HTTP GET and return the response body as bytes. -async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result { - let uri = format!("{}/{}", ext_remote_storage, ext_path); +async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result { + let uri = format!("{}/{}", remote_ext_base_url, ext_path); let filename = Path::new(ext_path) .file_name() .unwrap_or_else(|| std::ffi::OsStr::new("unknown")) diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs index 2d0f411d7a..a82f46e062 100644 --- a/compute_tools/src/http/middleware/authorize.rs +++ b/compute_tools/src/http/middleware/authorize.rs @@ -1,12 +1,10 @@ -use std::collections::HashSet; - use anyhow::{Result, anyhow}; use axum::{RequestExt, body::Body}; use axum_extra::{ TypedHeader, headers::{Authorization, authorization::Bearer}, }; -use compute_api::requests::ComputeClaims; +use compute_api::requests::{COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope}; use futures::future::BoxFuture; use http::{Request, Response, StatusCode}; use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet}; @@ -25,13 +23,14 @@ pub(in crate::http) struct Authorize { impl Authorize { pub fn new(compute_id: String, jwks: JwkSet) -> Self { let mut validation = Validation::new(Algorithm::EdDSA); - // Nothing is currently required - validation.required_spec_claims = HashSet::new(); validation.validate_exp = true; // Unused by the control plane - validation.validate_aud = false; - // Unused by the control plane validation.validate_nbf = false; + // Unused by the control plane + validation.validate_aud = false; + validation.set_audience(&[COMPUTE_AUDIENCE]); + // Nothing is currently required + validation.set_required_spec_claims(&[] as &[&str; 0]); Self { compute_id, @@ -64,11 +63,47 @@ impl AsyncAuthorizeRequest for Authorize { Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)), }; - if data.claims.compute_id != compute_id { - return Err(JsonResponse::error( - StatusCode::UNAUTHORIZED, - "invalid compute ID in authorization token claims", - )); + match data.claims.scope { + // TODO: We should validate audience for every token, but + // instead of this ad-hoc validation, we should turn + // [`Validation::validate_aud`] on. This is merely a stopgap + // while we roll out `aud` deployment. We return a 401 + // Unauthorized because when we eventually do use + // [`Validation`], we will hit the above `Err` match arm which + // returns 401 Unauthorized. + Some(ComputeClaimsScope::Admin) => { + let Some(ref audience) = data.claims.audience else { + return Err(JsonResponse::error( + StatusCode::UNAUTHORIZED, + "missing audience in authorization token claims", + )); + }; + + if !audience.iter().any(|a| a == COMPUTE_AUDIENCE) { + return Err(JsonResponse::error( + StatusCode::UNAUTHORIZED, + "invalid audience in authorization token claims", + )); + } + } + + // If the scope is not [`ComputeClaimsScope::Admin`], then we + // must validate the compute_id + _ => { + let Some(ref claimed_compute_id) = data.claims.compute_id else { + return Err(JsonResponse::error( + StatusCode::FORBIDDEN, + "missing compute_id in authorization token claims", + )); + }; + + if *claimed_compute_id != compute_id { + return Err(JsonResponse::error( + StatusCode::FORBIDDEN, + "invalid compute ID in authorization token claims", + )); + } + } } // Make claims available to any subsequent middleware or request diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index 6508de6eee..e141a48b7f 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -22,7 +22,7 @@ pub(in crate::http) async fn download_extension( State(compute): State>, ) -> Response { // Don't even try to download extensions if no remote storage is configured - if compute.params.ext_remote_storage.is_none() { + if compute.params.remote_ext_base_url.is_none() { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "remote storage is not configured", diff --git a/compute_tools/src/http/routes/lfc.rs b/compute_tools/src/http/routes/lfc.rs new file mode 100644 index 0000000000..07bcc6bfb7 --- /dev/null +++ b/compute_tools/src/http/routes/lfc.rs @@ -0,0 +1,39 @@ +use crate::compute_prewarm::LfcPrewarmStateWithProgress; +use crate::http::JsonResponse; +use axum::response::{IntoResponse, Response}; +use axum::{Json, http::StatusCode}; +use compute_api::responses::LfcOffloadState; +type Compute = axum::extract::State>; + +pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json { + Json(compute.lfc_prewarm_state().await) +} + +// Following functions are marked async for axum, as it's more convenient than wrapping these +// in async lambdas at call site + +pub(in crate::http) async fn offload_state(compute: Compute) -> Json { + Json(compute.lfc_offload_state()) +} + +pub(in crate::http) async fn prewarm(compute: Compute) -> Response { + if compute.prewarm_lfc() { + StatusCode::ACCEPTED.into_response() + } else { + JsonResponse::error( + StatusCode::TOO_MANY_REQUESTS, + "Multiple requests for prewarm are not allowed", + ) + } +} + +pub(in crate::http) async fn offload(compute: Compute) -> Response { + if compute.offload_lfc() { + StatusCode::ACCEPTED.into_response() + } else { + JsonResponse::error( + StatusCode::TOO_MANY_REQUESTS, + "Multiple requests for prewarm offload are not allowed", + ) + } +} diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs index a67be7fd5a..432e66a830 100644 --- a/compute_tools/src/http/routes/mod.rs +++ b/compute_tools/src/http/routes/mod.rs @@ -11,6 +11,7 @@ pub(in crate::http) mod extensions; pub(in crate::http) mod failpoints; pub(in crate::http) mod grants; pub(in crate::http) mod insights; +pub(in crate::http) mod lfc; pub(in crate::http) mod metrics; pub(in crate::http) mod metrics_json; pub(in crate::http) mod status; diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 10f767e97c..d5d2427971 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -23,7 +23,7 @@ use super::{ middleware::authorize::Authorize, routes::{ check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, - grants, insights, metrics, metrics_json, status, terminate, + grants, insights, lfc, metrics, metrics_json, status, terminate, }, }; use crate::compute::ComputeNode; @@ -85,6 +85,8 @@ impl From<&Server> for Router> { Router::>::new().route("/metrics", get(metrics::get_metrics)); let authenticated_router = Router::>::new() + .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm)) + .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload)) .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) .route("/database_schema", get(database_schema::get_schema_dump)) diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index a681fad0b0..7218067a8a 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -11,6 +11,7 @@ pub mod http; pub mod logger; pub mod catalog; pub mod compute; +pub mod compute_prewarm; pub mod disk_quota; pub mod extension_server; pub mod installed_extensions; diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index e37d6120ac..90326b2074 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,7 +1,7 @@ use metrics::core::{AtomicF64, AtomicU64, Collector, GenericCounter, GenericGauge}; use metrics::proto::MetricFamily; use metrics::{ - IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter, + IntCounter, IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter, register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec, }; use once_cell::sync::Lazy; @@ -97,6 +97,24 @@ pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy> = Lazy:: .expect("failed to define a metric") }); +/// Needed as neon.file_cache_prewarm_batch == 0 doesn't mean we never tried to prewarm. +/// On the other hand, LFC_PREWARMED_PAGES is excessive as we can GET /lfc/prewarm +pub(crate) static LFC_PREWARM_REQUESTS: Lazy = Lazy::new(|| { + register_int_counter!( + "compute_ctl_lfc_prewarm_requests_total", + "Total number of LFC prewarm requests made by compute_ctl", + ) + .expect("failed to define a metric") +}); + +pub(crate) static LFC_OFFLOAD_REQUESTS: Lazy = Lazy::new(|| { + register_int_counter!( + "compute_ctl_lfc_offload_requests_total", + "Total number of LFC offload requests made by compute_ctl", + ) + .expect("failed to define a metric") +}); + pub fn collect() -> Vec { let mut metrics = COMPUTE_CTL_UP.collect(); metrics.extend(INSTALLED_EXTENSIONS.collect()); @@ -106,5 +124,7 @@ pub fn collect() -> Vec { metrics.extend(AUDIT_LOG_DIR_SIZE.collect()); metrics.extend(PG_CURR_DOWNTIME_MS.collect()); metrics.extend(PG_TOTAL_DOWNTIME_MS.collect()); + metrics.extend(LFC_PREWARM_REQUESTS.collect()); + metrics.extend(LFC_OFFLOAD_REQUESTS.collect()); metrics } diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 5a07eec833..3311ee47b3 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -424,10 +424,10 @@ pub fn launch_monitor(compute: &Arc) -> thread::JoinHandle<()> { experimental, }; - let span = span!(Level::INFO, "compute_monitor"); thread::Builder::new() .name("compute-monitor".into()) .spawn(move || { + let span = span!(Level::INFO, "compute_monitor"); let _enter = span.enter(); monitor.run(); }) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 10d8f2c878..94467a0d2f 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -213,8 +213,10 @@ impl Escaping for PgIdent { // Find the first suitable tag that is not present in the string. // Postgres' max role/DB name length is 63 bytes, so even in the - // worst case it won't take long. - while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) { + // worst case it won't take long. Outer tag is always `tag + "x"`, + // so if `tag` is not present in the string, `outer_tag` is not + // present in the string either. + while self.contains(&tag.to_string()) { tag += "x"; outer_tag = tag.clone() + "x"; } diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs index 7be97046a0..c873697623 100644 --- a/compute_tools/src/rsyslog.rs +++ b/compute_tools/src/rsyslog.rs @@ -27,6 +27,40 @@ fn get_rsyslog_pid() -> Option { } } +fn wait_for_rsyslog_pid() -> Result { + const MAX_WAIT: Duration = Duration::from_secs(5); + const INITIAL_SLEEP: Duration = Duration::from_millis(2); + + let mut sleep_duration = INITIAL_SLEEP; + let start = std::time::Instant::now(); + let mut attempts = 1; + + for attempt in 1.. { + attempts = attempt; + match get_rsyslog_pid() { + Some(pid) => return Ok(pid), + None => { + if start.elapsed() >= MAX_WAIT { + break; + } + info!( + "rsyslogd is not running, attempt {}. Sleeping for {} ms", + attempt, + sleep_duration.as_millis() + ); + std::thread::sleep(sleep_duration); + sleep_duration *= 2; + } + } + } + + Err(anyhow::anyhow!( + "rsyslogd is not running after waiting for {} seconds and {} attempts", + attempts, + start.elapsed().as_secs() + )) +} + // Restart rsyslogd to apply the new configuration. // This is necessary, because there is no other way to reload the rsyslog configuration. // @@ -36,14 +70,14 @@ fn get_rsyslog_pid() -> Option { // TODO: test it properly // fn restart_rsyslog() -> Result<()> { - let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?; - info!("rsyslogd is running with pid: {}, restart it", old_pid); - // kill it to restart let _ = Command::new("pkill") .arg("rsyslogd") .output() - .context("Failed to stop rsyslogd")?; + .context("Failed to restart rsyslogd")?; + + // ensure rsyslogd is running + wait_for_rsyslog_pid()?; Ok(()) } @@ -131,15 +165,11 @@ pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result return Ok(()); } - // When new config is empty we can simply remove the configuration file. + // Nothing to configure if new_config.is_empty() { - info!("removing rsyslog config file: {}", POSTGRES_LOGS_CONF_PATH); - match std::fs::remove_file(POSTGRES_LOGS_CONF_PATH) { - Ok(_) => {} - Err(err) if err.kind() == ErrorKind::NotFound => {} - Err(err) => return Err(err.into()), - } - restart_rsyslog()?; + // When the configuration is removed, PostgreSQL will stop sending data + // to the files watched by rsyslog, so restarting rsyslog is more effort + // than just ignoring this change. return Ok(()); } diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index b72c1293ee..04b6ed2256 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -30,6 +30,7 @@ mod pg_helpers_tests { r#"fsync = off wal_level = logical hot_standby = on +prewarm_lfc_on_startup = off neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501' wal_log_hints = on log_connections = on @@ -70,6 +71,14 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor ("name$$$", ("$x$name$$$$x$", "xx")), ("name$$$$", ("$x$name$$$$$x$", "xx")), ("name$x$", ("$xx$name$x$$xx$", "xxx")), + ("x", ("$xx$x$xx$", "xxx")), + ("xx", ("$xxx$xx$xxx$", "xxxx")), + ("$x", ("$xx$$x$xx$", "xxx")), + ("x$", ("$xx$x$$xx$", "xxx")), + ("$x$", ("$xx$$x$$xx$", "xxx")), + ("xx$", ("$xxx$xx$$xxx$", "xxxx")), + ("$xx", ("$xxx$$xx$xxx$", "xxxx")), + ("$xx$", ("$xxx$$xx$$xxx$", "xxxx")), ]; for (input, expected) in test_cases { diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 92f0071bac..62c039047f 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -41,7 +41,7 @@ storage_broker.workspace = true http-utils.workspace = true utils.workspace = true whoami.workspace = true - +endpoint_storage.workspace = true compute_api.workspace = true workspace_hack.workspace = true tracing.workspace = true diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 1eac4f7ff0..4f0934e411 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -14,7 +14,7 @@ use std::ffi::OsStr; use std::io::Write; -use std::os::unix::prelude::AsRawFd; +use std::os::fd::AsFd; use std::os::unix::process::CommandExt; use std::path::Path; use std::process::Command; @@ -356,7 +356,7 @@ where let file = pid_file::claim_for_current_process(&path).expect("claim pid file"); // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile // remains locked after exec. - nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty())) + nix::fcntl::fcntl(file.as_fd(), FcntlArg::F_SETFD(FdFlag::empty())) .expect("remove FD_CLOEXEC"); // Don't run drop(file), it would close the file before we actually exec. std::mem::forget(file); diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 6f55c0310f..98ab6e5657 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,7 +8,6 @@ use std::borrow::Cow; use std::collections::{BTreeSet, HashMap}; use std::fs::File; -use std::os::fd::AsRawFd; use std::path::PathBuf; use std::process::exit; use std::str::FromStr; @@ -16,10 +15,11 @@ use std::time::Duration; use anyhow::{Context, Result, anyhow, bail}; use clap::Parser; +use compute_api::requests::ComputeClaimsScope; use compute_api::spec::ComputeMode; use control_plane::broker::StorageBroker; use control_plane::endpoint::ComputeControlPlane; -use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_PORT, EndpointStorage}; +use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage}; use control_plane::local_env; use control_plane::local_env::{ EndpointStorageConf, InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, @@ -30,7 +30,7 @@ use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::{ NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, }; -use nix::fcntl::{FlockArg, flock}; +use nix::fcntl::{Flock, FlockArg}; use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, @@ -643,9 +643,10 @@ struct EndpointStartCmdArgs { #[clap( long, - help = "Configure the remote extensions storage proxy gateway to request for extensions." + help = "Configure the remote extensions storage proxy gateway URL to request for extensions.", + alias = "remote-ext-config" )] - remote_ext_config: Option, + remote_ext_base_url: Option, #[clap( long, @@ -705,6 +706,9 @@ struct EndpointStopCmdArgs { struct EndpointGenerateJwtCmdArgs { #[clap(help = "Postgres endpoint id")] endpoint_id: String, + + #[clap(short = 's', long, help = "Scope to generate the JWT with", value_parser = ComputeClaimsScope::from_str)] + scope: Option, } #[derive(clap::Subcommand)] @@ -744,16 +748,16 @@ struct TimelineTreeEl { /// A flock-based guard over the neon_local repository directory struct RepoLock { - _file: File, + _file: Flock, } impl RepoLock { fn new() -> Result { let repo_dir = File::open(local_env::base_path())?; - let repo_dir_fd = repo_dir.as_raw_fd(); - flock(repo_dir_fd, FlockArg::LockExclusive)?; - - Ok(Self { _file: repo_dir }) + match Flock::lock(repo_dir, FlockArg::LockExclusive) { + Ok(f) => Ok(Self { _file: f }), + Err((_, e)) => Err(e).context("flock error"), + } } } @@ -1018,7 +1022,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { }) .collect(), endpoint_storage: EndpointStorageConf { - port: ENDPOINT_STORAGE_DEFAULT_PORT, + listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR, }, pg_distrib_dir: None, neon_distrib_dir: None, @@ -1410,9 +1414,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res EndpointCmd::Start(args) => { let endpoint_id = &args.endpoint_id; let pageserver_id = args.endpoint_pageserver_id; - let remote_ext_config = &args.remote_ext_config; + let remote_ext_base_url = &args.remote_ext_base_url; - let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new); + let default_generation = env + .storage_controller + .timelines_onto_safekeepers + .then_some(1); + let safekeepers_generation = args + .safekeepers_generation + .or(default_generation) + .map(SafekeeperGeneration::new); // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? { @@ -1484,14 +1495,29 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res None }; + let exp = (std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)? + + Duration::from_secs(86400)) + .as_secs(); + let claims = endpoint_storage::claims::EndpointStorageClaims { + tenant_id: endpoint.tenant_id, + timeline_id: endpoint.timeline_id, + endpoint_id: endpoint_id.to_string(), + exp, + }; + + let endpoint_storage_token = env.generate_auth_token(&claims)?; + let endpoint_storage_addr = env.endpoint_storage.listen_addr.to_string(); + println!("Starting existing endpoint {endpoint_id}..."); endpoint .start( &auth_token, + endpoint_storage_token, + endpoint_storage_addr, safekeepers_generation, safekeepers, pageservers, - remote_ext_config.as_ref(), + remote_ext_base_url.as_ref(), stripe_size.0 as usize, args.create_test_user, args.start_timeout, @@ -1540,12 +1566,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res endpoint.stop(&args.mode, args.destroy)?; } EndpointCmd::GenerateJwt(args) => { - let endpoint_id = &args.endpoint_id; - let endpoint = cplane - .endpoints - .get(endpoint_id) - .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - let jwt = endpoint.generate_jwt()?; + let endpoint = { + let endpoint_id = &args.endpoint_id; + + cplane + .endpoints + .get(endpoint_id) + .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))? + }; + + let jwt = endpoint.generate_jwt(args.scope)?; print!("{jwt}"); } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 4071b620d6..708745446d 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -45,7 +45,9 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use anyhow::{Context, Result, anyhow, bail}; -use compute_api::requests::{ComputeClaims, ConfigurationRequest}; +use compute_api::requests::{ + COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest, +}; use compute_api::responses::{ ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig, }; @@ -630,9 +632,17 @@ impl Endpoint { } /// Generate a JWT with the correct claims. - pub fn generate_jwt(&self) -> Result { + pub fn generate_jwt(&self, scope: Option) -> Result { self.env.generate_auth_token(&ComputeClaims { - compute_id: self.endpoint_id.clone(), + audience: match scope { + Some(ComputeClaimsScope::Admin) => Some(vec![COMPUTE_AUDIENCE.to_owned()]), + _ => None, + }, + compute_id: match scope { + Some(ComputeClaimsScope::Admin) => None, + _ => Some(self.endpoint_id.clone()), + }, + scope, }) } @@ -640,10 +650,12 @@ impl Endpoint { pub async fn start( &self, auth_token: &Option, + endpoint_storage_token: String, + endpoint_storage_addr: String, safekeepers_generation: Option, safekeepers: Vec, pageservers: Vec<(Host, u16)>, - remote_ext_config: Option<&String>, + remote_ext_base_url: Option<&String>, shard_stripe_size: usize, create_test_user: bool, start_timeout: Duration, @@ -733,6 +745,9 @@ impl Endpoint { drop_subscriptions_before_start: self.drop_subscriptions_before_start, audit_log_level: ComputeAudit::Disabled, logs_export_host: None::, + endpoint_storage_addr: Some(endpoint_storage_addr), + endpoint_storage_token: Some(endpoint_storage_token), + prewarm_lfc_on_startup: false, }; // this strange code is needed to support respec() in tests @@ -810,8 +825,8 @@ impl Endpoint { .stderr(logfile.try_clone()?) .stdout(logfile); - if let Some(remote_ext_config) = remote_ext_config { - cmd.args(["--remote-ext-config", remote_ext_config]); + if let Some(remote_ext_base_url) = remote_ext_base_url { + cmd.args(["--remote-ext-base-url", remote_ext_base_url]); } let child = cmd.spawn()?; @@ -903,7 +918,7 @@ impl Endpoint { self.external_http_address.port() ), ) - .bearer_auth(self.generate_jwt()?) + .bearer_auth(self.generate_jwt(None::)?) .send() .await?; @@ -980,7 +995,7 @@ impl Endpoint { self.external_http_address.port() )) .header(CONTENT_TYPE.as_str(), "application/json") - .bearer_auth(self.generate_jwt()?) + .bearer_auth(self.generate_jwt(None::)?) .body( serde_json::to_string(&ConfigurationRequest { spec, diff --git a/control_plane/src/endpoint_storage.rs b/control_plane/src/endpoint_storage.rs index 102db91a22..171aaeddb4 100644 --- a/control_plane/src/endpoint_storage.rs +++ b/control_plane/src/endpoint_storage.rs @@ -3,17 +3,19 @@ use crate::local_env::LocalEnv; use anyhow::{Context, Result}; use camino::Utf8PathBuf; use std::io::Write; +use std::net::SocketAddr; use std::time::Duration; /// Directory within .neon which will be used by default for LocalFs remote storage. pub const ENDPOINT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/endpoint_storage"; -pub const ENDPOINT_STORAGE_DEFAULT_PORT: u16 = 9993; +pub const ENDPOINT_STORAGE_DEFAULT_ADDR: SocketAddr = + SocketAddr::new(std::net::IpAddr::V4(std::net::Ipv4Addr::LOCALHOST), 9993); pub struct EndpointStorage { pub bin: Utf8PathBuf, pub data_dir: Utf8PathBuf, pub pemfile: Utf8PathBuf, - pub port: u16, + pub addr: SocketAddr, } impl EndpointStorage { @@ -22,7 +24,7 @@ impl EndpointStorage { bin: Utf8PathBuf::from_path_buf(env.endpoint_storage_bin()).unwrap(), data_dir: Utf8PathBuf::from_path_buf(env.endpoint_storage_data_dir()).unwrap(), pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(), - port: env.endpoint_storage.port, + addr: env.endpoint_storage.listen_addr, } } @@ -31,7 +33,7 @@ impl EndpointStorage { } fn listen_addr(&self) -> Utf8PathBuf { - format!("127.0.0.1:{}", self.port).into() + format!("{}:{}", self.addr.ip(), self.addr.port()).into() } pub fn init(&self) -> Result<()> { diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index a18b34daa4..4a8892c6de 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -20,7 +20,9 @@ use utils::auth::encode_from_key_file; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use crate::broker::StorageBroker; -use crate::endpoint_storage::{ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage}; +use crate::endpoint_storage::{ + ENDPOINT_STORAGE_DEFAULT_ADDR, ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage, +}; use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode}; use crate::safekeeper::SafekeeperNode; @@ -151,10 +153,10 @@ pub struct NeonLocalInitConf { pub generate_local_ssl_certs: bool, } -#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct EndpointStorageConf { - pub port: u16, + pub listen_addr: SocketAddr, } /// Broker config for cluster internal communication. @@ -241,6 +243,14 @@ impl Default for NeonStorageControllerConf { } } +impl Default for EndpointStorageConf { + fn default() -> Self { + Self { + listen_addr: ENDPOINT_STORAGE_DEFAULT_ADDR, + } + } +} + impl NeonBroker { pub fn client_url(&self) -> Url { let url = if let Some(addr) = self.listen_https_addr { diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 79e87eba9b..756f2b02db 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -546,6 +546,16 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("Falied to parse 'sampling_ratio'")?, + relsize_snapshot_cache_capacity: settings + .remove("relsize snapshot cache capacity") + .map(|x| x.parse::()) + .transpose() + .context("Falied to parse 'relsize_snapshot_cache_capacity' as integer")?, + basebackup_cache_enabled: settings + .remove("basebackup_cache_enabled") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'basebackup_cache_enabled' as bool")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 948e3c8c93..eec2c997e6 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -112,7 +112,7 @@ impl SafekeeperNode { } /// Initializes a safekeeper node by creating all necessary files, - /// e.g. SSL certificates. + /// e.g. SSL certificates and JWT token file. pub fn initialize(&self) -> anyhow::Result<()> { if self.env.generate_local_ssl_certs { self.env.generate_ssl_cert( @@ -120,6 +120,17 @@ impl SafekeeperNode { &self.datadir_path().join("server.key"), )?; } + + // Generate a token file for authentication with other safekeepers + if self.conf.auth_enabled { + let token = self + .env + .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; + + let token_path = self.datadir_path().join("peer_jwt_token"); + std::fs::write(token_path, token)?; + } + Ok(()) } @@ -218,14 +229,26 @@ impl SafekeeperNode { args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); } + if self.conf.auth_enabled { + let token_path = self.datadir_path().join("peer_jwt_token"); + let token_path_str = token_path + .to_str() + .with_context(|| { + format!("Token path {token_path:?} cannot be represented as a unicode string") + })? + .to_owned(); + args.extend(["--auth-token-path".to_owned(), token_path_str]); + } + args.extend_from_slice(extra_opts); + let env_variables = Vec::new(); background_process::start_process( &format!("safekeeper-{id}"), &datadir, &self.env.safekeeper_bin(), &args, - self.safekeeper_env_variables()?, + env_variables, background_process::InitialPidFile::Expect(self.pid_file()), retry_timeout, || async { @@ -239,18 +262,6 @@ impl SafekeeperNode { .await } - fn safekeeper_env_variables(&self) -> anyhow::Result> { - // Generate a token to connect from safekeeper to peers - if self.conf.auth_enabled { - let token = self - .env - .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; - Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)]) - } else { - Ok(Vec::new()) - } - } - /// /// Stop the server. /// diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index a36815d27e..755d67a7ad 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -10,7 +10,8 @@ use camino::{Utf8Path, Utf8PathBuf}; use hyper0::Uri; use nix::unistd::Pid; use pageserver_api::controller_api::{ - NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, + NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, + SafekeeperSchedulingPolicyRequest, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; use pageserver_api::models::{ @@ -20,7 +21,7 @@ use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use pem::Pem; use postgres_backend::AuthType; -use reqwest::Method; +use reqwest::{Method, Response}; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use tokio::process::Command; @@ -570,6 +571,11 @@ impl StorageController { let peer_jwt_token = encode_from_key_file(&peer_claims, private_key) .expect("failed to generate jwt token"); args.push(format!("--peer-jwt-token={peer_jwt_token}")); + + let claims = Claims::new(None, Scope::SafekeeperData); + let jwt_token = + encode_from_key_file(&claims, private_key).expect("failed to generate jwt token"); + args.push(format!("--safekeeper-jwt-token={jwt_token}")); } if let Some(public_key) = &self.public_key { @@ -614,6 +620,10 @@ impl StorageController { self.env.base_data_dir.display() )); + if self.env.safekeepers.iter().any(|sk| sk.auth_enabled) && self.private_key.is_none() { + anyhow::bail!("Safekeeper set up for auth but no private key specified"); + } + if self.config.timelines_onto_safekeepers { args.push("--timelines-onto-safekeepers".to_string()); } @@ -640,6 +650,10 @@ impl StorageController { ) .await?; + if self.config.timelines_onto_safekeepers { + self.register_safekeepers().await?; + } + Ok(()) } @@ -743,6 +757,23 @@ impl StorageController { where RQ: Serialize + Sized, RS: DeserializeOwned + Sized, + { + let response = self.dispatch_inner(method, path, body).await?; + Ok(response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + } + + /// Simple HTTP request wrapper for calling into storage controller + async fn dispatch_inner( + &self, + method: reqwest::Method, + path: String, + body: Option, + ) -> anyhow::Result + where + RQ: Serialize + Sized, { // In the special case of the `storage_controller start` subcommand, we wish // to use the API endpoint of the newly started storage controller in order @@ -785,10 +816,31 @@ impl StorageController { let response = builder.send().await?; let response = response.error_from_body().await?; - Ok(response - .json() - .await - .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + Ok(response) + } + + /// Register the safekeepers in the storage controller + #[instrument(skip(self))] + async fn register_safekeepers(&self) -> anyhow::Result<()> { + for sk in self.env.safekeepers.iter() { + let sk_id = sk.id; + let body = serde_json::json!({ + "id": sk_id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "127.0.0.1", + "port": sk.pg_port, + "http_port": sk.http_port, + "https_port": sk.https_port, + "version": 5957, + "availability_zone_id": format!("us-east-2b-{sk_id}"), + }); + self.upsert_safekeeper(sk_id, body).await?; + self.safekeeper_scheduling_policy(sk_id, SkSchedulingPolicy::Active) + .await?; + } + Ok(()) } /// Call into the attach_hook API, for use before handing out attachments to pageservers @@ -816,6 +868,42 @@ impl StorageController { Ok(response.generation) } + #[instrument(skip(self))] + pub async fn upsert_safekeeper( + &self, + node_id: NodeId, + request: serde_json::Value, + ) -> anyhow::Result<()> { + let resp = self + .dispatch_inner::( + Method::POST, + format!("control/v1/safekeeper/{node_id}"), + Some(request), + ) + .await?; + if !resp.status().is_success() { + anyhow::bail!( + "setting scheduling policy unsuccessful for safekeeper {node_id}: {}", + resp.status() + ); + } + Ok(()) + } + + #[instrument(skip(self))] + pub async fn safekeeper_scheduling_policy( + &self, + node_id: NodeId, + scheduling_policy: SkSchedulingPolicy, + ) -> anyhow::Result<()> { + self.dispatch::( + Method::POST, + format!("control/v1/safekeeper/{node_id}/scheduling_policy"), + Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }), + ) + .await + } + #[instrument(skip(self))] pub async fn inspect( &self, diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 723b2f8afb..20a1ffb7a0 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -14,6 +14,14 @@ PG_VERSION=${PG_VERSION:-14} CONFIG_FILE_ORG=/var/db/postgres/configs/config.json CONFIG_FILE=/tmp/config.json +# Test that the first library path that the dynamic loader looks in is the path +# that we use for custom compiled software +first_path="$(ldconfig --verbose 2>/dev/null \ + | grep --invert-match ^$'\t' \ + | cut --delimiter=: --fields=1 \ + | head --lines=1)" +test "$first_path" == '/usr/local/lib' || true # Remove the || true in a follow-up PR. Needed for backwards compat. + echo "Waiting pageserver become ready." while ! nc -z pageserver 6400; do sleep 1; diff --git a/docker-compose/ext-src/alter_db.sh b/docker-compose/ext-src/alter_db.sh new file mode 100755 index 0000000000..6df37e1c9b --- /dev/null +++ b/docker-compose/ext-src/alter_db.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# We need these settings to get the expected output results. +# We cannot use the environment variables e.g. PGTZ due to +# https://github.com/neondatabase/neon/issues/1287 +export DATABASE=${1:-contrib_regression} +psql -c "ALTER DATABASE ${DATABASE} SET neon.allow_unstable_extensions='on'" \ + -c "ALTER DATABASE ${DATABASE} SET DateStyle='Postgres,MDY'" \ + -c "ALTER DATABASE ${DATABASE} SET TimeZone='America/Los_Angeles'" \ diff --git a/docker-compose/ext-src/pg_graphql-src/regular-test.sh b/docker-compose/ext-src/pg_graphql-src/regular-test.sh index 85e1ae057a..9e7d63b612 100755 --- a/docker-compose/ext-src/pg_graphql-src/regular-test.sh +++ b/docker-compose/ext-src/pg_graphql-src/regular-test.sh @@ -18,6 +18,7 @@ TESTS=${TESTS/row_level_security/} TESTS=${TESTS/sqli_connection/} dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh psql -v ON_ERROR_STOP=1 -f test/fixtures.sql -d contrib_regression ${REGRESS} --use-existing --dbname=contrib_regression --inputdir=${TESTDIR} ${TESTS} diff --git a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out index ca54864ecd..ff6a7404cb 100644 --- a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out +++ b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out @@ -12,6 +12,7 @@ ERROR: invalid JWT encoding -- Test creating a session with an expired JWT SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw'); ERROR: Token used after it has expired +DETAIL: exp=1742564432 -- Test creating a session with a valid JWT SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg'); jwt_session_init diff --git a/docker-compose/ext-src/pgrag-src/regular-test.sh b/docker-compose/ext-src/pgrag-src/regular-test.sh index 6cb1b049a4..22eb7498fd 100755 --- a/docker-compose/ext-src/pgrag-src/regular-test.sh +++ b/docker-compose/ext-src/pgrag-src/regular-test.sh @@ -3,6 +3,7 @@ set -ex cd "$(dirname "${0}")" dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag" PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --load-extension=vector --load-extension=rag --dbname=contrib_regression basic_functions text_processing api_keys chunking_functions document_processing embedding_api_functions voyageai_functions diff --git a/docker-compose/ext-src/pgx_ulid-src/Makefile b/docker-compose/ext-src/pgx_ulid-src/Makefile index 6480c48441..00975e8c48 100644 --- a/docker-compose/ext-src/pgx_ulid-src/Makefile +++ b/docker-compose/ext-src/pgx_ulid-src/Makefile @@ -20,5 +20,6 @@ installcheck: regression-test regression-test: dropdb --if-exists contrib_regression createdb contrib_regression + ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION $(EXTNAME)" $(PG_REGRESS) --inputdir=. --outputdir=. --use-existing --dbname=contrib_regression $(REGRESS) diff --git a/docker-compose/ext-src/plv8-src/regular-test.sh b/docker-compose/ext-src/plv8-src/regular-test.sh index b10cc65e8a..d5224e341c 100755 --- a/docker-compose/ext-src/plv8-src/regular-test.sh +++ b/docker-compose/ext-src/plv8-src/regular-test.sh @@ -3,6 +3,7 @@ set -ex cd "$(dirname ${0})" dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress REGRESS="$(make -n installcheck | awk '{print substr($0,index($0,"init-extension"));}')" REGRESS="${REGRESS/startup_perms/}" diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile index ac87cc511b..de6bdd06c0 100644 --- a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile @@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress installcheck: dropdb --if-exists contrib_regression createdb contrib_regression + ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_bge_small_en_v15" $(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS) \ No newline at end of file diff --git a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile index e81f94ef47..7adcad32f7 100644 --- a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile +++ b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile @@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress installcheck: dropdb --if-exists contrib_regression createdb contrib_regression + ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_jina_reranker_v1_tiny_en" $(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS) diff --git a/docker-compose/ext-src/rum-src/regular-test.sh b/docker-compose/ext-src/rum-src/regular-test.sh index d1d45a36ef..815c1adb53 100755 --- a/docker-compose/ext-src/rum-src/regular-test.sh +++ b/docker-compose/ext-src/rum-src/regular-test.sh @@ -3,5 +3,6 @@ set -ex cd "$(dirname ${0})" dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression rum rum_hash ruminv timestamp orderby orderby_hash altorder altorder_hash limits int2 int4 int8 float4 float8 money oid time timetz date interval macaddr inet cidr text varchar char bytea bit varbit numeric rum_weight expr array \ No newline at end of file diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml index 76935453b6..81445ed412 100644 --- a/docker-compose/pageserver_config/pageserver.toml +++ b/docker-compose/pageserver_config/pageserver.toml @@ -3,3 +3,6 @@ pg_distrib_dir='/usr/local/' listen_pg_addr='0.0.0.0:6400' listen_http_addr='0.0.0.0:9898' remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' } +control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address +control_plane_emergency_mode=true +virtual_file_io_mode="buffered" # the CI runners where we run the docker compose tests have slow disks diff --git a/docs/consumption_metrics.md b/docs/consumption_metrics.md index 6bcd28ab10..eb211af646 100644 --- a/docs/consumption_metrics.md +++ b/docs/consumption_metrics.md @@ -38,11 +38,6 @@ Currently, the following metrics are collected: Amount of WAL produced , by a timeline, i.e. last_record_lsn This is an absolute, per-timeline metric. -- `resident_size` - -Size of all the layer files in the tenant's directory on disk on the pageserver. -This is an absolute, per-tenant metric. - - `remote_storage_size` Size of the remote storage (S3) directory. diff --git a/docs/rfcs/030-vectored-timeline-get.md b/docs/rfcs/030-vectored-timeline-get.md index 093a964f38..e933eac5fe 100644 --- a/docs/rfcs/030-vectored-timeline-get.md +++ b/docs/rfcs/030-vectored-timeline-get.md @@ -7,6 +7,8 @@ Author: Christian Schwarz A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver. +**EDIT**: the implementation of this feature is described in [Vlad's (internal) tech talk](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link). + # Motivation During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space. diff --git a/docs/rfcs/043-bottom-most-gc-compaction.md b/docs/rfcs/043-bottom-most-gc-compaction.md new file mode 100644 index 0000000000..4bba758b31 --- /dev/null +++ b/docs/rfcs/043-bottom-most-gc-compaction.md @@ -0,0 +1,194 @@ +# Bottommost Garbage-Collection Compaction + +## Summary + +The goal of this doc is to propose a way to reliably collect garbages below the GC horizon. This process is called bottom-most garbage-collect-compaction, and is part of the broader legacy-enhanced compaction that we plan to implement in the future. + +## Motivation + +The current GC algorithm will wait until the covering via image layers before collecting the garbages of a key region. Relying on image layer generation to generate covering images is not reliable. There are prior arts to generate feedbacks from the GC algorithm to the image generation process to accelerate garbage collection, but it slows down the system and creates write amplification. + +# Basic Idea + +![](images/036-bottom-most-gc-compaction/01-basic-idea.svg) + +The idea of bottom-most compaction is simple: we rewrite all layers that are below or intersect with the GC horizon to produce a flat level of image layers at the GC horizon and deltas above the GC horizon. In this process, + +- All images and deltas ≤ GC horizon LSN will be dropped. This process collects garbages. +- We produce images for all keys involved in the compaction process at the GC horizon. + +Therefore, it can precisely collect all garbages below the horizon, and reduce the space amplification, i.e., in the staircase pattern (test_gc_feedback). + +![The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line.](images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png) + +The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line. + +# Branches + +With branches, the bottom-most compaction should retain a snapshot of the keyspace at the `retain_lsn` so that the child branch can access data at the branch point. This requires some modifications to the basic bottom-most compaction algorithm that we sketched above. + +![](images/036-bottom-most-gc-compaction/03-retain-lsn.svg) + +## Single Timeline w/ Snapshots: handle `retain_lsn` + +First let’s look into the case where we create branches over the main branch but don’t write any data to them (aka “snapshots”). + +The bottom-most compaction algorithm collects all deltas and images of a key and can make decisions on what data to retain. Given that we have a single key’s history as below: + +``` +LSN 0x10 -> A +LSN 0x20 -> append B +retain_lsn: 0x20 +LSN 0x30 -> append C +LSN 0x40 -> append D +retain_lsn: 0x40 +LSN 0x50 -> append E +GC horizon: 0x50 +LSN 0x60 -> append F +``` + +The algorithm will produce: + +``` +LSN 0x20 -> AB +(drop all history below the earliest retain_lsn) +LSN 0x40 -> ABCD +(assume the cost of replaying 2 deltas is higher than storing the full image, we generate an image here) +LSN 0x50 -> append E +(replay one delta is cheap) +LSN 0x60 -> append F +(keep everything as-is above the GC horizon) +``` + +![](images/036-bottom-most-gc-compaction/05-btmgc-parent.svg) + +What happens is that we balance the space taken by each retain_lsn and the cost of replaying deltas during the bottom-most compaction process. This is controlled by a threshold. If `count(deltas) < $threshold`, the deltas will be retained. Otherwise, an image will be generated and the deltas will be dropped. + +In the example above, the `$threshold` is 2. + +## Child Branches with data: pull + partial images + +In the previous section we have shown how bottom-most compaction respects `retain_lsn` so that all data that was readable at branch creation remains readable. But branches can have data on their own, and that data can fall out of the branch’s PITR window. So, this section explains how we deal with that. + +We will run the same bottom-most compaction for these branches, to ensure the space amplification on the child branch is reasonable. + +``` +branch_lsn: 0x20 +LSN 0x30 -> append P +LSN 0x40 -> append Q +LSN 0x50 -> append R +GC horizon: 0x50 +LSN 0x60 -> append S +``` + +Note that bottom-most compaction happens on a per-timeline basis. When it processes this key, it only reads the history from LSN 0x30 without a base image. Therefore, on child branches, the bottom-most compaction process will make image creation decisions based on the same `count(deltas) < $threshold` criteria, and if it decides to create an image, the base image will be retrieved from the ancestor branch. + +``` +branch_lsn: 0x20 +LSN 0x50 -> ABPQR +(we pull the image at LSN 0x20 from the ancestor branch to get AB, and then apply append PQ to the page; we replace the record at 0x40 with an image and drop the delta) +GC horizon: 0x50 +LSN 0x60 -> append S +``` + +![](images/036-bottom-most-gc-compaction/06-btmgc-child.svg) + +Note that for child branches, we do not create image layers for the images when bottom-most compaction runs. Instead, we drop the 0x30/0x40/0x50 delta records and directly place the image ABPQR@0x50 into the delta layer, which serves as a sparse image layer. For child branches, if we create image layers, we will need to put all keys in the range into the image layer. This causes space bloat and slow compactions. In this proposal, the compaction process will only compact and process keys modified inside the child branch. + +# Result + +Bottom-most compaction ensures all garbage under the GC horizon gets collected right away (compared with “eventually” in the current algorithm). Meanwhile, it generates images at each of the retain_lsn to ensure branch reads are fast. As we make per-key decisions on whether to generate an image or not, the theoretical lower bound of the storage space we need to retain for a branch is lower than before. + +Before: min(sum(logs for each key), sum(image for each key)), for each partition — we always generate image layers on a key range + +After: sum(min(logs for each key, image for each key)) + +# Compaction Trigger + +The bottom-most compaction can be automatically triggered. The goal of the trigger is that it should ensure a constant factor for write amplification. Say that the user write 1GB of WAL into the system, we should write 1GB x C data to S3. The legacy compaction algorithm does not have such a constant factor C. The data we write to S3 is quadratic to the logical size of the database (see [A Theoretical View of Neon Storage](https://www.notion.so/A-Theoretical-View-of-Neon-Storage-8d7ad7555b0c41b2a3597fa780911194?pvs=21)). + +We propose the following compaction trigger that generates a constant write amplification factor. Write amplification >= total writes to S3 / total user writes. We only analyze the write amplification caused by the bottom-most GC-compaction process, ignoring the legacy create image layers amplification. + +Given that we have ***X*** bytes of the delta layers above the GC horizon, ***A*** bytes of the delta layers intersecting with the GC horizon, ***B*** bytes of the delta layers below the GC horizon, and ***C*** bytes of the image layers below the GC horizon. + +The legacy GC + compaction loop will always keep ***A*** unchanged, reduce ***B and C*** when there are image layers covering the key range. This yields 0 write amplification (only file deletions) and extra ***B*** bytes of space. + +![](images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg) + +The bottom-most compaction proposed here will split ***A*** into deltas above the GC horizon and below the GC horizon. Everything below the GC horizon will be image layers after the compaction (not considering branches). Therefore, this yields ***A+C*** extra write traffic each iteration, plus 0 extra space. + +![](images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg) + +Also considering read amplification (below the GC horizon). When a read request reaches the GC horizon, the read amplification will be (A+B+C)/C=1+(A+B)/C. Reducing ***A*** and ***B*** can help reduce the read amplification below the GC horizon. + +The metrics-based trigger will wait until a point that space amplification is not that large and write amplification is not that large before the compaction gets triggered. The trigger is defined as **(A+B)/C ≥ 1 (or some other ratio)**. + +To reason about this trigger, consider the two cases: + +**Data Ingestion** + +User keeps ingesting data into the database, which indicates that WAL size roughly equals to the database logical size. The compaction gets triggered only when the newly-written WAL roughly equals to the current bottom-most image size (=X). Therefore, it’s triggered when the database size gets doubled. This is a reasonable amount of work. Write amplification is 2X/X=1 for the X amount of data written. + +![](images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg) + +**Updates/Deletion** + +In this case, WAL size will be larger than the database logical size ***D***. The compaction gets triggered for every ***D*** bytes of WAL written. Therefore, for every ***D*** bytes of WAL, we rewrite the bottom-most layer, which produces an extra ***D*** bytes of write amplification. This incurs exactly 2x write amplification (by the write of D), 1.5x write amplification (if we count from the start of the process) and no space amplification. + +![](images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg) + +Note that here I try to reason that write amplification is a constant (i.e., the data we write to S3 is proportional to the data the user write). The main problem with the current legacy compaction algorithm is that write amplification is proportional to the database size. + +The next step is to optimize the write amplification above the GC horizon (i.e., change the image creation criteria, top-most compaction, or introduce tiered compaction), to ensure the write amplification of the whole system is a constant factor. + +20GB layers → +20GB layers → delete 20GB, need 40GB temporary space + +# Sub-Compactions + +The gc-compaction algorithm may take a long time and we need to split the job into multiple sub-compaction jobs. + +![](images/036-bottom-most-gc-compaction/13-job-split.svg) + +As in the figure, the auto-trigger schedules a compaction job covering the full keyspace below a specific LSN. In such case that we cannot finish compacting it in one run in a reasonable amount of time, the algorithm will vertically split it into multiple jobs (in this case, 5). + +Each gc-compaction job will create one level of delta layers and one flat level of image layers for each LSN. Those layers will be automatically split based on size, which means that if the sub-compaction job produces 1GB of deltas, it will produce 4 * 256MB delta layers. For those layers that is not fully contained within the sub-compaction job rectangles, it will be rewritten to only contain the keys outside of the key range. + +# Implementation + +The main implementation of gc-compaction is in `compaction.rs`. + +* `compact_with_gc`: The main loop of gc-compaction. It takes a rectangle range of the layer map and compact that specific range. It selects layers intersecting with the rectangle, downloads the layers, creates the k-merge iterator to read those layers in the key-lsn order, and decide which keys to keep or insert a reconstructed page. The process is the basic unit of a gc-compaction and is not interruptable. If the process gets preempted by L0 compaction, it has to be restarted from scratch. For layers overlaps with the rectangle but not fully inside, the main loop will also rewrite them so that the new layer (or two layers if both left and right ends are outside of the rectangle) has the same LSN range as the original one but only contain the keys outside of the compaction range. +* `gc_compaction_split_jobs`: Splits a big gc-compaction job into sub-compactions based on heuristics in the layer map. The function looks at the layer map and splits the compaction job based on the size of the layers so that each compaction job only pulls ~4GB of layer files. +* `generate_key_retention` and `KeyHistoryRetention`: Implements the algorithm described in the "basic idea" and "branch" chapter of this RFC. It takes a vector of history of a key (key-lsn-value) and decides which LSNs of the key to retain. If there are too many deltas between two retain_lsns, it will reconstruct the page and insert an image into the compaction result. Also, we implement `KeyHistoryRetention::verify` to ensure the generated result is not corrupted -- all retain_lsns and all LSNs above the gc-horizon should be accessible. +* `GcCompactionQueue`: the automatic trigger implementation for gc-compaction. `GcCompactionQueue::iteration` is called at the end of the tenant compaction loop. It will then call `trigger_auto_compaction` to decide whether to trigger a gc-compaction job for this tenant. If yes, the compaction-job will be added to the compaction queue, and the queue will be slowly drained once there are no other compaction jobs running. gc-compaction has the lowest priority. If a sub-compaction job is not successful or gets preempted by L0 compaction (see limitations for reasons why a compaction job would fail), it will _not_ be retried. +* Changes to `index_part.json`: we added a `last_completed_lsn` field to the index part for the auto-trigger to decide when to trigger a compaction. +* Changes to the read path: when gc-compaction updates the layer map, all reads need to wait. See `gc_compaction_layer_update_lock` and comments in the code path for more information. + +Gc-compaction can also be scheduled over the HTTP API. Example: + +``` +curl 'localhost:9898/v1/tenant/:tenant_id/timeline/:timeline_id/compact?enhanced_gc_bottom_most_compaction=true&dry_run=true' -X PUT -H "Content-Type: application/json" -d '{"scheduled": true, "compact_key_range": { "start": "000000067F0000A0000002A1CF0100000000", "end": "000000067F0000A0000002A1D70100000000" } }' +``` + +The `dry_run` mode can be specified in the query string so that the compaction will go through all layers to estimate how much space can be saved without writing the compaction result into the layer map. + +The auto-trigger is controlled by tenant-level flag `gc_compaction_enabled`. If this is set to false, no gc-compaction will be automatically scheduled on this tenant (but manual trigger still works). + +# Next Steps + +There are still some limitations of gc-compaction itself that needs to be resolved and tested, + +- gc-compaction is currently only automatically triggered on root branches. We have not tested gc-compaction on child branches in staging. +- gc-compaction will skip aux key regions because of the possible conflict with the assumption of aux file tombstones. +- gc-compaction does not consider keyspaces at retain_lsns and only look at keys in the layers. This also causes us giving up some sub-compaction jobs because a key might have part of its history available due to traditional GC removing part of the history. +- We limit gc-compaction to run over shards <= 150GB to avoid gc-compaction taking too much time blocking other compaction jobs. The sub-compaction split algorithm needs to be improved to be able to split vertically and horizontally. Also, we need to move the download layer process out of the compaction loop so that we don't block other compaction jobs for too long. +- The compaction trigger always schedules gc-compaction from the lowest LSN to the gc-horizon. Currently we do not schedule compaction jobs that only selects layers in the middle. Allowing this could potentially reduce the number of layers read/write throughout the process. +- gc-compaction will give up if there are too many layers to rewrite or if there are not enough disk space for the compaction. +- gc-compaction sometimes fails with "no key produced during compaction", which means that all existing keys within the compaction range can be collected; but we don't have a way to write this information back to the layer map -- we cannot generate an empty image layer. +- We limit the maximum size of deltas for a single key to 512MB. If above this size, gc-compaction will give up. This can be resolved by changing `generate_key_retention` to be a stream instead of requiring to collect all the key history. + +In the future, + +- Top-most compaction: ensure we always have an image coverage for the latest data (or near the latest data), so that reads will be fast at the latest LSN. +- Tiered compaction on deltas: ensure read from any LSN is fast. +- Per-timeline compaction → tenant-wide compaction? diff --git a/docs/rfcs/2025-04-30-direct-io-for-pageserver.md b/docs/rfcs/2025-04-30-direct-io-for-pageserver.md new file mode 100644 index 0000000000..847f5e4040 --- /dev/null +++ b/docs/rfcs/2025-04-30-direct-io-for-pageserver.md @@ -0,0 +1,362 @@ +# Direct IO For Pageserver + +Date: Apr 30, 2025 + +## Summary + +This document is a retroactive RFC. It +- provides some background on what direct IO is, +- motivates why Pageserver should be using it for its IO, and +- describes how we changed Pageserver to use it. + +The [initial proposal](https://github.com/neondatabase/neon/pull/8240) that kicked off the work can be found in this closed GitHub PR. + +People primarily involved in this project were: +- Yuchen Liang +- Vlad Lazar +- Christian Schwarz + +## Timeline + +For posterity, here is the rough timeline of the development work that got us to where we are today. + +- Jan 2024: [integrate `tokio-epoll-uring`](https://github.com/neondatabase/neon/pull/5824) along with owned buffers API +- March 2024: `tokio-epoll-uring` enabled in all regions in buffered IO mode +- Feb 2024 to June 2024: PS PageCache Bypass For Data Blocks + - Feb 2024: [Vectored Get Implementation](https://github.com/neondatabase/neon/pull/6576) bypasses delta & image layer blocks for page requests + - Apr to June 2024: [Epic: bypass PageCache for use data blocks](https://github.com/neondatabase/neon/issues/7386) addresses remaining users +- Aug to Nov 2024: direct IO: first code; preliminaries; read path coding; BufferedWriter; benchmarks show perf regressions too high, no-go. +- Nov 2024 to Jan 2025: address perf regressions by developing page_service pipelining (aka batching) and concurrent IO ([Epic](https://github.com/neondatabase/neon/issues/9376)) +- Feb to March 2024: rollout batching, then concurrent+direct IO => read path and InMemoryLayer is now direct IO +- Apr 2025: develop & roll out direct IO for the write path + +## Background: Terminology & Glossary + +**kernel page cache**: the Linux kernel's page cache is a write-back cache for filesystem contents. +The cached unit is memory-page-sized & aligned chunks of the files that are being cached (typically 4k). +The cache lives in kernel memory and is not directly accessible through userspace. + +**Buffered IO**: an application's read/write system calls go through the kernel page cache. +For example, a 10 byte sized read or write to offset 5000 in a file will load the file contents +at offset `[4096,8192)` into a free page in the kernel page cache. If necessary, it will evict +a page to make room (cf eviction). Then, the kernel performs a memory-to-memory copy of 10 bytes +from/to the offset `4` (`5000 = 4096 + 4`) within the cached page. If it's a write, the kernel keeps +track of the fact that the page is now "dirty" in some ancillary structure. + +**Writeback**: a buffered read/write syscall returns after the memory-to-memory copy. The modifications +made by e.g. write system calls are not even *issued* to disk, let alone durable. Instead, the kernel +asynchronously writes back dirtied pages based on a variety of conditions. For us, the most relevant +ones are a) explicit request by userspace (`fsync`) and b) memory pressure. + +**Memory pressure**: the kernel page cache is a best effort service and a user of spare memory capacity. +If there is no free memory, the kernel page allocator will take pages used by page cache to satisfy allocations. +Before reusing a page like that, the page has to be written back (writeback, see above). +The far-reaching consequence of this is that **any allocation of anonymous memory can do IO** if the only +way to get that memory is by eviction & re-using a dirty page cache page. +Notably, this includes a simple `malloc` in userspace, because eventually that boils down to `mmap(..., MAP_ANON, ...)`. +I refer to this effect as the "malloc latency backscatter" caused by buffered IO. + +**Direct IO** allows application's read/write system calls to bypass the kernel page cache. The filesystem +is still involved because it is ultimately in charge of mapping the concept of files & offsets within them +to sectors on block devices. Typically, the filesystem poses size and alignment requirements for memory buffers +and file offsets (statx `Dio_mem_align` / `Dio_offset_align`), see [this gist](https://gist.github.com/problame/1c35cac41b7cd617779f8aae50f97155). +The IO operations will fail at runtime with EINVAL if the alignment requirements are not met. + +**"buffered" vs "direct"**: the central distinction between buffered and direct IO is about who allocates and +fills the IO buffers, and who controls when exactly the IOs are issued. In buffered IO, it's the syscall handlers, +kernel page cache, and memory management subsystems (cf "writeback"). In direct IO, all of it is done by +the application. +It takes more effort by the application to program with direct instead of buffered IO. +The return is precise control over and a clear distinction between consumption/modification of memory vs disk. + +**Pageserver PageCache**: Pageserver has an additional `PageCache` (referred to as PS PageCache from here on, as opposed to "kernel page cache"). +Its caching unit is 8KiB blocks of the layer files written by Pageserver. +A miss in PageCache is filled by reading from the filesystem, through the `VirtualFile` abstraction layer. +The default size is tiny (64MiB), very much like Postgres's `shared_buffers`. +We ran production at 128MiB for a long time but gradually moved it up to 2GiB over the past ~year. + +**VirtualFile** is Pageserver's abstraction for file IO, very similar to the facility in Postgres that bears the same name. +Its historical purpose appears to be working around open file descriptor limitations, which is practically irrelevant on Linux. +However, the facility in Pageserver is useful as an intermediary layer for metrics and abstracts over the different kinds of +IO engines that Pageserver supports (`std-fs` vs `tokio-epoll-uring`). + +## Background: History Of Caching In Pageserver + +For multiple years, Pageserver's `PageCache` was on the path of all read _and write_ IO. +It performed write-back to the kernel using buffered IO. + +We converted it into a read-only cache of immutable data in [PR 4994](https://github.com/neondatabase/neon/pull/4994). + +The introduction of `tokio-epoll-uring` required converting the code base to used owned IO buffers. +The `PageCache` pages are usable as owned IO buffers. + +We then started bypassing PageCache for user data blocks. +Data blocks are the 8k blocks of data in layer files that hold the multiple `Value`s, as opposed to the disk btree index blocks that tell us which values exist in a file at what offsets. +The disk btree embedded in delta & image layers remains `PageCache`'d. +Epics for that work were: +- Vectored `Timeline::get` (cf RFC 30) skipped delta and image layer data block `PageCache`ing outright. +- Epic https://github.com/neondatabase/neon/issues/7386 took care of the remaining users for data blocks: + - Materialized page cache (cached materialized pages; shown to be ~0% hit rate in practice) + - InMemoryLayer + - Compaction + +The outcome of the above: +1. All data blocks are always read through the `VirtualFile` APIs, hitting the kernel buffered read path (=> kernel page cache). +2. Indirect blocks (=disk btree blocks) would be cached in the PS `PageCache`. + +In production we size the PS `PageCache` to be 2GiB. +Thus drives hit rate up to ~99.95% and the eviction rate / replacement rates down to less than 200/second on a 1-minute average, on the busiest machines. +High baseline replacement rates are treated as a signal of resource exhaustion (page cache insufficient to host working set of the PS). +The response to this is to migrate tenants away, or increase PS `PageCache` size. +It is currently manual but could be automated, e.g., in Storage Controller. + +In the future, we may eliminate the `PageCache` even for indirect blocks. +For example with an LRU cache that has as unit the entire disk btree content +instead of individual blocks. + +## High-Level Design + +So, before work on this project started, all data block reads and the entire write path of Pageserver were using kernel-buffered IO, i.e., the kernel page cache. +We now want to get the kernel page cache out of the picture by using direct IO for all interaction with the filesystem. +This achieves the following system properties: + +**Predictable VirtualFile latencies** +* With buffered IO, reads are sometimes fast, sometimes slow, depending on kernel page cache hit/miss. +* With buffered IO, appends when writing out new layer files during ingest or compaction are sometimes fast, sometimes slow because of write-back backpressure. +* With buffered IO, the "malloc backscatter" phenomenon pointed out in the Glossary section is not something we actively observe. + But we do have occasional spikes in Dirty memory amount and Memory PSI graphs, so it may already be affecting to some degree. +* By switching to direct IO, above operations will have the (predictable) device latency -- always. + Reads and appends always go to disk. + And malloc will not have to write back dirty data. + +**Explicitness & Tangibility of resource usage** +* In a multi-tenant system, it is generally desirable and valuable to be *explicit* about the main resources we use for each tenant. +* By using direct IO, we become explicit about the resources *disk IOPs* and *memory capacity* in a way that was previously being conflated through the kernel page cache, outside our immediate control. +* We will be able to build per-tenant observability of resource usage ("what tenant is causing the actual IOs that are sent to the disk?"). +* We will be able to build accounting & QoS by implementing an IO scheduler that is tenant aware. The kernel is not tenant-aware and can't do that. + +**CPU Efficiency** +* The involvement of the kernel page cache means one additional memory-to-memory copy on read and write path. +* Direct IO will eliminate that memory-to-memory copy, if we can make the userspace buffers used for the IO calls satisfy direct IO alignment requirements. + +The **trade-off** is that we no longer get the theoretical benefits of the kernel page cache. These are: +- read latency improvements for repeat reads of the same data ("locality of reference") + - asterisk: only if that state is still cache-resident by time of next access +- write throughput by having kernel page cache batch small VFS writes into bigger disk writes + - asterisk: only if memory pressure is low enough that the kernel can afford to delay writeback + +We are **happy to make this trade-off**: +- Because of the advantages listed above. +- Because we empirically have enough DRAM on Pageservers to serve metadata (=index blocks) from PS PageCache. + (At just 2GiB PS PageCache size, we average a 99.95% hit rate). + So, the latency of going to disk is only for data block reads, not the index traversal. +- Because **the kernel page cache is ineffective** at high tenant density anyway (#tenants/pageserver instance). + And because dense packing of tenants will always be desirable to drive COGS down, we should design the system for it. + (See the appendix for a more detailed explanation why this is). +- So, we accept that some reads that used to be fast by circumstance will have higher but **predictable** latency than before. + +### Desired End State + +The desired end state of the project is as follows, and with some asterisks, we have achieved it. + +All IOs of the Pageserver data path use direct IO, thereby bypassing the kernel page cache. + +In particular, the "data path" includes +- the wal ingest path +- compaction +- anything on the `Timeline::get` / `Timeline::get_vectored` path. + +The production Pageserver config is tuned such that virtually all non-data blocks are cached in the PS PageCache. +Hit rate target is 99.95%. + +There are no regressions to ingest latency. + +The total "wait-for-disk time" contribution to random getpage request latency is `O(1 read IOP latency)`. +We accomplish that by having a near 100% PS PageCache hit rate so that layer index traversal effectively never needs not wait for IO. +Thereby, it can issue all the data blocks as it traverses the index, and only wait at the end of it (concurrent IO). + +The amortized "wait-for-disk time" contribution of this direct IO proposal to a series of sequential getpage requests is `1/32 * read IOP latency` for each getpage request. +We accomplish this by server-side batching of up to 32 reads into a single `Timeline::get_vectored` call. +(This is an ideal world where our batches are full - that's not the case in prod today because of lack of queue depth). + +## Design & Implementation + +### Prerequisites + +A lot of prerequisite work had to happen to enable use of direct IO. + +To meet the "wait-for-disk time" requirements from the DoD, we implement for the read path: +- page_service level server-side batching (config field `page_service_pipelining`) +- concurrent IO (config field `get_vectored_concurrent_io`) +The work for both of these these was tracked [in the epic](https://github.com/neondatabase/neon/issues/9376). +Server-side batching will likely be obsoleted by the [#proj-compute-communicator](https://github.com/neondatabase/neon/pull/10799). +The Concurrent IO work is described in retroactive RFC `2025-04-30-pageserver-concurrent-io-on-read-path.md`. +The implementation is relatively brittle and needs further investment, see the `Future Work` section in that RFC. + +For the write path, and especially WAL ingest, we need to hide write latency. +We accomplish this by implementing a (`BufferedWriter`) type that does double-buffering: flushes of the filled +buffer happen in a sidecar tokio task while new writes fill a new buffer. +We refactor InMemoryLayer as well as BlobWriter (=> delta and image layer writers) to use this new `BufferedWriter`. +The most comprehensive write-up of this work is in [the PR description](https://github.com/neondatabase/neon/pull/11558). + +### Ensuring Adherence to Alignment Requirements + +Direct IO puts requirements on +- memory buffer alignment +- io size (=memory buffer size) +- file offset alignment + +The requirements are specific to a combination of filesystem/block-device/architecture(hardware page size!). + +In Neon production environments we currently use ext4 with Linux 6.1.X on AWS and Azure storage-optimized instances (locally attached NVMe). +Instead of dynamic discovery using `statx`, we statically hard-code 512 bytes as the buffer/offset alignment and size-multiple. +We made this decision because: +- a) it is compatible with all the environments we need to run in +- b) our primary workload can be small-random-read-heavy (we do merge adjacent reads if possible, but the worst case is that all `Value`s that needs to be read are far apart) +- c) 512-byte tail latency on the production instance types is much better than 4k (p99.9: 3x lower, p99.99 5x lower). +- d) hard-coding at compile-time allows us to use the Rust type system to enforce the use of only aligned IO buffers, eliminating a source of runtime errors typically associated with direct IO. + +This was [discussed here](https://neondb.slack.com/archives/C07BZ38E6SD/p1725036790965549?thread_ts=1725026845.455259&cid=C07BZ38E6SD). + +The new `IoBufAligned` / `IoBufAlignedMut` marker traits indicate that a given buffer meets memory alignment requirements. +All `VirtualFile` APIs and several software layers built on top of them only accept buffers that implement those traits. +Implementors of the marker traits are: +- `IoBuffer` / `IoBufferMut`: used for most reads and writes +- `PageWriteGuardBuf`: for filling PS PageCache pages (index blocks!) + +The alignment requirement is infectious; it permeates bottom-up throughout the code base. +We stop the infection at roughly the same layers in the code base where we stopped permeating the +use of owned-buffers-style API for tokio-epoll-uring. The way the stopping works is by introducing +a memory-to-memory copy from/to some unaligned memory location on the stack/current/heap. +The places where we currently stop permeating are sort of arbitrary. For example, it would probably +make sense to replace more usage of `Bytes` that we know holds 8k pages with 8k-sized `IoBuffer`s. + +The `IoBufAligned` / `IoBufAlignedMut` types do not protect us from the following types of runtime errors: +- non-adherence to file offset alignment requirements +- non-adherence to io size requirements + +The following higher-level constructs ensure we meet the requirements: +- read path: the `ChunkedVectoredReadBuilder` and `mod vectored_dio_read` ensure reads happen at aligned offsets and in appropriate size multiples. +- write path: `BufferedWriter` only writes in multiples of the capacity, at offsets that are `start_offset+N*capacity`; see its doc comment. + +Note that these types are used always, regardless of whether direct IO is enabled or not. +There are some cases where this adds unnecessary overhead to buffered IO (e.g. all memcpy's inflated to multiples of 512). +But we could not identify meaningful impact in practice when we shipped these changes while we were still using buffered IO. + +### Configuration / Feature Flagging + +In the previous section we described how all users of VirtualFile were changed to always adhere to direct IO alignment and size-multiple requirements. +To actually enable direct IO, all we need to do is set the `O_DIRECT` flag in `open` syscalls / io_uring operations. + +We set `O_DIRECT` based on: +- the VirtualFile API used to create/open the VirtualFile instance +- the `virtual_file_io_mode` configuration flag +- the OpenOptions `read` and/or `write` flags. + +The VirtualFile APIs suffixed with `_v2` are the only ones that _may_ open with `O_DIRECT` depending on the other two factors in above list. +Other APIs never use `O_DIRECT`. +(The name is bad and should really be `_maybe_direct_io`.) + +The reason for having new APIs is because all code used VirtualFile but implementation and rollout happened in consecutive phases (read path, InMemoryLayer, write path). +At the VirtualFile level, context on whether an instance of VirtualFile is on read path, InMemoryLayer, or write path is not available. + +The `_v2` APIs then check make the decision to set `O_DIRECT` based on the `virtual_file_io_mode` flag and the OpenOptions `read`/`write` flags. +The result is the following runtime behavior: + +|what|OpenOptions|`v_f_io_mode`
=`buffered`|`v_f_io_mode`
=`direct`|`v_f_io_mode`
=`direct-rw`| +|-|-|-|-|-| +|`DeltaLayerInner`|read|()|O_DIRECT|O_DIRECT| +|`ImageLayerInner`|read|()|O_DIRECT|O_DIRECT| +|`InMemoryLayer`|read + write|()|()*|O_DIRECT| +|`DeltaLayerWriter`| write | () | () | O_DIRECT | +|`ImageLayerWriter`| write | () | () | O_DIRECT | +|`download_layer_file`|write |()|()|O_DIRECT| + +The `InMemoryLayer` is marked with `*` because there was a period when it *did* use O_DIRECT under `=direct`. +That period was when we implemented and shipped the first version of `BufferedWriter`. +We used it in `InMemoryLayer` and `download_layer_file` but it was only sensitive to `v_f_io_mode` in `InMemoryLayer`. +The introduction of `=direct-rw`, and the switch of the remaining write path to `BufferedWriter`, happened later, +in https://github.com/neondatabase/neon/pull/11558. + +Note that this way of feature flagging inside VirtualFile makes it less and less a general purpose POSIX file access abstraction. +For example, with `=direct-rw` enabled, it is no longer possible to open a `VirtualFile` without `O_DIRECT`. It'll always be set. + +## Correctness Validation + +The correctness risks with this project were: +- Memory safety issues in the `IoBuffer` / `IoBufferMut` implementation. + These types expose an API that is largely identical to that of the `bytes` crate and/or Vec. +- Runtime errors (=> downtime / unavailability) because of non-adherence to alignment/size-multiple requirements, resulting in EINVAL on the read path. + +We sadly do not have infrastructure to run pageserver under `cargo miri`. +So for memory safety issues, we relied on careful peer review. + +We do assert the production-like alignment requirements in testing builds. +However, these asserts were added retroactively. +The actual validation before rollout happened in staging and pre-prod. +We eventually enabled `=direct`/`=direct-rw` for Rust unit tests and the regression test suite. +I cannot recall a single instance of staging/pre-prod/production errors caused by non-adherence to alignment/size-multiple requirements. +Evidently developer testing was good enough. + +## Performance Validation + +The read path went through a lot of iterations of benchmarking in staging and pre-prod. +The benchmarks in those environments demonstrated performance regressions early in the implementation. +It was actually this performance testing that made us implement batching and concurrent IO to avoid unacceptable regressions. + +The write path was much quicker to validate because `bench_ingest` covered all of the (less numerous) access patterns. + +## Future Work + +There is minor and major follow-up work that can be considered in the future. +Check the (soon-to-be-closed) Epic https://github.com/neondatabase/neon/issues/8130's "Follow-Ups" section for a current list. + +Read Path: +- PS PageCache hit rate is crucial to unlock concurrent IO and reasonable latency for random reads generally. + Instead of reactively sizing PS PageCache, we should estimate the required PS PageCache size + and potentially also use that to drive placement decisions of shards from StorageController + https://github.com/neondatabase/neon/issues/9288 +- ... unless we get rid of PS PageCache entirely and cache the index block in a more specialized cache. + But even then, an estimation of the working set would be helpful to figure out caching strategy. + +Write Path: +- BlobWriter and its users could switch back to a borrowed API https://github.com/neondatabase/neon/issues/10129 +- ... unless we want to implement bypass mode for large writes https://github.com/neondatabase/neon/issues/10101 +- The `TempVirtualFile` introduced as part of this project could internalize more of the common usage pattern: https://github.com/neondatabase/neon/issues/11692 +- Reduce conditional compilation around `virtual_file_io_mode`: https://github.com/neondatabase/neon/issues/11676 + +Both: +- A performance simulation mode that pads VirtualFile op latencies to typical NVMe latencies, even if the underlying storage is faster. + This would avoid misleadingly good performance on developer systems and in benchmarks on systems that are less busy than production hosts. + However, padding latencies at microsecond scale is non-trivial. + +Misc: +- We should finish trimming VirtualFile's scope to be truly limited to core data path read & write. + Abstractions for reading & writing pageserver config, location config, heatmaps, etc, should use + APIs in a different package (`VirtualFile::crashsafe_overwrite` and `VirtualFile::read_to_string` + are good entrypoints for cleanup.) https://github.com/neondatabase/neon/issues/11809 + +# Appendix + +## Why Kernel Page Cache Is Ineffective At Tenant High Density + +In the Motivation section, we stated: + +> - **The kernel page cache ineffective** at high tenant density anyways (#tenants/pageserver instance). + +The reason is that the Pageserver workload sent from Computes is whatever is a Compute cache(s) miss. +That's either sequential scans or random reads. +A random read workload simply causes cache thrashing because a packed Pageserver NVMe drive (`im4gn.2xlarge`) has ~100x more capacity than DRAM available. +It is complete waste to have the kernel page cache cache data blocks in this case. +Sequential read workloads *can* benefit iff those pages have been updated recently (=no image layer yet) and together in time/LSN space. +In such cases, the WAL records of those updates likely sit on the same delta layer block. +When Compute does a sequential scan, it sends a series of single-page requests for these individual pages. +When Pageserver processes the second request in such a series, it goes to the same delta layer block and have a kernel page cache hit. +This dependence on kernel page cache for sequential scan performance is significant, but the solution is at a higher level than generic data block caching. +We can either add a small per-connection LRU cache for such delta layer blocks. +Or we can merge those sequential requests into a larger vectored get request, which is designed to never read a block twice. +This amortizes the read latency for our delta layer block across the vectored get batch size (which currently is up to 32). + +There are Pageserver-internal workloads that do sequential access (compaction, image layer generation), but these +1. are not latency-critical and can do batched access outside of the `page_service` protocol constraints (image layer generation) +2. don't actually need to reconstruct images and therefore can use totally different access methods (=> compaction can use k-way merge iterators with their own internal buffering / prefetching). diff --git a/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md b/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md new file mode 100644 index 0000000000..2dc937d298 --- /dev/null +++ b/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md @@ -0,0 +1,251 @@ +# Concurrent IO for Pageserver Read Path + +Date: May 6, 2025 + +## Summary + +This document is a retroactive RFC on the Pageserver Concurrent IO work that happened in late 2024 / early 2025. + +The gist of it is that Pageserver's `Timeline::get_vectored` now _issues_ the data block read operations against layer files +_as it traverses the layer map_ and only _wait_ once, for all of them, after traversal is complete. + +Assuming a good PS PageCache hits on the index blocks during traversal, this drives down the "wait-for-disk" time +contribution down from `random_read_io_latency * O(number_of_values)` to `random_read_io_latency * O(1 + traversal)`. + +The motivation for why this work had to happen when it happened was the switch of Pageserver to +- not cache user data blocks in PS PageCache and +- switch to use direct IO. +More context on this are given in complimentary RFC `./rfcs/2025-04-30-direct-io-for-pageserver.md`. + +### Refs + +- Epic: https://github.com/neondatabase/neon/issues/9378 +- Prototyping happened during the Lisbon 2024 Offsite hackathon: https://github.com/neondatabase/neon/pull/9002 +- Main implementation PR with good description: https://github.com/neondatabase/neon/issues/9378 + +Design and implementation by: +- Vlad Lazar +- Christian Schwarz + +## Background & Motivation + +The Pageserver read path (`Timeline::get_vectored`) consists of two high-level steps: +- Retrieve the delta and image `Value`s required to reconstruct the requested Page@LSN (`Timeline::get_values_reconstruct_data`). +- Pass these values to walredo to reconstruct the page images. + +The read path used to be single-key but has been made multi-key some time ago. +([Internal tech talk by Vlad](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link)) +However, for simplicity, most of this doc will explain things in terms of a single key being requested. + +The `Value` retrieval step above can be broken down into the following functions: +- **Traversal** of the layer map to figure out which `Value`s from which layer files are required for the page reconstruction. +- **Read IO Planning**: planning of the read IOs that need to be issued to the layer files / filesystem / disk. + The main job here is to coalesce the small value reads into larger filesystem-level read operations. + This layer also takes care of direct IO alignment and size-multiple requirements (cf the RFC for details.) + Check `struct VectoredReadPlanner` and `mod vectored_dio_read` for how it's done. +- **Perform the read IO** using `tokio-epoll-uring`. + +Before this project, above functions were sequentially interleaved, meaning: +1. we would advance traversal, ... +2. discover, that we need to read a value, ... +3. read it from disk using `tokio-epoll-uring`, ... +4. goto 1 unless we're done. + +This meant that if N `Value`s need to be read to reconstruct a page, +the time we spend waiting for disk will be we `random_read_io_latency * O(number_of_values)`. + +## Design + +The **traversal** and **read IO Planning** jobs still happen sequentially, layer by layer, as before. +But instead of performing the read IOs inline, we submit the IOs to a concurrent tokio task for execution. +After the last read from the last layer is submitted, we wait for the IOs to complete. + +Assuming the filesystem / disk is able to actually process the submitted IOs without queuing, +we arrive at _time spent waiting for disk_ ~ `random_read_io_latency * O(1 + traversal)`. + +Note this whole RFC is concerned with the steady state where all layer files required for reconstruction are resident on local NVMe. +Traversal will stall on on-demand layer download if a layer is not yet resident. +It cannot proceed without the layer being resident beccause its next step depends on the contents of the layer index. + +### Avoiding Waiting For IO During Traversal + +The `traversal` component in above time-spent-waiting-for-disk estimation is dominant and needs to be minimized. + +Before this project, traversal needed to perform IOs for the following: +1. The time we are waiting on PS PageCache to page in the visited layers' disk btree index blocks. +2. When visiting a delta layer, reading the data block that contains a `Value` for a requested key, + to determine whether the `Value::will_init` the page and therefore traversal can stop for this key. + +The solution for (1) is to raise the PS PageCache size such that the hit rate is practically 100%. +(Check out the `Background: History Of Caching In Pageserver` section in the RFC on Direct IO for more details.) + +The solution for (2) is source `will_init` from the disk btree index keys, which fortunately +already encode this bit of information since the introduction of the current storage/layer format. + +### Concurrent IOs, Submission & Completion + +To separate IO submission from waiting for its completion, +we introduce the notion of an `IoConcurrency` struct through which IOs are issued. + +An IO is an opaque future that +- captures the `tx` side of a `oneshot` channel +- performs the read IO by calling `VirtualFile::read_exact_at().await` +- sending the result into the `tx` + +Issuing an IO means `Box`ing the future above and handing that `Box` over to the `IoConcurrency` struct. + +The traversal code that submits the IO stores the the corresponding `oneshot::Receiver` +in the `VectoredValueReconstructState`, in the the place where we previously stored +the sequentially read `img` and `records` fields. + +When we're done with traversal, we wait for all submitted IOs: +for each key, there is a future that awaits all the `oneshot::Receiver`s +for that key, and then calls into walredo to reconstruct the page image. +Walredo is now invoked concurrently for each value instead of sequentially. +Walredo itself remains unchanged. + +The spawned IO futures are driven to completion by a sidecar tokio task that +is separate from the task that performs all the layer visiting and spawning of IOs. +That tasks receives the IO futures via an unbounded mpsc channel and +drives them to completion inside a `FuturedUnordered`. + +### Error handling, Panics, Cancellation-Safety + +There are two error classes during reconstruct data retrieval: +* traversal errors: index lookup, move to next layer, and the like +* value read IO errors + +A traversal error fails the entire `get_vectored` request, as before this PR. +A value read error only fails reconstruction of that value. + +Panics and dropping of the `get_vectored` future before it completes +leaves the sidecar task running and does not cancel submitted IOs +(see next section for details on sidecar task lifecycle). +All of this is safe, but, today's preference in the team is to close out +all resource usage explicitly if possible, rather than cancelling + forgetting +about it on drop. So, there is warning if we drop a +`VectoredValueReconstructState`/`ValuesReconstructState` that still has uncompleted IOs. + +### Sidecar Task Lifecycle + +The sidecar tokio task is spawned as part of the `IoConcurrency::spawn_from_conf` struct. +The `IoConcurrency` object acts as a handle through which IO futures are submitted. + +The spawned tokio task holds the `Timeline::gate` open. +It is _not_ sensitive to `Timeline::cancel`, but instead to the `IoConcurrency` object being dropped. + +Once the `IoConcurrency` struct is dropped, no new IO futures can come in +but already submitted IO futures will be driven to completion regardless. +We _could_ safely stop polling these futures because `tokio-epoll-uring` op futures are cancel-safe. +But the underlying kernel and hardware resources are not magically freed up by that. +So, again, in the interest of closing out all outstanding resource usage, we make timeline shutdown wait for sidecar tasks and their IOs to complete. +Under normal conditions, this should be in the low hundreds of microseconds. + +It is advisable to make the `IoConcurrency` as long-lived as possible to minimize the amount of +tokio task churn (=> lower pressure on tokio). Generally this means creating it "high up" in the call stack. +The pain with this is that the `IoConcurrency` reference needs to be propagated "down" to +the (short-lived) functions/scope where we issue the IOs. +We would like to use `RequestContext` for this propagation in the future (issue [here](https://github.com/neondatabase/neon/issues/10460)). +For now, we just add another argument to the relevant code paths. + +### Feature Gating + +The `IoConcurrency` is an `enum` with two variants: `Sequential` and `SidecarTask`. + +The behavior from before this project is available through `IoConcurrency::Sequential`, +which awaits the IO futures in place, without "spawning" or "submitting" them anywhere. + +The `get_vectored_concurrent_io` pageserver config variable determines the runtime value, +**except** for the places that use `IoConcurrency::sequential` to get an `IoConcurrency` object. + +### Alternatives Explored & Caveats Encountered + +A few words on the rationale behind having a sidecar *task* and what +alternatives were considered but abandoned. + +#### Why We Need A Sidecar *Task* / Why Just `FuturesUnordered` Doesn't Work + +We explored to not have a sidecar task, and instead have a `FuturesUnordered` per +`Timeline::get_vectored`. We would queue all IO futures in it and poll it for the +first time after traversal is complete (i.e., at `collect_pending_ios`). + +The obvious disadvantage, but not showstopper, is that we wouldn't be submitting +IOs until traversal is complete. + +The showstopper however, is that deadlocks happen if we don't drive the +IO futures to completion independently of the traversal task. +The reason is that both the IO futures and the traversal task may hold _some_, +_and_ try to acquire _more_, shared limited resources. +For example, both the travseral task and IO future may try to acquire +* a `VirtualFile` file descriptor cache slot async mutex (observed during impl) +* a `tokio-epoll-uring` submission slot (observed during impl) +* a `PageCache` slot (currently this is not the case but we may move more code into the IO futures in the future) + +#### Why We Don't Do `tokio::task`-per-IO-future + +Another option is to spawn a short-lived `tokio::task` for each IO future. +We implemented and benchmarked it during development, but found little +throughput improvement and moderate mean & tail latency degradation. +Concerns about pressure on the tokio scheduler led us to abandon this variant. + +## Future Work + +In addition to what is listed here, also check the "Punted" list in the epic: +https://github.com/neondatabase/neon/issues/9378 + +### Enable `Timeline::get` + +The only major code path that still uses `IoConcurrency::sequential` is `Timeline::get`. +The impact is that roughly the following parts of pageserver do not benefit yet: +- parts of basebackup +- reads performed by the ingest path +- most internal operations that read metadata keys (e.g. `collect_keyspace`!) + +The solution is to propagate `IoConcurrency` via `RequestContext`:https://github.com/neondatabase/neon/issues/10460 + +The tricky part is to figure out at which level of the code the `IoConcurrency` is spawned (and added to the RequestContext). + +Also, propagation via `RequestContext` makes makes it harder to tell during development whether a given +piece of code uses concurrent vs sequential mode: one has to recurisvely walk up the call tree to find the +place that puts the `IoConcurrency` into the `RequestContext`. +We'd have to use `::Sequential` as the conservative default value in a fresh `RequestContext`, and add some +observability to weed out places that fail to enrich with a properly spanwed `IoConcurrency::spawn_from_conf`. + +### Concurrent On-Demand Downloads enabled by Detached Indices + +As stated earlier, traversal stalls on on-demand download because its next step depends on the contents of the layer index. +Once we have separated indices from data blocks (=> https://github.com/neondatabase/neon/issues/11695) +we will only need to stall if the index is not resident. The download of the data blocks can happen concurrently or in the background. For example: +- Move the `Layer::get_or_maybe_download().await` inside the IO futures. + This goes in the opposite direction of the next "future work" item below, but it's easy to do. +- Serve the IO future directly from object storage and dispatch the layer download + to some other actor, e.g., an actor that is responsible for both downloads & eviction. + +### New `tokio-epoll-uring` API That Separates Submission & Wait-For-Completion + +Instead of `$op().await` style API, it would be useful to have a different `tokio-epoll-uring` API +that separates enqueuing (without necessarily `io_uring_enter`ing the kernel each time), submission, +and then wait for completion. + +The `$op().await` API is too opaque, so we _have_ to stuff it into a `FuturesUnordered`. + +A split API as sketched above would allow traversal to ensure an IO operation is enqueued to the kernel/disk (and get back-pressure iff the io_uring squeue is full). +While avoiding spending of CPU cycles on processing of completions while we're still traversing. + +The idea gets muddied by the fact that we may self-deadlock if we submit too much without completing. +So, the submission part of the split API needs to process completions if squeue is full. + +In any way, this split API is precondition for the bigger issue with the design presented here, +which we dicsuss in the next section. + +### Opaque Futures Are Brittle + +The use of opaque futures to represent submitted IOs is a clever hack to minimize changes & allow for near-perfect feature-gating. +However, we take on **brittleness** because callers must guarantee that the submitted futures are independent. +By our experience, it is non-trivial to identify or rule out the interdependencies. +See the lengthy doc comment on the `IoConcurrency::spawn_io` method for more details. + +The better interface and proper subsystem boundary is a _descriptive_ struct of what needs to be done ("read this range from this VirtualFile into this buffer") +and get back a means to wait for completion. +The subsystem can thereby reason by its own how operations may be related; +unlike today, where the submitted opaque future can do just about anything. diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg new file mode 100644 index 0000000000..7107198c0a --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg @@ -0,0 +1,135 @@ + + + + + + 01-basic-idea + + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GC Horizon + + + + + Images + at earlier LSN + + + + + Deltas + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Deltas + + + + + Images + at GC LSN + + + + + + Deltas above GC Horizon + + + + + + Deltas below GC Horizon + + + + + + Deltas above GC Horizon + + + + + + Deltas and image below GC Horizon gets garbage-collected + + + + + WAL replay of deltas+image below GC Horizon + Reshuffle deltas + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg new file mode 100644 index 0000000000..792db6d69e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg @@ -0,0 +1,141 @@ + + + + + + + + + + + + 03-retain-lsn + + + Layer 1 + + + + + + + + + + + + + + + + + + GC Horizon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + retain_lsn 1 + + + + + + + + retain_lsn 2 + + + + + + + + retain_lsn 3 + + + + + + + + retain_lsn 4 + + + + + + + + + Dependent Branch + + + + + retain_lsn 3 + + + + + + + + Branch GC Horizon + + + + + + + + + Partial Image Coverage + + + + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg new file mode 100644 index 0000000000..9593ed969e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg @@ -0,0 +1,187 @@ + + + + + + 05-btmgc-parent + + + Layer 1 + + + + + Append C@0x30 + + + + + + + + + + + + + + + + GC Horizon + + + + + + + Append F@0x60 + + + + + + + + + + + + + + + Append E@0x50 + Append D@0x40 + + + + + + + + + + + + + + + A@0x10, Append B@0x20 + + + + + + + + + + + + + + + + 0x20 + + + + + 0x50 + + + + + + + + 0x40 + + + + + + + + GC Horizon + + + + + + + Append F@0x60 + + + + + + + Append E@0x50 + + + + + + + + 0x20 + + + + + 0x50 + + + + + + + + 0x40 + + + + + + + AB@0x20 + + + + + + + + + + + + + + + ABCD@0x40 + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg new file mode 100644 index 0000000000..b8a93d5b5f --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg @@ -0,0 +1,184 @@ + + + + + + 06-btmgc-child + + + Layer 1 + + + + + + + + + Append P@0x30 + + + + + + + + + + + + + + + + GC Horizon + + + + + + + Append S@0x60 + + + + + + + + + + + + + + + Append R@0x50 + Append Q@0x40 + + + + + + + + + + + + + 0x50 + + + + + + + + 0x20 + + + + + + + AB@0x20 + + + + + + + + + + + + + Ancestor Branch + + + + + + + + + + + + GC Horizon + + + + + + + Append S@0x60 + + + + + 0x50 + + + + + + + + 0x20 + + + + + + + AB + @0x20 + + + + + + + + + + + + + Ancestor Branch + + + + + + + + + + + AB + PQR@0x50 + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg new file mode 100644 index 0000000000..65034226da --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg @@ -0,0 +1,180 @@ + + + + + + 07-btmgc-analysis-1 + + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GC Horizon + + + + + Images + at earlier LSN + + + + + Deltas + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Deltas + + + + + Images + at GC LSN + + + + + + Deltas above GC Horizon + + + + + + Deltas below GC Horizon + + + + + + Deltas above GC Horizon + + + + + + Deltas and image below GC Horizon gets garbage-collected + + + + + size=A + + + + + + + + + + + + + + + + + + + + + + + + + + + + + size=B + + + + + size=C + + + + + A + + + + + + B + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg new file mode 100644 index 0000000000..16a17ec56e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg @@ -0,0 +1,158 @@ + + + + + + 08-optimization + + + Layer 1 + + + + + + + + + + + + + + GC Horizon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0x20 + + + + + + + + 0x50 + + + + + 0x60 + + + + + + + + 0x40 + + + + + + + + + + + + + + + + GC Horizon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0x20 + + + + + 0x70 + + + + + + + + 0x40 + + + + + + + + + + + + + + + + 0x50 + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg new file mode 100644 index 0000000000..243f038c88 --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg @@ -0,0 +1,184 @@ + + + + + + 09-btmgc-analysis-2 + + + Layer 1 + + + + + C + + + + + + + C + + + + + + + C + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + + GC Horizon + + + + + + + + + + + B + + + + + + + B + + + + + + + B + + + + + + + C + + + + + + + C + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + + GC Horizon + + + + + + + B + + + + + + + C + + + + + B + + + + + + C + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg new file mode 100644 index 0000000000..1e49ec017b --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg @@ -0,0 +1,81 @@ + + + + + + 10-btmgc-analysis-3 + + + Layer 1 + + + + + 1/5 X + + + + + + + 1/5 X + + + + + + + 1/5 X + + + + + + + 1/5 X + + + + + + + + GC Horizon + + + + + + + X + + + + + + + + + + + + GC Horizon + + + + + + + 2X + + + + + + + 1/5 X + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg new file mode 100644 index 0000000000..510d7a0c3e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg @@ -0,0 +1,81 @@ + + + + + + 11-btmgc-analysis-4 + + + Layer 1 + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + + GC Horizon + + + + + + + D + + + + + + + + + + + + GC Horizon + + + + + + + D + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png b/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png new file mode 100644 index 0000000000..c106f3ee89 Binary files /dev/null and b/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png differ diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg new file mode 100644 index 0000000000..37c38c727c --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg @@ -0,0 +1,176 @@ + + + + + + gc-compaction-split + + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + branch point + + + + + + + + last branch point + + + + + + Job 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + branch point + + + + + + + + last branch point + + + + + + Job 2 + + + + + + Job 3 + + + + + + Job 4 + + + + + + Job 5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Delta Layer + + + + + + + Image Layer + + + + + diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index f07ef06328..f44efe6d7a 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -343,7 +343,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]); const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg"; fn token() -> String { - let claims = endpoint_storage::Claims { + let claims = endpoint_storage::claims::EndpointStorageClaims { tenant_id: TENANT_ID, timeline_id: TIMELINE_ID, endpoint_id: ENDPOINT_ID.into(), @@ -462,6 +462,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH if var(REAL_S3_ENV).is_ok() { assert!(body.contains("remote_storage_s3_deleted_objects_total")); } + + #[cfg(target_os = "linux")] assert!(body.contains("process_threads")); } @@ -489,16 +491,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH } fn delete_prefix_token(uri: &str) -> String { - use serde::Serialize; let parts = uri.split("/").collect::>(); - #[derive(Serialize)] - struct PrefixClaims { - tenant_id: TenantId, - timeline_id: Option, - endpoint_id: Option, - exp: u64, - } - let claims = PrefixClaims { + let claims = endpoint_storage::claims::DeletePrefixClaims { tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(), timeline_id: parts.get(2).map(|c| c.parse().unwrap()), endpoint_id: parts.get(3).map(ToString::to_string), diff --git a/endpoint_storage/src/claims.rs b/endpoint_storage/src/claims.rs new file mode 100644 index 0000000000..ef0f0eb0b4 --- /dev/null +++ b/endpoint_storage/src/claims.rs @@ -0,0 +1,52 @@ +use serde::{Deserialize, Serialize}; +use std::fmt::Display; +use utils::id::{EndpointId, TenantId, TimelineId}; + +/// Claims to add, remove, or retrieve endpoint data. Used by compute_ctl +#[derive(Deserialize, Serialize, PartialEq)] +pub struct EndpointStorageClaims { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub endpoint_id: EndpointId, + pub exp: u64, +} + +/// Claims to remove tenant, timeline, or endpoint data. Used by control plane +#[derive(Deserialize, Serialize, PartialEq)] +pub struct DeletePrefixClaims { + pub tenant_id: TenantId, + /// None when tenant is deleted (endpoint_id is also None in this case) + pub timeline_id: Option, + /// None when timeline is deleted + pub endpoint_id: Option, + pub exp: u64, +} + +impl Display for EndpointStorageClaims { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "EndpointClaims(tenant_id={} timeline_id={} endpoint_id={} exp={})", + self.tenant_id, self.timeline_id, self.endpoint_id, self.exp + ) + } +} + +impl Display for DeletePrefixClaims { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "DeletePrefixClaims(tenant_id={} timeline_id={} endpoint_id={}, exp={})", + self.tenant_id, + self.timeline_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()), + self.endpoint_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()), + self.exp + ) + } +} diff --git a/endpoint_storage/src/lib.rs b/endpoint_storage/src/lib.rs index eb6b80c487..d1625dc843 100644 --- a/endpoint_storage/src/lib.rs +++ b/endpoint_storage/src/lib.rs @@ -1,3 +1,5 @@ +pub mod claims; +use crate::claims::{DeletePrefixClaims, EndpointStorageClaims}; use anyhow::Result; use axum::extract::{FromRequestParts, Path}; use axum::response::{IntoResponse, Response}; @@ -13,7 +15,7 @@ use std::result::Result as StdResult; use std::sync::Arc; use tokio_util::sync::CancellationToken; use tracing::{debug, error}; -use utils::id::{TenantId, TimelineId}; +use utils::id::{EndpointId, TenantId, TimelineId}; // simplified version of utils::auth::JwtAuth pub struct JwtAuth { @@ -79,26 +81,6 @@ pub struct Storage { pub max_upload_file_limit: usize, } -pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc - -#[derive(Deserialize, Serialize, PartialEq)] -pub struct Claims { - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub endpoint_id: EndpointId, - pub exp: u64, -} - -impl Display for Claims { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})", - self.tenant_id, self.timeline_id, self.endpoint_id, self.exp - ) - } -} - #[derive(Deserialize, Serialize)] struct KeyRequest { tenant_id: TenantId, @@ -107,6 +89,13 @@ struct KeyRequest { path: String, } +#[derive(Deserialize, Serialize, PartialEq)] +struct PrefixKeyRequest { + tenant_id: TenantId, + timeline_id: Option, + endpoint_id: Option, +} + #[derive(Debug, PartialEq)] pub struct S3Path { pub path: RemotePath, @@ -165,7 +154,7 @@ impl FromRequestParts> for S3Path { .extract::>>() .await .map_err(|e| bad_request(e, "invalid token"))?; - let claims: Claims = state + let claims: EndpointStorageClaims = state .auth .decode(bearer.token()) .map_err(|e| bad_request(e, "decoding token"))?; @@ -178,7 +167,7 @@ impl FromRequestParts> for S3Path { path.endpoint_id.clone() }; - let route = Claims { + let route = EndpointStorageClaims { tenant_id: path.tenant_id, timeline_id: path.timeline_id, endpoint_id, @@ -193,38 +182,13 @@ impl FromRequestParts> for S3Path { } } -#[derive(Deserialize, Serialize, PartialEq)] -pub struct PrefixKeyPath { - pub tenant_id: TenantId, - pub timeline_id: Option, - pub endpoint_id: Option, -} - -impl Display for PrefixKeyPath { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})", - self.tenant_id, - self.timeline_id - .as_ref() - .map(ToString::to_string) - .unwrap_or("".to_string()), - self.endpoint_id - .as_ref() - .map(ToString::to_string) - .unwrap_or("".to_string()) - ) - } -} - #[derive(Debug, PartialEq)] pub struct PrefixS3Path { pub path: RemotePath, } -impl From<&PrefixKeyPath> for PrefixS3Path { - fn from(path: &PrefixKeyPath) -> Self { +impl From<&DeletePrefixClaims> for PrefixS3Path { + fn from(path: &DeletePrefixClaims) -> Self { let timeline_id = path .timeline_id .as_ref() @@ -250,21 +214,27 @@ impl FromRequestParts> for PrefixS3Path { state: &Arc, ) -> Result { let Path(path) = parts - .extract::>() + .extract::>() .await .map_err(|e| bad_request(e, "invalid route"))?; let TypedHeader(Authorization(bearer)) = parts .extract::>>() .await .map_err(|e| bad_request(e, "invalid token"))?; - let claims: PrefixKeyPath = state + let claims: DeletePrefixClaims = state .auth .decode(bearer.token()) .map_err(|e| bad_request(e, "invalid token"))?; - if path != claims { - return Err(unauthorized(path, claims)); + let route = DeletePrefixClaims { + tenant_id: path.tenant_id, + timeline_id: path.timeline_id, + endpoint_id: path.endpoint_id, + exp: claims.exp, + }; + if route != claims { + return Err(unauthorized(route, claims)); } - Ok((&path).into()) + Ok((&route).into()) } } @@ -297,7 +267,7 @@ mod tests { #[test] fn s3_path() { - let auth = Claims { + let auth = EndpointStorageClaims { tenant_id: TENANT_ID, timeline_id: TIMELINE_ID, endpoint_id: ENDPOINT_ID.into(), @@ -327,10 +297,11 @@ mod tests { #[test] fn prefix_s3_path() { - let mut path = PrefixKeyPath { + let mut path = DeletePrefixClaims { tenant_id: TENANT_ID, timeline_id: None, endpoint_id: None, + exp: 0, }; let prefix_path = |s: String| RemotePath::from_string(&s).unwrap(); assert_eq!( diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 98f2fc297c..bbab271474 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,16 +1,58 @@ //! Structs representing the JSON formats used in the compute_ctl's HTTP API. +use std::str::FromStr; + use serde::{Deserialize, Serialize}; use crate::privilege::Privilege; use crate::responses::ComputeCtlConfig; use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; +/// The value to place in the [`ComputeClaims::audience`] claim. +pub static COMPUTE_AUDIENCE: &str = "compute"; + +/// Available scopes for a compute's JWT. +#[derive(Copy, Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ComputeClaimsScope { + /// An admin-scoped token allows access to all of `compute_ctl`'s authorized + /// facilities. + Admin, +} + +impl FromStr for ComputeClaimsScope { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "admin" => Ok(ComputeClaimsScope::Admin), + _ => Err(anyhow::anyhow!("invalid compute claims scope \"{s}\"")), + } + } +} + /// When making requests to the `compute_ctl` external HTTP server, the client /// must specify a set of claims in `Authorization` header JWTs such that /// `compute_ctl` can authorize the request. #[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename = "snake_case")] pub struct ComputeClaims { - pub compute_id: String, + /// The compute ID that will validate the token. The only case in which this + /// can be [`None`] is if [`Self::scope`] is + /// [`ComputeClaimsScope::Admin`]. + pub compute_id: Option, + + /// The scope of what the token authorizes. + pub scope: Option, + + /// The recipient the token is intended for. + /// + /// See [RFC 7519](https://www.rfc-editor.org/rfc/rfc7519#section-4.1.3) for + /// more information. + /// + /// TODO: Remove the [`Option`] wrapper when control plane learns to send + /// the claim. + #[serde(rename = "aud")] + pub audience: Option>, } /// Request of the /configure API diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index b7d6b7ca34..24d371c6eb 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -46,6 +46,30 @@ pub struct ExtensionInstallResponse { pub version: ExtVersion, } +#[derive(Serialize, Default, Debug, Clone)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum LfcPrewarmState { + #[default] + NotPrewarmed, + Prewarming, + Completed, + Failed { + error: String, + }, +} + +#[derive(Serialize, Default, Debug, Clone)] +#[serde(tag = "status", rename_all = "snake_case")] +pub enum LfcOffloadState { + #[default] + NotOffloaded, + Offloading, + Completed, + Failed { + error: String, + }, +} + /// Response of the /status API #[derive(Serialize, Debug, Deserialize)] #[serde(rename_all = "snake_case")] diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index ad246c48ec..09b550b96c 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -172,6 +172,15 @@ pub struct ComputeSpec { /// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding. /// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514 pub logs_export_host: Option, + + /// Address of endpoint storage service + pub endpoint_storage_addr: Option, + /// JWT for authorizing requests to endpoint storage service + pub endpoint_storage_token: Option, + + /// If true, download LFC state from endpoint_storage and pass it to Postgres on startup + #[serde(default)] + pub prewarm_lfc_on_startup: bool, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index 37de24be5b..30e788a601 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -84,6 +84,11 @@ "value": "on", "vartype": "bool" }, + { + "name": "prewarm_lfc_on_startup", + "value": "off", + "vartype": "bool" + }, { "name": "neon.safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501", diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs index 13a745e031..f91800685f 100644 --- a/libs/metrics/src/more_process_metrics.rs +++ b/libs/metrics/src/more_process_metrics.rs @@ -16,6 +16,7 @@ pub struct Collector { const NMETRICS: usize = 2; static CLK_TCK_F64: Lazy = Lazy::new(|| { + // SAFETY: libc::sysconf is safe, it merely returns a value. let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) }; if long == -1 { panic!("sysconf(_SC_CLK_TCK) failed"); diff --git a/libs/neon-shmem/Cargo.toml b/libs/neon-shmem/Cargo.toml new file mode 100644 index 0000000000..2a636bec40 --- /dev/null +++ b/libs/neon-shmem/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "neon-shmem" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +thiserror.workspace = true +nix.workspace=true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +[target.'cfg(target_os = "macos")'.dependencies] +tempfile = "3.14.0" diff --git a/libs/neon-shmem/src/lib.rs b/libs/neon-shmem/src/lib.rs new file mode 100644 index 0000000000..e1b14b1371 --- /dev/null +++ b/libs/neon-shmem/src/lib.rs @@ -0,0 +1,418 @@ +//! Shared memory utilities for neon communicator + +use std::num::NonZeroUsize; +use std::os::fd::{AsFd, BorrowedFd, OwnedFd}; +use std::ptr::NonNull; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use nix::errno::Errno; +use nix::sys::mman::MapFlags; +use nix::sys::mman::ProtFlags; +use nix::sys::mman::mmap as nix_mmap; +use nix::sys::mman::munmap as nix_munmap; +use nix::unistd::ftruncate as nix_ftruncate; + +/// ShmemHandle represents a shared memory area that can be shared by processes over fork(). +/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's +/// specified at creation. +/// +/// The area is backed by an anonymous file created with memfd_create(). The full address space for +/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`], +/// the underlying file is resized. Do not access the area beyond the current size. Currently, that +/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the +/// future. +pub struct ShmemHandle { + /// memfd file descriptor + fd: OwnedFd, + + max_size: usize, + + // Pointer to the beginning of the shared memory area. The header is stored there. + shared_ptr: NonNull, + + // Pointer to the beginning of the user data + pub data_ptr: NonNull, +} + +/// This is stored at the beginning in the shared memory area. +struct SharedStruct { + max_size: usize, + + /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag + current_size: AtomicUsize, +} + +const RESIZE_IN_PROGRESS: usize = 1 << 63; + +const HEADER_SIZE: usize = std::mem::size_of::(); + +/// Error type returned by the ShmemHandle functions. +#[derive(thiserror::Error, Debug)] +#[error("{msg}: {errno}")] +pub struct Error { + pub msg: String, + pub errno: Errno, +} + +impl Error { + fn new(msg: &str, errno: Errno) -> Error { + Error { + msg: msg.to_string(), + errno, + } + } +} + +impl ShmemHandle { + /// Create a new shared memory area. To communicate between processes, the processes need to be + /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes. + /// + /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other + /// processes can continue using it, however. + pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result { + // create the backing anonymous file. + let fd = create_backing_file(name)?; + + Self::new_with_fd(fd, initial_size, max_size) + } + + fn new_with_fd( + fd: OwnedFd, + initial_size: usize, + max_size: usize, + ) -> Result { + // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size + // is a little larger than this because of the SharedStruct header. Make the upper limit + // somewhat smaller than that, because with anything close to that, you'll run out of + // memory anyway. + if max_size >= 1 << 48 { + panic!("max size {} too large", max_size); + } + if initial_size > max_size { + panic!("initial size {initial_size} larger than max size {max_size}"); + } + + // The actual initial / max size is the one given by the caller, plus the size of + // 'SharedStruct'. + let initial_size = HEADER_SIZE + initial_size; + let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap(); + + // Reserve address space for it with mmap + // + // TODO: Use MAP_HUGETLB if possible + let start_ptr = unsafe { + nix_mmap( + None, + max_size, + ProtFlags::PROT_READ | ProtFlags::PROT_WRITE, + MapFlags::MAP_SHARED, + &fd, + 0, + ) + } + .map_err(|e| Error::new("mmap failed: {e}", e))?; + + // Reserve space for the initial size + enlarge_file(fd.as_fd(), initial_size as u64)?; + + // Initialize the header + let shared: NonNull = start_ptr.cast(); + unsafe { + shared.write(SharedStruct { + max_size: max_size.into(), + current_size: AtomicUsize::new(initial_size), + }) + }; + + // The user data begins after the header + let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) }; + + Ok(ShmemHandle { + fd, + max_size: max_size.into(), + shared_ptr: shared, + data_ptr, + }) + } + + // return reference to the header + fn shared(&self) -> &SharedStruct { + unsafe { self.shared_ptr.as_ref() } + } + + /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified + /// when creating the area. + /// + /// This may only be called from one process/thread concurrently. We detect that case + /// and return an Error. + pub fn set_size(&self, new_size: usize) -> Result<(), Error> { + let new_size = new_size + HEADER_SIZE; + let shared = self.shared(); + + if new_size > self.max_size { + panic!( + "new size ({} is greater than max size ({})", + new_size, self.max_size + ); + } + assert_eq!(self.max_size, shared.max_size); + + // Lock the area by setting the bit in 'current_size' + // + // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory + // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But + // since this is not performance-critical, better safe than sorry . + let mut old_size = shared.current_size.load(Ordering::Acquire); + loop { + if (old_size & RESIZE_IN_PROGRESS) != 0 { + return Err(Error::new( + "concurrent resize detected", + Errno::UnknownErrno, + )); + } + match shared.current_size.compare_exchange( + old_size, + new_size, + Ordering::Acquire, + Ordering::Relaxed, + ) { + Ok(_) => break, + Err(x) => old_size = x, + } + } + + // Ok, we got the lock. + // + // NB: If anything goes wrong, we *must* clear the bit! + let result = { + use std::cmp::Ordering::{Equal, Greater, Less}; + match new_size.cmp(&old_size) { + Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| { + Error::new("could not shrink shmem segment, ftruncate failed: {e}", e) + }), + Equal => Ok(()), + Greater => enlarge_file(self.fd.as_fd(), new_size as u64), + } + }; + + // Unlock + shared.current_size.store( + if result.is_ok() { new_size } else { old_size }, + Ordering::Release, + ); + + result + } + + /// Returns the current user-visible size of the shared memory segment. + /// + /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's + /// responsibility not to access the area beyond the current size. + pub fn current_size(&self) -> usize { + let total_current_size = + self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS; + total_current_size - HEADER_SIZE + } +} + +impl Drop for ShmemHandle { + fn drop(&mut self) { + // SAFETY: The pointer was obtained from mmap() with the given size. + // We unmap the entire region. + let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) }; + // The fd is dropped automatically by OwnedFd. + } +} + +/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an +/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for +/// development and testing, but in production we want the file to stay in memory. +/// +/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused. +#[allow(unused_variables)] +fn create_backing_file(name: &str) -> Result { + #[cfg(not(target_os = "macos"))] + { + nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty()) + .map_err(|e| Error::new("memfd_create failed: {e}", e)) + } + #[cfg(target_os = "macos")] + { + let file = tempfile::tempfile().map_err(|e| { + Error::new( + "could not create temporary file to back shmem area: {e}", + nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)), + ) + })?; + Ok(OwnedFd::from(file)) + } +} + +fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> { + // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that + // we don't get a segfault later when trying to actually use it. + #[cfg(not(target_os = "macos"))] + { + nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| { + Error::new( + "could not grow shmem segment, posix_fallocate failed: {e}", + e, + ) + }) + } + // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate' + #[cfg(target_os = "macos")] + { + nix::unistd::ftruncate(fd, size as i64) + .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use nix::unistd::ForkResult; + use std::ops::Range; + + /// check that all bytes in given range have the expected value. + fn assert_range(ptr: *const u8, expected: u8, range: Range) { + for i in range { + let b = unsafe { *(ptr.add(i)) }; + assert_eq!(expected, b, "unexpected byte at offset {}", i); + } + } + + /// Write 'b' to all bytes in the given range + fn write_range(ptr: *mut u8, b: u8, range: Range) { + unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) }; + } + + // simple single-process test of growing and shrinking + #[test] + fn test_shmem_resize() -> Result<(), Error> { + let max_size = 1024 * 1024; + let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?; + + assert_eq!(init_struct.current_size(), 0); + + // Initial grow + let size1 = 10000; + init_struct.set_size(size1).unwrap(); + assert_eq!(init_struct.current_size(), size1); + + // Write some data + let data_ptr = init_struct.data_ptr.as_ptr(); + write_range(data_ptr, 0xAA, 0..size1); + assert_range(data_ptr, 0xAA, 0..size1); + + // Shrink + let size2 = 5000; + init_struct.set_size(size2).unwrap(); + assert_eq!(init_struct.current_size(), size2); + + // Grow again + let size3 = 20000; + init_struct.set_size(size3).unwrap(); + assert_eq!(init_struct.current_size(), size3); + + // Try to read it. The area that was shrunk and grown again should read as all zeros now + assert_range(data_ptr, 0xAA, 0..5000); + assert_range(data_ptr, 0, 5000..size1); + + // Try to grow beyond max_size + //let size4 = max_size + 1; + //assert!(init_struct.set_size(size4).is_err()); + + // Dropping init_struct should unmap the memory + drop(init_struct); + + Ok(()) + } + + /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier, + /// but is stored in the shared memory area and works across processes. It's implemented by + /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes. + struct SimpleBarrier { + num_procs: usize, + count: AtomicUsize, + } + + impl SimpleBarrier { + unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) { + unsafe { + *ptr = SimpleBarrier { + num_procs, + count: AtomicUsize::new(0), + } + } + } + + pub fn wait(&self) { + let old = self.count.fetch_add(1, Ordering::Relaxed); + + let generation = old / self.num_procs; + + let mut current = old + 1; + while current < (generation + 1) * self.num_procs { + std::thread::sleep(std::time::Duration::from_millis(10)); + current = self.count.load(Ordering::Relaxed); + } + } + } + + #[test] + fn test_multi_process() { + // Initialize + let max_size = 1_000_000_000_000; + let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap(); + let ptr = init_struct.data_ptr.as_ptr(); + + // Store the SimpleBarrier in the first 1k of the area. + init_struct.set_size(10000).unwrap(); + let barrier_ptr: *mut SimpleBarrier = unsafe { + ptr.add(ptr.align_offset(std::mem::align_of::())) + .cast() + }; + unsafe { SimpleBarrier::init(barrier_ptr, 2) }; + let barrier = unsafe { barrier_ptr.as_ref().unwrap() }; + + // Fork another test process. The code after this runs in both processes concurrently. + let fork_result = unsafe { nix::unistd::fork().unwrap() }; + + // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000 + if fork_result.is_parent() { + write_range(ptr, 0xAA, 1000..2000); + } else { + write_range(ptr, 0xBB, 2000..3000); + } + barrier.wait(); + // Verify the contents. (in both processes) + assert_range(ptr, 0xAA, 1000..2000); + assert_range(ptr, 0xBB, 2000..3000); + + // Grow, from the child this time + let size = 10_000_000; + if !fork_result.is_parent() { + init_struct.set_size(size).unwrap(); + } + barrier.wait(); + + // make some writes at the end + if fork_result.is_parent() { + write_range(ptr, 0xAA, (size - 10)..size); + } else { + write_range(ptr, 0xBB, (size - 20)..(size - 10)); + } + barrier.wait(); + + // Verify the contents. (This runs in both processes) + assert_range(ptr, 0, (size - 1000)..(size - 20)); + assert_range(ptr, 0xBB, (size - 20)..(size - 10)); + assert_range(ptr, 0xAA, (size - 10)..size); + + if let ForkResult::Parent { child } = fork_result { + nix::sys::wait::waitpid(child, None).unwrap(); + } + } +} diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index b64c42a808..0fb2ff38ff 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -182,6 +182,9 @@ pub struct ConfigToml { pub tracing: Option, pub enable_tls_page_service_api: bool, pub dev_mode: bool, + pub timeline_import_config: TimelineImportConfig, + #[serde(skip_serializing_if = "Option::is_none")] + pub basebackup_cache_config: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -234,7 +237,7 @@ pub enum PageServiceProtocolPipelinedBatchingStrategy { ScatteredLsn, } -#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(tag = "mode", rename_all = "kebab-case")] pub enum GetVectoredConcurrentIo { /// The read path is fully sequential: layers are visited @@ -300,6 +303,33 @@ impl From for tracing_utils::Protocol { } } +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct TimelineImportConfig { + pub import_job_concurrency: NonZeroUsize, + pub import_job_soft_size_limit: NonZeroUsize, + pub import_job_checkpoint_threshold: NonZeroUsize, +} + +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(default)] +pub struct BasebackupCacheConfig { + #[serde(with = "humantime_serde")] + pub cleanup_period: Duration, + // FIXME: Support max_size_bytes. + // pub max_size_bytes: usize, + pub max_size_entries: i64, +} + +impl Default for BasebackupCacheConfig { + fn default() -> Self { + Self { + cleanup_period: Duration::from_secs(60), + // max_size_bytes: 1024 * 1024 * 1024, // 1 GiB + max_size_entries: 1000, + } + } +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -483,6 +513,14 @@ pub struct TenantConfigToml { /// Tenant level performance sampling ratio override. Controls the ratio of get page requests /// that will get perf sampling for the tenant. pub sampling_ratio: Option, + + /// Capacity of relsize snapshot cache (used by replicas). + pub relsize_snapshot_cache_capacity: usize, + + /// Enable preparing basebackup on XLOG_CHECKPOINT_SHUTDOWN and using it in basebackup requests. + // FIXME: Remove skip_serializing_if when the feature is stable. + #[serde(skip_serializing_if = "std::ops::Not::not")] + pub basebackup_cache_enabled: bool, } pub mod defaults { @@ -632,23 +670,15 @@ impl Default for ConfigToml { tenant_config: TenantConfigToml::default(), no_sync: None, wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, - page_service_pipelining: if !cfg!(test) { - PageServicePipeliningConfig::Serial - } else { - // Do not turn this into the default until scattered reads have been - // validated and rolled-out fully. - PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { + page_service_pipelining: PageServicePipeliningConfig::Pipelined( + PageServicePipeliningConfigPipelined { max_batch_size: NonZeroUsize::new(32).unwrap(), execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn, - }) - }, - get_vectored_concurrent_io: if !cfg!(test) { - GetVectoredConcurrentIo::Sequential - } else { - GetVectoredConcurrentIo::SidecarTask - }, - enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") { + }, + ), + get_vectored_concurrent_io: GetVectoredConcurrentIo::SidecarTask, + enable_read_path_debugging: if cfg!(feature = "testing") { Some(true) } else { None @@ -659,6 +689,12 @@ impl Default for ConfigToml { tracing: None, enable_tls_page_service_api: false, dev_mode: false, + timeline_import_config: TimelineImportConfig { + import_job_concurrency: NonZeroUsize::new(128).unwrap(), + import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(), + import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(), + }, + basebackup_cache_config: None, } } } @@ -725,6 +761,7 @@ pub mod tenant_conf_defaults { pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true; pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; + pub const DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY: usize = 1000; } impl Default for TenantConfigToml { @@ -782,6 +819,8 @@ impl Default for TenantConfigToml { gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, sampling_ratio: None, + relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY, + basebackup_cache_enabled: false, } } } diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 0c4d7fd4cb..c14975167b 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -910,6 +910,11 @@ impl Key { self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff } + #[inline(always)] + pub fn is_rel_block_of_rel(&self, rel: Oid) -> bool { + self.is_rel_block_key() && self.field4 == rel + } + #[inline(always)] pub fn is_rel_dir_key(&self) -> bool { self.field1 == 0x00 diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ff911499ab..383939a13f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -336,14 +336,30 @@ impl TimelineCreateRequest { #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum ShardImportStatus { - InProgress, + InProgress(Option), Done, Error(String), } + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub enum ShardImportProgress { + V1(ShardImportProgressV1), +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub struct ShardImportProgressV1 { + /// Total number of jobs in the import plan + pub jobs: usize, + /// Number of jobs completed + pub completed: usize, + /// Hash of the plan + pub import_plan_hash: u64, +} + impl ShardImportStatus { pub fn is_terminal(&self) -> bool { match self { - ShardImportStatus::InProgress => false, + ShardImportStatus::InProgress(_) => false, ShardImportStatus::Done | ShardImportStatus::Error(_) => true, } } @@ -614,6 +630,10 @@ pub struct TenantConfigPatch { pub gc_compaction_ratio_percent: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub sampling_ratio: FieldPatch>, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub relsize_snapshot_cache_capacity: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub basebackup_cache_enabled: FieldPatch, } /// Like [`crate::config::TenantConfigToml`], but preserves the information @@ -743,6 +763,12 @@ pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub sampling_ratio: Option>, + + #[serde(skip_serializing_if = "Option::is_none")] + pub relsize_snapshot_cache_capacity: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub basebackup_cache_enabled: Option, } impl TenantConfig { @@ -788,6 +814,8 @@ impl TenantConfig { mut gc_compaction_initial_threshold_kb, mut gc_compaction_ratio_percent, mut sampling_ratio, + mut relsize_snapshot_cache_capacity, + mut basebackup_cache_enabled, } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); @@ -889,6 +917,12 @@ impl TenantConfig { .gc_compaction_ratio_percent .apply(&mut gc_compaction_ratio_percent); patch.sampling_ratio.apply(&mut sampling_ratio); + patch + .relsize_snapshot_cache_capacity + .apply(&mut relsize_snapshot_cache_capacity); + patch + .basebackup_cache_enabled + .apply(&mut basebackup_cache_enabled); Ok(Self { checkpoint_distance, @@ -928,6 +962,8 @@ impl TenantConfig { gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, sampling_ratio, + relsize_snapshot_cache_capacity, + basebackup_cache_enabled, }) } @@ -1036,6 +1072,12 @@ impl TenantConfig { .gc_compaction_ratio_percent .unwrap_or(global_conf.gc_compaction_ratio_percent), sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio), + relsize_snapshot_cache_capacity: self + .relsize_snapshot_cache_capacity + .unwrap_or(global_conf.relsize_snapshot_cache_capacity), + basebackup_cache_enabled: self + .basebackup_cache_enabled + .unwrap_or(global_conf.basebackup_cache_enabled), } } } @@ -1803,7 +1845,6 @@ pub struct TopTenantShardsResponse { } pub mod virtual_file { - use std::sync::LazyLock; #[derive( Copy, @@ -1832,6 +1873,7 @@ pub mod virtual_file { Eq, Hash, strum_macros::EnumString, + strum_macros::EnumIter, strum_macros::Display, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, @@ -1843,37 +1885,14 @@ pub mod virtual_file { /// Uses buffered IO. Buffered, /// Uses direct IO for reads only. - #[cfg(target_os = "linux")] Direct, /// Use direct IO for reads and writes. - #[cfg(target_os = "linux")] DirectRw, } impl IoMode { pub fn preferred() -> Self { - // The default behavior when running Rust unit tests without any further - // flags is to use the newest behavior (DirectRw). - // The CI uses the following environment variable to unit tests for all - // different modes. - // NB: the Python regression & perf tests have their own defaults management - // that writes pageserver.toml; they do not use this variable. - if cfg!(test) { - static CACHED: LazyLock = LazyLock::new(|| { - utils::env::var_serde_json_string( - "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE", - ) - .unwrap_or( - #[cfg(target_os = "linux")] - IoMode::DirectRw, - #[cfg(not(target_os = "linux"))] - IoMode::Buffered, - ) - }); - *CACHED - } else { - IoMode::Buffered - } + IoMode::DirectRw } } @@ -1883,9 +1902,7 @@ pub mod virtual_file { fn try_from(value: u8) -> Result { Ok(match value { v if v == (IoMode::Buffered as u8) => IoMode::Buffered, - #[cfg(target_os = "linux")] v if v == (IoMode::Direct as u8) => IoMode::Direct, - #[cfg(target_os = "linux")] v if v == (IoMode::DirectRw as u8) => IoMode::DirectRw, x => return Err(x), }) diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 7ee63f9036..4dce5f7817 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -4,6 +4,7 @@ //! See docs/rfcs/025-generation-numbers.md use serde::{Deserialize, Serialize}; +use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use crate::controller_api::NodeRegisterRequest; @@ -63,9 +64,17 @@ pub struct ValidateResponseTenant { pub valid: bool, } +#[derive(Serialize, Deserialize)] +pub struct TimelineImportStatusRequest { + pub tenant_shard_id: TenantShardId, + pub timeline_id: TimelineId, + pub generation: Generation, +} + #[derive(Serialize, Deserialize)] pub struct PutTimelineImportStatusRequest { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub status: ShardImportStatus, + pub generation: Generation, } diff --git a/libs/pageserver_api/src/value.rs b/libs/pageserver_api/src/value.rs index 883d903ff3..e9000939c3 100644 --- a/libs/pageserver_api/src/value.rs +++ b/libs/pageserver_api/src/value.rs @@ -36,6 +36,24 @@ impl Value { Value::WalRecord(rec) => rec.will_init(), } } + + #[inline(always)] + pub fn estimated_size(&self) -> usize { + match self { + Value::Image(image) => image.len(), + Value::WalRecord(NeonWalRecord::AuxFile { + content: Some(content), + .. + }) => content.len(), + Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(), + Value::WalRecord(NeonWalRecord::ClogSetAborted { xids }) => xids.len() * 4, + Value::WalRecord(NeonWalRecord::ClogSetCommitted { xids, .. }) => xids.len() * 4, + Value::WalRecord(NeonWalRecord::MultixactMembersCreate { members, .. }) => { + members.len() * 8 + } + _ => 8192, /* use image size as the estimation */ + } + } } #[derive(Debug, PartialEq)] diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 654dde8da6..714d8ac403 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -841,6 +841,10 @@ impl PostgresBackend { let expected_end = match &end { ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true, + // The timeline doesn't exist and we have been requested to not auto-create it. + // Compute requests for timelines that haven't been created yet + // might reach us before the storcon request to create those timelines. + TimelineNoCreate => true, CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error)) if is_expected_io_error(io_error) => { @@ -1059,6 +1063,8 @@ pub enum CopyStreamHandlerEnd { Terminate, #[error("EOF on COPY stream")] EOF, + #[error("timeline not found, and allow_timeline_creation is false")] + TimelineNoCreate, /// The connection was lost #[error("connection error: {0}")] Disconnected(#[from] ConnectionError), diff --git a/libs/posthog_client_lite/Cargo.toml b/libs/posthog_client_lite/Cargo.toml new file mode 100644 index 0000000000..7c19bf2ccb --- /dev/null +++ b/libs/posthog_client_lite/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "posthog_client_lite" +version = "0.1.0" +edition = "2024" +license.workspace = true + +[dependencies] +anyhow.workspace = true +reqwest.workspace = true +serde.workspace = true +serde_json.workspace = true +sha2.workspace = true +workspace_hack.workspace = true +thiserror.workspace = true diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs new file mode 100644 index 0000000000..53deb26ab7 --- /dev/null +++ b/libs/posthog_client_lite/src/lib.rs @@ -0,0 +1,634 @@ +//! A lite version of the PostHog client that only supports local evaluation of feature flags. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; +use serde_json::json; +use sha2::Digest; + +#[derive(Debug, thiserror::Error)] +pub enum PostHogEvaluationError { + /// The feature flag is not available, for example, because the local evaluation data is not populated yet. + #[error("Feature flag not available: {0}")] + NotAvailable(String), + #[error("No condition group is matched")] + NoConditionGroupMatched, + /// Real errors, e.g., the rollout percentage does not add up to 100. + #[error("Failed to evaluate feature flag: {0}")] + Internal(String), +} + +#[derive(Deserialize)] +pub struct LocalEvaluationResponse { + #[allow(dead_code)] + flags: Vec, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlag { + key: String, + filters: LocalEvaluationFlagFilters, + active: bool, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagFilters { + groups: Vec, + multivariate: LocalEvaluationFlagMultivariate, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagFilterGroup { + variant: Option, + properties: Option>, + rollout_percentage: i64, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagFilterProperty { + key: String, + value: PostHogFlagFilterPropertyValue, + operator: String, +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(untagged)] +pub enum PostHogFlagFilterPropertyValue { + String(String), + Number(f64), + Boolean(bool), + List(Vec), +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagMultivariate { + variants: Vec, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagMultivariateVariant { + key: String, + rollout_percentage: i64, +} + +pub struct FeatureStore { + flags: HashMap, +} + +impl Default for FeatureStore { + fn default() -> Self { + Self::new() + } +} + +enum GroupEvaluationResult { + MatchedAndOverride(String), + MatchedAndEvaluate, + Unmatched, +} + +impl FeatureStore { + pub fn new() -> Self { + Self { + flags: HashMap::new(), + } + } + + pub fn set_flags(&mut self, flags: Vec) { + self.flags.clear(); + for flag in flags { + self.flags.insert(flag.key.clone(), flag); + } + } + + /// Generate a consistent hash for a user ID (e.g., tenant ID). + /// + /// The implementation is different from PostHog SDK. In PostHog SDK, it is sha1 of `user_id.distinct_id.salt`. + /// However, as we do not upload all of our tenant IDs to PostHog, we do not have the PostHog distinct_id for a + /// tenant. Therefore, the way we compute it is sha256 of `user_id.feature_id.salt`. + fn consistent_hash(user_id: &str, flag_key: &str, salt: &str) -> f64 { + let mut hasher = sha2::Sha256::new(); + hasher.update(user_id); + hasher.update("."); + hasher.update(flag_key); + hasher.update("."); + hasher.update(salt); + let hash = hasher.finalize(); + let hash_int = u64::from_le_bytes(hash[..8].try_into().unwrap()); + hash_int as f64 / u64::MAX as f64 + } + + /// Evaluate a condition. Returns an error if the condition cannot be evaluated due to parsing error or missing + /// property. + fn evaluate_condition( + &self, + operator: &str, + provided: &PostHogFlagFilterPropertyValue, + requested: &PostHogFlagFilterPropertyValue, + ) -> Result { + match operator { + "exact" => { + let PostHogFlagFilterPropertyValue::String(provided) = provided else { + // Left should be a string + return Err(PostHogEvaluationError::Internal(format!( + "The left side of the condition is not a string: {:?}", + provided + ))); + }; + let PostHogFlagFilterPropertyValue::List(requested) = requested else { + // Right should be a list of string + return Err(PostHogEvaluationError::Internal(format!( + "The right side of the condition is not a list: {:?}", + requested + ))); + }; + Ok(requested.contains(provided)) + } + "lt" | "gt" => { + let PostHogFlagFilterPropertyValue::String(requested) = requested else { + // Right should be a string + return Err(PostHogEvaluationError::Internal(format!( + "The right side of the condition is not a string: {:?}", + requested + ))); + }; + let Ok(requested) = requested.parse::() else { + return Err(PostHogEvaluationError::Internal(format!( + "Can not parse the right side of the condition as a number: {:?}", + requested + ))); + }; + // Left can either be a number or a string + let provided = match provided { + PostHogFlagFilterPropertyValue::Number(provided) => *provided, + PostHogFlagFilterPropertyValue::String(provided) => { + let Ok(provided) = provided.parse::() else { + return Err(PostHogEvaluationError::Internal(format!( + "Can not parse the left side of the condition as a number: {:?}", + provided + ))); + }; + provided + } + _ => { + return Err(PostHogEvaluationError::Internal(format!( + "The left side of the condition is not a number or a string: {:?}", + provided + ))); + } + }; + match operator { + "lt" => Ok(provided < requested), + "gt" => Ok(provided > requested), + op => Err(PostHogEvaluationError::Internal(format!( + "Unsupported operator: {}", + op + ))), + } + } + _ => Err(PostHogEvaluationError::Internal(format!( + "Unsupported operator: {}", + operator + ))), + } + } + + /// Evaluate a percentage. + fn evaluate_percentage(&self, mapped_user_id: f64, percentage: i64) -> bool { + mapped_user_id <= percentage as f64 / 100.0 + } + + /// Evaluate a filter group for a feature flag. Returns an error if there are errors during the evaluation. + /// + /// Return values: + /// Ok(GroupEvaluationResult::MatchedAndOverride(variant)): matched and evaluated to this value + /// Ok(GroupEvaluationResult::MatchedAndEvaluate): condition matched but no variant override, use the global rollout percentage + /// Ok(GroupEvaluationResult::Unmatched): condition unmatched + fn evaluate_group( + &self, + group: &LocalEvaluationFlagFilterGroup, + hash_on_group_rollout_percentage: f64, + provided_properties: &HashMap, + ) -> Result { + if let Some(ref properties) = group.properties { + for property in properties { + if let Some(value) = provided_properties.get(&property.key) { + // The user provided the property value + if !self.evaluate_condition( + property.operator.as_ref(), + value, + &property.value, + )? { + return Ok(GroupEvaluationResult::Unmatched); + } + } else { + // We cannot evaluate, the property is not available + return Err(PostHogEvaluationError::NotAvailable(format!( + "The required property in the condition is not available: {}", + property.key + ))); + } + } + } + + // The group has no condition matchers or we matched the properties + if self.evaluate_percentage(hash_on_group_rollout_percentage, group.rollout_percentage) { + if let Some(ref variant_override) = group.variant { + Ok(GroupEvaluationResult::MatchedAndOverride( + variant_override.clone(), + )) + } else { + Ok(GroupEvaluationResult::MatchedAndEvaluate) + } + } else { + Ok(GroupEvaluationResult::Unmatched) + } + } + + /// Evaluate a multivariate feature flag. Returns `None` if the flag is not available or if there are errors + /// during the evaluation. + /// + /// The parsing logic is as follows: + /// + /// * Match each filter group. + /// - If a group is matched, it will first determine whether the user is in the range of the group's rollout + /// percentage. We will generate a consistent hash for the user ID on the group rollout percentage. This hash + /// is shared across all groups. + /// - If the hash falls within the group's rollout percentage, return the variant if it's overridden, or + /// - Evaluate the variant using the global config and the global rollout percentage. + /// * Otherwise, continue with the next group until all groups are evaluated and no group is within the + /// rollout percentage. + /// * If there are no matching groups, return an error. + /// + /// Example: we have a multivariate flag with 3 groups of the configured global rollout percentage: A (10%), B (20%), C (70%). + /// There is a single group with a condition that has a rollout percentage of 10% and it does not have a variant override. + /// Then, we will have 1% of the users evaluated to A, 2% to B, and 7% to C. + pub fn evaluate_multivariate( + &self, + flag_key: &str, + user_id: &str, + ) -> Result { + let hash_on_global_rollout_percentage = + Self::consistent_hash(user_id, flag_key, "multivariate"); + let hash_on_group_rollout_percentage = + Self::consistent_hash(user_id, flag_key, "within_group"); + self.evaluate_multivariate_inner( + flag_key, + hash_on_global_rollout_percentage, + hash_on_group_rollout_percentage, + &HashMap::new(), + ) + } + + /// Evaluate a multivariate feature flag. Note that we directly take the mapped user ID + /// (a consistent hash ranging from 0 to 1) so that it is easier to use it in the tests + /// and avoid duplicate computations. + /// + /// Use a different consistent hash for evaluating the group rollout percentage. + /// The behavior: if the condition is set to rolling out to 10% of the users, and + /// we set the variant A to 20% in the global config, then 2% of the total users will + /// be evaluated to variant A. + /// + /// Note that the hash to determine group rollout percentage is shared across all groups. So if we have two + /// exactly-the-same conditions with 10% and 20% rollout percentage respectively, a total of 20% of the users + /// will be evaluated (versus 30% if group evaluation is done independently). + pub(crate) fn evaluate_multivariate_inner( + &self, + flag_key: &str, + hash_on_global_rollout_percentage: f64, + hash_on_group_rollout_percentage: f64, + properties: &HashMap, + ) -> Result { + if let Some(flag_config) = self.flags.get(flag_key) { + if !flag_config.active { + return Err(PostHogEvaluationError::NotAvailable(format!( + "The feature flag is not active: {}", + flag_key + ))); + } + // TODO: sort the groups so that variant overrides always get evaluated first and it follows the PostHog + // Python SDK behavior; for now we do not configure conditions without variant overrides in Neon so it + // does not matter. + for group in &flag_config.filters.groups { + match self.evaluate_group(group, hash_on_group_rollout_percentage, properties)? { + GroupEvaluationResult::MatchedAndOverride(variant) => return Ok(variant), + GroupEvaluationResult::MatchedAndEvaluate => { + let mut percentage = 0; + for variant in &flag_config.filters.multivariate.variants { + percentage += variant.rollout_percentage; + if self + .evaluate_percentage(hash_on_global_rollout_percentage, percentage) + { + return Ok(variant.key.clone()); + } + } + // This should not happen because the rollout percentage always adds up to 100, but just in case that PostHog + // returned invalid spec, we return an error. + return Err(PostHogEvaluationError::Internal(format!( + "Rollout percentage does not add up to 100: {}", + flag_key + ))); + } + GroupEvaluationResult::Unmatched => continue, + } + } + // If no group is matched, the feature is not available, and up to the caller to decide what to do. + Err(PostHogEvaluationError::NoConditionGroupMatched) + } else { + // The feature flag is not available yet + Err(PostHogEvaluationError::NotAvailable(format!( + "Not found in the local evaluation spec: {}", + flag_key + ))) + } + } +} + +/// A lite PostHog client. +/// +/// At the point of writing this code, PostHog does not have a functional Rust client with feature flag support. +/// This is a lite version that only supports local evaluation of feature flags and only supports those JSON specs +/// that will be used within Neon. +/// +/// PostHog is designed as a browser-server system: the browser (client) side uses the client key and is exposed +/// to the end users; the server side uses a server key and is not exposed to the end users. The client and the +/// server has different API keys and provide a different set of APIs. In Neon, we only have the server (that is +/// pageserver), and it will use both the client API and the server API. So we need to store two API keys within +/// our PostHog client. +/// +/// The server API is used to fetch the feature flag specs. The client API is used to capture events in case we +/// want to report the feature flag usage back to PostHog. The current plan is to use PostHog only as an UI to +/// configure feature flags so it is very likely that the client API will not be used. +pub struct PostHogClient { + /// The server API key. + server_api_key: String, + /// The client API key. + client_api_key: String, + /// The project ID. + project_id: String, + /// The private API URL. + private_api_url: String, + /// The public API URL. + public_api_url: String, + /// The HTTP client. + client: reqwest::Client, +} + +impl PostHogClient { + pub fn new( + server_api_key: String, + client_api_key: String, + project_id: String, + private_api_url: String, + public_api_url: String, + ) -> Self { + let client = reqwest::Client::new(); + Self { + server_api_key, + client_api_key, + project_id, + private_api_url, + public_api_url, + client, + } + } + + pub fn new_with_us_region( + server_api_key: String, + client_api_key: String, + project_id: String, + ) -> Self { + Self::new( + server_api_key, + client_api_key, + project_id, + "https://us.posthog.com".to_string(), + "https://us.i.posthog.com".to_string(), + ) + } + + /// Fetch the feature flag specs from the server. + /// + /// This is unfortunately an undocumented API at: + /// - + /// - + /// + /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation. + /// See `_compute_flag_locally` in + pub async fn get_feature_flags_local_evaluation( + &self, + ) -> anyhow::Result { + // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation + // with bearer token of self.server_api_key + let url = format!( + "{}/api/projects/{}/feature_flags/local_evaluation", + self.private_api_url, self.project_id + ); + let response = self + .client + .get(url) + .bearer_auth(&self.server_api_key) + .send() + .await?; + let body = response.text().await?; + Ok(serde_json::from_str(&body)?) + } + + /// Capture an event. This will only be used to report the feature flag usage back to PostHog, though + /// it also support a lot of other functionalities. + /// + /// + pub async fn capture_event( + &self, + event: &str, + distinct_id: &str, + properties: &HashMap, + ) -> anyhow::Result<()> { + // PUBLIC_URL/capture/ + // with bearer token of self.client_api_key + let url = format!("{}/capture/", self.public_api_url); + self.client + .post(url) + .body(serde_json::to_string(&json!({ + "api_key": self.client_api_key, + "distinct_id": distinct_id, + "event": event, + "properties": properties, + }))?) + .send() + .await?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn data() -> &'static str { + r#"{ + "flags": [ + { + "id": 132794, + "team_id": 152860, + "name": "", + "key": "gc-compaction", + "filters": { + "groups": [ + { + "variant": "enabled-stage-2", + "properties": [ + { + "key": "plan_type", + "type": "person", + "value": [ + "free" + ], + "operator": "exact" + }, + { + "key": "pageserver_remote_size", + "type": "person", + "value": "10000000", + "operator": "lt" + } + ], + "rollout_percentage": 50 + }, + { + "properties": [ + { + "key": "plan_type", + "type": "person", + "value": [ + "free" + ], + "operator": "exact" + }, + { + "key": "pageserver_remote_size", + "type": "person", + "value": "10000000", + "operator": "lt" + } + ], + "rollout_percentage": 80 + } + ], + "payloads": {}, + "multivariate": { + "variants": [ + { + "key": "disabled", + "name": "", + "rollout_percentage": 90 + }, + { + "key": "enabled-stage-1", + "name": "", + "rollout_percentage": 10 + }, + { + "key": "enabled-stage-2", + "name": "", + "rollout_percentage": 0 + }, + { + "key": "enabled-stage-3", + "name": "", + "rollout_percentage": 0 + }, + { + "key": "enabled", + "name": "", + "rollout_percentage": 0 + } + ] + } + }, + "deleted": false, + "active": true, + "ensure_experience_continuity": false, + "has_encrypted_payloads": false, + "version": 6 + } + ], + "group_type_mapping": {}, + "cohorts": {} + }"# + } + + #[test] + fn parse_local_evaluation() { + let data = data(); + let _: LocalEvaluationResponse = serde_json::from_str(data).unwrap(); + } + + #[test] + fn evaluate_multivariate() { + let mut store = FeatureStore::new(); + let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap(); + store.set_flags(response.flags); + + // This lacks the required properties and cannot be evaluated. + let variant = + store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &HashMap::new()); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NotAvailable(_)) + ),); + + let properties_unmatched = HashMap::from([ + ( + "plan_type".to_string(), + PostHogFlagFilterPropertyValue::String("paid".to_string()), + ), + ( + "pageserver_remote_size".to_string(), + PostHogFlagFilterPropertyValue::Number(1000.0), + ), + ]); + + // This does not match any group so there will be an error. + let variant = + store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &properties_unmatched); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NoConditionGroupMatched) + ),); + let variant = + store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.80, &properties_unmatched); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NoConditionGroupMatched) + ),); + + let properties = HashMap::from([ + ( + "plan_type".to_string(), + PostHogFlagFilterPropertyValue::String("free".to_string()), + ), + ( + "pageserver_remote_size".to_string(), + PostHogFlagFilterPropertyValue::Number(1000.0), + ), + ]); + + // It matches the first group as 0.10 <= 0.50 and the properties are matched. Then it gets evaluated to the variant override. + let variant = store.evaluate_multivariate_inner("gc-compaction", 0.10, 0.10, &properties); + assert_eq!(variant.unwrap(), "enabled-stage-2".to_string()); + + // It matches the second group as 0.50 <= 0.60 <= 0.80 and the properties are matched. Then it gets evaluated using the global percentage. + let variant = store.evaluate_multivariate_inner("gc-compaction", 0.99, 0.60, &properties); + assert_eq!(variant.unwrap(), "enabled-stage-1".to_string()); + let variant = store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.60, &properties); + assert_eq!(variant.unwrap(), "disabled".to_string()); + + // It matches the group conditions but not the group rollout percentage. + let variant = store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.90, &properties); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NoConditionGroupMatched) + ),); + } +} diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs index b447290ea8..9faed2c065 100644 --- a/libs/proxy/postgres-protocol2/src/message/frontend.rs +++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs @@ -25,6 +25,7 @@ where Ok(()) } +#[derive(Debug)] pub enum BindError { Conversion(Box), Serialization(io::Error), @@ -288,6 +289,12 @@ pub fn sync(buf: &mut BytesMut) { write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); } +#[inline] +pub fn flush(buf: &mut BytesMut) { + buf.put_u8(b'H'); + write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); +} + #[inline] pub fn terminate(buf: &mut BytesMut) { buf.put_u8(b'X'); diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs index b6bcabc922..7c9874bda3 100644 --- a/libs/proxy/postgres-types2/src/lib.rs +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -9,7 +9,6 @@ use std::error::Error; use std::fmt; use std::sync::Arc; -use bytes::BytesMut; use fallible_iterator::FallibleIterator; #[doc(inline)] pub use postgres_protocol2::Oid; @@ -27,41 +26,6 @@ macro_rules! accepts { ) } -/// Generates an implementation of `ToSql::to_sql_checked`. -/// -/// All `ToSql` implementations should use this macro. -macro_rules! to_sql_checked { - () => { - fn to_sql_checked( - &self, - ty: &$crate::Type, - out: &mut $crate::private::BytesMut, - ) -> ::std::result::Result< - $crate::IsNull, - Box, - > { - $crate::__to_sql_checked(self, ty, out) - } - }; -} - -// WARNING: this function is not considered part of this crate's public API. -// It is subject to change at any time. -#[doc(hidden)] -pub fn __to_sql_checked( - v: &T, - ty: &Type, - out: &mut BytesMut, -) -> Result> -where - T: ToSql, -{ - if !T::accepts(ty) { - return Err(Box::new(WrongType::new::(ty.clone()))); - } - v.to_sql(ty, out) -} - // mod pg_lsn; #[doc(hidden)] pub mod private; @@ -142,7 +106,7 @@ pub enum Kind { /// An array type along with the type of its elements. Array(Type), /// A range type along with the type of its elements. - Range(Type), + Range(Oid), /// A multirange type along with the type of its elements. Multirange(Type), /// A domain type along with its underlying type. @@ -377,43 +341,6 @@ pub enum IsNull { No, } -/// A trait for types that can be converted into Postgres values. -pub trait ToSql: fmt::Debug { - /// Converts the value of `self` into the binary format of the specified - /// Postgres `Type`, appending it to `out`. - /// - /// The caller of this method is responsible for ensuring that this type - /// is compatible with the Postgres `Type`. - /// - /// The return value indicates if this value should be represented as - /// `NULL`. If this is the case, implementations **must not** write - /// anything to `out`. - fn to_sql(&self, ty: &Type, out: &mut BytesMut) -> Result> - where - Self: Sized; - - /// Determines if a value of this type can be converted to the specified - /// Postgres `Type`. - fn accepts(ty: &Type) -> bool - where - Self: Sized; - - /// An adaptor method used internally by Rust-Postgres. - /// - /// *All* implementations of this method should be generated by the - /// `to_sql_checked!()` macro. - fn to_sql_checked( - &self, - ty: &Type, - out: &mut BytesMut, - ) -> Result>; - - /// Specify the encode format - fn encode_format(&self, _ty: &Type) -> Format { - Format::Binary - } -} - /// Supported Postgres message format types /// /// Using Text format in a message assumes a Postgres `SERVER_ENCODING` of `UTF8` @@ -424,52 +351,3 @@ pub enum Format { /// Compact, typed binary format Binary, } - -impl ToSql for &str { - fn to_sql(&self, ty: &Type, w: &mut BytesMut) -> Result> { - match *ty { - ref ty if ty.name() == "ltree" => types::ltree_to_sql(self, w), - ref ty if ty.name() == "lquery" => types::lquery_to_sql(self, w), - ref ty if ty.name() == "ltxtquery" => types::ltxtquery_to_sql(self, w), - _ => types::text_to_sql(self, w), - } - Ok(IsNull::No) - } - - fn accepts(ty: &Type) -> bool { - match *ty { - Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true, - ref ty - if (ty.name() == "citext" - || ty.name() == "ltree" - || ty.name() == "lquery" - || ty.name() == "ltxtquery") => - { - true - } - _ => false, - } - } - - to_sql_checked!(); -} - -macro_rules! simple_to { - ($t:ty, $f:ident, $($expected:ident),+) => { - impl ToSql for $t { - fn to_sql(&self, - _: &Type, - w: &mut BytesMut) - -> Result> { - types::$f(*self, w); - Ok(IsNull::No) - } - - accepts!($($expected),+); - - to_sql_checked!(); - } - } -} - -simple_to!(u32, oid_to_sql, OID); diff --git a/libs/proxy/postgres-types2/src/type_gen.rs b/libs/proxy/postgres-types2/src/type_gen.rs index a1bc3f85c0..6e6163e343 100644 --- a/libs/proxy/postgres-types2/src/type_gen.rs +++ b/libs/proxy/postgres-types2/src/type_gen.rs @@ -393,7 +393,7 @@ impl Inner { } } - pub fn oid(&self) -> Oid { + pub const fn const_oid(&self) -> Oid { match *self { Inner::Bool => 16, Inner::Bytea => 17, @@ -580,7 +580,14 @@ impl Inner { Inner::TstzmultiRangeArray => 6153, Inner::DatemultiRangeArray => 6155, Inner::Int8multiRangeArray => 6157, + Inner::Other(_) => u32::MAX, + } + } + + pub fn oid(&self) -> Oid { + match *self { Inner::Other(ref u) => u.oid, + _ => self.const_oid(), } } @@ -727,17 +734,17 @@ impl Inner { Inner::JsonbArray => &Kind::Array(Type(Inner::Jsonb)), Inner::AnyRange => &Kind::Pseudo, Inner::EventTrigger => &Kind::Pseudo, - Inner::Int4Range => &Kind::Range(Type(Inner::Int4)), + Inner::Int4Range => &const { Kind::Range(Inner::Int4.const_oid()) }, Inner::Int4RangeArray => &Kind::Array(Type(Inner::Int4Range)), - Inner::NumRange => &Kind::Range(Type(Inner::Numeric)), + Inner::NumRange => &const { Kind::Range(Inner::Numeric.const_oid()) }, Inner::NumRangeArray => &Kind::Array(Type(Inner::NumRange)), - Inner::TsRange => &Kind::Range(Type(Inner::Timestamp)), + Inner::TsRange => &const { Kind::Range(Inner::Timestamp.const_oid()) }, Inner::TsRangeArray => &Kind::Array(Type(Inner::TsRange)), - Inner::TstzRange => &Kind::Range(Type(Inner::Timestamptz)), + Inner::TstzRange => &const { Kind::Range(Inner::Timestamptz.const_oid()) }, Inner::TstzRangeArray => &Kind::Array(Type(Inner::TstzRange)), - Inner::DateRange => &Kind::Range(Type(Inner::Date)), + Inner::DateRange => &const { Kind::Range(Inner::Date.const_oid()) }, Inner::DateRangeArray => &Kind::Array(Type(Inner::DateRange)), - Inner::Int8Range => &Kind::Range(Type(Inner::Int8)), + Inner::Int8Range => &const { Kind::Range(Inner::Int8.const_oid()) }, Inner::Int8RangeArray => &Kind::Array(Type(Inner::Int8Range)), Inner::Jsonpath => &Kind::Simple, Inner::JsonpathArray => &Kind::Array(Type(Inner::Jsonpath)), diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 186eb07000..a7edfc076a 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -1,14 +1,12 @@ use std::collections::HashMap; use std::fmt; use std::net::IpAddr; -use std::sync::Arc; use std::task::{Context, Poll}; use std::time::Duration; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use futures_util::{TryStreamExt, future, ready}; -use parking_lot::Mutex; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; use serde::{Deserialize, Serialize}; @@ -16,29 +14,52 @@ use tokio::sync::mpsc; use crate::codec::{BackendMessages, FrontendMessage}; use crate::config::{Host, SslMode}; -use crate::connection::{Request, RequestMessages}; use crate::query::RowStream; use crate::simple_query::SimpleQueryStream; use crate::types::{Oid, Type}; use crate::{ - CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Statement, Transaction, - TransactionBuilder, query, simple_query, + CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Transaction, TransactionBuilder, + query, simple_query, }; pub struct Responses { + /// new messages from conn receiver: mpsc::Receiver, + /// current batch of messages cur: BackendMessages, + /// number of total queries sent. + waiting: usize, + /// number of ReadyForQuery messages received. + received: usize, } impl Responses { pub fn poll_next(&mut self, cx: &mut Context<'_>) -> Poll> { loop { - match self.cur.next().map_err(Error::parse)? { - Some(Message::ErrorResponse(body)) => return Poll::Ready(Err(Error::db(body))), - Some(message) => return Poll::Ready(Ok(message)), - None => {} + // get the next saved message + if let Some(message) = self.cur.next().map_err(Error::parse)? { + let received = self.received; + + // increase the query head if this is the last message. + if let Message::ReadyForQuery(_) = message { + self.received += 1; + } + + // check if the client has skipped this query. + if received + 1 < self.waiting { + // grab the next message. + continue; + } + + // convenience: turn the error messaage into a proper error. + let res = match message { + Message::ErrorResponse(body) => Err(Error::db(body)), + message => Ok(message), + }; + return Poll::Ready(res); } + // get the next batch of messages. match ready!(self.receiver.poll_recv(cx)) { Some(messages) => self.cur = messages, None => return Poll::Ready(Err(Error::closed())), @@ -55,44 +76,87 @@ impl Responses { /// (corresponding to the queries in the [crate::prepare] module). #[derive(Default)] pub(crate) struct CachedTypeInfo { - /// A statement for basic information for a type from its - /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its - /// fallback). - pub(crate) typeinfo: Option, - /// Cache of types already looked up. pub(crate) types: HashMap, } pub struct InnerClient { - sender: mpsc::UnboundedSender, + sender: mpsc::UnboundedSender, + responses: Responses, /// A buffer to use when writing out postgres commands. - buffer: Mutex, + buffer: BytesMut, } impl InnerClient { - pub fn send(&self, messages: RequestMessages) -> Result { - let (sender, receiver) = mpsc::channel(1); - let request = Request { messages, sender }; - self.sender.send(request).map_err(|_| Error::closed())?; - - Ok(Responses { - receiver, - cur: BackendMessages::empty(), - }) + pub fn start(&mut self) -> Result { + self.responses.waiting += 1; + Ok(PartialQuery(Some(self))) } - /// Call the given function with a buffer to be used when writing out - /// postgres commands. - pub fn with_buf(&self, f: F) -> R + // pub fn send_with_sync(&mut self, f: F) -> Result<&mut Responses, Error> + // where + // F: FnOnce(&mut BytesMut) -> Result<(), Error>, + // { + // self.start()?.send_with_sync(f) + // } + + pub fn send_simple_query(&mut self, query: &str) -> Result<&mut Responses, Error> { + self.responses.waiting += 1; + + self.buffer.clear(); + // simple queries do not need sync. + frontend::query(query, &mut self.buffer).map_err(Error::encode)?; + let buf = self.buffer.split().freeze(); + self.send_message(FrontendMessage::Raw(buf)) + } + + fn send_message(&mut self, messages: FrontendMessage) -> Result<&mut Responses, Error> { + self.sender.send(messages).map_err(|_| Error::closed())?; + Ok(&mut self.responses) + } +} + +pub struct PartialQuery<'a>(Option<&'a mut InnerClient>); + +impl Drop for PartialQuery<'_> { + fn drop(&mut self) { + if let Some(client) = self.0.take() { + client.buffer.clear(); + frontend::sync(&mut client.buffer); + let buf = client.buffer.split().freeze(); + let _ = client.send_message(FrontendMessage::Raw(buf)); + } + } +} + +impl<'a> PartialQuery<'a> { + pub fn send_with_flush(&mut self, f: F) -> Result<&mut Responses, Error> where - F: FnOnce(&mut BytesMut) -> R, + F: FnOnce(&mut BytesMut) -> Result<(), Error>, { - let mut buffer = self.buffer.lock(); - let r = f(&mut buffer); - buffer.clear(); - r + let client = self.0.as_deref_mut().unwrap(); + + client.buffer.clear(); + f(&mut client.buffer)?; + frontend::flush(&mut client.buffer); + let buf = client.buffer.split().freeze(); + client.send_message(FrontendMessage::Raw(buf)) + } + + pub fn send_with_sync(mut self, f: F) -> Result<&'a mut Responses, Error> + where + F: FnOnce(&mut BytesMut) -> Result<(), Error>, + { + let client = self.0.as_deref_mut().unwrap(); + + client.buffer.clear(); + f(&mut client.buffer)?; + frontend::sync(&mut client.buffer); + let buf = client.buffer.split().freeze(); + let _ = client.send_message(FrontendMessage::Raw(buf)); + + Ok(&mut self.0.take().unwrap().responses) } } @@ -109,7 +173,7 @@ pub struct SocketConfig { /// The client is one half of what is returned when a connection is established. Users interact with the database /// through this client object. pub struct Client { - inner: Arc, + inner: InnerClient, cached_typeinfo: CachedTypeInfo, socket_config: SocketConfig, @@ -120,17 +184,24 @@ pub struct Client { impl Client { pub(crate) fn new( - sender: mpsc::UnboundedSender, + sender: mpsc::UnboundedSender, + receiver: mpsc::Receiver, socket_config: SocketConfig, ssl_mode: SslMode, process_id: i32, secret_key: i32, ) -> Client { Client { - inner: Arc::new(InnerClient { + inner: InnerClient { sender, + responses: Responses { + receiver, + cur: BackendMessages::empty(), + waiting: 0, + received: 0, + }, buffer: Default::default(), - }), + }, cached_typeinfo: Default::default(), socket_config, @@ -145,19 +216,29 @@ impl Client { self.process_id } - pub(crate) fn inner(&self) -> &Arc { - &self.inner + pub(crate) fn inner_mut(&mut self) -> &mut InnerClient { + &mut self.inner } /// Pass text directly to the Postgres backend to allow it to sort out typing itself and /// to save a roundtrip - pub async fn query_raw_txt(&self, statement: &str, params: I) -> Result + pub async fn query_raw_txt( + &mut self, + statement: &str, + params: I, + ) -> Result where S: AsRef, I: IntoIterator>, I::IntoIter: ExactSizeIterator, { - query::query_txt(&self.inner, statement, params).await + query::query_txt( + &mut self.inner, + &mut self.cached_typeinfo, + statement, + params, + ) + .await } /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows. @@ -173,12 +254,15 @@ impl Client { /// Prepared statements should be use for any query which contains user-specified data, as they provided the /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass /// them to this method! - pub async fn simple_query(&self, query: &str) -> Result, Error> { + pub async fn simple_query(&mut self, query: &str) -> Result, Error> { self.simple_query_raw(query).await?.try_collect().await } - pub(crate) async fn simple_query_raw(&self, query: &str) -> Result { - simple_query::simple_query(self.inner(), query).await + pub(crate) async fn simple_query_raw( + &mut self, + query: &str, + ) -> Result { + simple_query::simple_query(self.inner_mut(), query).await } /// Executes a sequence of SQL statements using the simple query protocol. @@ -191,15 +275,11 @@ impl Client { /// Prepared statements should be use for any query which contains user-specified data, as they provided the /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass /// them to this method! - pub async fn batch_execute(&self, query: &str) -> Result { - simple_query::batch_execute(self.inner(), query).await + pub async fn batch_execute(&mut self, query: &str) -> Result { + simple_query::batch_execute(self.inner_mut(), query).await } pub async fn discard_all(&mut self) -> Result { - // clear the prepared statements that are about to be nuked from the postgres session - - self.cached_typeinfo.typeinfo = None; - self.batch_execute("discard all").await } @@ -208,7 +288,7 @@ impl Client { /// The transaction will roll back by default - use the `commit` method to commit it. pub async fn transaction(&mut self) -> Result, Error> { struct RollbackIfNotDone<'me> { - client: &'me Client, + client: &'me mut Client, done: bool, } @@ -218,14 +298,7 @@ impl Client { return; } - let buf = self.client.inner().with_buf(|buf| { - frontend::query("ROLLBACK", buf).unwrap(); - buf.split().freeze() - }); - let _ = self - .client - .inner() - .send(RequestMessages::Single(FrontendMessage::Raw(buf))); + let _ = self.client.inner.send_simple_query("ROLLBACK"); } } @@ -239,7 +312,7 @@ impl Client { client: self, done: false, }; - self.batch_execute("BEGIN").await?; + cleaner.client.batch_execute("BEGIN").await?; cleaner.done = true; } @@ -265,11 +338,6 @@ impl Client { } } - /// Query for type information - pub(crate) async fn get_type_inner(&mut self, oid: Oid) -> Result { - crate::prepare::get_type(&self.inner, &mut self.cached_typeinfo, oid).await - } - /// Determines if the connection to the server has already closed. /// /// In that case, all future queries will fail. diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs index f1fd9b47b3..daa5371426 100644 --- a/libs/proxy/tokio-postgres2/src/codec.rs +++ b/libs/proxy/tokio-postgres2/src/codec.rs @@ -1,21 +1,16 @@ use std::io; -use bytes::{Buf, Bytes, BytesMut}; +use bytes::{Bytes, BytesMut}; use fallible_iterator::FallibleIterator; use postgres_protocol2::message::backend; -use postgres_protocol2::message::frontend::CopyData; use tokio_util::codec::{Decoder, Encoder}; pub enum FrontendMessage { Raw(Bytes), - CopyData(CopyData>), } pub enum BackendMessage { - Normal { - messages: BackendMessages, - request_complete: bool, - }, + Normal { messages: BackendMessages }, Async(backend::Message), } @@ -44,7 +39,6 @@ impl Encoder for PostgresCodec { fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> { match item { FrontendMessage::Raw(buf) => dst.extend_from_slice(&buf), - FrontendMessage::CopyData(data) => data.write(dst), } Ok(()) @@ -57,7 +51,6 @@ impl Decoder for PostgresCodec { fn decode(&mut self, src: &mut BytesMut) -> Result, io::Error> { let mut idx = 0; - let mut request_complete = false; while let Some(header) = backend::Header::parse(&src[idx..])? { let len = header.len() as usize + 1; @@ -82,7 +75,6 @@ impl Decoder for PostgresCodec { idx += len; if header.tag() == backend::READY_FOR_QUERY_TAG { - request_complete = true; break; } } @@ -92,7 +84,6 @@ impl Decoder for PostgresCodec { } else { Ok(Some(BackendMessage::Normal { messages: BackendMessages(src.split_to(idx)), - request_complete, })) } } diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index 7c3a358bba..39a0a87c74 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -59,9 +59,11 @@ where connect_timeout: config.connect_timeout, }; - let (sender, receiver) = mpsc::unbounded_channel(); + let (client_tx, conn_rx) = mpsc::unbounded_channel(); + let (conn_tx, client_rx) = mpsc::channel(4); let client = Client::new( - sender, + client_tx, + client_rx, socket_config, config.ssl_mode, process_id, @@ -74,7 +76,7 @@ where .map(|m| BackendMessage::Async(Message::NoticeResponse(m))) .collect(); - let connection = Connection::new(stream, delayed, parameters, receiver); + let connection = Connection::new(stream, delayed, parameters, conn_tx, conn_rx); Ok((client, connection)) } diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs index 99d6f3f8e2..fe0372b266 100644 --- a/libs/proxy/tokio-postgres2/src/connection.rs +++ b/libs/proxy/tokio-postgres2/src/connection.rs @@ -4,7 +4,6 @@ use std::pin::Pin; use std::task::{Context, Poll}; use bytes::BytesMut; -use fallible_iterator::FallibleIterator; use futures_util::{Sink, Stream, ready}; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; @@ -19,30 +18,12 @@ use crate::error::DbError; use crate::maybe_tls_stream::MaybeTlsStream; use crate::{AsyncMessage, Error, Notification}; -pub enum RequestMessages { - Single(FrontendMessage), -} - -pub struct Request { - pub messages: RequestMessages, - pub sender: mpsc::Sender, -} - -pub struct Response { - sender: PollSender, -} - #[derive(PartialEq, Debug)] enum State { Active, Closing, } -enum WriteReady { - Terminating, - WaitingOnRead, -} - /// A connection to a PostgreSQL database. /// /// This is one half of what is returned when a new connection is established. It performs the actual IO with the @@ -56,9 +37,11 @@ pub struct Connection { pub stream: Framed, PostgresCodec>, /// HACK: we need this in the Neon Proxy to forward params. pub parameters: HashMap, - receiver: mpsc::UnboundedReceiver, + + sender: PollSender, + receiver: mpsc::UnboundedReceiver, + pending_responses: VecDeque, - responses: VecDeque, state: State, } @@ -71,14 +54,15 @@ where stream: Framed, PostgresCodec>, pending_responses: VecDeque, parameters: HashMap, - receiver: mpsc::UnboundedReceiver, + sender: mpsc::Sender, + receiver: mpsc::UnboundedReceiver, ) -> Connection { Connection { stream, parameters, + sender: PollSender::new(sender), receiver, pending_responses, - responses: VecDeque::new(), state: State::Active, } } @@ -110,7 +94,7 @@ where } }; - let (mut messages, request_complete) = match message { + let messages = match message { BackendMessage::Async(Message::NoticeResponse(body)) => { let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?; return Poll::Ready(Ok(AsyncMessage::Notice(error))); @@ -131,41 +115,19 @@ where continue; } BackendMessage::Async(_) => unreachable!(), - BackendMessage::Normal { - messages, - request_complete, - } => (messages, request_complete), + BackendMessage::Normal { messages } => messages, }; - let mut response = match self.responses.pop_front() { - Some(response) => response, - None => match messages.next().map_err(Error::parse)? { - Some(Message::ErrorResponse(error)) => { - return Poll::Ready(Err(Error::db(error))); - } - _ => return Poll::Ready(Err(Error::unexpected_message())), - }, - }; - - match response.sender.poll_reserve(cx) { + match self.sender.poll_reserve(cx) { Poll::Ready(Ok(())) => { - let _ = response.sender.send_item(messages); - if !request_complete { - self.responses.push_front(response); - } + let _ = self.sender.send_item(messages); } Poll::Ready(Err(_)) => { - // we need to keep paging through the rest of the messages even if the receiver's hung up - if !request_complete { - self.responses.push_front(response); - } + return Poll::Ready(Err(Error::closed())); } Poll::Pending => { - self.responses.push_front(response); - self.pending_responses.push_back(BackendMessage::Normal { - messages, - request_complete, - }); + self.pending_responses + .push_back(BackendMessage::Normal { messages }); trace!("poll_read: waiting on sender"); return Poll::Pending; } @@ -174,7 +136,7 @@ where } /// Fetch the next client request and enqueue the response sender. - fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll> { + fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll> { if self.receiver.is_closed() { return Poll::Ready(None); } @@ -182,10 +144,7 @@ where match self.receiver.poll_recv(cx) { Poll::Ready(Some(request)) => { trace!("polled new request"); - self.responses.push_back(Response { - sender: PollSender::new(request.sender), - }); - Poll::Ready(Some(request.messages)) + Poll::Ready(Some(request)) } Poll::Ready(None) => Poll::Ready(None), Poll::Pending => Poll::Pending, @@ -194,7 +153,7 @@ where /// Process client requests and write them to the postgres connection, flushing if necessary. /// client -> postgres - fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll> { + fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll> { loop { if Pin::new(&mut self.stream) .poll_ready(cx) @@ -209,14 +168,14 @@ where match self.poll_request(cx) { // send the message to postgres - Poll::Ready(Some(RequestMessages::Single(request))) => { + Poll::Ready(Some(request)) => { Pin::new(&mut self.stream) .start_send(request) .map_err(Error::io)?; } // No more messages from the client, and no more responses to wait for. // Send a terminate message to postgres - Poll::Ready(None) if self.responses.is_empty() => { + Poll::Ready(None) => { trace!("poll_write: at eof, terminating"); let mut request = BytesMut::new(); frontend::terminate(&mut request); @@ -228,16 +187,7 @@ where trace!("poll_write: sent eof, closing"); trace!("poll_write: done"); - return Poll::Ready(Ok(WriteReady::Terminating)); - } - // No more messages from the client, but there are still some responses to wait for. - Poll::Ready(None) => { - trace!( - "poll_write: at eof, pending responses {}", - self.responses.len() - ); - ready!(self.poll_flush(cx))?; - return Poll::Ready(Ok(WriteReady::WaitingOnRead)); + return Poll::Ready(Ok(())); } // Still waiting for a message from the client. Poll::Pending => { @@ -298,7 +248,7 @@ where // if the state is still active, try read from and write to postgres. let message = self.poll_read(cx)?; let closing = self.poll_write(cx)?; - if let Poll::Ready(WriteReady::Terminating) = closing { + if let Poll::Ready(()) = closing { self.state = State::Closing; } diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs index b12e76e5bf..8149bceeb9 100644 --- a/libs/proxy/tokio-postgres2/src/error/mod.rs +++ b/libs/proxy/tokio-postgres2/src/error/mod.rs @@ -86,6 +86,27 @@ pub struct DbError { } impl DbError { + pub fn new_test_error(code: SqlState, message: String) -> Self { + DbError { + severity: "ERROR".to_string(), + parsed_severity: Some(Severity::Error), + code, + message, + detail: None, + hint: None, + position: None, + where_: None, + schema: None, + table: None, + column: None, + datatype: None, + constraint: None, + file: None, + line: None, + routine: None, + } + } + pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result { let mut severity = None; let mut parsed_severity = None; diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs index 8e28843347..eeefb45d26 100644 --- a/libs/proxy/tokio-postgres2/src/generic_client.rs +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -1,9 +1,6 @@ #![allow(async_fn_in_trait)] -use postgres_protocol2::Oid; - use crate::query::RowStream; -use crate::types::Type; use crate::{Client, Error, Transaction}; mod private { @@ -15,20 +12,17 @@ mod private { /// This trait is "sealed", and cannot be implemented outside of this crate. pub trait GenericClient: private::Sealed { /// Like `Client::query_raw_txt`. - async fn query_raw_txt(&self, statement: &str, params: I) -> Result + async fn query_raw_txt(&mut self, statement: &str, params: I) -> Result where S: AsRef + Sync + Send, I: IntoIterator> + Sync + Send, I::IntoIter: ExactSizeIterator + Sync + Send; - - /// Query for type information - async fn get_type(&mut self, oid: Oid) -> Result; } impl private::Sealed for Client {} impl GenericClient for Client { - async fn query_raw_txt(&self, statement: &str, params: I) -> Result + async fn query_raw_txt(&mut self, statement: &str, params: I) -> Result where S: AsRef + Sync + Send, I: IntoIterator> + Sync + Send, @@ -36,17 +30,12 @@ impl GenericClient for Client { { self.query_raw_txt(statement, params).await } - - /// Query for type information - async fn get_type(&mut self, oid: Oid) -> Result { - self.get_type_inner(oid).await - } } impl private::Sealed for Transaction<'_> {} impl GenericClient for Transaction<'_> { - async fn query_raw_txt(&self, statement: &str, params: I) -> Result + async fn query_raw_txt(&mut self, statement: &str, params: I) -> Result where S: AsRef + Sync + Send, I: IntoIterator> + Sync + Send, @@ -54,9 +43,4 @@ impl GenericClient for Transaction<'_> { { self.query_raw_txt(statement, params).await } - - /// Query for type information - async fn get_type(&mut self, oid: Oid) -> Result { - self.client_mut().get_type(oid).await - } } diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index c8ebba5487..9556070ed5 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -18,7 +18,6 @@ pub use crate::statement::{Column, Statement}; pub use crate::tls::NoTls; pub use crate::transaction::Transaction; pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder}; -use crate::types::ToSql; /// After executing a query, the connection will be in one of these states #[derive(Clone, Copy, Debug, PartialEq)] @@ -120,9 +119,3 @@ pub enum SimpleQueryMessage { /// The number of rows modified or selected is returned. CommandComplete(u64), } - -fn slice_iter<'a>( - s: &'a [&'a (dyn ToSql + Sync)], -) -> impl ExactSizeIterator + 'a { - s.iter().map(|s| *s as _) -} diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs index b27eabcb0e..16b9cf66f4 100644 --- a/libs/proxy/tokio-postgres2/src/prepare.rs +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -1,19 +1,14 @@ -use std::future::Future; -use std::pin::Pin; -use std::sync::Arc; - -use bytes::Bytes; +use bytes::BytesMut; use fallible_iterator::FallibleIterator; -use futures_util::{TryStreamExt, pin_mut}; -use postgres_protocol2::message::backend::Message; +use postgres_protocol2::IsNull; +use postgres_protocol2::message::backend::{Message, RowDescriptionBody}; use postgres_protocol2::message::frontend; -use tracing::debug; +use postgres_protocol2::types::oid_to_sql; +use postgres_types2::Format; -use crate::client::{CachedTypeInfo, InnerClient}; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; +use crate::client::{CachedTypeInfo, PartialQuery, Responses}; use crate::types::{Kind, Oid, Type}; -use crate::{Column, Error, Statement, query, slice_iter}; +use crate::{Column, Error, Row, Statement}; pub(crate) const TYPEINFO_QUERY: &str = "\ SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid @@ -23,22 +18,51 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid WHERE t.oid = $1 "; +/// we need to make sure we close this prepared statement. +struct CloseStmt<'a, 'b> { + client: Option<&'a mut PartialQuery<'b>>, + name: &'static str, +} + +impl<'a> CloseStmt<'a, '_> { + fn close(mut self) -> Result<&'a mut Responses, Error> { + let client = self.client.take().unwrap(); + client.send_with_flush(|buf| { + frontend::close(b'S', self.name, buf).map_err(Error::encode)?; + Ok(()) + }) + } +} + +impl Drop for CloseStmt<'_, '_> { + fn drop(&mut self) { + if let Some(client) = self.client.take() { + let _ = client.send_with_flush(|buf| { + frontend::close(b'S', self.name, buf).map_err(Error::encode)?; + Ok(()) + }); + } + } +} + async fn prepare_typecheck( - client: &Arc, + client: &mut PartialQuery<'_>, name: &'static str, query: &str, - types: &[Type], ) -> Result { - let buf = encode(client, name, query, types)?; - let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + let responses = client.send_with_flush(|buf| { + frontend::parse(name, query, [], buf).map_err(Error::encode)?; + frontend::describe(b'S', name, buf).map_err(Error::encode)?; + Ok(()) + })?; match responses.next().await? { Message::ParseComplete => {} _ => return Err(Error::unexpected_message()), } - let parameter_description = match responses.next().await? { - Message::ParameterDescription(body) => body, + match responses.next().await? { + Message::ParameterDescription(_) => {} _ => return Err(Error::unexpected_message()), }; @@ -48,13 +72,6 @@ async fn prepare_typecheck( _ => return Err(Error::unexpected_message()), }; - let mut parameters = vec![]; - let mut it = parameter_description.parameters(); - while let Some(oid) = it.next().map_err(Error::parse)? { - let type_ = Type::from_oid(oid).ok_or_else(Error::unexpected_message)?; - parameters.push(type_); - } - let mut columns = vec![]; if let Some(row_description) = row_description { let mut it = row_description.fields(); @@ -65,98 +82,168 @@ async fn prepare_typecheck( } } - Ok(Statement::new(client, name, parameters, columns)) + Ok(Statement::new(name, columns)) } -fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result { - if types.is_empty() { - debug!("preparing query {}: {}", name, query); - } else { - debug!("preparing query {} with types {:?}: {}", name, types, query); - } - - client.with_buf(|buf| { - frontend::parse(name, query, types.iter().map(Type::oid), buf).map_err(Error::encode)?; - frontend::describe(b'S', name, buf).map_err(Error::encode)?; - frontend::sync(buf); - Ok(buf.split().freeze()) - }) -} - -pub async fn get_type( - client: &Arc, - typecache: &mut CachedTypeInfo, - oid: Oid, -) -> Result { +fn try_from_cache(typecache: &CachedTypeInfo, oid: Oid) -> Option { if let Some(type_) = Type::from_oid(oid) { - return Ok(type_); + return Some(type_); } if let Some(type_) = typecache.types.get(&oid) { - return Ok(type_.clone()); + return Some(type_.clone()); }; - let stmt = typeinfo_statement(client, typecache).await?; + None +} - let rows = query::query(client, stmt, slice_iter(&[&oid])).await?; - pin_mut!(rows); +pub async fn parse_row_description( + client: &mut PartialQuery<'_>, + typecache: &mut CachedTypeInfo, + row_description: Option, +) -> Result, Error> { + let mut columns = vec![]; - let row = match rows.try_next().await? { - Some(row) => row, - None => return Err(Error::unexpected_message()), + if let Some(row_description) = row_description { + let mut it = row_description.fields(); + while let Some(field) = it.next().map_err(Error::parse)? { + let type_ = try_from_cache(typecache, field.type_oid()).unwrap_or(Type::UNKNOWN); + let column = Column::new(field.name().to_string(), type_, field); + columns.push(column); + } + } + + let all_known = columns.iter().all(|c| c.type_ != Type::UNKNOWN); + if all_known { + // all known, return early. + return Ok(columns); + } + + let typeinfo = "neon_proxy_typeinfo"; + + // make sure to close the typeinfo statement before exiting. + let mut guard = CloseStmt { + name: typeinfo, + client: None, + }; + let client = guard.client.insert(client); + + // get the typeinfo statement. + let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY).await?; + + for column in &mut columns { + column.type_ = get_type(client, typecache, &stmt, column.type_oid()).await?; + } + + // cancel the close guard. + let responses = guard.close()?; + + match responses.next().await? { + Message::CloseComplete => {} + _ => return Err(Error::unexpected_message()), + } + + Ok(columns) +} + +async fn get_type( + client: &mut PartialQuery<'_>, + typecache: &mut CachedTypeInfo, + stmt: &Statement, + mut oid: Oid, +) -> Result { + let mut stack = vec![]; + let mut type_ = loop { + if let Some(type_) = try_from_cache(typecache, oid) { + break type_; + } + + let row = exec(client, stmt, oid).await?; + if stack.len() > 8 { + return Err(Error::unexpected_message()); + } + + let name: String = row.try_get(0)?; + let type_: i8 = row.try_get(1)?; + let elem_oid: Oid = row.try_get(2)?; + let rngsubtype: Option = row.try_get(3)?; + let basetype: Oid = row.try_get(4)?; + let schema: String = row.try_get(5)?; + let relid: Oid = row.try_get(6)?; + + let kind = if type_ == b'e' as i8 { + Kind::Enum + } else if type_ == b'p' as i8 { + Kind::Pseudo + } else if basetype != 0 { + Kind::Domain(basetype) + } else if elem_oid != 0 { + stack.push((name, oid, schema)); + oid = elem_oid; + continue; + } else if relid != 0 { + Kind::Composite(relid) + } else if let Some(rngsubtype) = rngsubtype { + Kind::Range(rngsubtype) + } else { + Kind::Simple + }; + + let type_ = Type::new(name, oid, kind, schema); + typecache.types.insert(oid, type_.clone()); + break type_; }; - let name: String = row.try_get(0)?; - let type_: i8 = row.try_get(1)?; - let elem_oid: Oid = row.try_get(2)?; - let rngsubtype: Option = row.try_get(3)?; - let basetype: Oid = row.try_get(4)?; - let schema: String = row.try_get(5)?; - let relid: Oid = row.try_get(6)?; - - let kind = if type_ == b'e' as i8 { - Kind::Enum - } else if type_ == b'p' as i8 { - Kind::Pseudo - } else if basetype != 0 { - Kind::Domain(basetype) - } else if elem_oid != 0 { - let type_ = get_type_rec(client, typecache, elem_oid).await?; - Kind::Array(type_) - } else if relid != 0 { - Kind::Composite(relid) - } else if let Some(rngsubtype) = rngsubtype { - let type_ = get_type_rec(client, typecache, rngsubtype).await?; - Kind::Range(type_) - } else { - Kind::Simple - }; - - let type_ = Type::new(name, oid, kind, schema); - typecache.types.insert(oid, type_.clone()); + while let Some((name, oid, schema)) = stack.pop() { + type_ = Type::new(name, oid, Kind::Array(type_), schema); + typecache.types.insert(oid, type_.clone()); + } Ok(type_) } -fn get_type_rec<'a>( - client: &'a Arc, - typecache: &'a mut CachedTypeInfo, - oid: Oid, -) -> Pin> + Send + 'a>> { - Box::pin(get_type(client, typecache, oid)) -} +/// exec the typeinfo statement returning one row. +async fn exec( + client: &mut PartialQuery<'_>, + statement: &Statement, + param: Oid, +) -> Result { + let responses = client.send_with_flush(|buf| { + encode_bind(statement, param, "", buf); + frontend::execute("", 0, buf).map_err(Error::encode)?; + Ok(()) + })?; -async fn typeinfo_statement( - client: &Arc, - typecache: &mut CachedTypeInfo, -) -> Result { - if let Some(stmt) = &typecache.typeinfo { - return Ok(stmt.clone()); + match responses.next().await? { + Message::BindComplete => {} + _ => return Err(Error::unexpected_message()), } - let typeinfo = "neon_proxy_typeinfo"; - let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY, &[]).await?; + let row = match responses.next().await? { + Message::DataRow(body) => Row::new(statement.clone(), body, Format::Binary)?, + _ => return Err(Error::unexpected_message()), + }; - typecache.typeinfo = Some(stmt.clone()); - Ok(stmt) + match responses.next().await? { + Message::CommandComplete(_) => {} + _ => return Err(Error::unexpected_message()), + }; + + Ok(row) +} + +fn encode_bind(statement: &Statement, param: Oid, portal: &str, buf: &mut BytesMut) { + frontend::bind( + portal, + statement.name(), + [Format::Binary as i16], + [param], + |param, buf| { + oid_to_sql(param, buf); + Ok(IsNull::No) + }, + [Format::Binary as i16], + buf, + ) + .unwrap(); } diff --git a/libs/proxy/tokio-postgres2/src/query.rs b/libs/proxy/tokio-postgres2/src/query.rs index 106bc69d49..5f3ed8ef5a 100644 --- a/libs/proxy/tokio-postgres2/src/query.rs +++ b/libs/proxy/tokio-postgres2/src/query.rs @@ -1,76 +1,43 @@ -use std::fmt; -use std::marker::PhantomPinned; use std::pin::Pin; -use std::sync::Arc; use std::task::{Context, Poll}; -use bytes::{BufMut, Bytes, BytesMut}; -use fallible_iterator::FallibleIterator; +use bytes::BufMut; use futures_util::{Stream, ready}; -use pin_project_lite::pin_project; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; -use postgres_types2::{Format, ToSql, Type}; -use tracing::debug; +use postgres_types2::Format; -use crate::client::{InnerClient, Responses}; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; -use crate::types::IsNull; -use crate::{Column, Error, ReadyForQueryStatus, Row, Statement}; +use crate::client::{CachedTypeInfo, InnerClient, Responses}; +use crate::{Error, ReadyForQueryStatus, Row, Statement}; -struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]); - -impl fmt::Debug for BorrowToSqlParamsDebug<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_list().entries(self.0.iter()).finish() - } -} - -pub async fn query<'a, I>( - client: &InnerClient, - statement: Statement, - params: I, -) -> Result -where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, -{ - let buf = if tracing::enabled!(tracing::Level::DEBUG) { - let params = params.into_iter().collect::>(); - debug!( - "executing statement {} with parameters: {:?}", - statement.name(), - BorrowToSqlParamsDebug(params.as_slice()), - ); - encode(client, &statement, params)? - } else { - encode(client, &statement, params)? - }; - let responses = start(client, buf).await?; - Ok(RowStream { - statement, - responses, - command_tag: None, - status: ReadyForQueryStatus::Unknown, - output_format: Format::Binary, - _p: PhantomPinned, - }) -} - -pub async fn query_txt( - client: &Arc, +pub async fn query_txt<'a, S, I>( + client: &'a mut InnerClient, + typecache: &mut CachedTypeInfo, query: &str, params: I, -) -> Result +) -> Result, Error> where S: AsRef, I: IntoIterator>, I::IntoIter: ExactSizeIterator, { let params = params.into_iter(); + let mut client = client.start()?; - let buf = client.with_buf(|buf| { + // Flow: + // 1. Parse the query + // 2. Inspect the row description for OIDs + // 3. If there's any OIDs we don't already know about, perform the typeinfo routine + // 4. Execute the query + // 5. Sync. + // + // The typeinfo routine: + // 1. Parse the typeinfo query + // 2. Execute the query on each OID + // 3. If the result does not match an OID we know, repeat 2. + + // parse the query and get type info + let responses = client.send_with_flush(|buf| { frontend::parse( "", // unnamed prepared statement query, // query to parse @@ -79,7 +46,30 @@ where ) .map_err(Error::encode)?; frontend::describe(b'S', "", buf).map_err(Error::encode)?; - // Bind, pass params as text, retrieve as binary + Ok(()) + })?; + + match responses.next().await? { + Message::ParseComplete => {} + _ => return Err(Error::unexpected_message()), + } + + match responses.next().await? { + Message::ParameterDescription(_) => {} + _ => return Err(Error::unexpected_message()), + }; + + let row_description = match responses.next().await? { + Message::RowDescription(body) => Some(body), + Message::NoData => None, + _ => return Err(Error::unexpected_message()), + }; + + let columns = + crate::prepare::parse_row_description(&mut client, typecache, row_description).await?; + + let responses = client.send_with_sync(|buf| { + // Bind, pass params as text, retrieve as text match frontend::bind( "", // empty string selects the unnamed portal "", // unnamed prepared statement @@ -102,173 +92,55 @@ where // Execute frontend::execute("", 0, buf).map_err(Error::encode)?; - // Sync - frontend::sync(buf); - Ok(buf.split().freeze()) + Ok(()) })?; - // now read the responses - let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; - - match responses.next().await? { - Message::ParseComplete => {} - _ => return Err(Error::unexpected_message()), - } - - let parameter_description = match responses.next().await? { - Message::ParameterDescription(body) => body, - _ => return Err(Error::unexpected_message()), - }; - - let row_description = match responses.next().await? { - Message::RowDescription(body) => Some(body), - Message::NoData => None, - _ => return Err(Error::unexpected_message()), - }; - match responses.next().await? { Message::BindComplete => {} _ => return Err(Error::unexpected_message()), } - let mut parameters = vec![]; - let mut it = parameter_description.parameters(); - while let Some(oid) = it.next().map_err(Error::parse)? { - let type_ = Type::from_oid(oid).unwrap_or(Type::UNKNOWN); - parameters.push(type_); - } - - let mut columns = vec![]; - if let Some(row_description) = row_description { - let mut it = row_description.fields(); - while let Some(field) = it.next().map_err(Error::parse)? { - let type_ = Type::from_oid(field.type_oid()).unwrap_or(Type::UNKNOWN); - let column = Column::new(field.name().to_string(), type_, field); - columns.push(column); - } - } - Ok(RowStream { - statement: Statement::new_anonymous(parameters, columns), responses, + statement: Statement::new("", columns), command_tag: None, status: ReadyForQueryStatus::Unknown, output_format: Format::Text, - _p: PhantomPinned, }) } -async fn start(client: &InnerClient, buf: Bytes) -> Result { - let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; - - match responses.next().await? { - Message::BindComplete => {} - _ => return Err(Error::unexpected_message()), - } - - Ok(responses) +/// A stream of table rows. +pub struct RowStream<'a> { + responses: &'a mut Responses, + output_format: Format, + pub statement: Statement, + pub command_tag: Option, + pub status: ReadyForQueryStatus, } -pub fn encode<'a, I>(client: &InnerClient, statement: &Statement, params: I) -> Result -where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, -{ - client.with_buf(|buf| { - encode_bind(statement, params, "", buf)?; - frontend::execute("", 0, buf).map_err(Error::encode)?; - frontend::sync(buf); - Ok(buf.split().freeze()) - }) -} - -pub fn encode_bind<'a, I>( - statement: &Statement, - params: I, - portal: &str, - buf: &mut BytesMut, -) -> Result<(), Error> -where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, -{ - let param_types = statement.params(); - let params = params.into_iter(); - - assert!( - param_types.len() == params.len(), - "expected {} parameters but got {}", - param_types.len(), - params.len() - ); - - let (param_formats, params): (Vec<_>, Vec<_>) = params - .zip(param_types.iter()) - .map(|(p, ty)| (p.encode_format(ty) as i16, p)) - .unzip(); - - let params = params.into_iter(); - - let mut error_idx = 0; - let r = frontend::bind( - portal, - statement.name(), - param_formats, - params.zip(param_types).enumerate(), - |(idx, (param, ty)), buf| match param.to_sql_checked(ty, buf) { - Ok(IsNull::No) => Ok(postgres_protocol2::IsNull::No), - Ok(IsNull::Yes) => Ok(postgres_protocol2::IsNull::Yes), - Err(e) => { - error_idx = idx; - Err(e) - } - }, - Some(1), - buf, - ); - match r { - Ok(()) => Ok(()), - Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, error_idx)), - Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)), - } -} - -pin_project! { - /// A stream of table rows. - pub struct RowStream { - statement: Statement, - responses: Responses, - command_tag: Option, - output_format: Format, - status: ReadyForQueryStatus, - #[pin] - _p: PhantomPinned, - } -} - -impl Stream for RowStream { +impl Stream for RowStream<'_> { type Item = Result; fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let this = self.project(); + let this = self.get_mut(); loop { match ready!(this.responses.poll_next(cx)?) { Message::DataRow(body) => { return Poll::Ready(Some(Ok(Row::new( this.statement.clone(), body, - *this.output_format, + this.output_format, )?))); } Message::EmptyQueryResponse | Message::PortalSuspended => {} Message::CommandComplete(body) => { if let Ok(tag) = body.tag() { - *this.command_tag = Some(tag.to_string()); + this.command_tag = Some(tag.to_string()); } } Message::ReadyForQuery(status) => { - *this.status = status.into(); + this.status = status.into(); return Poll::Ready(None); } _ => return Poll::Ready(Some(Err(Error::unexpected_message()))), @@ -276,24 +148,3 @@ impl Stream for RowStream { } } } - -impl RowStream { - /// Returns information about the columns of data in the row. - pub fn columns(&self) -> &[Column] { - self.statement.columns() - } - - /// Returns the command tag of this query. - /// - /// This is only available after the stream has been exhausted. - pub fn command_tag(&self) -> Option { - self.command_tag.clone() - } - - /// Returns if the connection is ready for querying, with the status of the connection. - /// - /// This might be available only after the stream has been exhausted. - pub fn ready_status(&self) -> ReadyForQueryStatus { - self.status - } -} diff --git a/libs/proxy/tokio-postgres2/src/simple_query.rs b/libs/proxy/tokio-postgres2/src/simple_query.rs index 2cf17188cf..e1ed48cdaf 100644 --- a/libs/proxy/tokio-postgres2/src/simple_query.rs +++ b/libs/proxy/tokio-postgres2/src/simple_query.rs @@ -1,19 +1,14 @@ -use std::marker::PhantomPinned; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use bytes::Bytes; use fallible_iterator::FallibleIterator; use futures_util::{Stream, ready}; use pin_project_lite::pin_project; use postgres_protocol2::message::backend::Message; -use postgres_protocol2::message::frontend; use tracing::debug; use crate::client::{InnerClient, Responses}; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow}; /// Information about a column of a single query row. @@ -33,28 +28,28 @@ impl SimpleColumn { } } -pub async fn simple_query(client: &InnerClient, query: &str) -> Result { +pub async fn simple_query<'a>( + client: &'a mut InnerClient, + query: &str, +) -> Result, Error> { debug!("executing simple query: {}", query); - let buf = encode(client, query)?; - let responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + let responses = client.send_simple_query(query)?; Ok(SimpleQueryStream { responses, columns: None, status: ReadyForQueryStatus::Unknown, - _p: PhantomPinned, }) } pub async fn batch_execute( - client: &InnerClient, + client: &mut InnerClient, query: &str, ) -> Result { debug!("executing statement batch: {}", query); - let buf = encode(client, query)?; - let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + let responses = client.send_simple_query(query)?; loop { match responses.next().await? { @@ -68,25 +63,16 @@ pub async fn batch_execute( } } -pub(crate) fn encode(client: &InnerClient, query: &str) -> Result { - client.with_buf(|buf| { - frontend::query(query, buf).map_err(Error::encode)?; - Ok(buf.split().freeze()) - }) -} - pin_project! { /// A stream of simple query results. - pub struct SimpleQueryStream { - responses: Responses, + pub struct SimpleQueryStream<'a> { + responses: &'a mut Responses, columns: Option>, status: ReadyForQueryStatus, - #[pin] - _p: PhantomPinned, } } -impl SimpleQueryStream { +impl SimpleQueryStream<'_> { /// Returns if the connection is ready for querying, with the status of the connection. /// /// This might be available only after the stream has been exhausted. @@ -95,7 +81,7 @@ impl SimpleQueryStream { } } -impl Stream for SimpleQueryStream { +impl Stream for SimpleQueryStream<'_> { type Item = Result; fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs index e4828db712..1f22d87fd7 100644 --- a/libs/proxy/tokio-postgres2/src/statement.rs +++ b/libs/proxy/tokio-postgres2/src/statement.rs @@ -1,35 +1,15 @@ use std::fmt; -use std::sync::{Arc, Weak}; +use std::sync::Arc; +use crate::types::Type; use postgres_protocol2::Oid; use postgres_protocol2::message::backend::Field; -use postgres_protocol2::message::frontend; - -use crate::client::InnerClient; -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; -use crate::types::Type; struct StatementInner { - client: Weak, name: &'static str, - params: Vec, columns: Vec, } -impl Drop for StatementInner { - fn drop(&mut self) { - if let Some(client) = self.client.upgrade() { - let buf = client.with_buf(|buf| { - frontend::close(b'S', self.name, buf).unwrap(); - frontend::sync(buf); - buf.split().freeze() - }); - let _ = client.send(RequestMessages::Single(FrontendMessage::Raw(buf))); - } - } -} - /// A prepared statement. /// /// Prepared statements can only be used with the connection that created them. @@ -37,38 +17,14 @@ impl Drop for StatementInner { pub struct Statement(Arc); impl Statement { - pub(crate) fn new( - inner: &Arc, - name: &'static str, - params: Vec, - columns: Vec, - ) -> Statement { - Statement(Arc::new(StatementInner { - client: Arc::downgrade(inner), - name, - params, - columns, - })) - } - - pub(crate) fn new_anonymous(params: Vec, columns: Vec) -> Statement { - Statement(Arc::new(StatementInner { - client: Weak::new(), - name: "", - params, - columns, - })) + pub(crate) fn new(name: &'static str, columns: Vec) -> Statement { + Statement(Arc::new(StatementInner { name, columns })) } pub(crate) fn name(&self) -> &str { self.0.name } - /// Returns the expected types of the statement's parameters. - pub fn params(&self) -> &[Type] { - &self.0.params - } - /// Returns information about the columns returned when the statement is queried. pub fn columns(&self) -> &[Column] { &self.0.columns @@ -78,7 +34,7 @@ impl Statement { /// Information about a column of a query. pub struct Column { name: String, - type_: Type, + pub(crate) type_: Type, // raw fields from RowDescription table_oid: Oid, diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs index f32603470f..12fe0737d4 100644 --- a/libs/proxy/tokio-postgres2/src/transaction.rs +++ b/libs/proxy/tokio-postgres2/src/transaction.rs @@ -1,7 +1,3 @@ -use postgres_protocol2::message::frontend; - -use crate::codec::FrontendMessage; -use crate::connection::RequestMessages; use crate::query::RowStream; use crate::{CancelToken, Client, Error, ReadyForQueryStatus}; @@ -20,14 +16,7 @@ impl Drop for Transaction<'_> { return; } - let buf = self.client.inner().with_buf(|buf| { - frontend::query("ROLLBACK", buf).unwrap(); - buf.split().freeze() - }); - let _ = self - .client - .inner() - .send(RequestMessages::Single(FrontendMessage::Raw(buf))); + let _ = self.client.inner_mut().send_simple_query("ROLLBACK"); } } @@ -54,7 +43,11 @@ impl<'a> Transaction<'a> { } /// Like `Client::query_raw_txt`. - pub async fn query_raw_txt(&self, statement: &str, params: I) -> Result + pub async fn query_raw_txt( + &mut self, + statement: &str, + params: I, + ) -> Result where S: AsRef, I: IntoIterator>, diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index a5cddb840f..5363e935e3 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -330,11 +330,18 @@ impl AzureBlobStorage { if let Err(DownloadError::Timeout) = &next_item { timeout_try_cnt += 1; if timeout_try_cnt <= 5 { - continue; + continue 'outer; } } - let next_item = next_item?; + let next_item = match next_item { + Ok(next_item) => next_item, + Err(e) => { + // The error is potentially retryable, so we must rewind the loop after yielding. + yield Err(e); + continue 'outer; + }, + }; // Log a warning if we saw two timeouts in a row before a successful request if timeout_try_cnt > 2 { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 918d9d5a6b..d98ff552ee 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -657,7 +657,14 @@ impl RemoteStorage for S3Bucket { res = request => Ok(res), _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout), _ = cancel.cancelled() => Err(DownloadError::Cancelled), - }?; + }; + + if let Err(DownloadError::Timeout) = &response { + yield Err(DownloadError::Timeout); + continue 'outer; + } + + let response = response?; // always yield cancellation errors and stop the stream let response = response .context("Failed to list S3 prefixes") diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 51f88625da..8658dc4011 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -299,11 +299,13 @@ pub struct PullTimelineRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub http_hosts: Vec, + pub ignore_tombstone: Option, } #[derive(Debug, Serialize, Deserialize)] pub struct PullTimelineResponse { - // Donor safekeeper host - pub safekeeper_host: String, + /// Donor safekeeper host. + /// None if no pull happened because the timeline already exists. + pub safekeeper_host: Option, // TODO: add more fields? } diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 215fa36df4..45acaf682f 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::fs::{self, File}; use std::io::{self, Write}; -use std::os::fd::AsRawFd; +use std::os::fd::AsFd; use camino::{Utf8Path, Utf8PathBuf}; @@ -210,13 +210,13 @@ pub fn overwrite( /// Syncs the filesystem for the given file descriptor. #[cfg_attr(target_os = "macos", allow(unused_variables))] -pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> { +pub fn syncfs(fd: impl AsFd) -> anyhow::Result<()> { // Linux guarantees durability for syncfs. // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync(). #[cfg(target_os = "linux")] { use anyhow::Context; - nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?; + nix::unistd::syncfs(fd).context("syncfs")?; } #[cfg(target_os = "macos")] { diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs index d0c07353d0..c945ecadf0 100644 --- a/libs/utils/src/fs_ext/rename_noreplace.rs +++ b/libs/utils/src/fs_ext/rename_noreplace.rs @@ -11,9 +11,9 @@ pub fn rename_noreplace( #[cfg(all(target_os = "linux", target_env = "gnu"))] { nix::fcntl::renameat2( - None, + nix::fcntl::AT_FDCWD, src, - None, + nix::fcntl::AT_FDCWD, dst, nix::fcntl::RenameFlags::RENAME_NOREPLACE, ) diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 6016c23a01..68cb1f0209 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -295,6 +295,9 @@ pub struct TenantId(Id); id_newtype!(TenantId); +/// If needed, reuse small string from proxy/src/types.rc +pub type EndpointId = String; + // A pair uniquely identifying Neon instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct TenantTimelineId { diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 6aeeeca021..b3c8d74d7d 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -1,6 +1,6 @@ //! A module to create and read lock files. //! -//! File locking is done using [`fcntl::flock`] exclusive locks. +//! File locking is done using [`nix::fcntl::Flock`] exclusive locks. //! The only consumer of this module is currently //! [`pid_file`](crate::pid_file). See the module-level comment //! there for potential pitfalls with lock files that are used @@ -9,26 +9,25 @@ use std::fs; use std::io::{Read, Write}; use std::ops::Deref; -use std::os::unix::prelude::AsRawFd; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use nix::errno::Errno::EAGAIN; -use nix::fcntl; +use nix::fcntl::{Flock, FlockArg}; use crate::crashsafe; -/// A handle to an open and unlocked, but not-yet-written lock file. +/// A handle to an open and flocked, but not-yet-written lock file. /// Returned by [`create_exclusive`]. #[must_use] pub struct UnwrittenLockFile { path: Utf8PathBuf, - file: fs::File, + file: Flock, } /// Returned by [`UnwrittenLockFile::write_content`]. #[must_use] -pub struct LockFileGuard(fs::File); +pub struct LockFileGuard(Flock); impl Deref for LockFileGuard { type Target = fs::File; @@ -67,17 +66,14 @@ pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result Ok(UnwrittenLockFile { + Ok(lock_file) => Ok(UnwrittenLockFile { path: lock_file_path.to_owned(), file: lock_file, }), - Err(EAGAIN) => anyhow::bail!("file is already locked"), - Err(e) => Err(e).context("flock error"), + Err((_, EAGAIN)) => anyhow::bail!("file is already locked"), + Err((_, e)) => Err(e).context("flock error"), } } @@ -105,32 +101,37 @@ pub enum LockFileRead { /// Check the [`LockFileRead`] variants for details. pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result { let res = fs::OpenOptions::new().read(true).open(path); - let mut lock_file = match res { + let lock_file = match res { Ok(f) => f, Err(e) => match e.kind() { std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist), _ => return Err(e).context("open lock file"), }, }; - let res = fcntl::flock( - lock_file.as_raw_fd(), - fcntl::FlockArg::LockExclusiveNonblock, - ); + let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock); // We need the content regardless of lock success / failure. // But, read it after flock so that, if it succeeded, the content is consistent. - let mut content = String::new(); - lock_file - .read_to_string(&mut content) - .context("read lock file")?; match res { - Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess( - LockFileGuard(lock_file), - content, - )), - Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess { - not_locked_file: lock_file, - content, - }), - Err(e) => Err(e).context("flock error"), + Ok(mut locked_file) => { + let mut content = String::new(); + locked_file + .read_to_string(&mut content) + .context("read lock file")?; + Ok(LockFileRead::NotHeldByAnyProcess( + LockFileGuard(locked_file), + content, + )) + } + Err((mut not_locked_file, EAGAIN)) => { + let mut content = String::new(); + not_locked_file + .read_to_string(&mut content) + .context("read lock file")?; + Ok(LockFileRead::LockedByOtherProcess { + not_locked_file, + content, + }) + } + Err((_, e)) => Err(e).context("flock error"), } } diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs index 945f710b1d..700cd5792b 100644 --- a/libs/utils/src/rate_limit.rs +++ b/libs/utils/src/rate_limit.rs @@ -17,7 +17,7 @@ impl std::fmt::Display for RateLimitStats { } impl RateLimit { - pub fn new(interval: Duration) -> Self { + pub const fn new(interval: Duration) -> Self { Self { last: None, interval, diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index 3d15e08400..857d98b644 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -127,12 +127,12 @@ macro_rules! __check_fields_present { match check_fields_present0($extractors) { Ok(FoundEverything) => Ok(()), - Ok(Unconfigured) if cfg!(test) => { + Ok(Unconfigured) if cfg!(feature = "testing") => { // allow unconfigured in tests Ok(()) }, Ok(Unconfigured) => { - panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer") + panic!(r#"utils::tracing_span_assert: outside of #[cfg(feature = "testing")] expected tracing to be configured with tracing_error::ErrorLayer"#) }, Err(missing) => Err(missing) } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 125cf2e483..5500d4ec8d 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -30,6 +30,7 @@ crc32c.workspace = true either.workspace = true fail.workspace = true futures.workspace = true +hashlink.workspace = true hex.workspace = true humantime.workspace = true humantime-serde.workspace = true @@ -100,6 +101,7 @@ strum.workspace = true strum_macros.workspace = true wal_decoder.workspace = true smallvec.workspace = true +twox-hash.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 2836450a0e..eaadfe14ae 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -14,6 +14,7 @@ use pageserver_api::key::Key; use pageserver_api::models::virtual_file::IoMode; use pageserver_api::shard::TenantShardId; use pageserver_api::value::Value; +use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; @@ -244,13 +245,7 @@ fn criterion_benchmark(c: &mut Criterion) { ]; let exploded_parameters = { let mut out = Vec::new(); - for io_mode in [ - IoMode::Buffered, - #[cfg(target_os = "linux")] - IoMode::Direct, - #[cfg(target_os = "linux")] - IoMode::DirectRw, - ] { + for io_mode in IoMode::iter() { for param in expect.clone() { let HandPickedParameters { volume_mib, diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 4a87a91910..219e63c9d4 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::error::Error as _; +use std::time::Duration; use bytes::Bytes; use detach_ancestor::AncestorDetached; @@ -819,4 +820,25 @@ impl Client { .await .map(|resp| resp.status()) } + + pub async fn activate_post_import( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + activate_timeline_timeout: Duration, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/activate_post_import?timeline_activate_timeout_ms={}", + self.mgmt_api_endpoint, + tenant_shard_id, + timeline_id, + activate_timeline_timeout.as_millis() + ); + + self.request(Method::PUT, uri, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } } diff --git a/pageserver/client_grpc/src/client_cache.rs b/pageserver/client_grpc/src/client_cache.rs index d4e82f98dc..1b82103bec 100644 --- a/pageserver/client_grpc/src/client_cache.rs +++ b/pageserver/client_grpc/src/client_cache.rs @@ -9,17 +9,20 @@ use tokio::{ }; use tonic::transport::{Channel, Endpoint}; -use uuid; -use std::io::{self, Error, ErrorKind}; -use std::{pin::Pin, task::{Context, Poll}}; -use futures::future; -use rand::{Rng, rngs::StdRng, SeedableRng}; -use tower::service_fn; -use http::Uri; -use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; -use hyper_util::rt::TokioIo; -use tokio::net::TcpStream; use bytes::BytesMut; +use futures::future; +use http::Uri; +use hyper_util::rt::TokioIo; +use rand::{Rng, SeedableRng, rngs::StdRng}; +use std::io::{self, Error, ErrorKind}; +use std::{ + pin::Pin, + task::{Context, Poll}, +}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use tokio::net::TcpStream; +use tower::service_fn; +use uuid; /// A pooled gRPC client with capacity tracking and error handling. pub struct ConnectionPool { @@ -139,7 +142,6 @@ impl AsyncRead for TokioTcp { this.deadline = Instant::now() + Duration::from_millis(next_ms); } - // 4) Perform actual read into a temporary buffer let mut tmp = [0u8; 4096]; let mut rb = ReadBuf::new(&mut tmp); @@ -192,18 +194,12 @@ impl AsyncWrite for TokioTcp { Pin::new(&mut this.tcp).poll_write(cx, data) } - fn poll_flush( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { + fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let this = self.get_mut(); Pin::new(&mut this.tcp).poll_flush(cx) } - fn poll_shutdown( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { + fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let this = self.get_mut(); Pin::new(&mut this.tcp).poll_shutdown(cx) } @@ -288,7 +284,9 @@ impl ConnectionPool { let mut inner = self.inner.lock().await; let now = Instant::now(); inner.entries.retain(|_id, entry| { - if entry.active_consumers == 0 && now.duration_since(entry.last_used) > self.max_idle_duration { + if entry.active_consumers == 0 + && now.duration_since(entry.last_used) > self.max_idle_duration + { // Remove idle connection return false; } @@ -296,7 +294,6 @@ impl ConnectionPool { }); } - async fn acquire_connection(&self) -> (uuid::Uuid, Channel) { loop { // Reuse an existing healthy connection if available @@ -325,7 +322,6 @@ impl ConnectionPool { } async fn create_connection(&self) -> () { - let max_delay_ms = self.max_delay_ms; let drop_rate = self.drop_rate; let hang_rate = self.hang_rate; @@ -362,17 +358,14 @@ impl ConnectionPool { // host + explicit port (Some(host), Some(port)) => format!("{}:{}", host, port.as_str()), // host only (no port) - (Some(host), None) => host.to_string(), + (Some(host), None) => host.to_string(), // neither? error out _ => return Err(Error::new(ErrorKind::InvalidInput, "no host or port")), }; //let addr = uri.authority().unwrap().as_str(); let tcp = TcpStream::connect(addr).await?; - let tcpwrapper = TokioTcp::new( - tcp, - max_delay_ms, - ); + let tcpwrapper = TokioTcp::new(tcp, max_delay_ms); Ok(TokioIo::new(tcpwrapper)) } }); @@ -398,7 +391,7 @@ impl ConnectionPool { } { sleep(delay).await; } else { - break // No delay, so we can create a connection + break; // No delay, so we can create a connection } } @@ -414,15 +407,13 @@ impl ConnectionPool { Endpoint::from_shared(self.endpoint.clone()) .expect("invalid endpoint") .timeout(self.connect_timeout) - .connect_with_connector(connector) + .connect_with_connector(connector), ) .await; - match attempt { Ok(Ok(channel)) => { { - let mut inner = self.inner.lock().await; let id = uuid::Uuid::new_v4(); inner.entries.insert( diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs index 743768af96..994135179d 100644 --- a/pageserver/client_grpc/src/lib.rs +++ b/pageserver/client_grpc/src/lib.rs @@ -9,7 +9,7 @@ use std::sync::RwLock; use std::time::Duration; use bytes::Bytes; -use futures::Stream; +use futures::{Stream, StreamExt}; use thiserror::Error; use tonic::metadata::AsciiMetadataValue; @@ -19,7 +19,7 @@ use pageserver_page_api::proto; use pageserver_page_api::proto::PageServiceClient; use utils::shard::ShardIndex; -use std::{fmt::Debug}; +use std::fmt::Debug; mod client_cache; #[derive(Error, Debug)] @@ -33,6 +33,9 @@ pub enum PageserverClientError { #[error("could not perform request: {0}`")] InvalidUri(#[from] http::uri::InvalidUri), + + #[error("could not perform request: {0}`")] + Other(String), } pub struct PageserverClient { @@ -79,13 +82,7 @@ impl PageserverClient { drop_rate: 0.0, hang_rate: 0.0, }; - Self::new_with_config( - tenant_id, - timeline_id, - auth_token, - shard_map, - options, - ) + Self::new_with_config(tenant_id, timeline_id, auth_token, shard_map, options) } pub fn new_with_config( tenant_id: &str, @@ -104,9 +101,9 @@ impl PageserverClient { client_cache_options: options, } } - pub async fn process_rel_exists_request( + pub async fn process_check_rel_exists_request( &self, - request: &RelExistsRequest, + request: &CheckRelExistsRequest, ) -> Result { // Current sharding model assumes that all metadata is present only at shard 0. let shard = ShardIndex::unsharded(); @@ -116,8 +113,8 @@ impl PageserverClient { let mut client = PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard)); - let request = proto::RelExistsRequest::from(request); - let response = client.rel_exists(tonic::Request::new(request)).await; + let request = proto::CheckRelExistsRequest::from(request); + let response = client.check_rel_exists(tonic::Request::new(request)).await; match response { Err(status) => { @@ -131,9 +128,9 @@ impl PageserverClient { } } - pub async fn process_rel_size_request( + pub async fn process_get_rel_size_request( &self, - request: &RelSizeRequest, + request: &GetRelSizeRequest, ) -> Result { // Current sharding model assumes that all metadata is present only at shard 0. let shard = ShardIndex::unsharded(); @@ -143,8 +140,8 @@ impl PageserverClient { let mut client = PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard)); - let request = proto::RelSizeRequest::from(request); - let response = client.rel_size(tonic::Request::new(request)).await; + let request = proto::GetRelSizeRequest::from(request); + let response = client.get_rel_size(tonic::Request::new(request)).await; match response { Err(status) => { @@ -158,7 +155,13 @@ impl PageserverClient { } } - pub async fn get_page(&self, request: &GetPageRequest) -> Result { + // Request a single batch of pages + // + // TODO: This opens a new gRPC stream for every request, which is extremely inefficient + pub async fn get_page( + &self, + request: &GetPageRequest, + ) -> Result, PageserverClientError> { // FIXME: calculate the shard number correctly let shard = ShardIndex::unsharded(); let pooled_client = self.get_client(shard).await; @@ -168,7 +171,19 @@ impl PageserverClient { PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard)); let request = proto::GetPageRequest::from(request); - let response = client.get_page(tonic::Request::new(request)).await; + + let request_stream = futures::stream::once(std::future::ready(request)); + + let mut response_stream = client + .get_pages(tonic::Request::new(request_stream)) + .await? + .into_inner(); + + let Some(response) = response_stream.next().await else { + return Err(PageserverClientError::Other( + "no response received for getpage request".to_string(), + )); + }; match response { Err(status) => { @@ -177,47 +192,48 @@ impl PageserverClient { } Ok(resp) => { pooled_client.finish(Ok(())).await; // Pass success to finish - let response: GetPageResponse = resp.into_inner().try_into()?; + let response: GetPageResponse = resp.try_into()?; return Ok(response.page_image); } } - } - // TODO: this should use model::GetPageRequest and GetPageResponse + // Open a stream for requesting pages + // + // TODO: This is a pretty low level interface, the caller should not need to be concerned + // with streams. But 'get_page' is currently very naive and inefficient. pub async fn get_pages( &self, - requests: impl Stream + Send + 'static, + requests: impl Stream + Send + 'static, ) -> std::result::Result< tonic::Response>, PageserverClientError, > { // FIXME: calculate the shard number correctly let shard = ShardIndex::unsharded(); - let request = tonic::Request::new(requests); let pooled_client = self.get_client(shard).await; let chan = pooled_client.channel(); let mut client = PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard)); - let response = client.get_pages(request).await; + let response = client.get_pages(tonic::Request::new(requests)).await; match response { Err(status) => { - pooled_client.finish(Err(status.clone())).await; + pooled_client.finish(Err(status.clone())).await; // Pass error to finish return Err(PageserverClientError::RequestError(status)); } Ok(resp) => { - pooled_client.finish(Ok(())).await; // Pass success to finish return Ok(resp); } } } + /// Process a request to get the size of a database. - pub async fn process_dbsize_request( + pub async fn process_get_dbsize_request( &self, - request: &DbSizeRequest, + request: &GetDbSizeRequest, ) -> Result { // Current sharding model assumes that all metadata is present only at shard 0. let shard = ShardIndex::unsharded(); @@ -227,8 +243,8 @@ impl PageserverClient { let mut client = PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard)); - let request = proto::DbSizeRequest::from(request); - let response = client.db_size(tonic::Request::new(request)).await; + let request = proto::GetDbSizeRequest::from(request); + let response = client.get_db_size(tonic::Request::new(request)).await; match response { Err(status) => { diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 293c01eff0..79f56a5a51 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -10,6 +10,7 @@ use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, ima use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::virtual_file::api::IoMode; use pageserver::{page_cache, virtual_file}; +use pageserver_api::key::Key; use utils::id::{TenantId, TimelineId}; use crate::layer_map_analyzer::parse_filename; @@ -27,6 +28,7 @@ pub(crate) enum LayerCmd { path: PathBuf, tenant: String, timeline: String, + key: Option, }, /// Dump all information of a layer file DumpLayer { @@ -100,6 +102,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { path, tenant, timeline, + key, } => { let timeline_path = path .join(TENANTS_SEGMENT_NAME) @@ -107,21 +110,37 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { .join(TIMELINES_SEGMENT_NAME) .join(timeline); let mut idx = 0; + let mut to_print = Vec::default(); for layer in fs::read_dir(timeline_path)? { let layer = layer?; if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) { - println!( - "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", - idx, - layer_file.key_range.start, - layer_file.key_range.end, - layer_file.lsn_range.start, - layer_file.lsn_range.end, - layer_file.is_delta, - ); + if let Some(key) = key { + if layer_file.key_range.start <= *key && *key < layer_file.key_range.end { + to_print.push((idx, layer_file)); + } + } else { + to_print.push((idx, layer_file)); + } idx += 1; } } + + if key.is_some() { + to_print + .sort_by_key(|(_idx, layer_file)| std::cmp::Reverse(layer_file.lsn_range.end)); + } + + for (idx, layer_file) in to_print { + println!( + "[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}", + idx, + layer_file.key_range.start, + layer_file.key_range.end, + layer_file.lsn_range.start, + layer_file.lsn_range.end, + layer_file.is_delta, + ); + } Ok(()) } LayerCmd::DumpLayer { diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml index 04b206de65..3a17981a78 100644 --- a/pageserver/page_api/Cargo.toml +++ b/pageserver/page_api/Cargo.toml @@ -1,20 +1,17 @@ [package] name = "pageserver_page_api" version = "0.1.0" -edition = "2024" +edition.workspace = true +license.workspace = true [dependencies] - -# For Lsn. -# -# TODO: move Lsn to separate crate? This draws in a lot more dependencies -utils.workspace = true - bytes.workspace = true prost.workspace = true -smallvec.workspace = true thiserror.workspace = true tonic.workspace = true +utils.workspace = true +workspace_hack.workspace = true + [build-dependencies] tonic-build.workspace = true diff --git a/pageserver/page_api/build.rs b/pageserver/page_api/build.rs index 6739222ed0..e96297f10e 100644 --- a/pageserver/page_api/build.rs +++ b/pageserver/page_api/build.rs @@ -1,7 +1,13 @@ +use std::env; +use std::path::PathBuf; + +/// Generates Rust code from .proto Protobuf schemas, along with a binary file +/// descriptor set for Protobuf schema reflection. fn main() -> Result<(), Box> { - // Generate rust code from .proto protobuf. + let out_dir = PathBuf::from(env::var("OUT_DIR")?); tonic_build::configure() .bytes(["."]) + .file_descriptor_set_path(out_dir.join("page_api_descriptor.bin")) .compile_protos(&["proto/page_service.proto"], &["proto"]) .map_err(|err| err.into()) } diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto index 9612d8eb91..f6acb3eeeb 100644 --- a/pageserver/page_api/proto/page_service.proto +++ b/pageserver/page_api/proto/page_service.proto @@ -3,30 +3,47 @@ // This is the compute read path. It primarily serves page versions at given // LSNs, but also base backups, SLRU segments, and relation metadata. // -// Request metadata: +// EXPERIMENTAL: this is still under development and subject to change. +// +// Request metadata headers: // - authorization: JWT token ("Bearer "), if auth is enabled // - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980") // - neon-shard-id: shard ID, as in hex ("0b10" = shard 11 of 16, 0-based) // - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e") // +// The service can be accessed via e.g. grpcurl: +// +// ``` +// grpcurl \ +// -plaintext \ +// -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \ +// -H "neon-shard-id: 0b10" \ +// -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \ +// -H "authorization: Bearer $JWT" \ +// -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}' +// localhost:51051 page_api.PageService/CheckRelExists +// ``` +// +// TODO: consider adding neon-compute-mode ("primary", "static", "replica"). +// However, this will require reconnecting when changing modes. +// // TODO: write implementation guidance on // - Health checks // - Tracing, OpenTelemetry // - Compression syntax = "proto3"; -package page_service; +package page_api; service PageService { - // Returns the total size of a database, as # of bytes. - rpc DbSize (DbSizeRequest) returns (DbSizeResponse); + // Returns whether a relation exists. + rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse); // Fetches a base backup. rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk); - // Fetches a page. - // TODO: remove this and use GetPages. Kept for benchmarks. - rpc GetPage (GetPageRequest) returns (GetPageResponse); + // Returns the total size of a database, as # of bytes. + rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse); // Fetches pages. // @@ -34,34 +51,35 @@ service PageService { // requests incur costs for e.g. HTTP/2 stream setup, header parsing, // authentication, and so on -- with streaming, we only pay these costs during // the initial stream setup. This ~doubles throughput in benchmarks. Other - // requests use regular unary requests, since they are not as frequent and + // RPCs use regular unary requests, since they are not as frequent and // performance-critical, and this simplifies implementation. // // NB: a status response (e.g. errors) will terminate the stream. The stream // may be shared by e.g. multiple Postgres backends, so we should avoid this. // Most errors are therefore sent as GetPageResponse.status instead. - rpc GetPages (stream GetPageRequestBatch) returns (stream GetPageResponse); - - // Fetches an SLRU segment. - // - // TODO: can these be significantly larger than 256 KB (8 pages)? If so, - // consider streaming the response instead. - rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse); - - // Returns whether a relation exists. - rpc RelExists(RelExistsRequest) returns (RelExistsResponse); + rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse); // Returns the size of a relation, as # of blocks. - rpc RelSize (RelSizeRequest) returns (RelSizeResponse); + rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse); + // Fetches an SLRU segment. + rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse); } -// Common request fields. -message RequestCommon { - // The LSN to read at. +// The LSN a request should read at. +message ReadLsn { + // The request's read LSN. Required. uint64 request_lsn = 1; - // If given, the caller guarantees that the page has not been modified - // since this LSN. + // If given, the caller guarantees that the page has not been modified since + // this LSN. Must be smaller than or equal to request_lsn. This allows the + // Pageserver to serve an old page without waiting for the request LSN to + // arrive. Valid for all request types. + // + // It is undefined behaviour to make a request such that the page was, in + // fact, modified between request_lsn and not_modified_since_lsn. The + // Pageserver might detect it and return an error, or it might return the old + // page version or the new page version. Setting not_modified_since_lsn equal + // to request_lsn is always safe, but can lead to unnecessary waiting. uint64 not_modified_since_lsn = 2; } @@ -73,22 +91,21 @@ message RelTag { uint32 fork_number = 4; } -// Requests the size of a database, as # of bytes. This is only accurate on -// shard 0; other shards will return their view of the database according to -// which pages they have. -message DbSizeRequest { - RequestCommon common = 1; - uint32 db_oid = 2; +// Checks whether a relation exists, at the given LSN. Only valid on shard 0, +// other shards will error. +message CheckRelExistsRequest { + ReadLsn read_lsn = 1; + RelTag rel = 2; } -message DbSizeResponse { - uint64 num_bytes = 1; +message CheckRelExistsResponse { + bool exists = 1; } // Requests a base backup at a given LSN. message GetBaseBackupRequest { // The LSN to fetch a base backup at. - RequestCommon common = 1; + ReadLsn read_lsn = 1; // If true, logical replication slots will not be created. bool replica = 2; } @@ -100,19 +117,43 @@ message GetBaseBackupResponseChunk { bytes chunk = 1; } -// Requests a single page. +// Requests the size of a database, as # of bytes. Only valid on shard 0, other +// shards will error. +message GetDbSizeRequest { + ReadLsn read_lsn = 1; + uint32 db_oid = 2; +} + +message GetDbSizeResponse { + uint64 num_bytes = 1; +} + +// Requests one or more pages. message GetPageRequest { // A request ID. Will be included in the response. Should be unique for // in-flight requests on the stream. - uint64 id = 1; - // The LSN to read at. - RequestCommon common = 2; - // The relation to read from. - RelTag rel = 3; - // The page number to read. Must belong to the remote shard. - uint32 block_number = 4; + uint64 request_id = 1; // The request class. - GetPageClass class = 5; + GetPageClass request_class = 2; + // The LSN to read at. + ReadLsn read_lsn = 3; + // The relation to read from. + RelTag rel = 4; + // Page numbers to read. Must belong to the remote shard. + // + // Multiple pages will be executed as a single batch by the Pageserver, + // amortizing layer access costs and parallelizing them. This may increase the + // latency of any individual request, but improves the overall latency and + // throughput of the batch as a whole. + // + // TODO: this causes an allocation in the common single-block case. The sender + // can use a SmallVec to stack-allocate it, but Prost will always deserialize + // into a heap-allocated Vec. Consider optimizing this. + // + // TODO: we might be able to avoid a sort or something if we mandate that these + // are always in order. But we can't currenly rely on this on the server, because + // of compatibility with the libpq protocol handler. + repeated uint32 block_number = 5; } // A GetPageRequest class. Primarily intended for observability, but may also be @@ -123,31 +164,27 @@ enum GetPageClass { GET_PAGE_CLASS_UNKNOWN = 0; // A normal request. This is the default. GET_PAGE_CLASS_NORMAL = 1; - // A prefetch request. + // A prefetch request. NB: can only be classified on pg < 18. GET_PAGE_CLASS_PREFETCH = 2; + // A background request (e.g. vacuum). + GET_PAGE_CLASS_BACKGROUND = 3; } -// A batch of GetPage requests. These will be executed as a single batch by the -// Pageserver, amortizing layer access costs and parallelizing them. This may -// increase the latency of any individual request, but improves the overall -// latency and throughput of the batch as a whole. +// A GetPage response. // -// Responses will be emitted individually, as soon as they are ready. They may -// be emitted in a different order than the requests. -message GetPageRequestBatch { - repeated GetPageRequest requests = 1; -} - -// A GetPage response. May be emitted out of order. +// A batch response will contain all of the requested pages. We could eagerly +// emit individual pages as soon as they are ready, but on a readv() Postgres +// holds buffer pool locks on all pages in the batch and we'll only return once +// the entire batch is ready, so no one can make use of the individual pages. message GetPageResponse { // The original request's ID. - uint64 id = 1; + uint64 request_id = 1; // The response status code. GetPageStatus status = 2; // A string describing the status, if any. - optional string reason = 3; - // The 8KB page image. Empty if status != OK. - bytes page_image = 4; + string reason = 3; + // The 8KB page images, in the same order as the request. Empty if status != OK. + repeated bytes page_image = 4; } // A GetPageResponse status code. Since we use a bidirectional stream, we don't @@ -163,46 +200,34 @@ enum GetPageStatus { GET_PAGE_STATUS_NOT_FOUND = 2; // The request was invalid. GET_PAGE_STATUS_INVALID = 3; - // The client is rate limited. Slow down and retry later. - // TODO: should we use this? + // The tenant is rate limited. Slow down and retry later. GET_PAGE_STATUS_SLOW_DOWN = 4; + // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a + // layer download. This could free up the server task to process other + // requests while the layer download is in progress. } -// Requests an SLRU segment. +// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on +// shard 0, other shards will error. +message GetRelSizeRequest { + ReadLsn read_lsn = 1; + RelTag rel = 2; +} + +message GetRelSizeResponse { + uint32 num_blocks = 1; +} + +// Requests an SLRU segment. Only valid on shard 0, other shards will error. message GetSlruSegmentRequest { - RequestCommon common = 1; + ReadLsn read_lsn = 1; uint32 kind = 2; uint32 segno = 3; } // Returns an SLRU segment. // -// TODO: can these be significantly larger than 256 KB (8 pages)? If so, -// consider chunking and streaming the response instead. +// These are up 32 pages (256 KB), so we can send them as a single response. message GetSlruSegmentResponse { bytes segment = 1; } - -// Checks whether a relation exists, at the given LSN. This is only accurate on -// shard 0; other shards will return their view of the relation according to -// which pages they have. -message RelExistsRequest { - RequestCommon common = 1; - RelTag rel = 2; -} - -message RelExistsResponse { - bool exists = 1; -} - -// Fetches the size of a relation at a given LSN, as # of blocks. This is only -// accurate on shard 0; other shards will return their view of the relation -// according to which pages they have. -message RelSizeRequest { - RequestCommon common = 1; - RelTag rel = 2; -} - -message RelSizeResponse { - uint32 num_blocks = 1; -} \ No newline at end of file diff --git a/pageserver/page_api/src/lib.rs b/pageserver/page_api/src/lib.rs index b87db7eaa2..4cbaf40763 100644 --- a/pageserver/page_api/src/lib.rs +++ b/pageserver/page_api/src/lib.rs @@ -1,19 +1,21 @@ //! This crate provides the Pageserver's page API. It contains: //! +//! * proto/page_service.proto: the Protobuf schema for the page API. //! * proto: auto-generated Protobuf types for gRPC. -//! * model: canonical domain types. Protobuf types are converted into these. -//! -//! See `proto/page_service.proto` for the protocol spec. //! //! This crate is used by both the client and the server. Try to keep it slim. -//! pub mod model; // Code generated by protobuf. pub mod proto { - tonic::include_proto!("page_service"); + tonic::include_proto!("page_api"); + + /// File descriptor set for Protobuf schema reflection. This allows using + /// e.g. grpcurl with the API. + pub const FILE_DESCRIPTOR_SET: &[u8] = + tonic::include_file_descriptor_set!("page_api_descriptor"); pub use page_service_client::PageServiceClient; - pub use page_service_server::PageServiceServer; + pub use page_service_server::{PageService, PageServiceServer}; } diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index bfa766a077..6d24d6e2ba 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -13,13 +13,12 @@ //! instead of being cast into internal mirror types. use bytes::Bytes; -use smallvec::{SmallVec, smallvec}; use utils::lsn::Lsn; use crate::proto; #[derive(Clone, Debug)] -pub struct RequestCommon { +pub struct ReadLsn { pub request_lsn: Lsn, pub not_modified_since_lsn: Lsn, } @@ -33,45 +32,44 @@ pub struct RelTag { } #[derive(Clone, Debug)] -pub struct RelExistsRequest { - pub common: RequestCommon, +pub struct CheckRelExistsRequest { + pub read_lsn: ReadLsn, pub rel: RelTag, } #[derive(Clone, Debug)] -pub struct RelSizeRequest { - pub common: RequestCommon, +pub struct GetRelSizeRequest { + pub read_lsn: ReadLsn, pub rel: RelTag, } #[derive(Clone, Debug)] -pub struct RelSizeResponse { +pub struct GetRelSizeResponse { pub num_blocks: u32, } #[derive(Clone, Debug)] pub struct GetPageRequest { - pub id: u64, - pub common: RequestCommon, + pub request_id: u64, + pub request_class: GetPageClass, + pub read_lsn: ReadLsn, pub rel: RelTag, - pub block_number: u32, - pub class: GetPageClass, + pub block_number: Vec, } #[derive(Clone, Debug, PartialEq)] pub enum GetPageClass { Normal, Prefetch, + Background, } -pub type GetPageRequestBatch = SmallVec<[GetPageRequest; 8]>; - #[derive(Clone, Debug)] pub struct GetPageResponse { - pub id: u64, + pub request_id: u64, pub status: GetPageStatus, - pub reason: Option, - pub page_image: Bytes, + pub reason: String, + pub page_image: Vec, } #[derive(Clone, Debug, PartialEq)] @@ -83,25 +81,25 @@ pub enum GetPageStatus { } #[derive(Clone, Debug)] -pub struct DbSizeRequest { - pub common: RequestCommon, +pub struct GetDbSizeRequest { + pub read_lsn: ReadLsn, pub db_oid: u32, } #[derive(Clone, Debug)] -pub struct DbSizeResponse { +pub struct GetDbSizeResponse { pub num_bytes: u64, } #[derive(Clone, Debug)] pub struct GetBaseBackupRequest { - pub common: RequestCommon, + pub read_lsn: ReadLsn, pub replica: bool, } #[derive(Clone, Debug)] pub struct GetSlruSegmentRequest { - pub common: RequestCommon, + pub read_lsn: ReadLsn, pub kind: u8, // TODO: SlruKind pub segno: u32, } @@ -127,28 +125,6 @@ impl From for tonic::Status { } } -impl From for proto::GetPageRequestBatch { - fn from(value: GetPageRequestBatch) -> proto::GetPageRequestBatch { - proto::GetPageRequestBatch { - requests: value.iter().map(|r| r.into()).collect(), - } - } -} - -impl From for GetPageRequestBatch { - fn from(value: GetPageRequest) -> GetPageRequestBatch { - smallvec![value] - } -} - -impl From for proto::GetPageRequestBatch { - fn from(value: GetPageRequest) -> proto::GetPageRequestBatch { - proto::GetPageRequestBatch { - requests: vec![(&value).into()], - } - } -} - impl From<&RelTag> for proto::RelTag { fn from(value: &RelTag) -> proto::RelTag { proto::RelTag { @@ -175,56 +151,58 @@ impl TryFrom<&proto::RelTag> for RelTag { } } -impl From<&RequestCommon> for proto::RequestCommon { - fn from(value: &RequestCommon) -> proto::RequestCommon { - proto::RequestCommon { +impl From<&ReadLsn> for proto::ReadLsn { + fn from(value: &ReadLsn) -> proto::ReadLsn { + proto::ReadLsn { request_lsn: value.request_lsn.into(), not_modified_since_lsn: value.not_modified_since_lsn.into(), } } } -impl From<&proto::RequestCommon> for RequestCommon { - fn from(value: &proto::RequestCommon) -> RequestCommon { - RequestCommon { +impl From<&proto::ReadLsn> for ReadLsn { + fn from(value: &proto::ReadLsn) -> ReadLsn { + ReadLsn { request_lsn: value.request_lsn.into(), not_modified_since_lsn: value.not_modified_since_lsn.into(), } } } -impl From<&RelExistsRequest> for proto::RelExistsRequest { - fn from(value: &RelExistsRequest) -> proto::RelExistsRequest { - proto::RelExistsRequest { - common: Some((&value.common).into()), +impl From<&CheckRelExistsRequest> for proto::CheckRelExistsRequest { + fn from(value: &CheckRelExistsRequest) -> proto::CheckRelExistsRequest { + proto::CheckRelExistsRequest { + read_lsn: Some((&value.read_lsn).into()), rel: Some((&value.rel).into()), } } } -impl TryFrom<&proto::RelExistsRequest> for RelExistsRequest { +impl TryFrom<&proto::CheckRelExistsRequest> for CheckRelExistsRequest { type Error = ProtocolError; - fn try_from(value: &proto::RelExistsRequest) -> Result { - Ok(RelExistsRequest { - common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(), + fn try_from( + value: &proto::CheckRelExistsRequest, + ) -> Result { + Ok(CheckRelExistsRequest { + read_lsn: (&value.read_lsn.ok_or(ProtocolError::Missing("read_lsn"))?).into(), rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?, }) } } -impl From<&RelSizeRequest> for proto::RelSizeRequest { - fn from(value: &RelSizeRequest) -> proto::RelSizeRequest { - proto::RelSizeRequest { - common: Some((&value.common).into()), +impl From<&GetRelSizeRequest> for proto::GetRelSizeRequest { + fn from(value: &GetRelSizeRequest) -> proto::GetRelSizeRequest { + proto::GetRelSizeRequest { + read_lsn: Some((&value.read_lsn).into()), rel: Some((&value.rel).into()), } } } -impl TryFrom<&proto::RelSizeRequest> for RelSizeRequest { +impl TryFrom<&proto::GetRelSizeRequest> for GetRelSizeRequest { type Error = ProtocolError; - fn try_from(value: &proto::RelSizeRequest) -> Result { - Ok(RelSizeRequest { - common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(), + fn try_from(value: &proto::GetRelSizeRequest) -> Result { + Ok(GetRelSizeRequest { + read_lsn: (&value.read_lsn.ok_or(ProtocolError::Missing("read_lsn"))?).into(), rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?, }) } @@ -233,14 +211,15 @@ impl TryFrom<&proto::RelSizeRequest> for RelSizeRequest { impl From<&GetPageRequest> for proto::GetPageRequest { fn from(value: &GetPageRequest) -> proto::GetPageRequest { proto::GetPageRequest { - id: value.id, - class: match value.class { + request_id: value.request_id, + request_class: match value.request_class { GetPageClass::Normal => proto::GetPageClass::Normal as i32, GetPageClass::Prefetch => proto::GetPageClass::Prefetch as i32, + GetPageClass::Background => proto::GetPageClass::Background as i32, }, - common: Some((&value.common).into()), + read_lsn: Some((&value.read_lsn).into()), rel: Some((&value.rel).into()), - block_number: value.block_number, + block_number: value.block_number.clone(), } } } @@ -249,11 +228,11 @@ impl TryFrom<&proto::GetPageRequest> for GetPageRequest { fn try_from(value: &proto::GetPageRequest) -> Result { Ok(GetPageRequest { - id: value.id, - common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(), + request_id: value.request_id, + read_lsn: (&value.read_lsn.ok_or(ProtocolError::Missing("read_lsn"))?).into(), rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?, - block_number: value.block_number, - class: proto::GetPageClass::try_from(value.class) + block_number: value.block_number.clone(), + request_class: proto::GetPageClass::try_from(value.request_class) .unwrap_or(proto::GetPageClass::Unknown) .try_into()?, }) @@ -268,6 +247,7 @@ impl TryFrom for GetPageClass { proto::GetPageClass::Unknown => Err(ProtocolError::InvalidValue("class")), proto::GetPageClass::Normal => Ok(GetPageClass::Normal), proto::GetPageClass::Prefetch => Ok(GetPageClass::Prefetch), + proto::GetPageClass::Background => Ok(GetPageClass::Background), } } } @@ -277,6 +257,7 @@ impl From for proto::GetPageClass { match value { GetPageClass::Normal => proto::GetPageClass::Normal, GetPageClass::Prefetch => proto::GetPageClass::Prefetch, + GetPageClass::Background => proto::GetPageClass::Background, } } } @@ -286,7 +267,7 @@ impl TryFrom for GetPageResponse { fn try_from(value: proto::GetPageResponse) -> Result { Ok(GetPageResponse { - id: value.id, + request_id: value.request_id, status: proto::GetPageStatus::try_from(value.status) .unwrap_or(proto::GetPageStatus::Unknown) .try_into()?, @@ -325,21 +306,21 @@ impl From for proto::GetPageStatus { } } -impl From<&DbSizeRequest> for proto::DbSizeRequest { - fn from(value: &DbSizeRequest) -> proto::DbSizeRequest { - proto::DbSizeRequest { - common: Some((&value.common).into()), +impl From<&GetDbSizeRequest> for proto::GetDbSizeRequest { + fn from(value: &GetDbSizeRequest) -> proto::GetDbSizeRequest { + proto::GetDbSizeRequest { + read_lsn: Some((&value.read_lsn).into()), db_oid: value.db_oid, } } } -impl TryFrom<&proto::DbSizeRequest> for DbSizeRequest { +impl TryFrom<&proto::GetDbSizeRequest> for GetDbSizeRequest { type Error = ProtocolError; - fn try_from(value: &proto::DbSizeRequest) -> Result { - Ok(DbSizeRequest { - common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(), + fn try_from(value: &proto::GetDbSizeRequest) -> Result { + Ok(GetDbSizeRequest { + read_lsn: (&value.read_lsn.ok_or(ProtocolError::Missing("read_lsn"))?).into(), db_oid: value.db_oid, }) } @@ -348,7 +329,7 @@ impl TryFrom<&proto::DbSizeRequest> for DbSizeRequest { impl From<&GetBaseBackupRequest> for proto::GetBaseBackupRequest { fn from(value: &GetBaseBackupRequest) -> proto::GetBaseBackupRequest { proto::GetBaseBackupRequest { - common: Some((&value.common).into()), + read_lsn: Some((&value.read_lsn).into()), replica: value.replica, } } @@ -361,7 +342,7 @@ impl TryFrom<&proto::GetBaseBackupRequest> for GetBaseBackupRequest { value: &proto::GetBaseBackupRequest, ) -> Result { Ok(GetBaseBackupRequest { - common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(), + read_lsn: (&value.read_lsn.ok_or(ProtocolError::Missing("read_lsn"))?).into(), replica: value.replica, }) } @@ -372,7 +353,7 @@ impl TryFrom<&proto::GetSlruSegmentRequest> for GetSlruSegmentRequest { fn try_from(value: &proto::GetSlruSegmentRequest) -> Result { Ok(GetSlruSegmentRequest { - common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(), + read_lsn: (&value.read_lsn.ok_or(ProtocolError::Missing("read_lsn"))?).into(), kind: value .kind .try_into() diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 327553f9db..af337d0c03 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -9,7 +9,7 @@ use anyhow::Context; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use pageserver_client::page_service::BasebackupRequest; -use pageserver_page_api::model::{GetBaseBackupRequest, RequestCommon}; +use pageserver_page_api::model::{GetBaseBackupRequest, ReadLsn}; use rand::prelude::*; use tokio::sync::Barrier; @@ -320,7 +320,7 @@ async fn client_grpc( let mut basebackup_stream = client .get_base_backup( &GetBaseBackupRequest { - common: RequestCommon { + read_lsn: ReadLsn { request_lsn: lsn, not_modified_since_lsn: lsn, }, diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index f71419ef56..6d4175867b 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -69,6 +69,9 @@ pub(crate) struct Args { #[clap(long)] set_io_mode: Option, + #[clap(long)] + only_relnode: Option, + /// Queue depth generated in each client. #[clap(long, default_value = "1")] queue_depth: NonZeroUsize, @@ -98,7 +101,6 @@ pub(crate) struct Args { percent_hangs: usize, targets: Option>, - } /// State shared by all clients @@ -239,7 +241,12 @@ async fn main_impl( for r in partitioning.keys.ranges.iter() { let mut i = r.start; while i != r.end { - if i.is_rel_block_key() { + let mut include = true; + include &= i.is_rel_block_key(); + if let Some(only_relnode) = args.only_relnode { + include &= i.is_rel_block_of_rel(only_relnode); + } + if include { filtered.add_key(i); } i = i.next(); @@ -495,8 +502,8 @@ async fn client_grpc( connect_backoff: Duration::from_millis(args.pool_connect_backoff.get() as u64), max_idle_duration: Duration::from_millis(args.pool_max_idle_duration.get() as u64), max_delay_ms: args.max_delay_ms as u64, - drop_rate: (args.percent_drops as f64)/100.0, - hang_rate: (args.percent_hangs as f64)/100.0, + drop_rate: (args.percent_drops as f64) / 100.0, + hang_rate: (args.percent_hangs as f64) / 100.0, }; let client = pageserver_client_grpc::PageserverClient::new_with_config( &worker_id.timeline.tenant_id.to_string(), @@ -537,8 +544,9 @@ async fn client_grpc( .to_rel_block() .expect("we filter non-rel-block keys out above"); pageserver_page_api::model::GetPageRequest { - id: 0, // TODO - common: pageserver_page_api::model::RequestCommon { + request_id: 0, // TODO + request_class: GetPageClass::Normal, + read_lsn: pageserver_page_api::model::ReadLsn { request_lsn: if rng.gen_bool(args.req_latest_probability) { Lsn::MAX } else { @@ -552,8 +560,7 @@ async fn client_grpc( rel_number: rel_tag.relnode, fork_number: rel_tag.forknum, }, - block_number: block_no, - class: GetPageClass::Normal, + block_number: vec![block_no], } }; let client_clone = client.clone(); @@ -645,8 +652,9 @@ async fn client_grpc_stream( .to_rel_block() .expect("we filter non-rel-block keys out above"); pageserver_page_api::model::GetPageRequest { - id: 0, // TODO - common: pageserver_page_api::model::RequestCommon { + request_id: 0, // TODO + request_class: GetPageClass::Normal, + read_lsn: pageserver_page_api::model::ReadLsn { request_lsn: if rng.gen_bool(args.req_latest_probability) { Lsn::MAX } else { @@ -660,11 +668,10 @@ async fn client_grpc_stream( rel_number: rel_tag.relnode, fork_number: rel_tag.forknum, }, - block_number: block_no, - class: GetPageClass::Normal, + block_number: vec![block_no], } }; - request_tx.send(req.into()).await.unwrap(); + request_tx.send((&req).into()).await.unwrap(); inflight.push_back(start); } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 58520c5d7a..2e6990dbc4 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -144,7 +144,7 @@ where replica, ctx, io_concurrency: IoConcurrency::spawn_from_conf( - timeline.conf, + timeline.conf.get_vectored_concurrent_io, timeline .gate .enter() @@ -347,7 +347,7 @@ where // Gather non-relational files from object storage pages. let slru_partitions = self .timeline - .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) + .get_slru_keyspace(Version::at(self.lsn), self.ctx) .await? .partition( self.timeline.get_shard_identity(), @@ -382,7 +382,7 @@ where // Otherwise only include init forks of unlogged relations. let rels = self .timeline - .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) + .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx) .await?; for &rel in rels.iter() { // Send init fork as main fork to provide well formed empty @@ -521,7 +521,7 @@ where async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> { let nblocks = self .timeline - .get_rel_size(src, Version::Lsn(self.lsn), self.ctx) + .get_rel_size(src, Version::at(self.lsn), self.ctx) .await?; // If the relation is empty, create an empty file @@ -581,7 +581,7 @@ where let relmap_img = if has_relmap_file { let img = self .timeline - .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) + .get_relmap_file(spcnode, dbnode, Version::at(self.lsn), self.ctx) .await?; if img.len() @@ -635,7 +635,7 @@ where if !has_relmap_file && self .timeline - .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) + .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx) .await? .is_empty() { diff --git a/pageserver/src/basebackup_cache.rs b/pageserver/src/basebackup_cache.rs new file mode 100644 index 0000000000..3a8ec555f7 --- /dev/null +++ b/pageserver/src/basebackup_cache.rs @@ -0,0 +1,518 @@ +use std::{collections::HashMap, sync::Arc}; + +use async_compression::tokio::write::GzipEncoder; +use camino::{Utf8Path, Utf8PathBuf}; +use metrics::core::{AtomicU64, GenericCounter}; +use pageserver_api::{config::BasebackupCacheConfig, models::TenantState}; +use tokio::{ + io::{AsyncWriteExt, BufWriter}, + sync::mpsc::{UnboundedReceiver, UnboundedSender}, +}; +use tokio_util::sync::CancellationToken; +use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, + shard::TenantShardId, +}; + +use crate::{ + basebackup::send_basebackup_tarball, + context::{DownloadBehavior, RequestContext}, + metrics::{BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ}, + task_mgr::TaskKind, + tenant::{ + Timeline, + mgr::{TenantManager, TenantSlot}, + }, +}; + +pub struct BasebackupPrepareRequest { + pub tenant_shard_id: TenantShardId, + pub timeline_id: TimelineId, + pub lsn: Lsn, +} + +pub type BasebackupPrepareSender = UnboundedSender; +pub type BasebackupPrepareReceiver = UnboundedReceiver; + +type BasebackupRemoveEntrySender = UnboundedSender; +type BasebackupRemoveEntryReceiver = UnboundedReceiver; + +/// BasebackupCache stores cached basebackup archives for timelines on local disk. +/// +/// The main purpose of this cache is to speed up the startup process of compute nodes +/// after scaling to zero. +/// Thus, the basebackup is stored only for the latest LSN of the timeline and with +/// fixed set of parameters (gzip=true, full_backup=false, replica=false, prev_lsn=none). +/// +/// The cache receives prepare requests through the `BasebackupPrepareSender` channel, +/// generates a basebackup from the timeline in the background, and stores it on disk. +/// +/// Basebackup requests are pretty rare. We expect ~thousands of entries in the cache +/// and ~1 RPS for get requests. +pub struct BasebackupCache { + data_dir: Utf8PathBuf, + config: BasebackupCacheConfig, + tenant_manager: Arc, + remove_entry_sender: BasebackupRemoveEntrySender, + + entries: std::sync::Mutex>, + + cancel: CancellationToken, + + read_hit_count: GenericCounter, + read_miss_count: GenericCounter, + read_err_count: GenericCounter, + + prepare_ok_count: GenericCounter, + prepare_skip_count: GenericCounter, + prepare_err_count: GenericCounter, +} + +impl BasebackupCache { + /// Creates a BasebackupCache and spawns the background task. + /// The initialization of the cache is performed in the background and does not + /// block the caller. The cache will return `None` for any get requests until + /// initialization is complete. + pub fn spawn( + runtime_handle: &tokio::runtime::Handle, + data_dir: Utf8PathBuf, + config: Option, + prepare_receiver: BasebackupPrepareReceiver, + tenant_manager: Arc, + cancel: CancellationToken, + ) -> Arc { + let (remove_entry_sender, remove_entry_receiver) = tokio::sync::mpsc::unbounded_channel(); + + let enabled = config.is_some(); + + let cache = Arc::new(BasebackupCache { + data_dir, + config: config.unwrap_or_default(), + tenant_manager, + remove_entry_sender, + + entries: std::sync::Mutex::new(HashMap::new()), + + cancel, + + read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]), + read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]), + read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]), + + prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]), + prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]), + prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]), + }); + + if enabled { + runtime_handle.spawn( + cache + .clone() + .background(prepare_receiver, remove_entry_receiver), + ); + } + + cache + } + + /// Gets a basebackup entry from the cache. + /// If the entry is found, opens a file with the basebackup archive and returns it. + /// The open file descriptor will prevent the file system from deleting the file + /// even if the entry is removed from the cache in the background. + pub async fn get( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Option { + // Fast path. Check if the entry exists using the in-memory state. + let tti = TenantTimelineId::new(tenant_id, timeline_id); + if self.entries.lock().unwrap().get(&tti) != Some(&lsn) { + self.read_miss_count.inc(); + return None; + } + + let path = self.entry_path(tenant_id, timeline_id, lsn); + + match tokio::fs::File::open(path).await { + Ok(file) => { + self.read_hit_count.inc(); + Some(file) + } + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + // We may end up here if the basebackup was concurrently removed by the cleanup task. + self.read_miss_count.inc(); + } else { + self.read_err_count.inc(); + tracing::warn!("Unexpected error opening basebackup cache file: {:?}", e); + } + None + } + } + } + + // Private methods. + + fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String { + // The default format for LSN is 0/ABCDEF. + // The backslash is not filename friendly, so serialize it as plain hex. + let lsn = lsn.0; + format!("basebackup_{tenant_id}_{timeline_id}_{lsn:016X}.tar.gz") + } + + fn entry_path(&self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> Utf8PathBuf { + self.data_dir + .join(Self::entry_filename(tenant_id, timeline_id, lsn)) + } + + fn entry_tmp_path( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Utf8PathBuf { + self.data_dir + .join("tmp") + .join(Self::entry_filename(tenant_id, timeline_id, lsn)) + } + + fn parse_entry_filename(filename: &str) -> Option<(TenantId, TimelineId, Lsn)> { + let parts: Vec<&str> = filename + .strip_prefix("basebackup_")? + .strip_suffix(".tar.gz")? + .split('_') + .collect(); + if parts.len() != 3 { + return None; + } + let tenant_id = parts[0].parse::().ok()?; + let timeline_id = parts[1].parse::().ok()?; + let lsn = Lsn(u64::from_str_radix(parts[2], 16).ok()?); + + Some((tenant_id, timeline_id, lsn)) + } + + async fn cleanup(&self) -> anyhow::Result<()> { + // Cleanup tmp directory. + let tmp_dir = self.data_dir.join("tmp"); + let mut tmp_dir = tokio::fs::read_dir(&tmp_dir).await?; + while let Some(dir_entry) = tmp_dir.next_entry().await? { + if let Err(e) = tokio::fs::remove_file(dir_entry.path()).await { + tracing::warn!("Failed to remove basebackup cache tmp file: {:#}", e); + } + } + + // Remove outdated entries. + let entries_old = self.entries.lock().unwrap().clone(); + let mut entries_new = HashMap::new(); + for (tenant_shard_id, tenant_slot) in self.tenant_manager.list() { + if !tenant_shard_id.is_shard_zero() { + continue; + } + let TenantSlot::Attached(tenant) = tenant_slot else { + continue; + }; + let tenant_id = tenant_shard_id.tenant_id; + + for timeline in tenant.list_timelines() { + let tti = TenantTimelineId::new(tenant_id, timeline.timeline_id); + if let Some(&entry_lsn) = entries_old.get(&tti) { + if timeline.get_last_record_lsn() <= entry_lsn { + entries_new.insert(tti, entry_lsn); + } + } + } + } + + for (&tti, &lsn) in entries_old.iter() { + if !entries_new.contains_key(&tti) { + self.remove_entry_sender + .send(self.entry_path(tti.tenant_id, tti.timeline_id, lsn)) + .unwrap(); + } + } + + BASEBACKUP_CACHE_ENTRIES.set(entries_new.len() as i64); + *self.entries.lock().unwrap() = entries_new; + + Ok(()) + } + + async fn on_startup(&self) -> anyhow::Result<()> { + // Create data_dir and tmp directory if they do not exist. + tokio::fs::create_dir_all(&self.data_dir.join("tmp")) + .await + .map_err(|e| { + anyhow::anyhow!( + "Failed to create basebackup cache data_dir {:?}: {:?}", + self.data_dir, + e + ) + })?; + + // Read existing entries from the data_dir and add them to in-memory state. + let mut entries = HashMap::new(); + let mut dir = tokio::fs::read_dir(&self.data_dir).await?; + while let Some(dir_entry) = dir.next_entry().await? { + let filename = dir_entry.file_name(); + + if filename == "tmp" { + // Skip the tmp directory. + continue; + } + + let parsed = Self::parse_entry_filename(filename.to_string_lossy().as_ref()); + let Some((tenant_id, timeline_id, lsn)) = parsed else { + tracing::warn!("Invalid basebackup cache file name: {:?}", filename); + continue; + }; + + let tti = TenantTimelineId::new(tenant_id, timeline_id); + + use std::collections::hash_map::Entry::*; + + match entries.entry(tti) { + Occupied(mut entry) => { + let entry_lsn = *entry.get(); + // Leave only the latest entry, remove the old one. + if lsn < entry_lsn { + self.remove_entry_sender.send(self.entry_path( + tenant_id, + timeline_id, + lsn, + ))?; + } else if lsn > entry_lsn { + self.remove_entry_sender.send(self.entry_path( + tenant_id, + timeline_id, + entry_lsn, + ))?; + entry.insert(lsn); + } else { + // Two different filenames parsed to the same timline_id and LSN. + // Should never happen. + return Err(anyhow::anyhow!( + "Duplicate basebackup cache entry with the same LSN: {:?}", + filename + )); + } + } + Vacant(entry) => { + entry.insert(lsn); + } + } + } + + BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64); + *self.entries.lock().unwrap() = entries; + + Ok(()) + } + + async fn background( + self: Arc, + mut prepare_receiver: BasebackupPrepareReceiver, + mut remove_entry_receiver: BasebackupRemoveEntryReceiver, + ) { + // Panic in the background is a safe fallback. + // It will drop receivers and the cache will be effectively disabled. + self.on_startup() + .await + .expect("Failed to initialize basebackup cache"); + + let mut cleanup_ticker = tokio::time::interval(self.config.cleanup_period); + cleanup_ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + + loop { + tokio::select! { + Some(req) = prepare_receiver.recv() => { + if let Err(err) = self.prepare_basebackup( + req.tenant_shard_id, + req.timeline_id, + req.lsn, + ).await { + tracing::info!("Failed to prepare basebackup: {:#}", err); + self.prepare_err_count.inc(); + continue; + } + } + Some(req) = remove_entry_receiver.recv() => { + if let Err(e) = tokio::fs::remove_file(req).await { + tracing::warn!("Failed to remove basebackup cache file: {:#}", e); + } + } + _ = cleanup_ticker.tick() => { + self.cleanup().await.unwrap_or_else(|e| { + tracing::warn!("Failed to clean up basebackup cache: {:#}", e); + }); + } + _ = self.cancel.cancelled() => { + tracing::info!("BasebackupCache background task cancelled"); + break; + } + } + } + } + + /// Prepare a basebackup for the given timeline. + /// + /// If the basebackup already exists with a higher LSN or the timeline already + /// has a higher last_record_lsn, skip the preparation. + /// + /// The basebackup is prepared in a temporary directory and then moved to the final + /// location to make the operation atomic. + async fn prepare_basebackup( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + req_lsn: Lsn, + ) -> anyhow::Result<()> { + tracing::info!( + tenant_id = %tenant_shard_id.tenant_id, + %timeline_id, + %req_lsn, + "Preparing basebackup for timeline", + ); + + let tti = TenantTimelineId::new(tenant_shard_id.tenant_id, timeline_id); + + { + let entries = self.entries.lock().unwrap(); + if let Some(&entry_lsn) = entries.get(&tti) { + if entry_lsn >= req_lsn { + tracing::info!( + %timeline_id, + %req_lsn, + %entry_lsn, + "Basebackup entry already exists for timeline with higher LSN, skipping basebackup", + ); + self.prepare_skip_count.inc(); + return Ok(()); + } + } + + if entries.len() as i64 >= self.config.max_size_entries { + tracing::info!( + %timeline_id, + %req_lsn, + "Basebackup cache is full, skipping basebackup", + ); + self.prepare_skip_count.inc(); + return Ok(()); + } + } + + let tenant = self + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + let tenant_state = tenant.current_state(); + if tenant_state != TenantState::Active { + anyhow::bail!( + "Tenant {} is not active, current state: {:?}", + tenant_shard_id.tenant_id, + tenant_state + ) + } + + let timeline = tenant.get_timeline(timeline_id, true)?; + + let last_record_lsn = timeline.get_last_record_lsn(); + if last_record_lsn > req_lsn { + tracing::info!( + %timeline_id, + %req_lsn, + %last_record_lsn, + "Timeline has a higher LSN than the requested one, skipping basebackup", + ); + self.prepare_skip_count.inc(); + return Ok(()); + } + + let entry_tmp_path = self.entry_tmp_path(tenant_shard_id.tenant_id, timeline_id, req_lsn); + + let res = self + .prepare_basebackup_tmp(&entry_tmp_path, &timeline, req_lsn) + .await; + + if let Err(err) = res { + tracing::info!("Failed to prepare basebackup tmp file: {:#}", err); + // Try to clean up tmp file. If we fail, the background clean up task will take care of it. + match tokio::fs::remove_file(&entry_tmp_path).await { + Ok(_) => {} + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => { + tracing::info!("Failed to remove basebackup tmp file: {:?}", e); + } + } + return Err(err); + } + + // Move the tmp file to the final location atomically. + let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn); + tokio::fs::rename(&entry_tmp_path, &entry_path).await?; + + let mut entries = self.entries.lock().unwrap(); + if let Some(old_lsn) = entries.insert(tti, req_lsn) { + // Remove the old entry if it exists. + self.remove_entry_sender + .send(self.entry_path(tenant_shard_id.tenant_id, timeline_id, old_lsn)) + .unwrap(); + } + BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64); + + self.prepare_ok_count.inc(); + Ok(()) + } + + /// Prepares a basebackup in a temporary file. + async fn prepare_basebackup_tmp( + &self, + emptry_tmp_path: &Utf8Path, + timeline: &Arc, + req_lsn: Lsn, + ) -> anyhow::Result<()> { + let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download); + let ctx = ctx.with_scope_timeline(timeline); + + let file = tokio::fs::File::create(emptry_tmp_path).await?; + let mut writer = BufWriter::new(file); + + let mut encoder = GzipEncoder::with_quality( + &mut writer, + // Level::Best because compression is not on the hot path of basebackup requests. + // The decompression is almost not affected by the compression level. + async_compression::Level::Best, + ); + + // We may receive a request before the WAL record is applied to the timeline. + // Wait for the requested LSN to be applied. + timeline + .wait_lsn( + req_lsn, + crate::tenant::timeline::WaitLsnWaiter::BaseBackupCache, + crate::tenant::timeline::WaitLsnTimeout::Default, + &ctx, + ) + .await?; + + send_basebackup_tarball( + &mut encoder, + timeline, + Some(req_lsn), + None, + false, + false, + &ctx, + ) + .await?; + + encoder.shutdown().await?; + writer.flush().await?; + writer.into_inner().sync_all().await?; + + Ok(()) + } +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 9b764b8f83..81cd339624 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -16,6 +16,7 @@ use http_utils::tls_certs::ReloadingCertificateResolver; use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric}; use metrics::set_build_info_metric; use nix::sys::socket::{setsockopt, sockopt}; +use pageserver::basebackup_cache::BasebackupCache; use pageserver::compute_service; use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields}; use pageserver::controller_upcall_client::StorageControllerUpcallClient; @@ -505,7 +506,7 @@ fn start_pageserver( // Set up deletion queue let (deletion_queue, deletion_workers) = DeletionQueue::new( remote_storage.clone(), - StorageControllerUpcallClient::new(conf, &shutdown_pageserver)?, + StorageControllerUpcallClient::new(conf, &shutdown_pageserver), conf, ); deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle()); @@ -542,6 +543,8 @@ fn start_pageserver( pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone()); // Scan the local 'tenants/' directory and start loading the tenants + let (basebackup_prepare_sender, basebackup_prepare_receiver) = + tokio::sync::mpsc::unbounded_channel(); let deletion_queue_client = deletion_queue.new_client(); let background_purges = mgr::BackgroundPurges::default(); let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( @@ -552,12 +555,22 @@ fn start_pageserver( remote_storage: remote_storage.clone(), deletion_queue_client, l0_flush_global_state, + basebackup_prepare_sender, }, order, shutdown_pageserver.clone(), ))?; let tenant_manager = Arc::new(tenant_manager); + let basebackup_cache = BasebackupCache::spawn( + BACKGROUND_RUNTIME.handle(), + conf.basebackup_cache_dir(), + conf.basebackup_cache_config.clone(), + basebackup_prepare_receiver, + Arc::clone(&tenant_manager), + shutdown_pageserver.child_token(), + ); + BACKGROUND_RUNTIME.spawn({ let shutdown_pageserver = shutdown_pageserver.clone(); let drive_init = async move { @@ -764,6 +777,7 @@ fn start_pageserver( } else { None }, + basebackup_cache, ); // All started up! Now just sit and wait for shutdown signal. diff --git a/pageserver/src/compute_service.rs b/pageserver/src/compute_service.rs index 952089bee7..94323162f2 100644 --- a/pageserver/src/compute_service.rs +++ b/pageserver/src/compute_service.rs @@ -29,6 +29,7 @@ use tracing::*; use utils::auth::SwappableJwtAuth; use utils::sync::gate::{Gate, GateGuard}; +use crate::basebackup_cache::BasebackupCache; use crate::compute_service_grpc::launch_compute_service_grpc_server; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; @@ -83,6 +84,7 @@ pub fn spawn( perf_trace_dispatch: Option, tcp_listener: tokio::net::TcpListener, tls_config: Option>, + basebackup_cache: Arc, ) -> Listener { let cancel = CancellationToken::new(); let libpq_ctx = RequestContext::todo_child( @@ -105,6 +107,7 @@ pub fn spawn( conf.pg_auth_type, tls_config, conf.page_service_pipelining.clone(), + basebackup_cache, libpq_ctx, cancel.clone(), ) @@ -139,6 +142,7 @@ pub async fn compute_connection_listener_main( auth_type: AuthType, tls_config: Option>, pipelining_config: PageServicePipeliningConfig, + basebackup_cache: Arc, listener_ctx: RequestContext, listener_cancel: CancellationToken, ) -> Connections { @@ -199,6 +203,7 @@ pub async fn compute_connection_listener_main( auth_type, tls_config.clone(), pipelining_config.clone(), + Arc::clone(&basebackup_cache), connection_ctx, connections_cancel.child_token(), gate_guard, @@ -235,6 +240,7 @@ pub async fn page_service_conn_main( auth_type: AuthType, tls_config: Option>, pipelining_config: PageServicePipeliningConfig, + basebackup_cache: Arc, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, @@ -277,6 +283,7 @@ pub async fn page_service_conn_main( auth_type, tls_config, pipelining_config, + basebackup_cache, connection_ctx, cancel, gate_guard, diff --git a/pageserver/src/compute_service_grpc.rs b/pageserver/src/compute_service_grpc.rs index 6c99a58d8a..6e556ef04d 100644 --- a/pageserver/src/compute_service_grpc.rs +++ b/pageserver/src/compute_service_grpc.rs @@ -149,16 +149,16 @@ impl PageService for PageServiceService { type GetPagesStream = Pin> + Send>>; - async fn rel_exists( + async fn check_rel_exists( &self, - request: tonic::Request, - ) -> std::result::Result, tonic::Status> { + request: tonic::Request, + ) -> std::result::Result, tonic::Status> { let ttid = self.extract_ttid(request.metadata())?; let shard = self.extract_shard(request.metadata())?; - let req: model::RelExistsRequest = request.get_ref().try_into()?; + let req: model::CheckRelExistsRequest = request.get_ref().try_into()?; let rel = convert_reltag(&req.rel); - let span = tracing::info_span!("rel_exists", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.common.request_lsn); + let span = tracing::info_span!("check_rel_exists", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.read_lsn.request_lsn); async { let timeline = self.get_timeline(ttid, shard).await?; @@ -166,34 +166,34 @@ impl PageService for PageServiceService { let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( &timeline, - req.common.request_lsn, - req.common.not_modified_since_lsn, + req.read_lsn.request_lsn, + req.read_lsn.not_modified_since_lsn, &latest_gc_cutoff_lsn, &ctx, ) .await?; - let exists = timeline - .get_rel_exists(rel, Version::Lsn(lsn), &ctx) - .await?; + let exists = timeline.get_rel_exists(rel, Version::at(lsn), &ctx).await?; - Ok(tonic::Response::new(proto::RelExistsResponse { exists })) + Ok(tonic::Response::new(proto::CheckRelExistsResponse { + exists, + })) } .instrument(span) .await } /// Returns size of a relation, as # of blocks - async fn rel_size( + async fn get_rel_size( &self, - request: tonic::Request, - ) -> std::result::Result, tonic::Status> { + request: tonic::Request, + ) -> std::result::Result, tonic::Status> { let ttid = self.extract_ttid(request.metadata())?; let shard = self.extract_shard(request.metadata())?; - let req: model::RelSizeRequest = request.get_ref().try_into()?; + let req: model::GetRelSizeRequest = request.get_ref().try_into()?; let rel = convert_reltag(&req.rel); - let span = tracing::info_span!("rel_size", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.common.request_lsn); + let span = tracing::info_span!("get_rel_size", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.read_lsn.request_lsn); async { let timeline = self.get_timeline(ttid, shard).await?; @@ -201,71 +201,17 @@ impl PageService for PageServiceService { let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( &timeline, - req.common.request_lsn, - req.common.not_modified_since_lsn, + req.read_lsn.request_lsn, + req.read_lsn.not_modified_since_lsn, &latest_gc_cutoff_lsn, &ctx, ) .await?; - let num_blocks = timeline.get_rel_size(rel, Version::Lsn(lsn), &ctx).await?; + let num_blocks = timeline.get_rel_size(rel, Version::at(lsn), &ctx).await?; - Ok(tonic::Response::new(proto::RelSizeResponse { num_blocks })) - } - .instrument(span) - .await - } - - async fn get_page( - &self, - request: tonic::Request, - ) -> std::result::Result, tonic::Status> { - let ttid = self.extract_ttid(request.metadata())?; - let shard = self.extract_shard(request.metadata())?; - let req: model::GetPageRequest = request.get_ref().try_into()?; - - let rel = convert_reltag(&req.rel); - let timeline = self.get_timeline(ttid, shard).await?; - - let ctx = self.ctx.with_scope_timeline(&timeline); - let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn( - &timeline, - req.common.request_lsn, - req.common.not_modified_since_lsn, - &latest_gc_cutoff_lsn, - &ctx, - ) - .await?; - - let shard_id = timeline.tenant_shard_id.shard_number; - let span = tracing::info_span!("get_page", tenant_id = %ttid.tenant_id, shard_id = %shard_id, timeline_id = %ttid.timeline_id, rel = %rel, block_number = %req.block_number, req_lsn = %req.common.request_lsn); - - async { - let gate_guard = match timeline.gate.enter() { - Ok(guard) => guard, - Err(_) => { - return Err(tonic::Status::unavailable("timeline is shutting down")); - } - }; - - let io_concurrency = IoConcurrency::spawn_from_conf(self.conf, gate_guard); - - let page_image = timeline - .get_rel_page_at_lsn( - rel, - req.block_number, - Version::Lsn(lsn), - &ctx, - io_concurrency, - ) - .await?; - - Ok(tonic::Response::new(proto::GetPageResponse { - id: req.id, - status: proto::GetPageStatus::Ok as i32, - reason: None, - page_image, + Ok(tonic::Response::new(proto::GetRelSizeResponse { + num_blocks, })) } .instrument(span) @@ -275,7 +221,7 @@ impl PageService for PageServiceService { // TODO: take and emit model types async fn get_pages( &self, - request: tonic::Request>, + request: tonic::Request>, ) -> Result, tonic::Status> { let ttid = self.extract_ttid(request.metadata())?; let shard = self.extract_shard(request.metadata())?; @@ -286,44 +232,56 @@ impl PageService for PageServiceService { let mut request_stream = request.into_inner(); let response_stream = try_stream! { - while let Some(batch) = request_stream.message().await? { + while let Some(request) = request_stream.message().await? { - // TODO: implement batching - for request in batch.requests { - let guard = timeline + let guard = timeline .gate .enter() .or(Err(tonic::Status::unavailable("timeline is shutting down")))?; - let request: model::GetPageRequest = (&request).try_into()?; - let rel = convert_reltag(&request.rel); + let request: model::GetPageRequest = (&request).try_into()?; + let rel = convert_reltag(&request.rel); + + let span = tracing::info_span!("get_pages", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, shard_id = %shard, rel = %rel, req_lsn = %request.read_lsn.request_lsn); + let result: Result, tonic::Status> = async { let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( &timeline, - request.common.request_lsn, - request.common.not_modified_since_lsn, + request.read_lsn.request_lsn, + request.read_lsn.not_modified_since_lsn, &latest_gc_cutoff_lsn, &ctx, ) - .await?; - - let page_image = timeline - .get_rel_page_at_lsn( - rel, - request.block_number, - Version::Lsn(lsn), - &ctx, - IoConcurrency::spawn_from_conf(conf, guard), - ) .await?; - yield proto::GetPageResponse { - id: request.id, - status: proto::GetPageStatus::Ok as i32, - reason: None, - page_image, - }; + let io_concurrency = IoConcurrency::spawn_from_conf(conf.get_vectored_concurrent_io, guard); + + // TODO: use get_rel_page_at_lsn_batched + let mut page_images = Vec::with_capacity(request.block_number.len()); + for blkno in request.block_number { + let page_image = timeline + .get_rel_page_at_lsn( + rel, + blkno, + Version::at(lsn), + &ctx, + io_concurrency.clone(), + ) + .await?; + + page_images.push(page_image); + } + Ok(page_images) } + .instrument(span) + .await; + let page_images = result?; + yield proto::GetPageResponse { + request_id: request.request_id, + status: proto::GetPageStatus::Ok as i32, + reason: "".to_string(), + page_image: page_images, + }; } }; @@ -332,15 +290,15 @@ impl PageService for PageServiceService { )) } - async fn db_size( + async fn get_db_size( &self, - request: tonic::Request, - ) -> Result, tonic::Status> { + request: tonic::Request, + ) -> Result, tonic::Status> { let ttid = self.extract_ttid(request.metadata())?; let shard = self.extract_shard(request.metadata())?; - let req: model::DbSizeRequest = request.get_ref().try_into()?; + let req: model::GetDbSizeRequest = request.get_ref().try_into()?; - let span = tracing::info_span!("get_page", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, db_oid = %req.db_oid, req_lsn = %req.common.request_lsn); + let span = tracing::info_span!("get_db_size", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, db_oid = %req.db_oid, req_lsn = %req.read_lsn.request_lsn); async { let timeline = self.get_timeline(ttid, shard).await?; @@ -348,18 +306,18 @@ impl PageService for PageServiceService { let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( &timeline, - req.common.request_lsn, - req.common.not_modified_since_lsn, + req.read_lsn.request_lsn, + req.read_lsn.not_modified_since_lsn, &latest_gc_cutoff_lsn, &ctx, ) .await?; let total_blocks = timeline - .get_db_size(DEFAULTTABLESPACE_OID, req.db_oid, Version::Lsn(lsn), &ctx) + .get_db_size(DEFAULTTABLESPACE_OID, req.db_oid, Version::at(lsn), &ctx) .await?; - Ok(tonic::Response::new(proto::DbSizeResponse { + Ok(tonic::Response::new(proto::GetDbSizeResponse { num_bytes: total_blocks as u64 * BLCKSZ as u64, })) } @@ -381,14 +339,14 @@ impl PageService for PageServiceService { let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( &timeline, - req.common.request_lsn, - req.common.not_modified_since_lsn, + req.read_lsn.request_lsn, + req.read_lsn.not_modified_since_lsn, &latest_gc_cutoff_lsn, &ctx, ) .await?; - let span = tracing::info_span!("get_base_backup", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, req_lsn = %req.common.request_lsn); + let span = tracing::info_span!("get_base_backup", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, req_lsn = %req.read_lsn.request_lsn); tracing::info!("starting basebackup"); @@ -515,7 +473,7 @@ impl PageService for PageServiceService { let shard = self.extract_shard(request.metadata())?; let req: model::GetSlruSegmentRequest = request.get_ref().try_into()?; - let span = tracing::info_span!("get_slru_segment", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, kind = %req.kind, segno = %req.segno, req_lsn = %req.common.request_lsn); + let span = tracing::info_span!("get_slru_segment", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, kind = %req.kind, segno = %req.segno, req_lsn = %req.read_lsn.request_lsn); async { let timeline = self.get_timeline(ttid, shard).await?; @@ -523,8 +481,8 @@ impl PageService for PageServiceService { let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( &timeline, - req.common.request_lsn, - req.common.not_modified_since_lsn, + req.read_lsn.request_lsn, + req.read_lsn.not_modified_since_lsn, &latest_gc_cutoff_lsn, &ctx, ) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 95143e58b7..e8b3b7b3ab 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -150,7 +150,7 @@ pub struct PageServerConf { /// not terrible. pub background_task_maximum_delay: Duration, - pub control_plane_api: Option, + pub control_plane_api: Url, /// JWT token for use with the control plane API. pub control_plane_api_token: Option, @@ -230,6 +230,10 @@ pub struct PageServerConf { /// such as authentication requirements for HTTP and PostgreSQL APIs. /// This is insecure and should only be used in development environments. pub dev_mode: bool, + + pub timeline_import_config: pageserver_api::config::TimelineImportConfig, + + pub basebackup_cache_config: Option, } /// Token for authentication to safekeepers @@ -259,6 +263,10 @@ impl PageServerConf { self.workdir.join("metadata.json") } + pub fn basebackup_cache_dir(&self) -> Utf8PathBuf { + self.workdir.join("basebackup_cache") + } + pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf { // Encode a version in the filename, so that if we ever switch away from JSON we can // increment this. @@ -404,6 +412,8 @@ impl PageServerConf { tracing, enable_tls_page_service_api, dev_mode, + timeline_import_config, + basebackup_cache_config, } = config_toml; let mut conf = PageServerConf { @@ -438,7 +448,8 @@ impl PageServerConf { test_remote_failures, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, - control_plane_api, + control_plane_api: control_plane_api + .ok_or_else(|| anyhow::anyhow!("`control_plane_api` must be set"))?, control_plane_emergency_mode, heatmap_upload_concurrency, secondary_download_concurrency, @@ -456,6 +467,8 @@ impl PageServerConf { tracing, enable_tls_page_service_api, dev_mode, + timeline_import_config, + basebackup_cache_config, // ------------------------------------------------------------ // fields that require additional validation or custom handling @@ -539,6 +552,23 @@ impl PageServerConf { ratio.numerator, ratio.denominator ) ); + + let url = Url::parse(&tracing_config.export_config.endpoint) + .map_err(anyhow::Error::msg) + .with_context(|| { + format!( + "tracing endpoint URL is invalid : {}", + tracing_config.export_config.endpoint + ) + })?; + + ensure!( + url.scheme() == "http" || url.scheme() == "https", + format!( + "tracing endpoint URL must start with http:// or https://: {}", + tracing_config.export_config.endpoint + ) + ); } IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance) @@ -573,6 +603,7 @@ impl PageServerConf { background_task_maximum_delay: Duration::ZERO, load_previous_heatmap: Some(true), generate_unarchival_heatmap: Some(true), + control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()), ..Default::default() }; PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap() @@ -641,9 +672,12 @@ mod tests { use super::PageServerConf; #[test] - fn test_empty_config_toml_is_valid() { - // we use Default impl of everything in this situation + fn test_minimal_config_toml_is_valid() { + // The minimal valid config for running a pageserver: + // - control_plane_api is mandatory, as pageservers cannot run in isolation + // - we use Default impl of everything else in this situation let input = r#" + control_plane_api = "http://localhost:6666" "#; let config_toml = toml_edit::de::from_str::(input) .expect("empty config is valid"); @@ -651,4 +685,25 @@ mod tests { PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect("parse_and_validate"); } + + #[test] + fn test_config_tracing_endpoint_is_invalid() { + let input = r#" + control_plane_api = "http://localhost:6666" + + [tracing] + + sampling_ratio = { numerator = 1, denominator = 0 } + + [tracing.export_config] + endpoint = "localhost:4317" + protocol = "http-binary" + timeout = "1ms" + "#; + let config_toml = toml_edit::de::from_str::(input) + .expect("config has valid fields"); + let workdir = Utf8PathBuf::from("/nonexistent"); + PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) + .expect_err("parse_and_validate should fail for endpoint without scheme"); + } } diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 08ab69f349..698390f719 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -18,21 +18,31 @@ use crate::tenant::timeline::logical_size::CurrentLogicalSize; // management. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] pub(super) enum Name { - /// Timeline last_record_lsn, absolute + /// Timeline last_record_lsn, absolute. #[serde(rename = "written_size")] WrittenSize, /// Timeline last_record_lsn, incremental #[serde(rename = "written_data_bytes_delta")] WrittenSizeDelta, + /// Written bytes only on this timeline (not including ancestors): + /// written_size - ancestor_lsn + /// + /// On the root branch, this is equivalent to `written_size`. + #[serde(rename = "written_size_since_parent")] + WrittenSizeSinceParent, + /// PITR history size only on this timeline (not including ancestors): + /// last_record_lsn - max(pitr_cutoff, ancestor_lsn). + /// + /// On the root branch, this is its entire PITR history size. Not emitted if GC hasn't computed + /// the PITR cutoff yet. 0 if PITR is disabled. + #[serde(rename = "pitr_history_size_since_parent")] + PitrHistorySizeSinceParent, /// Timeline logical size #[serde(rename = "timeline_logical_size")] LogicalSize, /// Tenant remote size #[serde(rename = "remote_storage_size")] RemoteSize, - /// Tenant resident size - #[serde(rename = "resident_size")] - ResidentSize, /// Tenant synthetic size #[serde(rename = "synthetic_storage_size")] SyntheticSize, @@ -160,6 +170,32 @@ impl MetricsKey { .incremental_values() } + /// `written_size` - `ancestor_lsn`. + const fn written_size_since_parent( + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> AbsoluteValueFactory { + MetricsKey { + tenant_id, + timeline_id: Some(timeline_id), + metric: Name::WrittenSizeSinceParent, + } + .absolute_values() + } + + /// `written_size` - max(`pitr_cutoff`, `ancestor_lsn`). + const fn pitr_history_size_since_parent( + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> AbsoluteValueFactory { + MetricsKey { + tenant_id, + timeline_id: Some(timeline_id), + metric: Name::PitrHistorySizeSinceParent, + } + .absolute_values() + } + /// Exact [`Timeline::get_current_logical_size`]. /// /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size @@ -187,18 +223,6 @@ impl MetricsKey { .absolute_values() } - /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`. - /// - /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size - const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory { - MetricsKey { - tenant_id, - timeline_id: None, - metric: Name::ResidentSize, - } - .absolute_values() - } - /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`]. /// /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size @@ -261,10 +285,7 @@ where let mut tenants = std::pin::pin!(tenants); while let Some((tenant_id, tenant)) = tenants.next().await { - let mut tenant_resident_size = 0; - let timelines = tenant.list_timelines(); - let timelines_len = timelines.len(); for timeline in timelines { let timeline_id = timeline.timeline_id; @@ -287,16 +308,9 @@ where continue; } } - - tenant_resident_size += timeline.resident_physical_size(); } - if timelines_len == 0 { - // Force set it to 1 byte to avoid not being reported -- all timelines are offloaded. - tenant_resident_size = 1; - } - - let snap = TenantSnapshot::collect(&tenant, tenant_resident_size); + let snap = TenantSnapshot::collect(&tenant); snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics); } @@ -305,19 +319,14 @@ where /// In-between abstraction to allow testing metrics without actual Tenants. struct TenantSnapshot { - resident_size: u64, remote_size: u64, synthetic_size: u64, } impl TenantSnapshot { /// Collect tenant status to have metrics created out of it. - /// - /// `resident_size` is calculated of the timelines we had access to for other metrics, so we - /// cannot just list timelines here. - fn collect(t: &Arc, resident_size: u64) -> Self { + fn collect(t: &Arc) -> Self { TenantSnapshot { - resident_size, remote_size: t.remote_size(), // Note that this metric is calculated in a separate bgworker // Here we only use cached value, which may lag behind the real latest one @@ -334,8 +343,6 @@ impl TenantSnapshot { ) { let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size); - let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size); - let synthetic_size = { let factory = MetricsKey::synthetic_size(tenant_id); let mut synthetic_size = self.synthetic_size; @@ -355,11 +362,7 @@ impl TenantSnapshot { } }; - metrics.extend( - [Some(remote_size), Some(resident_size), synthetic_size] - .into_iter() - .flatten(), - ); + metrics.extend([Some(remote_size), synthetic_size].into_iter().flatten()); } } @@ -370,7 +373,13 @@ impl TenantSnapshot { struct TimelineSnapshot { loaded_at: (Lsn, SystemTime), last_record_lsn: Lsn, + ancestor_lsn: Lsn, current_exact_logical_size: Option, + /// Whether PITR is enabled (pitr_interval > 0). + pitr_enabled: bool, + /// The PITR cutoff LSN. None if not yet initialized. If PITR is disabled, this is approximately + /// Some(last_record_lsn), but may lag behind it since it's computed periodically. + pitr_cutoff: Option, } impl TimelineSnapshot { @@ -390,6 +399,9 @@ impl TimelineSnapshot { } else { let loaded_at = t.loaded_at; let last_record_lsn = t.get_last_record_lsn(); + let ancestor_lsn = t.get_ancestor_lsn(); + let pitr_enabled = !t.get_pitr_interval().is_zero(); + let pitr_cutoff = t.gc_info.read().unwrap().cutoffs.time; let current_exact_logical_size = { let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id); @@ -409,7 +421,10 @@ impl TimelineSnapshot { Ok(Some(TimelineSnapshot { loaded_at, last_record_lsn, + ancestor_lsn, current_exact_logical_size, + pitr_enabled, + pitr_cutoff, })) } } @@ -460,6 +475,8 @@ impl TimelineSnapshot { let up_to = now; + let written_size_last = written_size_now.value.max(prev.1); // don't regress + if let Some(delta) = written_size_now.value.checked_sub(prev.1) { let key_value = written_size_delta_key.from_until(prev.0, up_to, delta); // written_size_delta @@ -477,6 +494,27 @@ impl TimelineSnapshot { }); } + // Compute the branch-local written size. + let written_size_since_parent_key = + MetricsKey::written_size_since_parent(tenant_id, timeline_id); + metrics.push( + written_size_since_parent_key + .at(now, written_size_last.saturating_sub(self.ancestor_lsn.0)), + ); + + // Compute the branch-local PITR history size. Not emitted if GC hasn't yet computed the + // PITR cutoff. 0 if PITR is disabled. + let pitr_history_size_since_parent_key = + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id); + if !self.pitr_enabled { + metrics.push(pitr_history_size_since_parent_key.at(now, 0)); + } else if let Some(pitr_cutoff) = self.pitr_cutoff { + metrics.push(pitr_history_size_since_parent_key.at( + now, + written_size_last.saturating_sub(pitr_cutoff.max(self.ancestor_lsn).0), + )); + } + { let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id); let current_or_previous = self diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 52b4fb8680..3379395b87 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -12,12 +12,17 @@ fn startup_collected_timeline_metrics_before_advancing() { let cache = HashMap::new(); let initdb_lsn = Lsn(0x10000); + let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + let logical_size = 0x42000; let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, SystemTime::now()), last_record_lsn: disk_consistent_lsn, - current_exact_logical_size: Some(0x42000), + ancestor_lsn: Lsn(0), + current_exact_logical_size: Some(logical_size), + pitr_enabled: true, + pitr_cutoff: Some(pitr_cutoff), }; let now = DateTime::::from(SystemTime::now()); @@ -33,7 +38,11 @@ fn startup_collected_timeline_metrics_before_advancing() { 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + MetricsKey::written_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } @@ -49,7 +58,9 @@ fn startup_collected_timeline_metrics_second_round() { let before = DateTime::::from(before); let initdb_lsn = Lsn(0x10000); + let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + let logical_size = 0x42000; let mut metrics = Vec::new(); let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id) @@ -59,7 +70,10 @@ fn startup_collected_timeline_metrics_second_round() { let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, init), last_record_lsn: disk_consistent_lsn, - current_exact_logical_size: Some(0x42000), + ancestor_lsn: Lsn(0), + current_exact_logical_size: Some(logical_size), + pitr_enabled: true, + pitr_cutoff: Some(pitr_cutoff), }; snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); @@ -69,7 +83,11 @@ fn startup_collected_timeline_metrics_second_round() { &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + MetricsKey::written_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } @@ -86,7 +104,9 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { let before = DateTime::::from(before); let initdb_lsn = Lsn(0x10000); + let pitr_cutoff = Lsn(0x11000); let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + let logical_size = 0x42000; let mut metrics = Vec::new(); let cache = HashMap::from([ @@ -103,7 +123,10 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { let snap = TimelineSnapshot { loaded_at: (disk_consistent_lsn, init), last_record_lsn: disk_consistent_lsn, - current_exact_logical_size: Some(0x42000), + ancestor_lsn: Lsn(0), + current_exact_logical_size: Some(logical_size), + pitr_enabled: true, + pitr_cutoff: Some(pitr_cutoff), }; snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); @@ -113,16 +136,18 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0), MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), - MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + MetricsKey::written_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0 - pitr_cutoff.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size) ] ); } +/// Tests that written sizes do not regress across restarts. #[test] fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { - // it can happen that we lose the inmemorylayer but have previously sent metrics and we - // should never go backwards - let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); @@ -140,7 +165,10 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { let snap = TimelineSnapshot { loaded_at: (Lsn(50), at_restart), last_record_lsn: Lsn(50), + ancestor_lsn: Lsn(0), current_exact_logical_size: None, + pitr_enabled: true, + pitr_cutoff: Some(Lsn(20)), }; let mut cache = HashMap::from([ @@ -169,6 +197,8 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { 0 ), MetricsKey::written_size(tenant_id, timeline_id).at(now, 100), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 100), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 80), ] ); @@ -183,6 +213,157 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() { &[ MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0), MetricsKey::written_size(tenant_id, timeline_id).at(later, 100), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 100), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 80), + ] + ); +} + +/// Tests that written sizes do not regress across restarts, even on child branches. +#[test] +fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let [later, now, at_restart] = time_backwards(); + + // FIXME: tests would be so much easier if we did not need to juggle back and forth + // SystemTime and DateTime:: ... Could do the conversion only at upload time? + let now = DateTime::::from(now); + let later = DateTime::::from(later); + let before_restart = at_restart - std::time::Duration::from_secs(5 * 60); + let way_before = before_restart - std::time::Duration::from_secs(10 * 60); + let before_restart = DateTime::::from(before_restart); + let way_before = DateTime::::from(way_before); + + let snap = TimelineSnapshot { + loaded_at: (Lsn(50), at_restart), + last_record_lsn: Lsn(50), + ancestor_lsn: Lsn(40), + current_exact_logical_size: None, + pitr_enabled: true, + pitr_cutoff: Some(Lsn(20)), + }; + + let mut cache = HashMap::from([ + MetricsKey::written_size(tenant_id, timeline_id) + .at(before_restart, 100) + .to_kv_pair(), + MetricsKey::written_size_delta(tenant_id, timeline_id) + .from_until( + way_before, + before_restart, + // not taken into account, but the timestamps are important + 999_999_999, + ) + .to_kv_pair(), + ]); + + let mut metrics = Vec::new(); + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( + before_restart, + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, 100), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 60), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60), + ] + ); + + // now if we cache these metrics, and re-run while "still in recovery" + cache.extend(metrics.drain(..).map(|x| x.to_kv_pair())); + + // "still in recovery", because our snapshot did not change + snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0), + MetricsKey::written_size(tenant_id, timeline_id).at(later, 100), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 60), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60), + ] + ); +} + +/// Tests that written sizes do not regress across restarts, even on child branches and +/// with a PITR cutoff after the branch point. +#[test] +fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn_and_pitr_cutoff() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let [later, now, at_restart] = time_backwards(); + + // FIXME: tests would be so much easier if we did not need to juggle back and forth + // SystemTime and DateTime:: ... Could do the conversion only at upload time? + let now = DateTime::::from(now); + let later = DateTime::::from(later); + let before_restart = at_restart - std::time::Duration::from_secs(5 * 60); + let way_before = before_restart - std::time::Duration::from_secs(10 * 60); + let before_restart = DateTime::::from(before_restart); + let way_before = DateTime::::from(way_before); + + let snap = TimelineSnapshot { + loaded_at: (Lsn(50), at_restart), + last_record_lsn: Lsn(50), + ancestor_lsn: Lsn(30), + current_exact_logical_size: None, + pitr_enabled: true, + pitr_cutoff: Some(Lsn(40)), + }; + + let mut cache = HashMap::from([ + MetricsKey::written_size(tenant_id, timeline_id) + .at(before_restart, 100) + .to_kv_pair(), + MetricsKey::written_size_delta(tenant_id, timeline_id) + .from_until( + way_before, + before_restart, + // not taken into account, but the timestamps are important + 999_999_999, + ) + .to_kv_pair(), + ]); + + let mut metrics = Vec::new(); + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( + before_restart, + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, 100), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 70), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60), + ] + ); + + // now if we cache these metrics, and re-run while "still in recovery" + cache.extend(metrics.drain(..).map(|x| x.to_kv_pair())); + + // "still in recovery", because our snapshot did not change + snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0), + MetricsKey::written_size(tenant_id, timeline_id).at(later, 100), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 70), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60), ] ); } @@ -201,7 +382,10 @@ fn post_restart_current_exact_logical_size_uses_cached() { let snap = TimelineSnapshot { loaded_at: (Lsn(50), at_restart), last_record_lsn: Lsn(50), + ancestor_lsn: Lsn(0), current_exact_logical_size: None, + pitr_enabled: true, + pitr_cutoff: None, }; let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id) @@ -224,7 +408,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() { let tenant_id = TenantId::generate(); let ts = TenantSnapshot { - resident_size: 1000, remote_size: 1000, // not yet calculated synthetic_size: 0, @@ -245,7 +428,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() { metrics, &[ MetricsKey::remote_storage_size(tenant_id).at(now, 1000), - MetricsKey::resident_size(tenant_id).at(now, 1000), MetricsKey::synthetic_size(tenant_id).at(now, 1000), ] ); @@ -256,7 +438,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() { let tenant_id = TenantId::generate(); let ts = TenantSnapshot { - resident_size: 1000, remote_size: 1000, // not yet calculated synthetic_size: 0, @@ -274,7 +455,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() { metrics, &[ MetricsKey::remote_storage_size(tenant_id).at(now, 1000), - MetricsKey::resident_size(tenant_id).at(now, 1000), // no synthetic size here ] ); @@ -290,19 +470,103 @@ fn time_backwards() -> [std::time::SystemTime; N] { times } +/// Tests that disabled PITR history does not yield any history size, even when the PITR cutoff +/// indicates otherwise. +#[test] +fn pitr_disabled_yields_no_history_size() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let mut metrics = Vec::new(); + let cache = HashMap::new(); + + let initdb_lsn = Lsn(0x10000); + let pitr_cutoff = Lsn(0x11000); + let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + + let snap = TimelineSnapshot { + loaded_at: (disk_consistent_lsn, SystemTime::now()), + last_record_lsn: disk_consistent_lsn, + ancestor_lsn: Lsn(0), + current_exact_logical_size: None, + pitr_enabled: false, + pitr_cutoff: Some(pitr_cutoff), + }; + + let now = DateTime::::from(SystemTime::now()); + + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( + snap.loaded_at.1.into(), + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), + MetricsKey::written_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0), + ] + ); +} + +/// Tests that uninitialized PITR cutoff does not emit any history size metric at all. +#[test] +fn pitr_uninitialized_does_not_emit_history_size() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let mut metrics = Vec::new(); + let cache = HashMap::new(); + + let initdb_lsn = Lsn(0x10000); + let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + + let snap = TimelineSnapshot { + loaded_at: (disk_consistent_lsn, SystemTime::now()), + last_record_lsn: disk_consistent_lsn, + ancestor_lsn: Lsn(0), + current_exact_logical_size: None, + pitr_enabled: true, + pitr_cutoff: None, + }; + + let now = DateTime::::from(SystemTime::now()); + + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_until( + snap.loaded_at.1.into(), + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), + MetricsKey::written_size_since_parent(tenant_id, timeline_id) + .at(now, disk_consistent_lsn.0), + ] + ); +} + pub(crate) const fn metric_examples_old( tenant_id: TenantId, timeline_id: TimelineId, now: DateTime, before: DateTime, -) -> [RawMetric; 6] { +) -> [RawMetric; 7] { [ MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id) .from_until_old_format(before, now, 0), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0), MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0), - MetricsKey::resident_size(tenant_id).at_old_format(now, 0), MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1), ] } @@ -312,13 +576,14 @@ pub(crate) const fn metric_examples( timeline_id: TimelineId, now: DateTime, before: DateTime, -) -> [NewRawMetric; 6] { +) -> [NewRawMetric; 7] { [ MetricsKey::written_size(tenant_id, timeline_id).at(now, 0), MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0), + MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 0), + MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0), MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0), MetricsKey::remote_storage_size(tenant_id).at(now, 0), - MetricsKey::resident_size(tenant_id).at(now, 0), MetricsKey::synthetic_size(tenant_id).at(now, 1), ] } diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 59e0145a5b..eba773272a 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -513,6 +513,14 @@ mod tests { line!(), r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, ), + ( + line!(), + r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, + ), + ( + line!(), + r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"pitr_history_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, + ), ( line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#, @@ -521,10 +529,6 @@ mod tests { line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#, ), - ( - line!(), - r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#, - ), ( line!(), r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#, @@ -564,7 +568,7 @@ mod tests { assert_eq!(upgraded_samples, new_samples); } - fn metric_samples_old() -> [RawMetric; 6] { + fn metric_samples_old() -> [RawMetric; 7] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); @@ -576,7 +580,7 @@ mod tests { super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before) } - fn metric_samples() -> [NewRawMetric; 6] { + fn metric_samples() -> [NewRawMetric; 7] { let tenant_id = TenantId::from_array([0; 16]); let timeline_id = TimelineId::from_array([0xff; 16]); diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 59c94f1549..dc38ea616c 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -7,7 +7,7 @@ use pageserver_api::models::ShardImportStatus; use pageserver_api::shard::TenantShardId; use pageserver_api::upcall_api::{ PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, - ValidateRequest, ValidateRequestTenant, ValidateResponse, + TimelineImportStatusRequest, ValidateRequest, ValidateRequestTenant, ValidateResponse, }; use reqwest::Certificate; use serde::Serialize; @@ -51,21 +51,22 @@ pub trait StorageControllerUpcallApi { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + generation: Generation, status: ShardImportStatus, ) -> impl Future> + Send; + fn get_timeline_import_status( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + generation: Generation, + ) -> impl Future> + Send; } impl StorageControllerUpcallClient { /// A None return value indicates that the input `conf` object does not have control /// plane API enabled. - pub fn new( - conf: &'static PageServerConf, - cancel: &CancellationToken, - ) -> Result, reqwest::Error> { - let mut url = match conf.control_plane_api.as_ref() { - Some(u) => u.clone(), - None => return Ok(None), - }; + pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Self { + let mut url = conf.control_plane_api.clone(); if let Ok(mut segs) = url.path_segments_mut() { // This ensures that `url` ends with a slash if it doesn't already. @@ -85,15 +86,17 @@ impl StorageControllerUpcallClient { } for cert in &conf.ssl_ca_certs { - client = client.add_root_certificate(Certificate::from_der(cert.contents())?); + client = client.add_root_certificate( + Certificate::from_der(cert.contents()).expect("Invalid certificate in config"), + ); } - Ok(Some(Self { - http_client: client.build()?, + Self { + http_client: client.build().expect("Failed to construct HTTP client"), base_url: url, node_id: conf.id, cancel: cancel.clone(), - })) + } } #[tracing::instrument(skip_all)] @@ -101,6 +104,7 @@ impl StorageControllerUpcallClient { &self, url: &url::Url, request: R, + method: reqwest::Method, ) -> Result where R: Serialize, @@ -110,7 +114,7 @@ impl StorageControllerUpcallClient { || async { let response = self .http_client - .post(url.clone()) + .request(method.clone(), url.clone()) .json(&request) .send() .await?; @@ -219,7 +223,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { register: register.clone(), }; - let response: ReAttachResponse = self.retry_http_forever(&url, request).await?; + let response: ReAttachResponse = self + .retry_http_forever(&url, request, reqwest::Method::POST) + .await?; tracing::info!( "Received re-attach response with {} tenants (node {}, register: {:?})", response.tenants.len(), @@ -272,7 +278,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { return Err(RetryForeverError::ShuttingDown); } - let response: ValidateResponse = self.retry_http_forever(&url, request).await?; + let response: ValidateResponse = self + .retry_http_forever(&url, request, reqwest::Method::POST) + .await?; for rt in response.tenants { result.insert(rt.id, rt.valid); } @@ -291,6 +299,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + generation: Generation, status: ShardImportStatus, ) -> Result<(), RetryForeverError> { let url = self @@ -301,9 +310,35 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { let request = PutTimelineImportStatusRequest { tenant_shard_id, timeline_id, + generation, status, }; - self.retry_http_forever(&url, request).await + self.retry_http_forever(&url, request, reqwest::Method::POST) + .await + } + + #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context + async fn get_timeline_import_status( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + generation: Generation, + ) -> Result { + let url = self + .base_url + .join("timeline_import_status") + .expect("Failed to build path"); + + let request = TimelineImportStatusRequest { + tenant_shard_id, + timeline_id, + generation, + }; + + let response: ShardImportStatus = self + .retry_http_forever(&url, request, reqwest::Method::GET) + .await?; + Ok(response) } } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 6dd7d741c1..7854fd9e36 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -585,7 +585,7 @@ impl DeletionQueue { /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice. pub fn new( remote_storage: GenericRemoteStorage, - controller_upcall_client: Option, + controller_upcall_client: C, conf: &'static PageServerConf, ) -> (Self, DeletionQueueWorkers) where @@ -663,6 +663,7 @@ mod test { use camino::Utf8Path; use hex_literal::hex; use pageserver_api::key::Key; + use pageserver_api::models::ShardImportStatus; use pageserver_api::shard::ShardIndex; use pageserver_api::upcall_api::ReAttachResponseTenant; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; @@ -701,7 +702,7 @@ mod test { async fn restart(&mut self) { let (deletion_queue, workers) = DeletionQueue::new( self.storage.clone(), - Some(self.mock_control_plane.clone()), + self.mock_control_plane.clone(), self.harness.conf, ); @@ -792,10 +793,20 @@ mod test { &self, _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, + _generation: Generation, _status: pageserver_api::models::ShardImportStatus, ) -> Result<(), RetryForeverError> { unimplemented!() } + + async fn get_timeline_import_status( + &self, + _tenant_shard_id: TenantShardId, + _timeline_id: TimelineId, + _generation: Generation, + ) -> Result { + unimplemented!() + } } async fn setup(test_name: &str) -> anyhow::Result { @@ -821,11 +832,8 @@ mod test { let mock_control_plane = MockStorageController::new(); - let (deletion_queue, worker) = DeletionQueue::new( - storage.clone(), - Some(mock_control_plane.clone()), - harness.conf, - ); + let (deletion_queue, worker) = + DeletionQueue::new(storage.clone(), mock_control_plane.clone(), harness.conf); let worker_join = worker.spawn_with(&tokio::runtime::Handle::current()); diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index 4e775f15eb..363b1427f5 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -53,7 +53,7 @@ where tx: tokio::sync::mpsc::Sender, // Client for calling into control plane API for validation of deletes - controller_upcall_client: Option, + controller_upcall_client: C, // DeletionLists which are waiting generation validation. Not safe to // execute until [`validate`] has processed them. @@ -86,7 +86,7 @@ where conf: &'static PageServerConf, rx: tokio::sync::mpsc::Receiver, tx: tokio::sync::mpsc::Sender, - controller_upcall_client: Option, + controller_upcall_client: C, lsn_table: Arc>, cancel: CancellationToken, ) -> Self { @@ -137,20 +137,16 @@ where return Ok(()); } - let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client { - match controller_upcall_client - .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect()) - .await - { - Ok(tenants) => tenants, - Err(RetryForeverError::ShuttingDown) => { - // The only way a validation call returns an error is when the cancellation token fires - return Err(DeletionQueueError::ShuttingDown); - } + let tenants_valid = match self + .controller_upcall_client + .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect()) + .await + { + Ok(tenants) => tenants, + Err(RetryForeverError::ShuttingDown) => { + // The only way a validation call returns an error is when the cancellation token fires + return Err(DeletionQueueError::ShuttingDown); } - } else { - // Control plane API disabled. In legacy mode we consider everything valid. - tenant_generations.keys().map(|k| (*k, true)).collect() }; let mut validated_sequence: Option = None; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 8b6500b020..0d6791cddd 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -449,7 +449,7 @@ async fn build_timeline_info_common( // Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we // actually trimmed data to), which can pass each other when PITR is changed. let min_readable_lsn = std::cmp::max( - timeline.get_gc_cutoff_lsn(), + timeline.get_gc_cutoff_lsn().unwrap_or_default(), *timeline.get_applied_gc_cutoff_lsn(), ); @@ -3199,7 +3199,7 @@ async fn list_aux_files( .await?; let io_concurrency = IoConcurrency::spawn_from_conf( - state.conf, + state.conf.get_vectored_concurrent_io, timeline.gate.enter().map_err(|_| ApiError::Cancelled)?, ); @@ -3500,6 +3500,107 @@ async fn put_tenant_timeline_import_wal( }.instrument(span).await } +/// Activate a timeline after its import has completed +/// +/// The endpoint is idempotent and callers are expected to retry all +/// errors until a successful response. +async fn activate_post_import_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + const DEFAULT_ACTIVATE_TIMEOUT: Duration = Duration::from_secs(1); + let activate_timeout = parse_query_param(&request, "timeline_activate_timeout_ms")? + .map(Duration::from_millis) + .unwrap_or(DEFAULT_ACTIVATE_TIMEOUT); + + let span = info_span!( + "activate_post_import_handler", + tenant_id=%tenant_shard_id.tenant_id, + timeline_id=%timeline_id, + shard_id=%tenant_shard_id.shard_slug() + ); + + async move { + let state = get_state(&request); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + tenant + .finalize_importing_timeline(timeline_id) + .await + .map_err(ApiError::InternalServerError)?; + + match tenant.get_timeline(timeline_id, false) { + Ok(_timeline) => { + // Timeline is already visible. Reset not required: fall through. + } + Err(GetTimelineError::NotFound { .. }) => { + // This is crude: we reset the whole tenant such that the new timeline is detected + // and activated. We can come up with something more granular in the future. + // + // Note that we only reset the tenant if required: when the timeline is + // not present in [`Tenant::timelines`]. + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + state + .tenant_manager + .reset_tenant(tenant_shard_id, false, &ctx) + .await + .map_err(ApiError::InternalServerError)?; + } + Err(GetTimelineError::ShuttingDown) => { + return Err(ApiError::ShuttingDown); + } + Err(GetTimelineError::NotActive { .. }) => { + unreachable!("Called get_timeline with active_only=false"); + } + } + + let timeline = tenant.get_timeline(timeline_id, false)?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn) + .with_scope_timeline(&timeline); + + let result = + tokio::time::timeout(activate_timeout, timeline.wait_to_become_active(&ctx)).await; + match result { + Ok(Ok(())) => { + // fallthrough + } + // Timeline reached some other state that's not active + // TODO(vlad): if the tenant is broken, return a permananet error + Ok(Err(_timeline_state)) => { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Timeline activation failed" + ))); + } + // Activation timed out + Err(_) => { + return Err(ApiError::Timeout("Timeline activation timed out".into())); + } + } + + let timeline_info = build_timeline_info( + &timeline, false, // include_non_incremental_logical_size, + false, // force_await_initial_logical_size + &ctx, + ) + .await + .context("get local timeline info") + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, timeline_info) + } + .instrument(span) + .await +} + /// Read the end of a tar archive. /// /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. @@ -3924,5 +4025,9 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal", |r| api_handler(r, put_tenant_timeline_import_wal), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import", + |r| api_handler(r, activate_post_import_handler), + ) .any(handler_404)) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index ea161fc739..72405a0a84 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -3,6 +3,7 @@ mod auth; pub mod basebackup; +pub mod basebackup_cache; pub mod config; pub mod consumption_metrics; pub mod context; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9a6c3f2378..3076c7f1d6 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -497,6 +497,24 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy = Lazy::n .expect("failed to define a metric") }); +pub(crate) static ONDEMAND_DOWNLOAD_BYTES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_ondemand_download_bytes_total", + "Total bytes of layers on-demand downloaded", + &["task_kind"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static ONDEMAND_DOWNLOAD_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_ondemand_download_count", + "Total count of layers on-demand downloaded", + &["task_kind"] + ) + .expect("failed to define a metric") +}); + pub(crate) mod wait_ondemand_download_time { use super::*; const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[ @@ -825,23 +843,50 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy = Lazy::new(| .expect("failed to define a metric") }); -pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy = Lazy::new(|| { +pub(crate) static RELSIZE_LATEST_CACHE_ENTRIES: Lazy = Lazy::new(|| { register_uint_gauge!( - "pageserver_relsize_cache_entries", - "Number of entries in the relation size cache", + "pageserver_relsize_latest_cache_entries", + "Number of entries in the latest relation size cache", ) .expect("failed to define a metric") }); -pub(crate) static RELSIZE_CACHE_HITS: Lazy = Lazy::new(|| { - register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",) - .expect("failed to define a metric") +pub(crate) static RELSIZE_LATEST_CACHE_HITS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_relsize_latest_cache_hits", + "Latest relation size cache hits", + ) + .expect("failed to define a metric") }); -pub(crate) static RELSIZE_CACHE_MISSES: Lazy = Lazy::new(|| { +pub(crate) static RELSIZE_LATEST_CACHE_MISSES: Lazy = Lazy::new(|| { register_int_counter!( - "pageserver_relsize_cache_misses", - "Relation size cache misses", + "pageserver_relsize_latest_cache_misses", + "Relation size latest cache misses", + ) + .expect("failed to define a metric") +}); + +pub(crate) static RELSIZE_SNAPSHOT_CACHE_ENTRIES: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_relsize_snapshot_cache_entries", + "Number of entries in the pitr relation size cache", + ) + .expect("failed to define a metric") +}); + +pub(crate) static RELSIZE_SNAPSHOT_CACHE_HITS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_relsize_snapshot_cache_hits", + "Pitr relation size cache hits", + ) + .expect("failed to define a metric") +}); + +pub(crate) static RELSIZE_SNAPSHOT_CACHE_MISSES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_relsize_snapshot_cache_misses", + "Relation size snapshot cache misses", ) .expect("failed to define a metric") }); @@ -1021,6 +1066,15 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy = Lazy::new(| .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric") }); +pub(crate) static TENANT_OFFLOADED_TIMELINES: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_tenant_offloaded_timelines", + "Number of offloaded timelines of a tenant", + &["tenant_id", "shard_id"] + ) + .expect("Failed to register pageserver_tenant_offloaded_timelines metric") +}); + pub(crate) static EVICTION_ITERATION_DURATION: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_eviction_iteration_duration_seconds_global", @@ -2180,6 +2234,10 @@ impl BasebackupQueryTimeOngoingRecording<'_> { // If you want to change categorize of a specific error, also change it in `log_query_error`. let metric = match res { Ok(_) => &self.parent.ok, + Err(QueryError::Shutdown) => { + // Do not observe ok/err for shutdown + return; + } Err(QueryError::Disconnected(ConnectionError::Io(io_error))) if is_expected_io_error(io_error) => { @@ -3502,11 +3560,14 @@ impl TimelineMetrics { } pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { + let tid = tenant_shard_id.tenant_id.to_string(); + let shard_id = tenant_shard_id.shard_slug().to_string(); + // Only shard zero deals in synthetic sizes if tenant_shard_id.is_shard_zero() { - let tid = tenant_shard_id.tenant_id.to_string(); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); } + let _ = TENANT_OFFLOADED_TIMELINES.remove_label_values(&[&tid, &shard_id]); tenant_throttling::remove_tenant_metrics(tenant_shard_id); @@ -4298,6 +4359,42 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { .set(u64::try_from(num_threads.get()).unwrap()); } +pub(crate) static BASEBACKUP_CACHE_READ: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_basebackup_cache_read_total", + "Number of read accesses to the basebackup cache grouped by hit/miss/error", + &["result"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_basebackup_cache_prepare_total", + "Number of prepare requests processed by the basebackup cache grouped by ok/skip/error", + &["result"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_basebackup_cache_entries_total", + "Number of entries in the basebackup cache" + ) + .expect("failed to define a metric") +}); + +// FIXME: Support basebackup cache size metrics. +#[allow(dead_code)] +pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_basebackup_cache_size_bytes", + "Total size of all basebackup cache entries on disk in bytes" + ) + .expect("failed to define a metric") +}); + static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_config_ignored_items", diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index ddea8aab6f..7412750d65 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -9,7 +9,6 @@ use std::sync::Arc; use std::time::{Duration, Instant, SystemTime}; use std::{io, str}; -use crate::PERF_TRACE_TARGET; use anyhow::{Context, bail}; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; @@ -17,7 +16,7 @@ use itertools::Itertools; use jsonwebtoken::TokenData; use once_cell::sync::OnceCell; use pageserver_api::config::{ - PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, + GetVectoredConcurrentIo, PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy, }; use pageserver_api::key::rel_block_to_key; @@ -50,15 +49,17 @@ use utils::simple_rcu::RcuReadGuard; use utils::sync::gate::GateGuard; use utils::sync::spsc_fold; +use crate::PERF_TRACE_TARGET; use crate::auth::check_permission; use crate::basebackup::BasebackupError; +use crate::basebackup_cache::BasebackupCache; use crate::config::PageServerConf; use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder}; use crate::metrics::{ self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS, SmgrOpTimer, TimelineMetrics, }; -use crate::pgdatadir_mapping::Version; +use crate::pgdatadir_mapping::{LsnRange, Version}; use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, @@ -102,6 +103,7 @@ pub async fn libpq_page_service_conn_main( auth_type: AuthType, tls_config: Option>, pipelining_config: PageServicePipeliningConfig, + basebackup_cache: Arc, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, @@ -158,11 +160,12 @@ pub async fn libpq_page_service_conn_main( // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. let mut conn_handler = PageServerHandler::new( - conf, tenant_manager, auth, pipelining_config, + conf.get_vectored_concurrent_io, perf_span_fields, + basebackup_cache, connection_ctx, cancel.clone(), gate_guard, @@ -198,7 +201,6 @@ pub async fn libpq_page_service_conn_main( } struct PageServerHandler { - conf: &'static PageServerConf, auth: Option>, claims: Option, @@ -216,6 +218,9 @@ struct PageServerHandler { timeline_handles: Option, pipelining_config: PageServicePipeliningConfig, + get_vectored_concurrent_io: GetVectoredConcurrentIo, + + basebackup_cache: Arc, gate_guard: GateGuard, } @@ -469,7 +474,7 @@ impl std::fmt::Display for BatchedPageStreamError { struct BatchedGetPageRequest { req: PagestreamGetPageRequest, timer: SmgrOpTimer, - effective_request_lsn: Lsn, + lsn_range: LsnRange, ctx: RequestContext, } @@ -591,12 +596,12 @@ impl BatchedFeMessage { match batching_strategy { PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => { if let Some(last_in_batch) = accum_pages.last() { - if last_in_batch.effective_request_lsn - != this_pages[0].effective_request_lsn + if last_in_batch.lsn_range.effective_lsn + != this_pages[0].lsn_range.effective_lsn { trace!( - accum_lsn = %last_in_batch.effective_request_lsn, - this_lsn = %this_pages[0].effective_request_lsn, + accum_lsn = %last_in_batch.lsn_range.effective_lsn, + this_lsn = %this_pages[0].lsn_range.effective_lsn, "stopping batching because LSN changed" ); @@ -611,15 +616,15 @@ impl BatchedFeMessage { let same_page_different_lsn = accum_pages.iter().any(|batched| { batched.req.rel == this_pages[0].req.rel && batched.req.blkno == this_pages[0].req.blkno - && batched.effective_request_lsn - != this_pages[0].effective_request_lsn + && batched.lsn_range.effective_lsn + != this_pages[0].lsn_range.effective_lsn }); if same_page_different_lsn { trace!( rel=%this_pages[0].req.rel, blkno=%this_pages[0].req.blkno, - lsn=%this_pages[0].effective_request_lsn, + lsn=%this_pages[0].lsn_range.effective_lsn, "stopping batching because same page was requested at different LSNs" ); @@ -671,17 +676,17 @@ impl BatchedFeMessage { impl PageServerHandler { #[allow(clippy::too_many_arguments)] pub fn new( - conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, pipelining_config: PageServicePipeliningConfig, + get_vectored_concurrent_io: GetVectoredConcurrentIo, perf_span_fields: ConnectionPerfSpanFields, + basebackup_cache: Arc, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, ) -> Self { PageServerHandler { - conf, auth, claims: None, connection_ctx, @@ -689,6 +694,8 @@ impl PageServerHandler { timeline_handles: Some(TimelineHandles::new(tenant_manager)), cancel, pipelining_config, + get_vectored_concurrent_io, + basebackup_cache, gate_guard, } } @@ -862,10 +869,27 @@ impl PageServerHandler { // avoid a somewhat costly Span::record() by constructing the entire span in one go. macro_rules! mkspan { (before shard routing) => {{ - tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn) + tracing::info_span!( + parent: &parent_span, + "handle_get_page_request", + request_id = %req.hdr.reqid, + rel = %req.rel, + blkno = %req.blkno, + req_lsn = %req.hdr.request_lsn, + not_modified_since_lsn = %req.hdr.not_modified_since, + ) }}; ($shard_id:expr) => {{ - tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id) + tracing::info_span!( + parent: &parent_span, + "handle_get_page_request", + request_id = %req.hdr.reqid, + rel = %req.rel, + blkno = %req.blkno, + req_lsn = %req.hdr.request_lsn, + not_modified_since_lsn = %req.hdr.not_modified_since, + shard_id = %$shard_id, + ) }}; } @@ -929,6 +953,7 @@ impl PageServerHandler { shard_id = %shard.get_shard_identity().shard_slug(), timeline_id = %timeline_id, lsn = %req.hdr.request_lsn, + not_modified_since_lsn = %req.hdr.not_modified_since, request_id = %req.hdr.reqid, key = %key, ) @@ -967,7 +992,7 @@ impl PageServerHandler { .await?; // We're holding the Handle - let effective_request_lsn = match Self::effective_request_lsn( + let effective_lsn = match Self::effective_request_lsn( &shard, shard.get_last_record_lsn(), req.hdr.request_lsn, @@ -986,7 +1011,10 @@ impl PageServerHandler { pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, - effective_request_lsn, + lsn_range: LsnRange { + effective_lsn, + request_lsn: req.hdr.request_lsn + }, ctx, }], // The executor grabs the batch when it becomes idle. @@ -1087,7 +1115,7 @@ impl PageServerHandler { } #[instrument(level = tracing::Level::DEBUG, skip_all)] - async fn pagesteam_handle_batched_message( + async fn pagestream_handle_batched_message( &mut self, pgb_writer: &mut PostgresBackend, batch: BatchedFeMessage, @@ -1432,7 +1460,7 @@ impl PageServerHandler { } let io_concurrency = IoConcurrency::spawn_from_conf( - self.conf, + self.get_vectored_concurrent_io, match self.gate_guard.try_clone() { Ok(guard) => guard, Err(_) => { @@ -1542,7 +1570,7 @@ impl PageServerHandler { }; let result = self - .pagesteam_handle_batched_message( + .pagestream_handle_batched_message( pgb_writer, msg, io_concurrency.clone(), @@ -1718,7 +1746,7 @@ impl PageServerHandler { return Err(e); } }; - self.pagesteam_handle_batched_message( + self.pagestream_handle_batched_message( pgb_writer, batch, io_concurrency.clone(), @@ -1936,7 +1964,14 @@ impl PageServerHandler { .await?; let exists = timeline - .get_rel_exists(req.rel, Version::Lsn(lsn), ctx) + .get_rel_exists( + req.rel, + Version::LsnRange(LsnRange { + effective_lsn: lsn, + request_lsn: req.hdr.request_lsn, + }), + ctx, + ) .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { @@ -1963,7 +1998,14 @@ impl PageServerHandler { .await?; let n_blocks = timeline - .get_rel_size(req.rel, Version::Lsn(lsn), ctx) + .get_rel_size( + req.rel, + Version::LsnRange(LsnRange { + effective_lsn: lsn, + request_lsn: req.hdr.request_lsn, + }), + ctx, + ) .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { @@ -1990,7 +2032,15 @@ impl PageServerHandler { .await?; let total_blocks = timeline - .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx) + .get_db_size( + DEFAULTTABLESPACE_OID, + req.dbnode, + Version::LsnRange(LsnRange { + effective_lsn: lsn, + request_lsn: req.hdr.request_lsn, + }), + ctx, + ) .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; @@ -2023,7 +2073,7 @@ impl PageServerHandler { // Ignore error (trace buffer may be full or tracer may have disconnected). _ = page_trace.try_send(PageTraceEvent { key, - effective_lsn: batch.effective_request_lsn, + effective_lsn: batch.lsn_range.effective_lsn, time, }); } @@ -2038,7 +2088,7 @@ impl PageServerHandler { perf_instrument = true; } - req.effective_request_lsn + req.lsn_range.effective_lsn }) .max() .expect("batch is never empty"); @@ -2092,7 +2142,7 @@ impl PageServerHandler { ( &p.req.rel, &p.req.blkno, - p.effective_request_lsn, + p.lsn_range, p.ctx.attached_child(), ) }), @@ -2277,6 +2327,8 @@ impl PageServerHandler { .map_err(QueryError::Disconnected)?; self.flush_cancellable(pgb, &self.cancel).await?; + let mut from_cache = false; + // Send a tarball of the latest layer on the timeline. Compress if not // fullbackup. TODO Compress in that case too (tests need to be updated) if full_backup { @@ -2294,7 +2346,33 @@ impl PageServerHandler { .map_err(map_basebackup_error)?; } else { let mut writer = BufWriter::new(pgb.copyout_writer()); - if gzip { + + let cached = { + // Basebackup is cached only for this combination of parameters. + if timeline.is_basebackup_cache_enabled() + && gzip + && lsn.is_some() + && prev_lsn.is_none() + { + self.basebackup_cache + .get(tenant_id, timeline_id, lsn.unwrap()) + .await + } else { + None + } + }; + + if let Some(mut cached) = cached { + from_cache = true; + tokio::io::copy(&mut cached, &mut writer) + .await + .map_err(|e| { + map_basebackup_error(BasebackupError::Client( + e, + "handle_basebackup_request,cached,copy", + )) + })?; + } else if gzip { let mut encoder = GzipEncoder::with_quality( &mut writer, // NOTE using fast compression because it's on the critical path @@ -2353,6 +2431,7 @@ impl PageServerHandler { info!( lsn_await_millis = lsn_awaited_after.as_millis(), basebackup_millis = basebackup_after.as_millis(), + %from_cache, "basebackup complete" ); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 81e548a095..c6f3929257 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -40,10 +40,12 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; -use crate::context::{PerfInstrumentFutureExt, RequestContext}; +use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder}; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::metrics::{ - RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD, + RELSIZE_CACHE_MISSES_OLD, RELSIZE_LATEST_CACHE_ENTRIES, RELSIZE_LATEST_CACHE_HITS, + RELSIZE_LATEST_CACHE_MISSES, RELSIZE_SNAPSHOT_CACHE_ENTRIES, RELSIZE_SNAPSHOT_CACHE_HITS, + RELSIZE_SNAPSHOT_CACHE_MISSES, }; use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id, @@ -90,6 +92,28 @@ pub enum LsnForTimestamp { NoData(Lsn), } +/// Each request to page server contains LSN range: `not_modified_since..request_lsn`. +/// See comments libs/pageserver_api/src/models.rs. +/// Based on this range and `last_record_lsn` PS calculates `effective_lsn`. +/// But to distinguish requests from primary and replicas we need also to pass `request_lsn`. +#[derive(Debug, Clone, Copy, Default)] +pub struct LsnRange { + pub effective_lsn: Lsn, + pub request_lsn: Lsn, +} + +impl LsnRange { + pub fn at(lsn: Lsn) -> LsnRange { + LsnRange { + effective_lsn: lsn, + request_lsn: lsn, + } + } + pub fn is_latest(&self) -> bool { + self.request_lsn == Lsn::MAX + } +} + #[derive(Debug, thiserror::Error)] pub(crate) enum CalculateLogicalSizeError { #[error("cancelled")] @@ -202,13 +226,13 @@ impl Timeline { io_concurrency: IoConcurrency, ) -> Result { match version { - Version::Lsn(effective_lsn) => { + Version::LsnRange(lsns) => { let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)]; let res = self .get_rel_page_at_lsn_batched( - pages.iter().map(|(tag, blknum)| { - (tag, blknum, effective_lsn, ctx.attached_child()) - }), + pages + .iter() + .map(|(tag, blknum)| (tag, blknum, lsns, ctx.attached_child())), io_concurrency.clone(), ctx, ) @@ -246,7 +270,7 @@ impl Timeline { /// The ordering of the returned vec corresponds to the ordering of `pages`. pub(crate) async fn get_rel_page_at_lsn_batched( &self, - pages: impl ExactSizeIterator, + pages: impl ExactSizeIterator, io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> Vec> { @@ -265,7 +289,7 @@ impl Timeline { let mut req_keyspaces: HashMap = HashMap::with_capacity(pages.len()); - for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() { + for (response_slot_idx, (tag, blknum, lsns, ctx)) in pages.enumerate() { if tag.relnode == 0 { result_slots[response_slot_idx].write(Err(PageReconstructError::Other( RelationError::InvalidRelnode.into(), @@ -274,25 +298,31 @@ impl Timeline { slots_filled += 1; continue; } + let lsn = lsns.effective_lsn; + let nblocks = { + let ctx = RequestContextBuilder::from(&ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_REL_SIZE", + reltag=%tag, + lsn=%lsn, + ) + }) + .attached_child(); - let nblocks = match self - .get_rel_size(*tag, Version::Lsn(lsn), &ctx) - .maybe_perf_instrument(&ctx, |crnt_perf_span| { - info_span!( - target: PERF_TRACE_TARGET, - parent: crnt_perf_span, - "GET_REL_SIZE", - reltag=%tag, - lsn=%lsn, - ) - }) - .await - { - Ok(nblocks) => nblocks, - Err(err) => { - result_slots[response_slot_idx].write(Err(err)); - slots_filled += 1; - continue; + match self + .get_rel_size(*tag, Version::LsnRange(lsns), &ctx) + .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) + .await + { + Ok(nblocks) => nblocks, + Err(err) => { + result_slots[response_slot_idx].write(Err(err)); + slots_filled += 1; + continue; + } } }; @@ -308,6 +338,17 @@ impl Timeline { let key = rel_block_to_key(*tag, *blknum); + let ctx = RequestContextBuilder::from(&ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "GET_BATCH", + batch_size = %page_count, + ) + }) + .attached_child(); + let key_slots = keys_slots.entry(key).or_default(); key_slots.push((response_slot_idx, ctx)); @@ -323,14 +364,7 @@ impl Timeline { let query = VersionedKeySpaceQuery::scattered(query); let res = self .get_vectored(query, io_concurrency, ctx) - .maybe_perf_instrument(ctx, |current_perf_span| { - info_span!( - target: PERF_TRACE_TARGET, - parent: current_perf_span, - "GET_BATCH", - batch_size = %page_count, - ) - }) + .maybe_perf_instrument(ctx, |current_perf_span| current_perf_span.clone()) .await; match res { @@ -460,7 +494,7 @@ impl Timeline { )); } - if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { + if let Some(nblocks) = self.get_cached_rel_size(&tag, version) { return Ok(nblocks); } @@ -478,7 +512,7 @@ impl Timeline { let mut buf = version.get(self, key, ctx).await?; let nblocks = buf.get_u32_le(); - self.update_cached_rel_size(tag, version.get_lsn(), nblocks); + self.update_cached_rel_size(tag, version, nblocks); Ok(nblocks) } @@ -500,7 +534,7 @@ impl Timeline { } // first try to lookup relation in cache - if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) { + if let Some(_nblocks) = self.get_cached_rel_size(&tag, version) { return Ok(true); } // then check if the database was already initialized. @@ -576,7 +610,7 @@ impl Timeline { // scan directory listing (new), merge with the old results let key_range = rel_tag_sparse_key_range(spcnode, dbnode); let io_concurrency = IoConcurrency::spawn_from_conf( - self.conf, + self.conf.get_vectored_concurrent_io, self.gate .enter() .map_err(|_| PageReconstructError::Cancelled)?, @@ -622,7 +656,7 @@ impl Timeline { ) -> Result { assert!(self.tenant_shard_id.is_shard_zero()); let n_blocks = self - .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx) + .get_slru_segment_size(kind, segno, Version::at(lsn), ctx) .await?; let keyspace = KeySpace::single( @@ -635,7 +669,7 @@ impl Timeline { ); let io_concurrency = IoConcurrency::spawn_from_conf( - self.conf, + self.conf.get_vectored_concurrent_io, self.gate .enter() .map_err(|_| PageReconstructError::Cancelled)?, @@ -857,11 +891,11 @@ impl Timeline { mut f: impl FnMut(TimestampTz) -> ControlFlow, ) -> Result { for segno in self - .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx) + .list_slru_segments(SlruKind::Clog, Version::at(probe_lsn), ctx) .await? { let nblocks = self - .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx) + .get_slru_segment_size(SlruKind::Clog, segno, Version::at(probe_lsn), ctx) .await?; let keyspace = KeySpace::single( @@ -875,7 +909,7 @@ impl Timeline { ); let io_concurrency = IoConcurrency::spawn_from_conf( - self.conf, + self.conf.get_vectored_concurrent_io, self.gate .enter() .map_err(|_| PageReconstructError::Cancelled)?, @@ -1084,8 +1118,17 @@ impl Timeline { let mut result = HashMap::new(); for (k, v) in kv { let v = v?; + if v.is_empty() { + // This is a tombstone -- we can skip it. + // Originally, the replorigin code uses `Lsn::INVALID` to represent a tombstone. However, as it part of + // the sparse keyspace and the sparse keyspace uses an empty image to universally represent a tombstone, + // we also need to consider that. Such tombstones might be written on the detach ancestor code path to + // avoid the value going into the child branch. (See [`crate::tenant::timeline::detach_ancestor::generate_tombstone_image_layer`] for more details.) + continue; + } let origin_id = k.field6 as RepOriginId; - let origin_lsn = Lsn::des(&v).unwrap(); + let origin_lsn = Lsn::des(&v) + .with_context(|| format!("decode replorigin value for {}: {v:?}", origin_id))?; if origin_lsn != Lsn::INVALID { result.insert(origin_id, origin_lsn); } @@ -1118,7 +1161,7 @@ impl Timeline { let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { for rel in self - .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx) + .list_rels(*spcnode, *dbnode, Version::at(lsn), ctx) .await? { if self.cancel.is_cancelled() { @@ -1193,7 +1236,7 @@ impl Timeline { result.add_key(rel_dir_to_key(spcnode, dbnode)); let mut rels: Vec = self - .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx) + .list_rels(spcnode, dbnode, Version::at(lsn), ctx) .await? .into_iter() .collect(); @@ -1310,59 +1353,75 @@ impl Timeline { Ok((dense_keyspace, sparse_keyspace)) } - /// Get cached size of relation if it not updated after specified LSN - pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { - let rel_size_cache = self.rel_size_cache.read().unwrap(); - if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) { - if lsn >= *cached_lsn { - RELSIZE_CACHE_HITS.inc(); - return Some(*nblocks); + /// Get cached size of relation. There are two caches: one for primary updates, it captures the latest state of + /// of the timeline and snapshot cache, which key includes LSN and so can be used by replicas to get relation size + /// at the particular LSN (snapshot). + pub fn get_cached_rel_size(&self, tag: &RelTag, version: Version<'_>) -> Option { + let lsn = version.get_lsn(); + { + let rel_size_cache = self.rel_size_latest_cache.read().unwrap(); + if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { + if lsn >= *cached_lsn { + RELSIZE_LATEST_CACHE_HITS.inc(); + return Some(*nblocks); + } + RELSIZE_CACHE_MISSES_OLD.inc(); } - RELSIZE_CACHE_MISSES_OLD.inc(); } - RELSIZE_CACHE_MISSES.inc(); + { + let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap(); + if let Some(nblock) = rel_size_cache.get(&(lsn, *tag)) { + RELSIZE_SNAPSHOT_CACHE_HITS.inc(); + return Some(*nblock); + } + } + if version.is_latest() { + RELSIZE_LATEST_CACHE_MISSES.inc(); + } else { + RELSIZE_SNAPSHOT_CACHE_MISSES.inc(); + } None } /// Update cached relation size if there is no more recent update - pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { - let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - - if lsn < rel_size_cache.complete_as_of { - // Do not cache old values. It's safe to cache the size on read, as long as - // the read was at an LSN since we started the WAL ingestion. Reasoning: we - // never evict values from the cache, so if the relation size changed after - // 'lsn', the new value is already in the cache. - return; - } - - match rel_size_cache.map.entry(tag) { - hash_map::Entry::Occupied(mut entry) => { - let cached_lsn = entry.get_mut(); - if lsn >= cached_lsn.0 { - *cached_lsn = (lsn, nblocks); + pub fn update_cached_rel_size(&self, tag: RelTag, version: Version<'_>, nblocks: BlockNumber) { + let lsn = version.get_lsn(); + if version.is_latest() { + let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap(); + match rel_size_cache.entry(tag) { + hash_map::Entry::Occupied(mut entry) => { + let cached_lsn = entry.get_mut(); + if lsn >= cached_lsn.0 { + *cached_lsn = (lsn, nblocks); + } + } + hash_map::Entry::Vacant(entry) => { + entry.insert((lsn, nblocks)); + RELSIZE_LATEST_CACHE_ENTRIES.inc(); } } - hash_map::Entry::Vacant(entry) => { - entry.insert((lsn, nblocks)); - RELSIZE_CACHE_ENTRIES.inc(); + } else { + let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap(); + if rel_size_cache.capacity() != 0 { + rel_size_cache.insert((lsn, tag), nblocks); + RELSIZE_SNAPSHOT_CACHE_ENTRIES.set(rel_size_cache.len() as u64); } } } /// Store cached relation size pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { - let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() { - RELSIZE_CACHE_ENTRIES.inc(); + let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap(); + if rel_size_cache.insert(tag, (lsn, nblocks)).is_none() { + RELSIZE_LATEST_CACHE_ENTRIES.inc(); } } /// Remove cached relation size pub fn remove_cached_rel_size(&self, tag: &RelTag) { - let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - if rel_size_cache.map.remove(tag).is_some() { - RELSIZE_CACHE_ENTRIES.dec(); + let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap(); + if rel_size_cache.remove(tag).is_some() { + RELSIZE_LATEST_CACHE_ENTRIES.dec(); } } } @@ -1566,7 +1625,10 @@ impl DatadirModification<'_> { // check the cache too. This is because eagerly checking the cache results in // less work overall and 10% better performance. It's more work on cache miss // but cache miss is rare. - if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) { + if let Some(nblocks) = self + .tline + .get_cached_rel_size(&rel, Version::Modified(self)) + { Ok(nblocks) } else if !self .tline @@ -2578,6 +2640,11 @@ impl DatadirModification<'_> { } } + #[cfg(test)] + pub fn put_for_unit_test(&mut self, key: Key, val: Value) { + self.put(key, val); + } + fn put(&mut self, key: Key, val: Value) { if Self::is_data_key(&key) { self.put_data(key.to_compact(), val) @@ -2643,7 +2710,7 @@ pub struct DatadirModificationStats { /// timeline to not miss the latest updates. #[derive(Clone, Copy)] pub enum Version<'a> { - Lsn(Lsn), + LsnRange(LsnRange), Modified(&'a DatadirModification<'a>), } @@ -2655,7 +2722,7 @@ impl Version<'_> { ctx: &RequestContext, ) -> Result { match self { - Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await, + Version::LsnRange(lsns) => timeline.get(key, lsns.effective_lsn, ctx).await, Version::Modified(modification) => modification.get(key, ctx).await, } } @@ -2677,12 +2744,26 @@ impl Version<'_> { } } - fn get_lsn(&self) -> Lsn { + pub fn is_latest(&self) -> bool { match self { - Version::Lsn(lsn) => *lsn, + Version::LsnRange(lsns) => lsns.is_latest(), + Version::Modified(_) => true, + } + } + + pub fn get_lsn(&self) -> Lsn { + match self { + Version::LsnRange(lsns) => lsns.effective_lsn, Version::Modified(modification) => modification.lsn, } } + + pub fn at(lsn: Lsn) -> Self { + Version::LsnRange(LsnRange { + effective_lsn: lsn, + request_lsn: lsn, + }) + } } //--- Metadata structs stored in key-value pairs in the repository. diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index d4873e60a1..55272b2125 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -380,6 +380,10 @@ pub enum TaskKind { DetachAncestor, ImportPgdata, + + /// Background task of [`crate::basebackup_cache::BasebackupCache`]. + /// Prepares basebackups and clears outdated entries. + BasebackupCache, } #[derive(Default)] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 698579e8fb..bf3f71e35a 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -50,6 +50,7 @@ use remote_timeline_client::{ use secondary::heatmap::{HeatMapTenant, HeatMapTimeline}; use storage_broker::BrokerClientChannel; use timeline::compaction::{CompactionOutcome, GcCompactionQueue}; +use timeline::import_pgdata::ImportingTimeline; use timeline::offload::{OffloadError, offload_timeline}; use timeline::{ CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata, @@ -77,6 +78,7 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit use self::timeline::{ EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError, }; +use crate::basebackup_cache::BasebackupPrepareSender; use crate::config::PageServerConf; use crate::context; use crate::context::RequestContextBuilder; @@ -85,8 +87,8 @@ use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::l0_flush::L0FlushGlobalState; use crate::metrics::{ BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS, - INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC, - TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics, + INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES, + TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics, }; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationMode; @@ -156,6 +158,7 @@ pub struct TenantSharedResources { pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, pub l0_flush_global_state: L0FlushGlobalState, + pub basebackup_prepare_sender: BasebackupPrepareSender, } /// A [`TenantShard`] is really an _attached_ tenant. The configuration @@ -284,6 +287,19 @@ pub struct TenantShard { /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating` timelines_offloaded: Mutex>>, + /// Tracks the timelines that are currently importing into this tenant shard. + /// + /// Note that importing timelines are also present in [`Self::timelines_creating`]. + /// Keep this in mind when ordering lock acquisition. + /// + /// Lifetime: + /// * An imported timeline is created while scanning the bucket on tenant attach + /// if the index part contains an `import_pgdata` entry and said field marks the import + /// as in progress. + /// * Imported timelines are removed when the storage controller calls the post timeline + /// import activation endpoint. + timelines_importing: std::sync::Mutex>, + /// The last tenant manifest known to be in remote storage. None if the manifest has not yet /// been either downloaded or uploaded. Always Some after tenant attach. /// @@ -303,12 +319,15 @@ pub struct TenantShard { gc_cs: tokio::sync::Mutex<()>, walredo_mgr: Option>, - // provides access to timeline data sitting in the remote storage + /// Provides access to timeline data sitting in the remote storage. pub(crate) remote_storage: GenericRemoteStorage, - // Access to global deletion queue for when this tenant wants to schedule a deletion + /// Access to global deletion queue for when this tenant wants to schedule a deletion. deletion_queue_client: DeletionQueueClient, + /// A channel to send async requests to prepare a basebackup for the basebackup cache. + basebackup_prepare_sender: BasebackupPrepareSender, + /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`]. cached_logical_sizes: tokio::sync::Mutex>, cached_synthetic_tenant_size: Arc, @@ -923,19 +942,10 @@ enum StartCreatingTimelineResult { #[allow(clippy::large_enum_variant, reason = "TODO")] enum TimelineInitAndSyncResult { - ReadyToActivate(Arc), + ReadyToActivate, NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata), } -impl TimelineInitAndSyncResult { - fn ready_to_activate(self) -> Option> { - match self { - Self::ReadyToActivate(timeline) => Some(timeline), - _ => None, - } - } -} - #[must_use] struct TimelineInitAndSyncNeedsSpawnImportPgdata { timeline: Arc, @@ -1012,10 +1022,6 @@ enum CreateTimelineCause { enum LoadTimelineCause { Attach, Unoffload, - ImportPgdata { - create_guard: TimelineCreateGuard, - activate: ActivateTimelineArgs, - }, } #[derive(thiserror::Error, Debug)] @@ -1097,7 +1103,7 @@ impl TenantShard { self: &Arc, timeline_id: TimelineId, resources: TimelineResources, - mut index_part: IndexPart, + index_part: IndexPart, metadata: TimelineMetadata, previous_heatmap: Option, ancestor: Option>, @@ -1106,7 +1112,7 @@ impl TenantShard { ) -> anyhow::Result { let tenant_id = self.tenant_shard_id; - let import_pgdata = index_part.import_pgdata.take(); + let import_pgdata = index_part.import_pgdata.clone(); let idempotency = match &import_pgdata { Some(import_pgdata) => { CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata { @@ -1127,7 +1133,7 @@ impl TenantShard { } }; - let (timeline, timeline_ctx) = self.create_timeline_struct( + let (timeline, _timeline_ctx) = self.create_timeline_struct( timeline_id, &metadata, previous_heatmap, @@ -1197,14 +1203,6 @@ impl TenantShard { match import_pgdata { Some(import_pgdata) if !import_pgdata.is_done() => { - match cause { - LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), - LoadTimelineCause::ImportPgdata { .. } => { - unreachable!( - "ImportPgdata should not be reloading timeline import is done and persisted as such in s3" - ) - } - } let mut guard = self.timelines_creating.lock().unwrap(); if !guard.insert(timeline_id) { // We should never try and load the same timeline twice during startup @@ -1260,26 +1258,7 @@ impl TenantShard { "Timeline has no ancestor and no layer files" ); - match cause { - LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), - LoadTimelineCause::ImportPgdata { - create_guard, - activate, - } => { - // TODO: see the comment in the task code above how I'm not so certain - // it is safe to activate here because of concurrent shutdowns. - match activate { - ActivateTimelineArgs::Yes { broker_client } => { - info!("activating timeline after reload from pgdata import task"); - timeline.activate(self.clone(), broker_client, None, &timeline_ctx); - } - ActivateTimelineArgs::No => (), - } - drop(create_guard); - } - } - - Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline)) + Ok(TimelineInitAndSyncResult::ReadyToActivate) } } } @@ -1312,6 +1291,7 @@ impl TenantShard { remote_storage, deletion_queue_client, l0_flush_global_state, + basebackup_prepare_sender, } = resources; let attach_mode = attached_conf.location.attach_mode; @@ -1327,6 +1307,7 @@ impl TenantShard { remote_storage.clone(), deletion_queue_client, l0_flush_global_state, + basebackup_prepare_sender, )); // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if @@ -1768,7 +1749,7 @@ impl TenantShard { })?; match effect { - TimelineInitAndSyncResult::ReadyToActivate(_) => { + TimelineInitAndSyncResult::ReadyToActivate => { // activation happens later, on Tenant::activate } TimelineInitAndSyncResult::NeedsSpawnImportPgdata( @@ -1778,13 +1759,24 @@ impl TenantShard { guard, }, ) => { - tokio::task::spawn(self.clone().create_timeline_import_pgdata_task( - timeline, - import_pgdata, - ActivateTimelineArgs::No, - guard, - ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), - )); + let timeline_id = timeline.timeline_id; + let import_task_handle = + tokio::task::spawn(self.clone().create_timeline_import_pgdata_task( + timeline.clone(), + import_pgdata, + guard, + ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), + )); + + let prev = self.timelines_importing.lock().unwrap().insert( + timeline_id, + ImportingTimeline { + timeline: timeline.clone(), + import_task_handle, + }, + ); + + assert!(prev.is_none()); } } } @@ -2678,14 +2670,7 @@ impl TenantShard { .await? } CreateTimelineParams::ImportPgdata(params) => { - self.create_timeline_import_pgdata( - params, - ActivateTimelineArgs::Yes { - broker_client: broker_client.clone(), - }, - ctx, - ) - .await? + self.create_timeline_import_pgdata(params, ctx).await? } }; @@ -2759,7 +2744,6 @@ impl TenantShard { async fn create_timeline_import_pgdata( self: &Arc, params: CreateTimelineParamsImportPgdata, - activate: ActivateTimelineArgs, ctx: &RequestContext, ) -> Result { let CreateTimelineParamsImportPgdata { @@ -2840,24 +2824,71 @@ impl TenantShard { let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself(); - tokio::spawn(self.clone().create_timeline_import_pgdata_task( + let import_task_handle = tokio::spawn(self.clone().create_timeline_import_pgdata_task( timeline.clone(), index_part, - activate, timeline_create_guard, timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), )); + let prev = self.timelines_importing.lock().unwrap().insert( + timeline.timeline_id, + ImportingTimeline { + timeline: timeline.clone(), + import_task_handle, + }, + ); + + // Idempotency is enforced higher up the stack + assert!(prev.is_none()); + // NB: the timeline doesn't exist in self.timelines at this point Ok(CreateTimelineResult::ImportSpawned(timeline)) } + /// Finalize the import of a timeline on this shard by marking it complete in + /// the index part. If the import task hasn't finished yet, returns an error. + /// + /// This method is idempotent. If the import was finalized once, the next call + /// will be a no-op. + pub(crate) async fn finalize_importing_timeline( + &self, + timeline_id: TimelineId, + ) -> anyhow::Result<()> { + let timeline = { + let locked = self.timelines_importing.lock().unwrap(); + match locked.get(&timeline_id) { + Some(importing_timeline) => { + if !importing_timeline.import_task_handle.is_finished() { + return Err(anyhow::anyhow!("Import task not done yet")); + } + + importing_timeline.timeline.clone() + } + None => { + return Ok(()); + } + } + }; + + timeline + .remote_client + .schedule_index_upload_for_import_pgdata_finalize()?; + timeline.remote_client.wait_completion().await?; + + self.timelines_importing + .lock() + .unwrap() + .remove(&timeline_id); + + Ok(()) + } + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))] async fn create_timeline_import_pgdata_task( self: Arc, timeline: Arc, index_part: import_pgdata::index_part_format::Root, - activate: ActivateTimelineArgs, timeline_create_guard: TimelineCreateGuard, ctx: RequestContext, ) { @@ -2869,7 +2900,6 @@ impl TenantShard { .create_timeline_import_pgdata_task_impl( timeline, index_part, - activate, timeline_create_guard, ctx, ) @@ -2885,60 +2915,15 @@ impl TenantShard { self: Arc, timeline: Arc, index_part: import_pgdata::index_part_format::Root, - activate: ActivateTimelineArgs, - timeline_create_guard: TimelineCreateGuard, + _timeline_create_guard: TimelineCreateGuard, ctx: RequestContext, ) -> Result<(), anyhow::Error> { info!("importing pgdata"); + let ctx = ctx.with_scope_timeline(&timeline); import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone()) .await .context("import")?; - info!("import done"); - - // - // Reload timeline from remote. - // This proves that the remote state is attachable, and it reuses the code. - // - // TODO: think about whether this is safe to do with concurrent TenantShard::shutdown. - // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit. - // But our activate() call might launch new background tasks after TenantShard::shutdown - // already went past shutting down the TenantShard::timelines, which this timeline here is no part of. - // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting - // down while bootstrapping/branching + activating), but, the race condition is much more likely - // to manifest because of the long runtime of this import task. - - // in theory this shouldn't even .await anything except for coop yield - info!("shutting down timeline"); - timeline.shutdown(ShutdownMode::Hard).await; - info!("timeline shut down, reloading from remote"); - // TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc - // let Some(timeline) = Arc::into_inner(timeline) else { - // anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere"); - // }; - let timeline_id = timeline.timeline_id; - - // load from object storage like TenantShard::attach does - let resources = self.build_timeline_resources(timeline_id); - let index_part = resources - .remote_client - .download_index_file(&self.cancel) - .await?; - let index_part = match index_part { - MaybeDeletedIndexPart::Deleted(_) => { - // likely concurrent delete call, cplane should prevent this - anyhow::bail!( - "index part says deleted but we are not done creating yet, this should not happen but" - ) - } - MaybeDeletedIndexPart::IndexPart(p) => p, - }; - let metadata = index_part.metadata.clone(); - self - .load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{ - create_guard: timeline_create_guard, activate, }, &ctx) - .await? - .ready_to_activate() - .context("implementation error: reloaded timeline still needs import after import reported success")?; + info!("import done - waiting for activation"); anyhow::Ok(()) } @@ -3370,6 +3355,13 @@ impl TenantShard { activated_timelines += 1; } + let tid = self.tenant_shard_id.tenant_id.to_string(); + let shard_id = self.tenant_shard_id.shard_slug().to_string(); + let offloaded_timeline_count = timelines_offloaded_accessor.len(); + TENANT_OFFLOADED_TIMELINES + .with_label_values(&[&tid, &shard_id]) + .set(offloaded_timeline_count as u64); + self.state.send_modify(move |current_state| { assert!( matches!(current_state, TenantState::Activating(_)), @@ -3475,6 +3467,14 @@ impl TenantShard { timeline.defuse_for_tenant_drop(); }); } + { + let mut timelines_importing = self.timelines_importing.lock().unwrap(); + timelines_importing + .drain() + .for_each(|(_timeline_id, importing_timeline)| { + importing_timeline.shutdown(); + }); + } // test_long_timeline_create_then_tenant_delete is leaning on this message tracing::info!("Waiting for timelines..."); while let Some(res) = js.join_next().await { @@ -3949,13 +3949,6 @@ where Ok(result) } -enum ActivateTimelineArgs { - Yes { - broker_client: storage_broker::BrokerClientChannel, - }, - No, -} - impl TenantShard { pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig { self.tenant_conf.load().tenant_conf.clone() @@ -4253,10 +4246,9 @@ impl TenantShard { remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, l0_flush_global_state: L0FlushGlobalState, + basebackup_prepare_sender: BasebackupPrepareSender, ) -> TenantShard { - debug_assert!( - !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none() - ); + assert!(!attached_conf.location.generation.is_none()); let (state, mut rx) = watch::channel(state); @@ -4324,6 +4316,7 @@ impl TenantShard { timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()), timelines_offloaded: Mutex::new(HashMap::new()), + timelines_importing: Mutex::new(HashMap::new()), remote_tenant_manifest: Default::default(), gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, @@ -4357,6 +4350,7 @@ impl TenantShard { ongoing_timeline_detach: std::sync::Mutex::default(), gc_block: Default::default(), l0_flush_global_state, + basebackup_prepare_sender, } } @@ -4609,7 +4603,7 @@ impl TenantShard { target.cutoffs = GcCutoffs { space: space_cutoff, - time: Lsn::INVALID, + time: None, }; } } @@ -4693,7 +4687,7 @@ impl TenantShard { if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) { target.within_ancestor_pitr = - timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time; + Some(timeline.get_ancestor_lsn()) >= ancestor_gc_cutoffs.time; } } @@ -4706,13 +4700,15 @@ impl TenantShard { } else { 0 }); - timeline.metrics.pitr_history_size.set( - timeline - .get_last_record_lsn() - .checked_sub(target.cutoffs.time) - .unwrap_or(Lsn(0)) - .0, - ); + if let Some(time_cutoff) = target.cutoffs.time { + timeline.metrics.pitr_history_size.set( + timeline + .get_last_record_lsn() + .checked_sub(time_cutoff) + .unwrap_or_default() + .0, + ); + } // Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline? // - this timeline was created while we were finding cutoffs @@ -4721,8 +4717,8 @@ impl TenantShard { let original_cutoffs = target.cutoffs.clone(); // GC cutoffs should never go back target.cutoffs = GcCutoffs { - space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)), - time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)), + space: cutoffs.space.max(original_cutoffs.space), + time: cutoffs.time.max(original_cutoffs.time), } } } @@ -5274,6 +5270,7 @@ impl TenantShard { pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(), l0_compaction_trigger: self.l0_compaction_trigger.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), + basebackup_prepare_sender: self.basebackup_prepare_sender.clone(), } } @@ -5582,6 +5579,14 @@ impl TenantShard { } } + // Update metrics + let tid = self.tenant_shard_id.to_string(); + let shard_id = self.tenant_shard_id.shard_slug().to_string(); + let set_key = &[tid.as_str(), shard_id.as_str()][..]; + TENANT_OFFLOADED_TIMELINES + .with_label_values(set_key) + .set(manifest.offloaded_timelines.len() as u64); + // Upload the manifest. Remote storage does no retries internally, so retry here. match backoff::retry( || async { @@ -5848,6 +5853,8 @@ pub(crate) mod harness { ) -> anyhow::Result> { let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); + let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel(); + let tenant = Arc::new(TenantShard::new( TenantState::Attaching, self.conf, @@ -5865,6 +5872,7 @@ pub(crate) mod harness { self.deletion_queue.new_client(), // TODO: ideally we should run all unit tests with both configs L0FlushGlobalState::new(L0FlushConfig::default()), + basebackup_requst_sender, )); let preload = tenant @@ -5949,7 +5957,9 @@ mod tests { use itertools::Itertools; #[cfg(feature = "testing")] use models::CompactLsnRange; - use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; + use pageserver_api::key::{ + AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX, repl_origin_key, + }; use pageserver_api::keyspace::KeySpace; #[cfg(feature = "testing")] use pageserver_api::keyspace::KeySpaceRandomAccum; @@ -8185,6 +8195,54 @@ mod tests { assert_eq!(files.get("pg_logical/mappings/test2"), None); } + #[tokio::test] + async fn test_repl_origin_tombstones() { + let harness = TenantHarness::create("test_repl_origin_tombstones") + .await + .unwrap(); + + let (tenant, ctx) = harness.load().await; + let io_concurrency = IoConcurrency::spawn_for_test(); + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + let repl_lsn = Lsn(0x10); + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification.put_for_unit_test(repl_origin_key(2), Value::Image(Bytes::new())); + modification.set_replorigin(1, repl_lsn).await.unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // we can read everything from the storage + let repl_origins = tline + .get_replorigins(lsn, &ctx, io_concurrency.clone()) + .await + .unwrap(); + assert_eq!(repl_origins.len(), 1); + assert_eq!(repl_origins[&1], lsn); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification.put_for_unit_test( + repl_origin_key(3), + Value::Image(Bytes::copy_from_slice(b"cannot_decode_this")), + ); + modification.commit(&ctx).await.unwrap(); + } + let result = tline + .get_replorigins(lsn, &ctx, io_concurrency.clone()) + .await; + assert!(result.is_err()); + } + #[tokio::test] async fn test_metadata_image_creation() -> anyhow::Result<()> { let harness = TenantHarness::create("test_metadata_image_creation").await?; @@ -8568,8 +8626,10 @@ mod tests { lsn: Lsn, ctx: &RequestContext, ) -> Result, GetVectoredError> { - let io_concurrency = - IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap()); + let io_concurrency = IoConcurrency::spawn_from_conf( + tline.conf.get_vectored_concurrent_io, + tline.gate.enter().unwrap(), + ); let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn); let mut res = tline @@ -8907,7 +8967,7 @@ mod tests { .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); - guard.cutoffs.time = Lsn(0x30); + guard.cutoffs.time = Some(Lsn(0x30)); guard.cutoffs.space = Lsn(0x30); } @@ -9015,7 +9075,7 @@ mod tests { .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); - guard.cutoffs.time = Lsn(0x40); + guard.cutoffs.time = Some(Lsn(0x40)); guard.cutoffs.space = Lsn(0x40); } tline @@ -9433,7 +9493,7 @@ mod tests { *guard = GcInfo { retain_lsns: vec![], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), @@ -9517,7 +9577,7 @@ mod tests { .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); - guard.cutoffs.time = Lsn(0x40); + guard.cutoffs.time = Some(Lsn(0x40)); guard.cutoffs.space = Lsn(0x40); } tline @@ -9988,7 +10048,7 @@ mod tests { (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), @@ -10051,7 +10111,7 @@ mod tests { let verify_result = || async { let gc_horizon = { let gc_info = tline.gc_info.read().unwrap(); - gc_info.cutoffs.time + gc_info.cutoffs.time.unwrap_or_default() }; for idx in 0..10 { assert_eq!( @@ -10129,7 +10189,7 @@ mod tests { .await; // Update GC info let mut guard = tline.gc_info.write().unwrap(); - guard.cutoffs.time = Lsn(0x38); + guard.cutoffs.time = Some(Lsn(0x38)); guard.cutoffs.space = Lsn(0x38); } tline @@ -10237,7 +10297,7 @@ mod tests { (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), @@ -10300,7 +10360,7 @@ mod tests { let verify_result = || async { let gc_horizon = { let gc_info = tline.gc_info.read().unwrap(); - gc_info.cutoffs.time + gc_info.cutoffs.time.unwrap_or_default() }; for idx in 0..10 { assert_eq!( @@ -10486,7 +10546,7 @@ mod tests { *guard = GcInfo { retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { - time: Lsn(0x10), + time: Some(Lsn(0x10)), space: Lsn(0x10), }, leases: Default::default(), @@ -10506,7 +10566,7 @@ mod tests { *guard = GcInfo { retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { - time: Lsn(0x50), + time: Some(Lsn(0x50)), space: Lsn(0x50), }, leases: Default::default(), @@ -11227,7 +11287,7 @@ mod tests { *guard = GcInfo { retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), @@ -11616,7 +11676,7 @@ mod tests { (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), @@ -11679,7 +11739,7 @@ mod tests { let verify_result = || async { let gc_horizon = { let gc_info = tline.gc_info.read().unwrap(); - gc_info.cutoffs.time + gc_info.cutoffs.time.unwrap_or_default() }; for idx in 0..10 { assert_eq!( @@ -11868,7 +11928,7 @@ mod tests { (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), @@ -11931,7 +11991,7 @@ mod tests { let verify_result = || async { let gc_horizon = { let gc_info = tline.gc_info.read().unwrap(); - gc_info.cutoffs.time + gc_info.cutoffs.time.unwrap_or_default() }; for idx in 0..10 { assert_eq!( @@ -12194,7 +12254,7 @@ mod tests { *guard = GcInfo { retain_lsns: vec![], cutoffs: GcCutoffs { - time: Lsn(0x30), + time: Some(Lsn(0x30)), space: Lsn(0x30), }, leases: Default::default(), diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 8cf3c548c9..ed541c4f12 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -94,10 +94,23 @@ impl Header { pub enum WriteBlobError { #[error(transparent)] Flush(FlushTaskError), - #[error("blob too large ({len} bytes)")] - BlobTooLarge { len: usize }, #[error(transparent)] - WriteBlobRaw(anyhow::Error), + Other(anyhow::Error), +} + +impl WriteBlobError { + pub fn is_cancel(&self) -> bool { + match self { + WriteBlobError::Flush(e) => e.is_cancel(), + WriteBlobError::Other(_) => false, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + WriteBlobError::Flush(e) => e.into_anyhow(), + WriteBlobError::Other(e) => e, + } + } } impl BlockCursor<'_> { @@ -327,7 +340,9 @@ where return ( ( io_buf.slice_len(), - Err(WriteBlobError::BlobTooLarge { len }), + Err(WriteBlobError::Other(anyhow::anyhow!( + "blob too large ({len} bytes)" + ))), ), srcbuf, ); @@ -391,7 +406,7 @@ where // Verify the header, to ensure we don't write invalid/corrupt data. let header = match Header::decode(&raw_with_header) .context("decoding blob header") - .map_err(WriteBlobError::WriteBlobRaw) + .map_err(WriteBlobError::Other) { Ok(header) => header, Err(err) => return (raw_with_header, Err(err)), @@ -401,7 +416,7 @@ where let raw_len = raw_with_header.len(); return ( raw_with_header, - Err(WriteBlobError::WriteBlobRaw(anyhow::anyhow!( + Err(WriteBlobError::Other(anyhow::anyhow!( "header length mismatch: {header_total_len} != {raw_len}" ))), ); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 2ae7e1e875..86aef9b42c 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -346,7 +346,8 @@ async fn init_load_generations( "Emergency mode! Tenants will be attached unsafely using their last known generation" ); emergency_generations(tenant_confs) - } else if let Some(client) = StorageControllerUpcallClient::new(conf, cancel)? { + } else { + let client = StorageControllerUpcallClient::new(conf, cancel); info!("Calling {} API to re-attach tenants", client.base_url()); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. match client.re_attach(conf).await { @@ -360,9 +361,6 @@ async fn init_load_generations( anyhow::bail!("Shut down while waiting for control plane re-attach response") } } - } else { - info!("Control plane API not configured, tenant generations are disabled"); - return Ok(None); }; // The deletion queue needs to know about the startup attachment state to decide which (if any) stored @@ -1153,17 +1151,8 @@ impl TenantManager { // Testing hack: if we are configured with no control plane, then drop the generation // from upserts. This enables creating generation-less tenants even though neon_local // always uses generations when calling the location conf API. - let attached_conf = if cfg!(feature = "testing") { - let mut conf = AttachedTenantConf::try_from(new_location_config) - .map_err(UpsertLocationError::BadRequest)?; - if self.conf.control_plane_api.is_none() { - conf.location.generation = Generation::none(); - } - conf - } else { - AttachedTenantConf::try_from(new_location_config) - .map_err(UpsertLocationError::BadRequest)? - }; + let attached_conf = AttachedTenantConf::try_from(new_location_config) + .map_err(UpsertLocationError::BadRequest)?; let tenant = tenant_spawn( self.conf, diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index ea29f51956..21d68495f7 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -949,6 +949,35 @@ impl RemoteTimelineClient { Ok(()) } + /// If the `import_pgdata` field marks the timeline as having an import in progress, + /// launch an index-file upload operation that transitions it to done in the background + pub(crate) fn schedule_index_upload_for_import_pgdata_finalize( + self: &Arc, + ) -> anyhow::Result<()> { + use import_pgdata::index_part_format; + + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + let to_update = match &upload_queue.dirty.import_pgdata { + Some(import) if !import.is_done() => Some(import), + Some(_) | None => None, + }; + + if let Some(old) = to_update { + let new = + index_part_format::Root::V1(index_part_format::V1::Done(index_part_format::Done { + idempotency_key: old.idempotency_key().clone(), + started_at: *old.started_at(), + finished_at: chrono::Utc::now().naive_utc(), + })); + + upload_queue.dirty.import_pgdata = Some(new); + self.schedule_index_upload(upload_queue); + } + + Ok(()) + } + /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field. pub(crate) fn schedule_index_upload_for_gc_compaction_state_update( self: &Arc, diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index c26b7626ef..dd49c843f3 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -668,7 +668,9 @@ impl From for UpdateError { impl From for UpdateError { fn from(value: std::io::Error) -> Self { - if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) { + if let Some(nix::errno::Errno::ENOSPC) = + value.raw_os_error().map(nix::errno::Errno::from_raw) + { UpdateError::NoSpace } else if value .get_ref() diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index bf5d9bc87a..d1020cff96 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -235,7 +235,7 @@ pub(super) async fn gather_inputs( // than our internal space cutoff. This means that if someone drops a database and waits for their // PITR interval, they will see synthetic size decrease, even if we are still storing data inside // the space cutoff. - let mut next_pitr_cutoff = gc_info.cutoffs.time; + let mut next_pitr_cutoff = gc_info.cutoffs.time.unwrap_or_default(); // TODO: handle None // If the caller provided a shorter retention period, use that instead of the GC cutoff. let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 796ad01e54..9d15e7c4de 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -2,6 +2,7 @@ pub mod batch_split_writer; pub mod delta_layer; +pub mod errors; pub mod filter_iterator; pub mod image_layer; pub mod inmemory_layer; @@ -30,6 +31,7 @@ pub use inmemory_layer::InMemoryLayer; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; +use pageserver_api::config::GetVectoredConcurrentIo; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use pageserver_api::record::NeonWalRecord; @@ -42,7 +44,6 @@ use self::inmemory_layer::InMemoryLayerFileId; use super::PageReconstructError; use super::layer_map::InMemoryLayerDesc; use super::timeline::{GetVectoredError, ReadPath}; -use crate::config::PageServerConf; use crate::context::{ AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, }; @@ -317,11 +318,10 @@ impl IoConcurrency { } pub(crate) fn spawn_from_conf( - conf: &'static PageServerConf, + conf: GetVectoredConcurrentIo, gate_guard: GateGuard, ) -> IoConcurrency { - use pageserver_api::config::GetVectoredConcurrentIo; - let selected = match conf.get_vectored_concurrent_io { + let selected = match conf { GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential, GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard), }; diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 39cd02d101..51f2e909a2 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -10,6 +10,7 @@ use utils::id::TimelineId; use utils::lsn::Lsn; use utils::shard::TenantShardId; +use super::errors::PutError; use super::layer::S3_UPLOAD_LIMIT; use super::{ DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, @@ -235,7 +236,7 @@ impl<'a> SplitImageLayerWriter<'a> { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { // The current estimation is an upper bound of the space that the key/image could take // because we did not consider compression in this estimation. The resulting image layer // could be smaller than the target size. @@ -253,7 +254,8 @@ impl<'a> SplitImageLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(PutError::Other)?; let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); self.batches.add_unfinished_image_writer( prev_image_writer, @@ -346,7 +348,7 @@ impl<'a> SplitDeltaLayerWriter<'a> { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate // number, and therefore the final layer size could be a little bit larger or smaller than the target. // @@ -366,7 +368,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?, + .await + .map_err(PutError::Other)?, )); } let (_, inner) = self.inner.as_mut().unwrap(); @@ -386,7 +389,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(PutError::Other)?; let (start_key, prev_delta_writer) = self.inner.replace((key, next_delta_writer)).unwrap(); self.batches.add_unfinished_delta_writer( @@ -396,11 +400,11 @@ impl<'a> SplitDeltaLayerWriter<'a> { ); } else if inner.estimated_size() >= S3_UPLOAD_LIMIT { // We have to produce a very large file b/c a key is updated too often. - anyhow::bail!( + return Err(PutError::Other(anyhow::anyhow!( "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced", key, inner.estimated_size() - ); + ))); } } self.last_key_written = key; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 607b0d513c..2c1b27c8d5 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -55,6 +55,7 @@ use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use super::errors::PutError; use super::{ AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, @@ -477,12 +478,15 @@ impl DeltaLayerWriterInner { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { let (_, res) = self .put_value_bytes( key, lsn, - Value::ser(&val)?.slice_len(), + Value::ser(&val) + .map_err(anyhow::Error::new) + .map_err(PutError::Other)? + .slice_len(), val.will_init(), ctx, ) @@ -497,7 +501,7 @@ impl DeltaLayerWriterInner { val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (FullSlice, anyhow::Result<()>) + ) -> (FullSlice, Result<(), PutError>) where Buf: IoBuf + Send, { @@ -513,19 +517,24 @@ impl DeltaLayerWriterInner { .blob_writer .write_blob_maybe_compressed(val, ctx, compression) .await; + let res = res.map_err(PutError::WriteBlob); let off = match res { Ok((off, _)) => off, - Err(e) => return (val, Err(anyhow::anyhow!(e))), + Err(e) => return (val, Err(e)), }; let blob_ref = BlobRef::new(off, will_init); let delta_key = DeltaKey::from_key_lsn(&key, lsn); - let res = self.tree.append(&delta_key.0, blob_ref.0); + let res = self + .tree + .append(&delta_key.0, blob_ref.0) + .map_err(anyhow::Error::new) + .map_err(PutError::Other); self.num_keys += 1; - (val, res.map_err(|e| anyhow::anyhow!(e))) + (val, res) } fn size(&self) -> u64 { @@ -694,7 +703,7 @@ impl DeltaLayerWriter { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { self.inner .as_mut() .unwrap() @@ -709,7 +718,7 @@ impl DeltaLayerWriter { val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (FullSlice, anyhow::Result<()>) + ) -> (FullSlice, Result<(), PutError>) where Buf: IoBuf + Send, { @@ -1441,14 +1450,6 @@ impl DeltaLayerInner { offset } - pub fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> { - self.iter_with_options( - ctx, - 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. - 1024, // The default value. Unit tests might use a different value - ) - } - pub fn iter_with_options<'a>( &'a self, ctx: &'a RequestContext, @@ -1634,7 +1635,6 @@ pub(crate) mod test { use crate::tenant::disk_btree::tests::TestDisk; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; - use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::{TenantShard, Timeline}; /// Construct an index for a fictional delta layer and and then @@ -2311,8 +2311,7 @@ pub(crate) mod test { for batch_size in [1, 2, 4, 8, 3, 7, 13] { println!("running with batch_size={batch_size} max_read_size={max_read_size}"); // Test if the batch size is correctly determined - let mut iter = delta_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size); let mut num_items = 0; for _ in 0..3 { iter.next_batch().await.unwrap(); @@ -2329,8 +2328,7 @@ pub(crate) mod test { iter.key_values_batch.clear(); } // Test if the result is correct - let mut iter = delta_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size); assert_delta_iter_equal(&mut iter, &test_deltas).await; } } diff --git a/pageserver/src/tenant/storage_layer/errors.rs b/pageserver/src/tenant/storage_layer/errors.rs new file mode 100644 index 0000000000..591e489faa --- /dev/null +++ b/pageserver/src/tenant/storage_layer/errors.rs @@ -0,0 +1,24 @@ +use crate::tenant::blob_io::WriteBlobError; + +#[derive(Debug, thiserror::Error)] +pub enum PutError { + #[error(transparent)] + WriteBlob(WriteBlobError), + #[error(transparent)] + Other(anyhow::Error), +} + +impl PutError { + pub fn is_cancel(&self) -> bool { + match self { + PutError::WriteBlob(e) => e.is_cancel(), + PutError::Other(_) => false, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + PutError::WriteBlob(e) => e.into_anyhow(), + PutError::Other(e) => e, + } + } +} diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs index 8d172a1c19..1a330ecfc2 100644 --- a/pageserver/src/tenant/storage_layer/filter_iterator.rs +++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs @@ -157,7 +157,7 @@ mod tests { .await .unwrap(); - let merge_iter = MergeIterator::create( + let merge_iter = MergeIterator::create_for_testing( &[resident_layer_1.get_as_delta(&ctx).await.unwrap()], &[], &ctx, @@ -182,7 +182,7 @@ mod tests { result.extend(test_deltas1[90..100].iter().cloned()); assert_filter_iter_equal(&mut filter_iter, &result).await; - let merge_iter = MergeIterator::create( + let merge_iter = MergeIterator::create_for_testing( &[resident_layer_1.get_as_delta(&ctx).await.unwrap()], &[], &ctx, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 2f7c5715bb..740f53f928 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -53,6 +53,7 @@ use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use super::errors::PutError; use super::layer_name::ImageLayerName; use super::{ AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, @@ -684,14 +685,6 @@ impl ImageLayerInner { } } - pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> { - self.iter_with_options( - ctx, - 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. - 1024, // The default value. Unit tests might use a different value - ) - } - pub(crate) fn iter_with_options<'a>( &'a self, ctx: &'a RequestContext, @@ -850,8 +843,14 @@ impl ImageLayerWriterInner { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { - ensure!(self.key_range.contains(&key)); + ) -> Result<(), PutError> { + if !self.key_range.contains(&key) { + return Err(PutError::Other(anyhow::anyhow!( + "key {:?} not in range {:?}", + key, + self.key_range + ))); + } let compression = self.conf.image_compression; let uncompressed_len = img.len() as u64; self.uncompressed_bytes += uncompressed_len; @@ -861,7 +860,7 @@ impl ImageLayerWriterInner { .write_blob_maybe_compressed(img.slice_len(), ctx, compression) .await; // TODO: re-use the buffer for `img` further upstack - let (off, compression_info) = res?; + let (off, compression_info) = res.map_err(PutError::WriteBlob)?; if compression_info.compressed_size.is_some() { // The image has been considered for compression at least self.uncompressed_bytes_eligible += uncompressed_len; @@ -873,7 +872,10 @@ impl ImageLayerWriterInner { let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); - self.tree.append(&keybuf, off)?; + self.tree + .append(&keybuf, off) + .map_err(anyhow::Error::new) + .map_err(PutError::Other)?; #[cfg(feature = "testing")] { @@ -1093,7 +1095,7 @@ impl ImageLayerWriter { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { self.inner.as_mut().unwrap().put_image(key, img, ctx).await } @@ -1240,7 +1242,6 @@ mod test { use crate::context::RequestContext; use crate::tenant::harness::{TIMELINE_ID, TenantHarness}; use crate::tenant::storage_layer::{Layer, ResidentLayer}; - use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; use crate::tenant::{TenantShard, Timeline}; #[tokio::test] @@ -1507,8 +1508,7 @@ mod test { for batch_size in [1, 2, 4, 8, 3, 7, 13] { println!("running with batch_size={batch_size} max_read_size={max_read_size}"); // Test if the batch size is correctly determined - let mut iter = img_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size); let mut num_items = 0; for _ in 0..3 { iter.next_batch().await.unwrap(); @@ -1525,8 +1525,7 @@ mod test { iter.key_values_batch.clear(); } // Test if the result is correct - let mut iter = img_layer.iter(&ctx); - iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size); assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await; } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 5d558e66cc..200beba115 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -63,7 +63,28 @@ pub struct InMemoryLayer { opened_at: Instant, - /// The above fields never change, except for `end_lsn`, which is only set once. + /// All versions of all pages in the layer are kept here. Indexed + /// by block number and LSN. The [`IndexEntry`] is an offset into the + /// ephemeral file where the page version is stored. + /// + /// We use a separate lock for the index to reduce the critical section + /// during which reads cannot be planned. + /// + /// If you need access to both the index and the underlying file at the same time, + /// respect the following locking order to avoid deadlocks: + /// 1. [`InMemoryLayer::inner`] + /// 2. [`InMemoryLayer::index`] + /// + /// Note that the file backing [`InMemoryLayer::inner`] is append-only, + /// so it is not necessary to hold simultaneous locks on index. + /// This avoids holding index locks across IO, and is crucial for avoiding read tail latency. + /// In particular: + /// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`]. + /// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`]. + index: RwLock>>, + + /// The above fields never change, except for `end_lsn`, which is only set once, + /// and `index` (see rationale there). /// All other changing parts are in `inner`, and protected by a mutex. inner: RwLock, @@ -81,11 +102,6 @@ impl std::fmt::Debug for InMemoryLayer { } pub struct InMemoryLayerInner { - /// All versions of all pages in the layer are kept here. Indexed - /// by block number and LSN. The [`IndexEntry`] is an offset into the - /// ephemeral file where the page version is stored. - index: BTreeMap>, - /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. /// PerSeg::page_versions map stores offsets into this file. @@ -105,7 +121,7 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = { trailing_ones }; -/// See [`InMemoryLayerInner::index`]. +/// See [`InMemoryLayer::index`]. /// /// For memory efficiency, the data is packed into a u64. /// @@ -425,7 +441,7 @@ impl InMemoryLayer { .page_content_kind(PageContentKind::InMemoryLayer) .attached_child(); - let inner = self.inner.read().await; + let index = self.index.read().await; struct ValueRead { entry_lsn: Lsn, @@ -435,10 +451,7 @@ impl InMemoryLayer { let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default(); for range in keyspace.ranges.iter() { - for (key, vec_map) in inner - .index - .range(range.start.to_compact()..range.end.to_compact()) - { + for (key, vec_map) in index.range(range.start.to_compact()..range.end.to_compact()) { let key = Key::from_compact(*key); let slice = vec_map.slice_range(lsn_range.clone()); @@ -466,7 +479,7 @@ impl InMemoryLayer { } } } - drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below + drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below let read_from = Arc::clone(self); let read_ctx = ctx.attached_child(); reconstruct_state @@ -573,8 +586,8 @@ impl InMemoryLayer { start_lsn, end_lsn: OnceLock::new(), opened_at: Instant::now(), + index: RwLock::new(BTreeMap::new()), inner: RwLock::new(InMemoryLayerInner { - index: BTreeMap::new(), file, resource_units: GlobalResourceUnits::new(), }), @@ -592,31 +605,39 @@ impl InMemoryLayer { serialized_batch: SerializedValueBatch, ctx: &RequestContext, ) -> anyhow::Result<()> { - let mut inner = self.inner.write().await; - self.assert_writable(); + let (base_offset, metadata) = { + let mut inner = self.inner.write().await; + self.assert_writable(); - let base_offset = inner.file.len(); + let base_offset = inner.file.len(); - let SerializedValueBatch { - raw, - metadata, - max_lsn: _, - len: _, - } = serialized_batch; + let SerializedValueBatch { + raw, + metadata, + max_lsn: _, + len: _, + } = serialized_batch; - // Write the batch to the file - inner.file.write_raw(&raw, ctx).await?; - let new_size = inner.file.len(); + // Write the batch to the file + inner.file.write_raw(&raw, ctx).await?; + let new_size = inner.file.len(); - let expected_new_len = base_offset - .checked_add(raw.len().into_u64()) - // write_raw would error if we were to overflow u64. - // also IndexEntry and higher levels in - //the code don't allow the file to grow that large - .unwrap(); - assert_eq!(new_size, expected_new_len); + let expected_new_len = base_offset + .checked_add(raw.len().into_u64()) + // write_raw would error if we were to overflow u64. + // also IndexEntry and higher levels in + //the code don't allow the file to grow that large + .unwrap(); + assert_eq!(new_size, expected_new_len); + + inner.resource_units.maybe_publish_size(new_size); + + (base_offset, metadata) + }; // Update the index with the new entries + let mut index = self.index.write().await; + for meta in metadata { let SerializedValueMeta { key, @@ -639,7 +660,7 @@ impl InMemoryLayer { will_init, })?; - let vec_map = inner.index.entry(key).or_default(); + let vec_map = index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0; if old.is_some() { // This should not break anything, but is unexpected: ingestion code aims to filter out @@ -658,8 +679,6 @@ impl InMemoryLayer { ); } - inner.resource_units.maybe_publish_size(new_size); - Ok(()) } @@ -680,6 +699,18 @@ impl InMemoryLayer { /// Records the end_lsn for non-dropped layers. /// `end_lsn` is exclusive + /// + /// A note on locking: + /// The current API of [`InMemoryLayer`] does not ensure that there's no ongoing + /// writes while freezing the layer. This is enforced at a higher level via + /// [`crate::tenant::Timeline::write_lock`]. Freeze might be called via two code paths: + /// 1. Via the active [`crate::tenant::timeline::TimelineWriter`]. This holds the + /// Timeline::write_lock for its lifetime. The rolling is handled in + /// [`crate::tenant::timeline::TimelineWriter::put_batch`]. It's a &mut self function + /// so can't be called from different threads. + /// 2. In the background via [`crate::tenant::Timeline::maybe_freeze_ephemeral_layer`]. + /// This only proceeds if try_lock on Timeline::write_lock succeeds (i.e. there's no active writer), + /// hence there can be no concurrent writes pub async fn freeze(&self, end_lsn: Lsn) { assert!( self.start_lsn < end_lsn, @@ -700,8 +731,8 @@ impl InMemoryLayer { #[cfg(debug_assertions)] { - let inner = self.inner.write().await; - for vec_map in inner.index.values() { + let index = self.index.read().await; + for vec_map in index.values() { for (lsn, _) in vec_map.as_slice() { assert!(*lsn < end_lsn); } @@ -724,14 +755,11 @@ impl InMemoryLayer { ) -> Result> { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the - // write lock on it, so we shouldn't block anyone. There's one exception - // though: another thread might have grabbed a reference to this layer - // in `get_layer_for_write' just before the checkpointer called - // `freeze`, and then `write_to_disk` on it. When the thread gets the - // lock, it will see that it's not writeable anymore and retry, but it - // would have to wait until we release it. That race condition is very - // rare though, so we just accept the potential latency hit for now. + // write lock on it, so we shouldn't block anyone. See the comment on + // [`InMemoryLayer::freeze`] to understand how locking between the append path + // and layer flushing works. let inner = self.inner.read().await; + let index = self.index.read().await; use l0_flush::Inner; let _concurrency_permit = match l0_flush_global_state { @@ -743,13 +771,9 @@ impl InMemoryLayer { let key_count = if let Some(key_range) = key_range { let key_range = key_range.start.to_compact()..key_range.end.to_compact(); - inner - .index - .iter() - .filter(|(k, _)| key_range.contains(k)) - .count() + index.iter().filter(|(k, _)| key_range.contains(k)).count() } else { - inner.index.len() + index.len() }; if key_count == 0 { return Ok(None); @@ -772,7 +796,7 @@ impl InMemoryLayer { let file_contents = inner.file.load_to_io_buf(ctx).await?; let file_contents = file_contents.freeze(); - for (key, vec_map) in inner.index.iter() { + for (key, vec_map) in index.iter() { // Write all page versions for (lsn, entry) in vec_map .as_slice() diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index b7f6e5dc77..3d55972017 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -4,6 +4,7 @@ use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime}; use crate::PERF_TRACE_TARGET; +use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT}; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; @@ -22,7 +23,7 @@ use super::{ LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState, }; use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; +use crate::context::{RequestContext, RequestContextBuilder}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::task_mgr::TaskKind; use crate::tenant::Timeline; @@ -1075,24 +1076,17 @@ impl LayerInner { return Err(DownloadError::DownloadRequired); } - let ctx = if ctx.has_perf_span() { - let dl_ctx = RequestContextBuilder::from(ctx) - .task_kind(TaskKind::LayerDownload) - .download_behavior(DownloadBehavior::Download) - .root_perf_span(|| { - info_span!( - target: PERF_TRACE_TARGET, - "DOWNLOAD_LAYER", - layer = %self, - reason = %reason - ) - }) - .detached_child(); - ctx.perf_follows_from(&dl_ctx); - dl_ctx - } else { - ctx.attached_child() - }; + let ctx = RequestContextBuilder::from(ctx) + .perf_span(|crnt_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: crnt_perf_span, + "DOWNLOAD_LAYER", + layer = %self, + reason = %reason, + ) + }) + .attached_child(); async move { tracing::info!(%reason, "downloading on-demand"); @@ -1100,7 +1094,7 @@ impl LayerInner { let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); let res = self .download_init_and_wait(timeline, permit, ctx.attached_child()) - .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) + .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone()) .await?; scopeguard::ScopeGuard::into_inner(init_cancelled); @@ -1255,6 +1249,14 @@ impl LayerInner { self.access_stats.record_residence_event(); + let task_kind: &'static str = ctx.task_kind().into(); + ONDEMAND_DOWNLOAD_BYTES + .with_label_values(&[task_kind]) + .inc_by(self.desc.file_size); + ONDEMAND_DOWNLOAD_COUNT + .with_label_values(&[task_kind]) + .inc(); + Ok(self.initialize_after_layer_is_on_disk(permit)) } Err(e) => { @@ -1700,7 +1702,7 @@ impl DownloadError { } } -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Copy, Clone)] pub(crate) enum NeedsDownload { NotFound, NotFile(std::fs::FileType), diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index e084e3d567..ea3dea50c3 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -19,14 +19,6 @@ pub(crate) enum LayerRef<'a> { } impl<'a> LayerRef<'a> { - #[allow(dead_code)] - fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> { - match self { - Self::Image(x) => LayerIterRef::Image(x.iter(ctx)), - Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)), - } - } - fn iter_with_options( self, ctx: &'a RequestContext, @@ -322,6 +314,28 @@ impl MergeIteratorItem for ((Key, Lsn, Value), Arc) { } impl<'a> MergeIterator<'a> { + #[cfg(test)] + pub(crate) fn create_for_testing( + deltas: &[&'a DeltaLayerInner], + images: &[&'a ImageLayerInner], + ctx: &'a RequestContext, + ) -> Self { + Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024) + } + + /// Create a new merge iterator with custom options. + /// + /// Adjust `max_read_size` and `max_batch_size` to trade memory usage for performance. The size should scale + /// with the number of layers to compact. If there are a lot of layers, consider reducing the values, so that + /// the buffer does not take too much memory. + /// + /// The default options for L0 compactions are: + /// - max_read_size: 1024 * 8192 (8MB) + /// - max_batch_size: 1024 + /// + /// The default options for gc-compaction are: + /// - max_read_size: 128 * 8192 (1MB) + /// - max_batch_size: 128 pub fn create_with_options( deltas: &[&'a DeltaLayerInner], images: &[&'a ImageLayerInner], @@ -351,14 +365,6 @@ impl<'a> MergeIterator<'a> { } } - pub fn create( - deltas: &[&'a DeltaLayerInner], - images: &[&'a ImageLayerInner], - ctx: &'a RequestContext, - ) -> Self { - Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024) - } - pub(crate) async fn next_inner(&mut self) -> anyhow::Result> { while let Some(mut iter) = self.heap.peek_mut() { if !iter.is_loaded() { @@ -477,7 +483,7 @@ mod tests { let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) .await .unwrap(); - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_2.get_as_delta(&ctx).await.unwrap(), resident_layer_1.get_as_delta(&ctx).await.unwrap(), @@ -549,7 +555,7 @@ mod tests { let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) .await .unwrap(); - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_1.get_as_delta(&ctx).await.unwrap(), resident_layer_2.get_as_delta(&ctx).await.unwrap(), @@ -670,7 +676,7 @@ mod tests { // Test with different layer order for MergeIterator::create to ensure the order // is stable. - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_4.get_as_delta(&ctx).await.unwrap(), resident_layer_1.get_as_delta(&ctx).await.unwrap(), @@ -682,7 +688,7 @@ mod tests { ); assert_merge_iter_equal(&mut merge_iter, &expect).await; - let mut merge_iter = MergeIterator::create( + let mut merge_iter = MergeIterator::create_for_testing( &[ resident_layer_1.get_as_delta(&ctx).await.unwrap(), resident_layer_4.get_as_delta(&ctx).await.unwrap(), diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 1112a5330b..4709a6d616 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -340,7 +340,7 @@ pub(crate) fn log_compaction_error( } else { match level { Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"), - Level::ERROR => error!("Compaction failed: {err:#}"), + Level::ERROR => error!("Compaction failed: {err:?}"), Level::INFO => info!("Compaction failed: {err:#}"), level => unimplemented!("unexpected level {level:?}"), } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index cfeab77598..54dc3b2d0b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -14,6 +14,7 @@ pub mod span; pub mod uninit; mod walreceiver; +use hashlink::LruCache; use std::array; use std::cmp::{max, min}; use std::collections::btree_map::Entry; @@ -23,8 +24,6 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; -use crate::PERF_TRACE_TARGET; -use crate::walredo::RedoAttemptType; use anyhow::{Context, Result, anyhow, bail, ensure}; use arc_swap::{ArcSwap, ArcSwapOption}; use bytes::Bytes; @@ -93,10 +92,12 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; use super::tasks::log_compaction_error; use super::upload_queue::NotInitialized; use super::{ - AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded, + AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded, debug_assert_current_span_has_tenant_and_timeline_id, }; +use crate::PERF_TRACE_TARGET; use crate::aux_file::AuxFileSizeEstimator; +use crate::basebackup_cache::BasebackupPrepareRequest; use crate::config::PageServerConf; use crate::context::{ DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, @@ -130,6 +131,7 @@ use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use crate::walingest::WalLagCooldown; +use crate::walredo::RedoAttemptType; use crate::{ZERO_PAGE, task_mgr, walredo}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] @@ -195,16 +197,7 @@ pub struct TimelineResources { pub pagestream_throttle_metrics: Arc, pub l0_compaction_trigger: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, -} - -/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL -/// ingestion considerably, because WAL ingestion needs to check on most records if the record -/// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end -/// of the timeline (disk_consistent_lsn). It's used on reads of relation sizes to check if the -/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`]. -pub(crate) struct RelSizeCache { - pub(crate) complete_as_of: Lsn, - pub(crate) map: HashMap, + pub basebackup_prepare_sender: BasebackupPrepareSender, } pub struct Timeline { @@ -365,7 +358,8 @@ pub struct Timeline { pub walreceiver: Mutex>, /// Relation size cache - pub(crate) rel_size_cache: RwLock, + pub(crate) rel_size_latest_cache: RwLock>, + pub(crate) rel_size_snapshot_cache: Mutex>, download_all_remote_layers_task_info: RwLock>, @@ -447,6 +441,9 @@ pub struct Timeline { pub(crate) rel_size_v2_status: ArcSwapOption, wait_lsn_log_slow: tokio::sync::Semaphore, + + /// A channel to send async requests to prepare a basebackup for the basebackup cache. + basebackup_prepare_sender: BasebackupPrepareSender, } pub(crate) enum PreviousHeatmap { @@ -537,29 +534,24 @@ impl GcInfo { /// The `GcInfo` component describing which Lsns need to be retained. Functionally, this /// is a single number (the oldest LSN which we must retain), but it internally distinguishes /// between time-based and space-based retention for observability and consumption metrics purposes. -#[derive(Debug, Clone)] +#[derive(Clone, Debug, Default)] pub(crate) struct GcCutoffs { /// Calculated from the [`pageserver_api::models::TenantConfig::gc_horizon`], this LSN indicates how much /// history we must keep to retain a specified number of bytes of WAL. pub(crate) space: Lsn, - /// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates how much - /// history we must keep to enable reading back at least the PITR interval duration. - pub(crate) time: Lsn, -} - -impl Default for GcCutoffs { - fn default() -> Self { - Self { - space: Lsn::INVALID, - time: Lsn::INVALID, - } - } + /// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates + /// how much history we must keep to enable reading back at least the PITR interval duration. + /// + /// None indicates that the PITR cutoff has not been computed. A PITR interval of 0 will yield + /// Some(last_record_lsn). + pub(crate) time: Option, } impl GcCutoffs { fn select_min(&self) -> Lsn { - std::cmp::min(self.space, self.time) + // NB: if we haven't computed the PITR cutoff yet, we can't GC anything. + self.space.min(self.time.unwrap_or_default()) } } @@ -987,6 +979,16 @@ impl From for CreateImageLayersError { } } +impl From for CreateImageLayersError { + fn from(e: super::storage_layer::errors::PutError) -> Self { + if e.is_cancel() { + CreateImageLayersError::Cancelled + } else { + CreateImageLayersError::Other(e.into_anyhow()) + } + } +} + impl From for CreateImageLayersError { fn from(e: GetVectoredError) -> Self { match e { @@ -1031,6 +1033,7 @@ pub(crate) enum WaitLsnWaiter<'a> { Tenant, PageService, HttpEndpoint, + BaseBackupCache, } /// Argument to [`Timeline::shutdown`]. @@ -1086,11 +1089,14 @@ impl Timeline { /// Get the bytes written since the PITR cutoff on this branch, and /// whether this branch's ancestor_lsn is within its parent's PITR. pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) { + // TODO: for backwards compatibility, we return the full history back to 0 when the PITR + // cutoff has not yet been initialized. This should return None instead, but this is exposed + // in external HTTP APIs and callers may not handle a null value. let gc_info = self.gc_info.read().unwrap(); let history = self .get_last_record_lsn() - .checked_sub(gc_info.cutoffs.time) - .unwrap_or(Lsn(0)) + .checked_sub(gc_info.cutoffs.time.unwrap_or_default()) + .unwrap_or_default() .0; (history, gc_info.within_ancestor_pitr) } @@ -1100,9 +1106,10 @@ impl Timeline { self.applied_gc_cutoff_lsn.read() } - /// Read timeline's planned GC cutoff: this is the logical end of history that users - /// are allowed to read (based on configured PITR), even if physically we have more history. - pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn { + /// Read timeline's planned GC cutoff: this is the logical end of history that users are allowed + /// to read (based on configured PITR), even if physically we have more history. Returns None + /// if the PITR cutoff has not yet been initialized. + pub(crate) fn get_gc_cutoff_lsn(&self) -> Option { self.gc_info.read().unwrap().cutoffs.time } @@ -1553,7 +1560,8 @@ impl Timeline { } WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService - | WaitLsnWaiter::HttpEndpoint => unreachable!( + | WaitLsnWaiter::HttpEndpoint + | WaitLsnWaiter::BaseBackupCache => unreachable!( "tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind() ), @@ -2117,22 +2125,14 @@ impl Timeline { debug_assert_current_span_has_tenant_and_timeline_id(); // Regardless of whether we're going to try_freeze_and_flush - // or not, stop ingesting any more data. Walreceiver only provides - // cancellation but no "wait until gone", because it uses the Timeline::gate. - // So, only after the self.gate.close() below will we know for sure that - // no walreceiver tasks are left. - // For `try_freeze_and_flush=true`, this means that we might still be ingesting - // data during the call to `self.freeze_and_flush()` below. - // That's not ideal, but, we don't have the concept of a ChildGuard, - // which is what we'd need to properly model early shutdown of the walreceiver - // task sub-tree before the other Timeline task sub-trees. + // or not, stop ingesting any more data. let walreceiver = self.walreceiver.lock().unwrap().take(); tracing::debug!( is_some = walreceiver.is_some(), "Waiting for WalReceiverManager..." ); if let Some(walreceiver) = walreceiver { - walreceiver.cancel(); + walreceiver.shutdown().await; } // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); @@ -2466,6 +2466,41 @@ impl Timeline { false } } + + pub(crate) fn is_basebackup_cache_enabled(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .basebackup_cache_enabled + .unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled) + } + + /// Prepare basebackup for the given LSN and store it in the basebackup cache. + /// The method is asynchronous and returns immediately. + /// The actual basebackup preparation is performed in the background + /// by the basebackup cache on a best-effort basis. + pub(crate) fn prepare_basebackup(&self, lsn: Lsn) { + if !self.is_basebackup_cache_enabled() { + return; + } + if !self.tenant_shard_id.is_shard_zero() { + // In theory we should never get here, but just in case check it. + // Preparing basebackup doesn't make sense for shards other than shard zero. + return; + } + + let res = self + .basebackup_prepare_sender + .send(BasebackupPrepareRequest { + tenant_shard_id: self.tenant_shard_id, + timeline_id: self.timeline_id, + lsn, + }); + if let Err(e) = res { + // May happen during shutdown, it's not critical. + info!("Failed to send shutdown checkpoint: {e:#}"); + } + } } /// Number of times we will compute partition within a checkpoint distance. @@ -2543,6 +2578,13 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } + pub(crate) fn get_pitr_interval(&self) -> Duration { + let tenant_conf = &self.tenant_conf.load().tenant_conf; + tenant_conf + .pitr_interval + .unwrap_or(self.conf.default_tenant_conf.pitr_interval) + } + fn get_compaction_period(&self) -> Duration { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf @@ -2818,6 +2860,13 @@ impl Timeline { self.remote_client.update_config(&new_conf.location); + let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap(); + if let Some(new_capacity) = new_conf.tenant_conf.relsize_snapshot_cache_capacity { + if new_capacity != rel_size_cache.capacity() { + rel_size_cache.set_capacity(new_capacity); + } + } + self.metrics .evictions_with_low_residence_duration .write() @@ -2876,6 +2925,14 @@ impl Timeline { ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded); } + let relsize_snapshot_cache_capacity = { + let loaded_tenant_conf = tenant_conf.load(); + loaded_tenant_conf + .tenant_conf + .relsize_snapshot_cache_capacity + .unwrap_or(conf.default_tenant_conf.relsize_snapshot_cache_capacity) + }; + Arc::new_cyclic(|myself| { let metrics = Arc::new(TimelineMetrics::new( &tenant_shard_id, @@ -2967,10 +3024,8 @@ impl Timeline { last_image_layer_creation_check_instant: Mutex::new(None), last_received_wal: Mutex::new(None), - rel_size_cache: RwLock::new(RelSizeCache { - complete_as_of: disk_consistent_lsn, - map: HashMap::new(), - }), + rel_size_latest_cache: RwLock::new(HashMap::new()), + rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)), download_all_remote_layers_task_info: RwLock::new(None), @@ -3015,6 +3070,8 @@ impl Timeline { rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status), wait_lsn_log_slow: tokio::sync::Semaphore::new(1), + + basebackup_prepare_sender: resources.basebackup_prepare_sender, }; result.repartition_threshold = @@ -3528,7 +3585,7 @@ impl Timeline { }; let io_concurrency = IoConcurrency::spawn_from_conf( - self_ref.conf, + self_ref.conf.get_vectored_concurrent_io, self_ref .gate .enter() @@ -5557,7 +5614,7 @@ impl Timeline { }); let io_concurrency = IoConcurrency::spawn_from_conf( - self.conf, + self.conf.get_vectored_concurrent_io, self.gate .enter() .map_err(|_| CreateImageLayersError::Cancelled)?, @@ -5923,6 +5980,16 @@ impl From for CompactionError { } } +impl From for CompactionError { + fn from(e: super::storage_layer::errors::PutError) -> Self { + if e.is_cancel() { + CompactionError::ShuttingDown + } else { + CompactionError::Other(e.into_anyhow()) + } + } +} + #[serde_as] #[derive(serde::Serialize)] struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration); @@ -6218,14 +6285,12 @@ impl Timeline { pausable_failpoint!("Timeline::find_gc_cutoffs-pausable"); - if cfg!(test) { + if cfg!(test) && pitr == Duration::ZERO { // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup - if pitr == Duration::ZERO { - return Ok(GcCutoffs { - time: self.get_last_record_lsn(), - space: space_cutoff, - }); - } + return Ok(GcCutoffs { + time: Some(self.get_last_record_lsn()), + space: space_cutoff, + }); } // Calculate a time-based limit on how much to retain: @@ -6239,14 +6304,14 @@ impl Timeline { // PITR is not set. Retain the size-based limit, or the default time retention, // whichever requires less data. GcCutoffs { - time: self.get_last_record_lsn(), + time: Some(self.get_last_record_lsn()), space: std::cmp::max(time_cutoff, space_cutoff), } } (Duration::ZERO, None) => { // PITR is not set, and time lookup failed GcCutoffs { - time: self.get_last_record_lsn(), + time: Some(self.get_last_record_lsn()), space: space_cutoff, } } @@ -6254,7 +6319,7 @@ impl Timeline { // PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR // cannot advance beyond what was already GC'd, and respect space-based retention GcCutoffs { - time: *self.get_applied_gc_cutoff_lsn(), + time: Some(*self.get_applied_gc_cutoff_lsn()), space: space_cutoff, } } @@ -6262,7 +6327,7 @@ impl Timeline { // PITR interval is set and we looked up timestamp successfully. Ignore // size based retention and make time cutoff authoritative GcCutoffs { - time: time_cutoff, + time: Some(time_cutoff), space: time_cutoff, } } @@ -6315,7 +6380,7 @@ impl Timeline { ) }; - let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff); + let mut new_gc_cutoff = space_cutoff.min(time_cutoff.unwrap_or_default()); let standby_horizon = self.standby_horizon.load(); // Hold GC for the standby, but as a safety guard do it only within some // reasonable lag. @@ -6364,7 +6429,7 @@ impl Timeline { async fn gc_timeline( &self, space_cutoff: Lsn, - time_cutoff: Lsn, + time_cutoff: Option, // None if uninitialized retain_lsns: Vec, max_lsn_with_valid_lease: Option, new_gc_cutoff: Lsn, @@ -6383,6 +6448,12 @@ impl Timeline { return Ok(result); } + let Some(time_cutoff) = time_cutoff else { + // The GC cutoff should have been computed by now, but let's be defensive. + info!("Nothing to GC: time_cutoff not yet computed"); + return Ok(result); + }; + // We need to ensure that no one tries to read page versions or create // branches at a point before latest_gc_cutoff_lsn. See branch_timeline() // for details. This will block until the old value is no longer in use. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 9086d29d50..0e4b14c3e4 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1277,6 +1277,8 @@ impl Timeline { return Ok(CompactionOutcome::YieldForL0); } + let gc_cutoff = *self.applied_gc_cutoff_lsn.read(); + // 2. Repartition and create image layers if necessary match self .repartition( @@ -1287,7 +1289,7 @@ impl Timeline { ) .await { - Ok(((dense_partitioning, sparse_partitioning), lsn)) => { + Ok(((dense_partitioning, sparse_partitioning), lsn)) if lsn >= gc_cutoff => { // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them let image_ctx = RequestContextBuilder::from(ctx) .access_stats_behavior(AccessStatsBehavior::Skip) @@ -1341,6 +1343,10 @@ impl Timeline { } } + Ok(_) => { + info!("skipping repartitioning due to image compaction LSN being below GC cutoff"); + } + // Suppress errors when cancelled. Err(_) if self.cancel.is_cancelled() => {} Err(err) if err.is_cancel() => {} @@ -1520,7 +1526,7 @@ impl Timeline { info!( "starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \ checked {layers_checked}/{layers_total} layers \ - (latest_gc_cutoff={} pitr_cutoff={})", + (latest_gc_cutoff={} pitr_cutoff={:?})", layers_to_rewrite.len(), drop_layers.len(), *latest_gc_cutoff, @@ -1994,7 +2000,13 @@ impl Timeline { let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?; deltas.push(l); } - MergeIterator::create(&deltas, &[], ctx) + MergeIterator::create_with_options( + &deltas, + &[], + ctx, + 1024 * 8192, /* 8 MiB buffer per layer iterator */ + 1024, + ) }; // This iterator walks through all keys and is needed to calculate size used by each key @@ -2198,8 +2210,7 @@ impl Timeline { .as_mut() .unwrap() .put_value(key, lsn, value, ctx) - .await - .map_err(CompactionError::Other)?; + .await?; } else { let owner = self.shard_identity.get_shard_number(&key); @@ -2828,7 +2839,7 @@ impl Timeline { Ok(()) } - /// Check if the memory usage is within the limit. + /// Check to bail out of gc compaction early if it would use too much memory. async fn check_memory_usage( self: &Arc, layer_selection: &[Layer], @@ -2841,7 +2852,8 @@ impl Timeline { let layer_desc = layer.layer_desc(); if layer_desc.is_delta() { // Delta layers at most have 1MB buffer; 3x to make it safe (there're deltas as large as 16KB). - // Multiply the layer size so that tests can pass. + // Scale it by target_layer_size_bytes so that tests can pass (some tests, e.g., `test_pageserver_gc_compaction_preempt + // use 3MB layer size and we need to account for that). estimated_memory_usage_mb += 3.0 * (layer_desc.file_size / target_layer_size_bytes) as f64; num_delta_layers += 1; @@ -3423,6 +3435,7 @@ impl Timeline { // Step 2: Produce images+deltas. let mut accumulated_values = Vec::new(); + let mut accumulated_values_estimated_size = 0; let mut last_key: Option = None; // Only create image layers when there is no ancestor branches. TODO: create covering image layer @@ -3599,7 +3612,18 @@ impl Timeline { if last_key.is_none() { last_key = Some(key); } + accumulated_values_estimated_size += val.estimated_size(); accumulated_values.push((key, lsn, val)); + + // Accumulated values should never exceed 512MB. + if accumulated_values_estimated_size >= 1024 * 1024 * 512 { + return Err(CompactionError::Other(anyhow!( + "too many values for a single key: {} for key {}, {} items", + accumulated_values_estimated_size, + key, + accumulated_values.len() + ))); + } } else { let last_key: &mut Key = last_key.as_mut().unwrap(); stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction @@ -3632,6 +3656,7 @@ impl Timeline { .map_err(CompactionError::Other)?; accumulated_values.clear(); *last_key = key; + accumulated_values_estimated_size = val.estimated_size(); accumulated_values.push((key, lsn, val)); } } diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 8e95c3a8ff..40eda8c785 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -178,7 +178,7 @@ impl Attempt { } } -async fn generate_tombstone_image_layer( +pub(crate) async fn generate_tombstone_image_layer( detached: &Arc, ancestor: &Arc, ancestor_lsn: Lsn, @@ -188,7 +188,7 @@ async fn generate_tombstone_image_layer( "removing non-inherited keys by writing an image layer with tombstones at the detach LSN" ); let io_concurrency = IoConcurrency::spawn_from_conf( - detached.conf, + detached.conf.get_vectored_concurrent_io, detached.gate.enter().map_err(|_| Error::ShuttingDown)?, ); let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index b917fdbfd8..658d867c18 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -1,8 +1,10 @@ use std::sync::Arc; use anyhow::{Context, bail}; +use importbucket_client::{ControlFile, RemoteStorageWrapper}; use pageserver_api::models::ShardImportStatus; use remote_storage::RemotePath; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::info; use utils::lsn::Lsn; @@ -17,6 +19,17 @@ mod importbucket_client; mod importbucket_format; pub(crate) mod index_part_format; +pub(crate) struct ImportingTimeline { + pub import_task_handle: JoinHandle<()>, + pub timeline: Arc, +} + +impl ImportingTimeline { + pub(crate) fn shutdown(self) { + self.import_task_handle.abort(); + } +} + pub async fn doit( timeline: &Arc, index_part: index_part_format::Root, @@ -26,181 +39,225 @@ pub async fn doit( let index_part_format::Root::V1(v1) = index_part; let index_part_format::InProgress { location, - idempotency_key, - started_at, + idempotency_key: _, + started_at: _, } = match v1 { index_part_format::V1::Done(_) => return Ok(()), index_part_format::V1::InProgress(in_progress) => in_progress, }; - let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; + let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel); - let status_prefix = RemotePath::from_string("status").unwrap(); - - // - // See if shard is done. - // TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing. - // - let shard_status_key = - status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug())); - let shard_status: Option = - storage.get_json(&shard_status_key).await?; - info!(?shard_status, "peeking shard status"); - if shard_status.map(|st| st.done).unwrap_or(false) { - info!("shard status indicates that the shard is done, skipping import"); - } else { - // TODO: checkpoint the progress into the IndexPart instead of restarting - // from the beginning. - - // - // Wipe the slate clean - the flow does not allow resuming. - // We can implement resuming in the future by checkpointing the progress into the IndexPart. - // - info!("wipe the slate clean"); - { - // TODO: do we need to hold GC lock for this? - let mut guard = timeline.layers.write().await; - assert!( - guard.layer_map()?.open_layer.is_none(), - "while importing, there should be no in-memory layer" // this just seems like a good place to assert it - ); - let all_layers_keys = guard.all_persistent_layers(); - let all_layers: Vec<_> = all_layers_keys - .iter() - .map(|key| guard.get_from_key(key)) - .collect(); - let open = guard.open_mut().context("open_mut")?; - - timeline.remote_client.schedule_gc_update(&all_layers)?; - open.finish_gc_timeline(&all_layers); - } - - // - // Wait for pgdata to finish uploading - // - info!("wait for pgdata to reach status 'done'"); - let pgdata_status_key = status_prefix.join("pgdata"); - loop { - let res = async { - let pgdata_status: Option = storage - .get_json(&pgdata_status_key) - .await - .context("get pgdata status")?; - info!(?pgdata_status, "peeking pgdata status"); - if pgdata_status.map(|st| st.done).unwrap_or(false) { - Ok(()) - } else { - Err(anyhow::anyhow!("pgdata not done yet")) - } - } - .await; - match res { - Ok(_) => break, - Err(err) => { - info!(?err, "indefinitely waiting for pgdata to finish"); - if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) - .await - .is_ok() - { - bail!("cancelled while waiting for pgdata"); - } - } - } - } - - // - // Do the import - // - info!("do the import"); - let control_file = storage.get_control_file().await?; - let base_lsn = control_file.base_lsn(); - - info!("update TimelineMetadata based on LSNs from control file"); - { - let pg_version = control_file.pg_version(); - let _ctx: &RequestContext = ctx; - async move { - // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the - // checkpoint record, and prev_record_lsn should point to its beginning. - // We should read the real end of the record from the WAL, but here we - // just fake it. - let disk_consistent_lsn = Lsn(base_lsn.0 + 8); - let prev_record_lsn = base_lsn; - let metadata = TimelineMetadata::new( - disk_consistent_lsn, - Some(prev_record_lsn), - None, // no ancestor - Lsn(0), // no ancestor lsn - base_lsn, // latest_gc_cutoff_lsn - base_lsn, // initdb_lsn - pg_version, - ); - - let _start_lsn = disk_consistent_lsn + 1; - - timeline - .remote_client - .schedule_index_upload_for_full_metadata_update(&metadata)?; - - timeline.remote_client.wait_completion().await?; - - anyhow::Ok(()) - } - } - .await?; - - flow::run( - timeline.clone(), - base_lsn, - control_file, - storage.clone(), - ctx, + let shard_status = storcon_client + .get_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, ) - .await?; + .await + .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?; - // - // Communicate that shard is done. - // Ensure at-least-once delivery of the upcall to storage controller - // before we mark the task as done and never come here again. - // - let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel)? - .expect("storcon configured"); - storcon_client - .put_timeline_import_status( - timeline.tenant_shard_id, - timeline.timeline_id, - // TODO(vlad): What about import errors? - ShardImportStatus::Done, - ) - .await - .map_err(|_err| anyhow::anyhow!("Shut down while putting timeline import status"))?; + info!(?shard_status, "peeking shard status"); + match shard_status { + ShardImportStatus::InProgress(maybe_progress) => { + let storage = + importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; - storage - .put_json( - &shard_status_key, - &importbucket_format::ShardStatus { done: true }, + let control_file_res = if maybe_progress.is_none() { + // Only prepare the import once when there's no progress. + prepare_import(timeline, storage.clone(), &cancel).await + } else { + storage.get_control_file().await + }; + + let control_file = match control_file_res { + Ok(cf) => cf, + Err(err) => { + return Err( + terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await, + ); + } + }; + + let res = flow::run( + timeline.clone(), + control_file, + storage.clone(), + maybe_progress, + ctx, ) - .await - .context("put shard status")?; + .await; + if let Err(err) = res { + return Err( + terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await, + ); + } + + // Communicate that shard is done. + // Ensure at-least-once delivery of the upcall to storage controller + // before we mark the task as done and never come here again. + // + // Note that we do not mark the import complete in the index part now. + // This happens in [`Tenant::finalize_importing_timeline`] in response + // to the storage controller calling + // `/v1/tenant/:tenant_id/timeline/:timeline_id/activate_post_import`. + storcon_client + .put_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, + ShardImportStatus::Done, + ) + .await + .map_err(|_err| { + anyhow::anyhow!("Shut down while putting timeline import status") + })?; + } + ShardImportStatus::Error(err) => { + info!( + "shard status indicates that the shard is done (error), skipping import {}", + err + ); + } + ShardImportStatus::Done => { + info!("shard status indicates that the shard is done (success), skipping import"); + } } - // - // Mark as done in index_part. - // This makes subsequent timeline loads enter the normal load code path - // instead of spawning the import task and calling this here function. - // - info!("mark import as complete in index part"); - timeline - .remote_client - .schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1( - index_part_format::V1::Done(index_part_format::Done { - idempotency_key, - started_at, - finished_at: chrono::Utc::now().naive_utc(), - }), - )))?; - - timeline.remote_client.wait_completion().await?; - Ok(()) } + +async fn prepare_import( + timeline: &Arc, + storage: RemoteStorageWrapper, + cancel: &CancellationToken, +) -> anyhow::Result { + // Wipe the slate clean before starting the import as a precaution. + // This method is only called when there's no recorded checkpoint for the import + // in the storage controller. + // + // Note that this is split-brain safe (two imports for same timeline shards running in + // different generations) because we go through the usual deletion path, including deletion queue. + info!("wipe the slate clean"); + { + // TODO: do we need to hold GC lock for this? + let mut guard = timeline.layers.write().await; + assert!( + guard.layer_map()?.open_layer.is_none(), + "while importing, there should be no in-memory layer" // this just seems like a good place to assert it + ); + let all_layers_keys = guard.all_persistent_layers(); + let all_layers: Vec<_> = all_layers_keys + .iter() + .map(|key| guard.get_from_key(key)) + .collect(); + let open = guard.open_mut().context("open_mut")?; + + timeline.remote_client.schedule_gc_update(&all_layers)?; + open.finish_gc_timeline(&all_layers); + } + + // + // Wait for pgdata to finish uploading + // + info!("wait for pgdata to reach status 'done'"); + let status_prefix = RemotePath::from_string("status").unwrap(); + let pgdata_status_key = status_prefix.join("pgdata"); + loop { + let res = async { + let pgdata_status: Option = storage + .get_json(&pgdata_status_key) + .await + .context("get pgdata status")?; + info!(?pgdata_status, "peeking pgdata status"); + if pgdata_status.map(|st| st.done).unwrap_or(false) { + Ok(()) + } else { + Err(anyhow::anyhow!("pgdata not done yet")) + } + } + .await; + match res { + Ok(_) => break, + Err(err) => { + info!(?err, "indefinitely waiting for pgdata to finish"); + if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) + .await + .is_ok() + { + bail!("cancelled while waiting for pgdata"); + } + } + } + } + + let control_file = storage.get_control_file().await?; + let base_lsn = control_file.base_lsn(); + + info!("update TimelineMetadata based on LSNs from control file"); + { + let pg_version = control_file.pg_version(); + async move { + // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the + // checkpoint record, and prev_record_lsn should point to its beginning. + // We should read the real end of the record from the WAL, but here we + // just fake it. + let disk_consistent_lsn = Lsn(base_lsn.0 + 8); + let prev_record_lsn = base_lsn; + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + Some(prev_record_lsn), + None, // no ancestor + Lsn(0), // no ancestor lsn + base_lsn, // latest_gc_cutoff_lsn + base_lsn, // initdb_lsn + pg_version, + ); + + let _start_lsn = disk_consistent_lsn + 1; + + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; + + timeline.remote_client.wait_completion().await?; + + anyhow::Ok(()) + } + } + .await?; + + Ok(control_file) +} + +async fn terminate_flow_with_error( + timeline: &Arc, + error: anyhow::Error, + storcon_client: &StorageControllerUpcallClient, + cancel: &CancellationToken, +) -> anyhow::Error { + // The import task is a aborted on tenant shutdown, so in principle, it should + // never be cancelled. To be on the safe side, check the cancellation tokens + // before marking the import as failed. + if !(cancel.is_cancelled() || timeline.cancel.is_cancelled()) { + let notify_res = storcon_client + .put_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, + ShardImportStatus::Error(format!("{error:#}")), + ) + .await; + + if let Err(_notify_error) = notify_res { + // The [`StorageControllerUpcallClient::put_timeline_import_status`] retries + // forever internally, so errors returned by it can only be due to cancellation. + info!("failed to notify storcon about permanent import error"); + } + + // Will be logged by [`Tenant::create_timeline_import_pgdata_task`] + error + } else { + anyhow::anyhow!("Import task cancelled") + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index c6d2944769..3e10a4e6d6 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -29,71 +29,127 @@ //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest) use std::collections::HashSet; +use std::hash::{Hash, Hasher}; use std::ops::Range; use std::sync::Arc; -use anyhow::{bail, ensure}; +use anyhow::ensure; use bytes::Bytes; +use futures::stream::FuturesOrdered; use itertools::Itertools; +use pageserver_api::config::TimelineImportConfig; use pageserver_api::key::{ CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, }; use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range}; +use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus}; use pageserver_api::reltag::{RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::relfile_utils::parse_relfilename; use postgres_ffi::{BLCKSZ, pg_constants}; use remote_storage::RemotePath; -use tokio::task::JoinSet; -use tracing::{Instrument, debug, info_span, instrument}; +use tokio::sync::Semaphore; +use tokio_stream::StreamExt; +use tracing::{debug, instrument}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; +use utils::pausable_failpoint; use super::Timeline; use super::importbucket_client::{ControlFile, RemoteStorageWrapper}; use crate::assert_u64_eq_usize::UsizeIsU64; use crate::context::{DownloadBehavior, RequestContext}; +use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient}; use crate::pgdatadir_mapping::{ DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory, }; use crate::task_mgr::TaskKind; -use crate::tenant::storage_layer::{ImageLayerWriter, Layer}; +use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer}; pub async fn run( timeline: Arc, - pgdata_lsn: Lsn, control_file: ControlFile, storage: RemoteStorageWrapper, + import_progress: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { - Flow { - timeline, - pgdata_lsn, - control_file, - tasks: Vec::new(), - storage, + // Match how we run the import based on the progress version. + // If there's no import progress, it means that this is a new import + // and we can use whichever version we want. + match import_progress { + Some(ShardImportProgress::V1(progress)) => { + run_v1(timeline, control_file, storage, Some(progress), ctx).await + } + None => run_v1(timeline, control_file, storage, None, ctx).await, } - .run(ctx) - .await } -struct Flow { +async fn run_v1( timeline: Arc, - pgdata_lsn: Lsn, control_file: ControlFile, - tasks: Vec, storage: RemoteStorageWrapper, + import_progress: Option, + ctx: &RequestContext, +) -> anyhow::Result<()> { + let planner = Planner { + control_file, + storage: storage.clone(), + shard: timeline.shard_identity, + tasks: Vec::default(), + }; + + let import_config = &timeline.conf.timeline_import_config; + let plan = planner.plan(import_config).await?; + + // Hash the plan and compare with the hash of the plan we got back from the storage controller. + // If the two match, it means that the planning stage had the same output. + // + // This is not intended to be a cryptographically secure hash. + const SEED: u64 = 42; + let mut hasher = twox_hash::XxHash64::with_seed(SEED); + plan.hash(&mut hasher); + let plan_hash = hasher.finish(); + + if let Some(progress) = &import_progress { + if plan_hash != progress.import_plan_hash { + anyhow::bail!("Import plan does not match storcon metadata"); + } + + // Handle collisions on jobs of unequal length + if progress.jobs != plan.jobs.len() { + anyhow::bail!("Import plan job length does not match storcon metadata") + } + } + + pausable_failpoint!("import-timeline-pre-execute-pausable"); + + let start_from_job_idx = import_progress.map(|progress| progress.completed); + plan.execute(timeline, start_from_job_idx, plan_hash, import_config, ctx) + .await } -impl Flow { - /// Perform the ingestion into [`Self::timeline`]. - /// Assumes the timeline is empty (= no layers). - pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> { - let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); +struct Planner { + control_file: ControlFile, + storage: RemoteStorageWrapper, + shard: ShardIdentity, + tasks: Vec, +} - self.pgdata_lsn = pgdata_lsn; +#[derive(Hash)] +struct Plan { + jobs: Vec, + // Included here such that it ends up in the hash for the plan + shard: ShardIdentity, +} + +impl Planner { + /// Creates an import plan + /// + /// This function is and must remain pure: given the same input, it will generate the same import plan. + async fn plan(mut self, import_config: &TimelineImportConfig) -> anyhow::Result { + let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); let datadir = PgDataDir::new(&self.storage).await?; @@ -115,7 +171,7 @@ impl Flow { } // Import SLRUs - if self.timeline.tenant_shard_id.is_shard_zero() { + if self.shard.is_shard_zero() { // pg_xact (01:00 keyspace) self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) .await?; @@ -166,14 +222,16 @@ impl Flow { let mut last_end_key = Key::MIN; let mut current_chunk = Vec::new(); let mut current_chunk_size: usize = 0; - let mut parallel_jobs = Vec::new(); + let mut jobs = Vec::new(); for task in std::mem::take(&mut self.tasks).into_iter() { - if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 { + if current_chunk_size + task.total_size() + > import_config.import_job_soft_size_limit.into() + { let key_range = last_end_key..task.key_range().start; - parallel_jobs.push(ChunkProcessingJob::new( + jobs.push(ChunkProcessingJob::new( key_range.clone(), std::mem::take(&mut current_chunk), - &self, + pgdata_lsn, )); last_end_key = key_range.end; current_chunk_size = 0; @@ -181,45 +239,16 @@ impl Flow { current_chunk_size += task.total_size(); current_chunk.push(task); } - parallel_jobs.push(ChunkProcessingJob::new( + jobs.push(ChunkProcessingJob::new( last_end_key..Key::MAX, current_chunk, - &self, + pgdata_lsn, )); - // Start all jobs simultaneosly - let mut work = JoinSet::new(); - // TODO: semaphore? - for job in parallel_jobs { - let ctx: RequestContext = - ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); - work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job"))); - } - let mut results = Vec::new(); - while let Some(result) = work.join_next().await { - match result { - Ok(res) => { - results.push(res); - } - Err(_joinset_err) => { - results.push(Err(anyhow::anyhow!( - "parallel job panicked or cancelled, check pageserver logs" - ))); - } - } - } - - if results.iter().all(|r| r.is_ok()) { - Ok(()) - } else { - let mut msg = String::new(); - for result in results { - if let Err(err) = result { - msg.push_str(&format!("{err:?}\n\n")); - } - } - bail!("Some parallel jobs failed:\n\n{msg}"); - } + Ok(Plan { + jobs, + shard: self.shard, + }) } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))] @@ -266,7 +295,7 @@ impl Flow { let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32); self.tasks .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new( - *self.timeline.get_shard_identity(), + self.shard, start_key..end_key, &file.path, self.storage.clone(), @@ -289,7 +318,7 @@ impl Flow { } async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> { - assert!(self.timeline.tenant_shard_id.is_shard_zero()); + assert!(self.shard.is_shard_zero()); let segments = self.storage.listfilesindir(path).await?; let segments: Vec<(String, u32, usize)> = segments @@ -344,6 +373,100 @@ impl Flow { } } +impl Plan { + async fn execute( + self, + timeline: Arc, + start_after_job_idx: Option, + import_plan_hash: u64, + import_config: &TimelineImportConfig, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &timeline.cancel); + + let mut work = FuturesOrdered::new(); + let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into())); + + let jobs_in_plan = self.jobs.len(); + + let mut jobs = self + .jobs + .into_iter() + .enumerate() + .map(|(idx, job)| (idx + 1, job)) + .filter(|(idx, _job)| { + // Filter out any jobs that have been done already + if let Some(start_after) = start_after_job_idx { + *idx > start_after + } else { + true + } + }) + .peekable(); + + let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0); + let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into(); + + // Run import jobs concurrently up to the limit specified by the pageserver configuration. + // Note that we process completed futures in the oreder of insertion. This will be the + // building block for resuming imports across pageserver restarts or tenant migrations. + while last_completed_job_idx < jobs_in_plan { + tokio::select! { + permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => { + let permit = permit.expect("never closed"); + let (job_idx, job) = jobs.next().expect("we peeked"); + + let job_timeline = timeline.clone(); + let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); + + work.push_back(tokio::task::spawn(async move { + let _permit = permit; + let res = job.run(job_timeline, &ctx).await; + (job_idx, res) + })); + }, + maybe_complete_job_idx = work.next() => { + match maybe_complete_job_idx { + Some(Ok((job_idx, res))) => { + assert!(last_completed_job_idx.checked_add(1).unwrap() == job_idx); + + res?; + last_completed_job_idx = job_idx; + + if last_completed_job_idx % checkpoint_every == 0 { + let progress = ShardImportProgressV1 { + jobs: jobs_in_plan, + completed: last_completed_job_idx, + import_plan_hash, + }; + + storcon_client.put_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, + ShardImportStatus::InProgress(Some(ShardImportProgress::V1(progress))) + ) + .await + .map_err(|_err| { + anyhow::anyhow!("Shut down while putting timeline import status") + })?; + } + }, + Some(Err(_)) => { + anyhow::bail!( + "import job panicked or cancelled" + ); + } + None => {} + } + } + } + } + + Ok(()) + } +} + // // dbdir iteration tools // @@ -512,6 +635,15 @@ struct ImportSingleKeyTask { buf: Bytes, } +impl Hash for ImportSingleKeyTask { + fn hash(&self, state: &mut H) { + let ImportSingleKeyTask { key, buf } = self; + + key.hash(state); + buf.hash(state); + } +} + impl ImportSingleKeyTask { fn new(key: Key, buf: Bytes) -> Self { ImportSingleKeyTask { key, buf } @@ -540,6 +672,20 @@ struct ImportRelBlocksTask { storage: RemoteStorageWrapper, } +impl Hash for ImportRelBlocksTask { + fn hash(&self, state: &mut H) { + let ImportRelBlocksTask { + shard_identity: _, + key_range, + path, + storage: _, + } = self; + + key_range.hash(state); + path.hash(state); + } +} + impl ImportRelBlocksTask { fn new( shard_identity: ShardIdentity, @@ -624,6 +770,19 @@ struct ImportSlruBlocksTask { storage: RemoteStorageWrapper, } +impl Hash for ImportSlruBlocksTask { + fn hash(&self, state: &mut H) { + let ImportSlruBlocksTask { + key_range, + path, + storage: _, + } = self; + + key_range.hash(state); + path.hash(state); + } +} + impl ImportSlruBlocksTask { fn new(key_range: Range, path: &RemotePath, storage: RemoteStorageWrapper) -> Self { ImportSlruBlocksTask { @@ -666,6 +825,7 @@ impl ImportTask for ImportSlruBlocksTask { } } +#[derive(Hash)] enum AnyImportTask { SingleKey(ImportSingleKeyTask), RelBlocks(ImportRelBlocksTask), @@ -712,8 +872,8 @@ impl From for AnyImportTask { } } +#[derive(Hash)] struct ChunkProcessingJob { - timeline: Arc, range: Range, tasks: Vec, @@ -721,25 +881,24 @@ struct ChunkProcessingJob { } impl ChunkProcessingJob { - fn new(range: Range, tasks: Vec, env: &Flow) -> Self { - assert!(env.pgdata_lsn.is_valid()); + fn new(range: Range, tasks: Vec, pgdata_lsn: Lsn) -> Self { + assert!(pgdata_lsn.is_valid()); Self { - timeline: env.timeline.clone(), range, tasks, - pgdata_lsn: env.pgdata_lsn, + pgdata_lsn, } } - async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> { + async fn run(self, timeline: Arc, ctx: &RequestContext) -> anyhow::Result<()> { let mut writer = ImageLayerWriter::new( - self.timeline.conf, - self.timeline.timeline_id, - self.timeline.tenant_shard_id, + timeline.conf, + timeline.timeline_id, + timeline.tenant_shard_id, &self.range, self.pgdata_lsn, - &self.timeline.gate, - self.timeline.cancel.clone(), + &timeline.gate, + timeline.cancel.clone(), ctx, ) .await?; @@ -751,24 +910,54 @@ impl ChunkProcessingJob { let resident_layer = if nimages > 0 { let (desc, path) = writer.finish(ctx).await?; - Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)? + + { + let guard = timeline.layers.read().await; + let existing_layer = guard.try_get_from_key(&desc.key()); + if let Some(layer) = existing_layer { + if layer.metadata().generation != timeline.generation { + return Err(anyhow::anyhow!( + "Import attempted to rewrite layer file in the same generation: {}", + layer.local_path() + )); + } + } + } + + Layer::finish_creating(timeline.conf, &timeline, desc, &path)? } else { // dropping the writer cleans up return Ok(()); }; - // this is sharing the same code as create_image_layers - let mut guard = self.timeline.layers.write().await; - guard - .open_mut()? - .track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics); + // The same import job might run multiple times since not each job is checkpointed. + // Hence, we must support the cases where the layer already exists. We cannot be + // certain that the existing layer is identical to the new one, so in that case + // we replace the old layer with the one we just generated. + + let mut guard = timeline.layers.write().await; + + let existing_layer = guard + .try_get_from_key(&resident_layer.layer_desc().key()) + .cloned(); + match existing_layer { + Some(existing) => { + guard.open_mut()?.rewrite_layers( + &[(existing.clone(), resident_layer.clone())], + &[], + &timeline.metrics, + ); + } + None => { + guard + .open_mut()? + .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics); + } + } + crate::tenant::timeline::drop_wlock(guard); - // Schedule the layer for upload but don't add barriers such as - // wait for completion or index upload, so we don't inhibit upload parallelism. - // TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?) - // TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level. - self.timeline + timeline .remote_client .schedule_layer_file_upload(resident_layer)?; diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index e7aa8f6038..34313748b7 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -190,31 +190,6 @@ impl RemoteStorageWrapper { Ok(Some(res)) } - #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] - pub async fn put_json(&self, path: &RemotePath, value: &T) -> anyhow::Result<()> - where - T: serde::Serialize, - { - let buf = serde_json::to_vec(value)?; - let bytes = Bytes::from(buf); - utils::backoff::retry( - || async { - let size = bytes.len(); - let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone()))); - self.storage - .upload_storage_object(bytes, size, path, &self.cancel) - .await - }, - remote_storage::TimeoutOrCancel::caused_by_cancel, - 1, - u32::MAX, - &format!("put json {path}"), - &self.cancel, - ) - .await - .expect("practically infinite retries") - } - #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] pub async fn get_range( &self, diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs index 57c647cc7f..d9f4da4748 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs @@ -5,9 +5,3 @@ pub struct PgdataStatus { pub done: bool, // TODO: remaining fields } - -#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] -pub struct ShardStatus { - pub done: bool, - // TODO: remaining fields -} diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs index ea7a41b25f..371fc857dc 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs @@ -64,4 +64,12 @@ impl Root { }, } } + pub fn started_at(&self) -> &chrono::NaiveDateTime { + match self { + Root::V1(v1) => match v1 { + V1::InProgress(in_progress) => &in_progress.started_at, + V1::Done(done) => &done.started_at, + }, + } + } } diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 4f80073cc3..0f73eb839b 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -63,6 +63,7 @@ pub struct WalReceiver { /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token. /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`. cancel: CancellationToken, + task: tokio::task::JoinHandle<()>, } impl WalReceiver { @@ -79,7 +80,7 @@ impl WalReceiver { let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); let cancel = timeline.cancel.child_token(); - WALRECEIVER_RUNTIME.spawn({ + let task = WALRECEIVER_RUNTIME.spawn({ let cancel = cancel.clone(); async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -120,14 +121,25 @@ impl WalReceiver { Self { manager_status, cancel, + task, } } #[instrument(skip_all, level = tracing::Level::DEBUG)] - pub fn cancel(&self) { + pub async fn shutdown(self) { debug_assert_current_span_has_tenant_and_timeline_id(); debug!("cancelling walreceiver tasks"); self.cancel.cancel(); + match self.task.await { + Ok(()) => debug!("Shutdown success"), + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + // already logged by panic hook + } + Err(je) => { + error!("shutdown walreceiver task join error: {je}") + } + } } pub(crate) fn status(&self) -> Option { diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 58953407b1..45b6e44c54 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -14,8 +14,6 @@ use std::fs::File; use std::io::{Error, ErrorKind}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; -#[cfg(target_os = "linux")] -use std::os::unix::fs::OpenOptionsExt; use std::sync::LazyLock; use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; @@ -76,6 +74,8 @@ pub struct VirtualFile { impl VirtualFile { /// Open a file in read-only mode. Like File::open. + /// + /// Insensitive to `virtual_file_io_mode` setting. pub async fn open>( path: P, ctx: &RequestContext, @@ -97,36 +97,20 @@ impl VirtualFile { Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await } + /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`. pub async fn open_with_options_v2>( path: P, - open_options: &OpenOptions, + mut open_options: OpenOptions, ctx: &RequestContext, ) -> Result { let mode = get_io_mode(); - let set_o_direct = match (mode, open_options.is_write()) { + let direct = match (mode, open_options.is_write()) { (IoMode::Buffered, _) => false, - #[cfg(target_os = "linux")] (IoMode::Direct, false) => true, - #[cfg(target_os = "linux")] (IoMode::Direct, true) => false, - #[cfg(target_os = "linux")] (IoMode::DirectRw, _) => true, }; - let open_options = open_options.clone(); - let open_options = if set_o_direct { - #[cfg(target_os = "linux")] - { - let mut open_options = open_options; - open_options.custom_flags(nix::libc::O_DIRECT); - open_options - } - #[cfg(not(target_os = "linux"))] - unreachable!( - "O_DIRECT is not supported on this platform, IoMode's that result in set_o_direct=true shouldn't even be defined" - ); - } else { - open_options - }; + open_options = open_options.direct(direct); let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; Ok(VirtualFile { inner, _mode: mode }) } @@ -424,7 +408,7 @@ impl OpenFiles { /// error types may be elegible for retry. pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool { use nix::errno::Errno::*; - match e.raw_os_error().map(nix::errno::from_i32) { + match e.raw_os_error().map(nix::errno::Errno::from_raw) { Some(EIO) => { // Terminate on EIO because we no longer trust the device to store // data safely, or to uphold persistence guarantees on fsync. @@ -530,7 +514,7 @@ impl VirtualFileInner { path: P, ctx: &RequestContext, ) -> Result { - Self::open_with_options(path.as_ref(), OpenOptions::new().read(true).clone(), ctx).await + Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await } /// Open a file with given options. @@ -558,10 +542,11 @@ impl VirtualFileInner { // It would perhaps be nicer to check just for the read and write flags // explicitly, but OpenOptions doesn't contain any functions to read flags, // only to set them. - let mut reopen_options = open_options.clone(); - reopen_options.create(false); - reopen_options.create_new(false); - reopen_options.truncate(false); + let reopen_options = open_options + .clone() + .create(false) + .create_new(false) + .truncate(false); let vfile = VirtualFileInner { handle: RwLock::new(handle), @@ -797,6 +782,12 @@ impl VirtualFileInner { where Buf: tokio_epoll_uring::IoBufMut + Send, { + self.validate_direct_io( + Slice::stable_ptr(&buf).addr(), + Slice::bytes_total(&buf), + offset, + ); + let file_guard = match self .lock_file() .await @@ -822,6 +813,8 @@ impl VirtualFileInner { offset: u64, ctx: &RequestContext, ) -> (FullSlice, Result) { + self.validate_direct_io(buf.as_ptr().addr(), buf.len(), offset); + let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, Err(e) => return (buf, Err(e)), @@ -836,6 +829,64 @@ impl VirtualFileInner { (buf, result) }) } + + /// Validate all reads and writes to adhere to the O_DIRECT requirements of our production systems. + /// + /// Validating it iin userspace sets a consistent bar, independent of what actual OS/filesystem/block device is in use. + fn validate_direct_io(&self, addr: usize, size: usize, offset: u64) { + // TODO: eventually enable validation in the builds we use in real environments like staging, preprod, and prod. + if !(cfg!(feature = "testing") || cfg!(test)) { + return; + } + if !self.open_options.is_direct() { + return; + } + + // Validate buffer memory alignment. + // + // What practically matters as of Linux 6.1 is bdev_dma_alignment() + // which is practically between 512 and 4096. + // On our production systems, the value is 512. + // The IoBuffer/IoBufferMut hard-code that value. + // + // Because the alloctor might return _more_ aligned addresses than requested, + // there is a chance that testing would not catch violations of a runtime requirement stricter than 512. + { + let requirement = 512; + let remainder = addr % requirement; + assert!( + remainder == 0, + "Direct I/O buffer must be aligned: buffer_addr=0x{addr:x} % 0x{requirement:x} = 0x{remainder:x}" + ); + } + + // Validate offset alignment. + // + // We hard-code 512 throughout the code base. + // So enforce just that and not anything more restrictive. + // Even the shallowest testing will expose more restrictive requirements if those ever arise. + { + let requirement = 512; + let remainder = offset % requirement; + assert!( + remainder == 0, + "Direct I/O offset must be aligned: offset=0x{offset:x} % 0x{requirement:x} = 0x{remainder:x}" + ); + } + + // Validate buffer size multiple requirement. + // + // The requirement in Linux 6.1 is bdev_logical_block_size(). + // On our production systems, that is 512. + { + let requirement = 512; + let remainder = size % requirement; + assert!( + remainder == 0, + "Direct I/O buffer size must be a multiple of {requirement}: size=0x{size:x} % 0x{requirement:x} = 0x{remainder:x}" + ); + } + } } // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 @@ -1224,7 +1275,6 @@ mod tests { use std::sync::Arc; use owned_buffers_io::io_buf_ext::IoBufExt; - use owned_buffers_io::slice::SliceMutExt; use rand::seq::SliceRandom; use rand::{Rng, thread_rng}; @@ -1232,208 +1282,85 @@ mod tests { use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - enum MaybeVirtualFile { - VirtualFile(VirtualFile), - File(File), - } - - impl From for MaybeVirtualFile { - fn from(vf: VirtualFile) -> Self { - MaybeVirtualFile::VirtualFile(vf) - } - } - - impl MaybeVirtualFile { - async fn read_exact_at( - &self, - mut slice: tokio_epoll_uring::Slice, - offset: u64, - ctx: &RequestContext, - ) -> Result, Error> { - match self { - MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await, - MaybeVirtualFile::File(file) => { - let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed(); - file.read_exact_at(rust_slice, offset).map(|()| slice) - } - } - } - async fn write_all_at( - &self, - buf: FullSlice, - offset: u64, - ctx: &RequestContext, - ) -> Result<(), Error> { - match self { - MaybeVirtualFile::VirtualFile(file) => { - let (_buf, res) = file.write_all_at(buf, offset, ctx).await; - res - } - MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset), - } - } - - // Helper function to slurp a portion of a file into a string - async fn read_string_at( - &mut self, - pos: u64, - len: usize, - ctx: &RequestContext, - ) -> Result { - let slice = IoBufferMut::with_capacity(len).slice_full(); - assert_eq!(slice.bytes_total(), len); - let slice = self.read_exact_at(slice, pos, ctx).await?; - let buf = slice.into_inner(); - assert_eq!(buf.len(), len); - - Ok(String::from_utf8(buf.to_vec()).unwrap()) - } - } - #[tokio::test] async fn test_virtual_files() -> anyhow::Result<()> { - // The real work is done in the test_files() helper function. This - // allows us to run the same set of tests against a native File, and - // VirtualFile. We trust the native Files and wouldn't need to test them, - // but this allows us to verify that the operations return the same - // results with VirtualFiles as with native Files. (Except that with - // native files, you will run out of file descriptors if the ulimit - // is low enough.) - struct A; - - impl Adapter for A { - async fn open( - path: Utf8PathBuf, - opts: OpenOptions, - ctx: &RequestContext, - ) -> Result { - let vf = VirtualFile::open_with_options_v2(&path, &opts, ctx).await?; - Ok(MaybeVirtualFile::VirtualFile(vf)) - } - } - test_files::("virtual_files").await - } - - #[tokio::test] - async fn test_physical_files() -> anyhow::Result<()> { - struct B; - - impl Adapter for B { - async fn open( - path: Utf8PathBuf, - opts: OpenOptions, - _ctx: &RequestContext, - ) -> Result { - Ok(MaybeVirtualFile::File({ - let owned_fd = opts.open(path.as_std_path()).await?; - File::from(owned_fd) - })) - } - } - - test_files::("physical_files").await - } - - /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition - /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function - /// in trait which benefits from the new lifetime capture rules already. - trait Adapter { - async fn open( - path: Utf8PathBuf, - opts: OpenOptions, - ctx: &RequestContext, - ) -> Result; - } - - async fn test_files(testname: &str) -> anyhow::Result<()> - where - A: Adapter, - { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); - let testdir = crate::config::PageServerConf::test_repo_dir(testname); + let testdir = crate::config::PageServerConf::test_repo_dir("test_virtual_files"); std::fs::create_dir_all(&testdir)?; + let zeropad512 = |content: &[u8]| { + let mut buf = IoBufferMut::with_capacity_zeroed(512); + buf[..content.len()].copy_from_slice(content); + buf.freeze().slice_len() + }; + let path_a = testdir.join("file_a"); - let mut file_a = A::open( + let file_a = VirtualFile::open_with_options_v2( path_a.clone(), OpenOptions::new() + .read(true) .write(true) + // set create & truncate flags to ensure when we trigger a reopen later in this test, + // the reopen_options must have masked out those flags; if they don't, then + // the after reopen we will fail to read the `content_a` that we write here. .create(true) - .truncate(true) - .to_owned(), + .truncate(true), &ctx, ) .await?; + let (_, res) = file_a.write_all_at(zeropad512(b"content_a"), 0, &ctx).await; + res?; - file_a - .write_all_at(IoBuffer::from(b"foobar").slice_len(), 0, &ctx) - .await?; - - // cannot read from a file opened in write-only mode - let _ = file_a.read_string_at(0, 1, &ctx).await.unwrap_err(); - - // Close the file and re-open for reading - let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?; - - // cannot write to a file opened in read-only mode - let _ = file_a - .write_all_at(IoBuffer::from(b"bar").slice_len(), 0, &ctx) - .await - .unwrap_err(); - - // Try simple read - assert_eq!("foobar", file_a.read_string_at(0, 6, &ctx).await?); - - // Create another test file, and try FileExt functions on it. let path_b = testdir.join("file_b"); - let mut file_b = A::open( + let file_b = VirtualFile::open_with_options_v2( path_b.clone(), OpenOptions::new() .read(true) .write(true) .create(true) - .truncate(true) - .to_owned(), + .truncate(true), &ctx, ) .await?; - file_b - .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx) - .await?; - file_b - .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx) - .await?; + let (_, res) = file_b.write_all_at(zeropad512(b"content_b"), 0, &ctx).await; + res?; - assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); + let assert_first_512_eq = async |vfile: &VirtualFile, expect: &[u8]| { + let buf = vfile + .read_exact_at(IoBufferMut::with_capacity_zeroed(512).slice_full(), 0, &ctx) + .await + .unwrap(); + assert_eq!(&buf[..], &zeropad512(expect)[..]); + }; - // Open a lot of files, enough to cause some evictions. (Or to be precise, - // open the same file many times. The effect is the same.) + // Open a lot of file descriptors / VirtualFile instances. + // Enough to cause some evictions in the fd cache. - let mut vfiles = Vec::new(); + let mut file_b_dupes = Vec::new(); for _ in 0..100 { - let mut vfile = A::open( + let vfile = VirtualFile::open_with_options_v2( path_b.clone(), - OpenOptions::new().read(true).to_owned(), + OpenOptions::new().read(true), &ctx, ) .await?; - assert_eq!("FOOBAR", vfile.read_string_at(0, 6, &ctx).await?); - vfiles.push(vfile); + assert_first_512_eq(&vfile, b"content_b").await; + file_b_dupes.push(vfile); } // make sure we opened enough files to definitely cause evictions. - assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2); + assert!(file_b_dupes.len() > TEST_MAX_FILE_DESCRIPTORS * 2); // The underlying file descriptor for 'file_a' should be closed now. Try to read - // from it again. - assert_eq!("foobar", file_a.read_string_at(0, 6, &ctx).await?); + // from it again. The VirtualFile reopens the file internally. + assert_first_512_eq(&file_a, b"content_a").await; // Check that all the other FDs still work too. Use them in random order for // good measure. - vfiles.as_mut_slice().shuffle(&mut thread_rng()); - for vfile in vfiles.iter_mut() { - assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?); + file_b_dupes.as_mut_slice().shuffle(&mut thread_rng()); + for vfile in file_b_dupes.iter_mut() { + assert_first_512_eq(vfile, b"content_b").await; } Ok(()) @@ -1464,9 +1391,9 @@ mod tests { // Open the file many times. let mut files = Vec::new(); for _ in 0..VIRTUAL_FILES { - let f = VirtualFileInner::open_with_options( + let f = VirtualFile::open_with_options_v2( &test_file_path, - OpenOptions::new().read(true).clone(), + OpenOptions::new().read(true), &ctx, ) .await?; @@ -1509,8 +1436,6 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_basic() { - let ctx = - RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1520,26 +1445,22 @@ mod tests { VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); - let post = file.read_string_at(0, 3, &ctx).await.unwrap(); + + let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); - drop(file); VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); - let post = file.read_string_at(0, 3, &ctx).await.unwrap(); + + let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "bar"); assert!(!tmp_path.exists()); - drop(file); } #[tokio::test] async fn test_atomic_overwrite_preexisting_tmp() { - let ctx = - RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1554,10 +1475,8 @@ mod tests { .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); - let post = file.read_string_at(0, 3, &ctx).await.unwrap(); + let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); - drop(file); } } diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index dd04fb561a..3cde34eda7 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -111,18 +111,20 @@ pub(crate) fn get() -> IoEngine { use std::os::unix::prelude::FileExt; use std::sync::atomic::{AtomicU8, Ordering}; +#[cfg(target_os = "linux")] +use {std::time::Duration, tracing::info}; use super::owned_buffers_io::io_buf_ext::FullSlice; use super::owned_buffers_io::slice::SliceMutExt; use super::{FileGuard, Metadata}; #[cfg(target_os = "linux")] -fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { +pub(super) fn epoll_uring_error_to_std( + e: tokio_epoll_uring::Error, +) -> std::io::Error { match e { tokio_epoll_uring::Error::Op(e) => e, - tokio_epoll_uring::Error::System(system) => { - std::io::Error::new(std::io::ErrorKind::Other, system) - } + tokio_epoll_uring::Error::System(system) => std::io::Error::other(system), } } @@ -149,7 +151,11 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.read(file_guard, offset, slice).await; + let (resources, res) = + retry_ecanceled_once((file_guard, slice), |(file_guard, slice)| async { + system.read(file_guard, offset, slice).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -164,7 +170,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.fsync(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.fsync(file_guard).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -182,7 +191,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.fdatasync(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.fdatasync(file_guard).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -201,7 +213,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.statx(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.statx(file_guard).await + }) + .await; ( resources, res.map_err(epoll_uring_error_to_std).map(Metadata::from), @@ -224,6 +239,7 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { // TODO: ftruncate op for tokio-epoll-uring + // Don't forget to use retry_ecanceled_once let res = file_guard.with_std_file(|std_file| std_file.set_len(len)); (file_guard, res) } @@ -245,8 +261,11 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let ((file_guard, slice), res) = - system.write(file_guard, offset, buf.into_raw_slice()).await; + let ((file_guard, slice), res) = retry_ecanceled_once( + (file_guard, buf.into_raw_slice()), + async |(file_guard, buf)| system.write(file_guard, offset, buf).await, + ) + .await; ( (file_guard, FullSlice::must_new(slice)), res.map_err(epoll_uring_error_to_std), @@ -282,6 +301,56 @@ impl IoEngine { } } +/// We observe in tests that stop pageserver with SIGTERM immediately after it was ingesting data, +/// occasionally buffered writers fail (and get retried by BufferedWriter) with ECANCELED. +/// The problem is believed to be a race condition in how io_uring handles punted async work (io-wq) and signals. +/// Investigation ticket: +/// +/// This function retries the operation once if it fails with ECANCELED. +/// ONLY USE FOR IDEMPOTENT [`super::VirtualFile`] operations. +#[cfg(target_os = "linux")] +pub(super) async fn retry_ecanceled_once( + resources: T, + f: F, +) -> (T, Result>) +where + F: Fn(T) -> Fut, + Fut: std::future::Future>)>, + T: Send, + V: Send, +{ + let (resources, res) = f(resources).await; + let Err(e) = res else { + return (resources, res); + }; + let tokio_epoll_uring::Error::Op(err) = e else { + return (resources, Err(e)); + }; + if err.raw_os_error() != Some(nix::libc::ECANCELED) { + return (resources, Err(tokio_epoll_uring::Error::Op(err))); + } + { + static RATE_LIMIT: std::sync::Mutex = + std::sync::Mutex::new(utils::rate_limit::RateLimit::new(Duration::from_secs(1))); + let mut guard = RATE_LIMIT.lock().unwrap(); + guard.call2(|rate_limit_stats| { + info!( + %rate_limit_stats, "ECANCELED observed, assuming it is due to a signal being received by the submitting thread, retrying after a delay; this message is rate-limited" + ); + }); + drop(guard); + } + tokio::time::sleep(Duration::from_millis(100)).await; // something big enough to beat even heavily overcommitted CI runners + let (resources, res) = f(resources).await; + (resources, res) +} + +pub(super) fn panic_operation_must_be_idempotent() { + panic!( + "unsupported; io_engine may retry operations internally and thus needs them to be idempotent (retry_ecanceled_once)" + ) +} + pub enum FeatureTestResult { PlatformPreferred(IoEngineKind), Worse { diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index 7d323f3d8f..7d478f3600 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -1,13 +1,20 @@ //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; use std::os::fd::OwnedFd; +use std::os::unix::fs::OpenOptionsExt; use std::path::Path; use super::io_engine::IoEngine; #[derive(Debug, Clone)] pub struct OpenOptions { + /// We keep a copy of the write() flag we pass to the `inner`` `OptionOptions` + /// to support [`Self::is_write`]. write: bool, + /// We don't expose + pass through a raw `custom_flags()` style API. + /// The only custom flag we support is `O_DIRECT`, which we track here + /// and map to `custom_flags()` in the [`Self::open`] method. + direct: bool, inner: Inner, } #[derive(Debug, Clone)] @@ -29,6 +36,7 @@ impl Default for OpenOptions { }; Self { write: false, + direct: false, inner, } } @@ -43,7 +51,11 @@ impl OpenOptions { self.write } - pub fn read(&mut self, read: bool) -> &mut OpenOptions { + pub(super) fn is_direct(&self) -> bool { + self.direct + } + + pub fn read(mut self, read: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.read(read); @@ -56,7 +68,7 @@ impl OpenOptions { self } - pub fn write(&mut self, write: bool) -> &mut OpenOptions { + pub fn write(mut self, write: bool) -> Self { self.write = write; match &mut self.inner { Inner::StdFs(x) => { @@ -70,7 +82,7 @@ impl OpenOptions { self } - pub fn create(&mut self, create: bool) -> &mut OpenOptions { + pub fn create(mut self, create: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.create(create); @@ -83,7 +95,7 @@ impl OpenOptions { self } - pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions { + pub fn create_new(mut self, create_new: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.create_new(create_new); @@ -96,7 +108,7 @@ impl OpenOptions { self } - pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions { + pub fn truncate(mut self, truncate: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.truncate(truncate); @@ -109,25 +121,53 @@ impl OpenOptions { self } + /// Don't use, `O_APPEND` is not supported. + pub fn append(&mut self, _append: bool) { + super::io_engine::panic_operation_must_be_idempotent(); + } + pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result { - match &self.inner { - Inner::StdFs(x) => x.open(path).map(|file| file.into()), + #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] + let mut custom_flags = 0; + if self.direct { #[cfg(target_os = "linux")] - Inner::TokioEpollUring(x) => { + { + custom_flags |= nix::libc::O_DIRECT; + } + #[cfg(not(target_os = "linux"))] + { + // Other platforms may be used for development but don't necessarily have a 1:1 equivalent to Linux's O_DIRECT (macOS!). + // Just don't set the flag; to catch alignment bugs typical for O_DIRECT, + // we have a runtime validation layer inside `VirtualFile::write_at` and `VirtualFile::read_at`. + static WARNING: std::sync::Once = std::sync::Once::new(); + WARNING.call_once(|| { + let span = tracing::info_span!(parent: None, "open_options"); + let _enter = span.enter(); + tracing::warn!("your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs; this warning is logged once per process"); + }); + } + } + + match self.inner.clone() { + Inner::StdFs(mut x) => x + .custom_flags(custom_flags) + .open(path) + .map(|file| file.into()), + #[cfg(target_os = "linux")] + Inner::TokioEpollUring(mut x) => { + x.custom_flags(custom_flags); let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await; - system.open(path, x).await.map_err(|e| match e { - tokio_epoll_uring::Error::Op(e) => e, - tokio_epoll_uring::Error::System(system) => { - std::io::Error::new(std::io::ErrorKind::Other, system) - } + let (_, res) = super::io_engine::retry_ecanceled_once((), |()| async { + let res = system.open(path, &x).await; + ((), res) }) + .await; + res.map_err(super::io_engine::epoll_uring_error_to_std) } } } -} -impl std::os::unix::prelude::OpenOptionsExt for OpenOptions { - fn mode(&mut self, mode: u32) -> &mut OpenOptions { + pub fn mode(mut self, mode: u32) -> Self { match &mut self.inner { Inner::StdFs(x) => { let _ = x.mode(mode); @@ -140,16 +180,8 @@ impl std::os::unix::prelude::OpenOptionsExt for OpenOptions { self } - fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions { - match &mut self.inner { - Inner::StdFs(x) => { - let _ = x.custom_flags(flags); - } - #[cfg(target_os = "linux")] - Inner::TokioEpollUring(x) => { - let _ = x.custom_flags(flags); - } - } + pub fn direct(mut self, direct: bool) -> Self { + self.direct = direct; self } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index b41a9f6cd2..ac9867e8b4 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -247,6 +247,19 @@ pub enum FlushTaskError { Cancelled, } +impl FlushTaskError { + pub fn is_cancel(&self) -> bool { + match self { + FlushTaskError::Cancelled => true, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + FlushTaskError::Cancelled => anyhow::anyhow!(self), + } + } +} + impl FlushBackgroundTask where Buf: IoBufAligned + Send + Sync, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index e60c590f87..c1a3b79915 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1316,6 +1316,10 @@ impl WalIngest { } }); + if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { + modification.tline.prepare_basebackup(lsn); + } + Ok(()) } @@ -1684,31 +1688,31 @@ mod tests { // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx) .await?, false ); assert!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx) .await .is_err() ); assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, 1 ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx) .await?, 3 ); @@ -1719,7 +1723,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 0, - Version::Lsn(Lsn(0x20)), + Version::at(Lsn(0x20)), &ctx, io_concurrency.clone() ) @@ -1733,7 +1737,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 0, - Version::Lsn(Lsn(0x30)), + Version::at(Lsn(0x30)), &ctx, io_concurrency.clone() ) @@ -1747,7 +1751,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 0, - Version::Lsn(Lsn(0x40)), + Version::at(Lsn(0x40)), &ctx, io_concurrency.clone() ) @@ -1760,7 +1764,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 1, - Version::Lsn(Lsn(0x40)), + Version::at(Lsn(0x40)), &ctx, io_concurrency.clone() ) @@ -1774,7 +1778,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 0, - Version::Lsn(Lsn(0x50)), + Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) @@ -1787,7 +1791,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 1, - Version::Lsn(Lsn(0x50)), + Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) @@ -1800,7 +1804,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 2, - Version::Lsn(Lsn(0x50)), + Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) @@ -1820,7 +1824,7 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx) .await?, 2 ); @@ -1829,7 +1833,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 0, - Version::Lsn(Lsn(0x60)), + Version::at(Lsn(0x60)), &ctx, io_concurrency.clone() ) @@ -1842,7 +1846,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 1, - Version::Lsn(Lsn(0x60)), + Version::at(Lsn(0x60)), &ctx, io_concurrency.clone() ) @@ -1854,7 +1858,7 @@ mod tests { // should still see the truncated block with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx) .await?, 3 ); @@ -1863,7 +1867,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 2, - Version::Lsn(Lsn(0x50)), + Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) @@ -1880,7 +1884,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x68)), &ctx) .await?, 0 ); @@ -1893,7 +1897,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x70)), &ctx) .await?, 2 ); @@ -1902,7 +1906,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 0, - Version::Lsn(Lsn(0x70)), + Version::at(Lsn(0x70)), &ctx, io_concurrency.clone() ) @@ -1915,7 +1919,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 1, - Version::Lsn(Lsn(0x70)), + Version::at(Lsn(0x70)), &ctx, io_concurrency.clone() ) @@ -1932,7 +1936,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx) .await?, 1501 ); @@ -1942,7 +1946,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, blk, - Version::Lsn(Lsn(0x80)), + Version::at(Lsn(0x80)), &ctx, io_concurrency.clone() ) @@ -1956,7 +1960,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, 1500, - Version::Lsn(Lsn(0x80)), + Version::at(Lsn(0x80)), &ctx, io_concurrency.clone() ) @@ -1990,13 +1994,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, 1 ); @@ -2011,7 +2015,7 @@ mod tests { // Check that rel is not visible anymore assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x30)), &ctx) .await?, false ); @@ -2029,13 +2033,13 @@ mod tests { // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x40)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x40)), &ctx) .await?, 1 ); @@ -2077,26 +2081,26 @@ mod tests { // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx) .await?, false ); assert!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx) .await .is_err() ); assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx) .await?, relsize ); @@ -2110,7 +2114,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, blkno, - Version::Lsn(lsn), + Version::at(lsn), &ctx, io_concurrency.clone() ) @@ -2131,7 +2135,7 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx) .await?, 1 ); @@ -2144,7 +2148,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, blkno, - Version::Lsn(Lsn(0x60)), + Version::at(Lsn(0x60)), &ctx, io_concurrency.clone() ) @@ -2157,7 +2161,7 @@ mod tests { // should still see all blocks with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx) .await?, relsize ); @@ -2169,7 +2173,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, blkno, - Version::Lsn(Lsn(0x50)), + Version::at(Lsn(0x50)), &ctx, io_concurrency.clone() ) @@ -2193,13 +2197,13 @@ mod tests { assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_exists(TESTREL_A, Version::at(Lsn(0x80)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx) .await?, relsize ); @@ -2212,7 +2216,7 @@ mod tests { .get_rel_page_at_lsn( TESTREL_A, blkno, - Version::Lsn(Lsn(0x80)), + Version::at(Lsn(0x80)), &ctx, io_concurrency.clone() ) @@ -2250,7 +2254,7 @@ mod tests { assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE + 1 ); @@ -2264,7 +2268,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE ); @@ -2279,7 +2283,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE - 1 ); @@ -2297,7 +2301,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) + .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx) .await?, size as BlockNumber ); diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 90bdff32a9..8216b7b355 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -37,6 +37,8 @@ DATA = \ neon--1.2--1.3.sql \ neon--1.3--1.4.sql \ neon--1.4--1.5.sql \ + neon--1.5--1.6.sql \ + neon--1.6--1.5.sql \ neon--1.5--1.4.sql \ neon--1.4--1.3.sql \ neon--1.3--1.2.sql \ diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index 61bb3206e7..2655a45bcc 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -425,15 +425,12 @@ compact_prefetch_buffers(void) * point inside and outside PostgreSQL. * * This still does throw errors when it receives malformed responses from PS. - * - * When we're not called from CHECK_FOR_INTERRUPTS (indicated by - * IsHandlingInterrupts) we also report we've ended prefetch receive work, - * just in case state tracking was lost due to an error in the sync getPage - * response code. */ void -communicator_prefetch_pump_state(bool IsHandlingInterrupts) +communicator_prefetch_pump_state(void) { + START_PREFETCH_RECEIVE_WORK(); + while (MyPState->ring_receive != MyPState->ring_flush) { NeonResponse *response; @@ -482,9 +479,7 @@ communicator_prefetch_pump_state(bool IsHandlingInterrupts) } } - /* We never pump the prefetch state while handling other pages */ - if (!IsHandlingInterrupts) - END_PREFETCH_RECEIVE_WORK(); + END_PREFETCH_RECEIVE_WORK(); communicator_reconfigure_timeout_if_needed(); } @@ -672,9 +667,10 @@ prefetch_wait_for(uint64 ring_index) Assert(MyPState->ring_unused > ring_index); + START_PREFETCH_RECEIVE_WORK(); + while (MyPState->ring_receive <= ring_index) { - START_PREFETCH_RECEIVE_WORK(); entry = GetPrfSlot(MyPState->ring_receive); Assert(entry->status == PRFS_REQUESTED); @@ -683,12 +679,19 @@ prefetch_wait_for(uint64 ring_index) result = false; break; } - - END_PREFETCH_RECEIVE_WORK(); CHECK_FOR_INTERRUPTS(); } + if (result) + { + /* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */ + PrefetchRequest *slot = GetPrfSlot(ring_index); + result = slot->status == PRFS_RECEIVED; + } + END_PREFETCH_RECEIVE_WORK(); + return result; +; } /* @@ -714,6 +717,7 @@ prefetch_read(PrefetchRequest *slot) Assert(slot->status == PRFS_REQUESTED); Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_receive); + Assert(readpage_reentrant_guard || AmPrewarmWorker); if (slot->status != PRFS_REQUESTED || slot->response != NULL || @@ -796,6 +800,7 @@ communicator_prefetch_receive(BufferTag tag) PrfHashEntry *entry; PrefetchRequest hashkey; + Assert(readpage_reentrant_guard || AmPrewarmWorker); /* do not pump prefetch state in prewarm worker */ hashkey.buftag = tag; entry = prfh_lookup(MyPState->prf_hash, &hashkey); if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index)) @@ -815,8 +820,12 @@ communicator_prefetch_receive(BufferTag tag) void prefetch_on_ps_disconnect(void) { + bool save_readpage_reentrant_guard = readpage_reentrant_guard; MyPState->ring_flush = MyPState->ring_unused; + /* Prohibit callig of prefetch_pump_state */ + START_PREFETCH_RECEIVE_WORK(); + while (MyPState->ring_receive < MyPState->ring_unused) { PrefetchRequest *slot; @@ -845,6 +854,9 @@ prefetch_on_ps_disconnect(void) MyNeonCounters->getpage_prefetch_discards_total += 1; } + /* Restore guard */ + readpage_reentrant_guard = save_readpage_reentrant_guard; + /* * We can have gone into retry due to network error, so update stats with * the latest available @@ -2438,6 +2450,7 @@ void communicator_reconfigure_timeout_if_needed(void) { bool needs_set = MyPState->ring_receive != MyPState->ring_unused && + !AmPrewarmWorker && /* do not pump prefetch state in prewarm worker */ readahead_getpage_pull_timeout_ms > 0; if (needs_set != timeout_set) @@ -2503,7 +2516,7 @@ communicator_processinterrupts(void) if (timeout_signaled) { if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) - communicator_prefetch_pump_state(true); + communicator_prefetch_pump_state(); timeout_signaled = false; communicator_reconfigure_timeout_if_needed(); diff --git a/pgxn/neon/communicator.h b/pgxn/neon/communicator.h index f55c4b10f1..5376c9b839 100644 --- a/pgxn/neon/communicator.h +++ b/pgxn/neon/communicator.h @@ -44,7 +44,7 @@ extern int communicator_read_slru_segment(SlruKind kind, int64 segno, void *buffer); extern void communicator_reconfigure_timeout_if_needed(void); -extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts); +extern void communicator_prefetch_pump_state(void); #endif diff --git a/pgxn/neon/communicator/src/backend_interface.rs b/pgxn/neon/communicator/src/backend_interface.rs index a5cc976bc5..224680d136 100644 --- a/pgxn/neon/communicator/src/backend_interface.rs +++ b/pgxn/neon/communicator/src/backend_interface.rs @@ -1,6 +1,8 @@ //! This code runs in each backend process. That means that launching Rust threads, panicking //! etc. is forbidden! +use std::os::fd::OwnedFd; + use crate::backend_comms::NeonIOHandle; use crate::init::CommunicatorInitStruct; use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess}; @@ -17,7 +19,7 @@ pub struct CommunicatorBackendStruct<'t> { neon_request_slots: &'t [NeonIOHandle], - submission_pipe_write_fd: std::ffi::c_int, + submission_pipe_write_fd: OwnedFd, pending_cache_read_op: Option>, @@ -169,7 +171,8 @@ impl<'t> CommunicatorBackendStruct<'t> { // // If it does block very briefly, that's not too serious. let idxbuf = request_idx.to_ne_bytes(); - let _res = nix::unistd::write(self.submission_pipe_write_fd, &idxbuf); + + let _res = nix::unistd::write(&self.submission_pipe_write_fd, &idxbuf); // FIXME: check result, return any errors } diff --git a/pgxn/neon/communicator/src/init.rs b/pgxn/neon/communicator/src/init.rs index 6a9b9b0b7d..1c66d287ff 100644 --- a/pgxn/neon/communicator/src/init.rs +++ b/pgxn/neon/communicator/src/init.rs @@ -21,6 +21,7 @@ use std::ffi::c_int; use std::mem; use std::mem::MaybeUninit; +use std::os::fd::OwnedFd; use neonart::allocator::r#static::alloc_array_from_slice; @@ -36,8 +37,8 @@ pub struct CommunicatorInitStruct { #[allow(dead_code)] pub max_procs: u32, - pub submission_pipe_read_fd: std::ffi::c_int, - pub submission_pipe_write_fd: std::ffi::c_int, + pub submission_pipe_read_fd: OwnedFd, + pub submission_pipe_write_fd: OwnedFd, // Shared memory data structures pub num_neon_request_slots_per_backend: u32, @@ -111,6 +112,14 @@ pub extern "C" fn rcommunicator_shmem_init( let integrated_cache_init_struct = IntegratedCacheInitStruct::shmem_init(max_procs, remaining_area); + let (submission_pipe_read_fd, submission_pipe_write_fd) = unsafe { + use std::os::fd::FromRawFd; + ( + OwnedFd::from_raw_fd(submission_pipe_read_fd), + OwnedFd::from_raw_fd(submission_pipe_write_fd), + ) + }; + let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct { max_procs, submission_pipe_read_fd, diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs index 8c3498ab7e..44700fe0c1 100644 --- a/pgxn/neon/communicator/src/worker_process/main_loop.rs +++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs @@ -1,4 +1,6 @@ use std::collections::HashMap; +use std::os::fd::AsRawFd; +use std::os::fd::OwnedFd; use std::path::PathBuf; use std::sync::atomic::{AtomicU64, Ordering}; @@ -31,7 +33,7 @@ pub struct CommunicatorWorkerProcessStruct<'a> { pub(crate) cache: IntegratedCacheWriteAccess<'a>, - submission_pipe_read_raw_fd: i32, + submission_pipe_read_fd: OwnedFd, next_request_id: AtomicU64, @@ -139,7 +141,7 @@ pub(super) async fn init( neon_request_slots: cis.neon_request_slots, pageserver_client, cache, - submission_pipe_read_raw_fd: cis.submission_pipe_read_fd, + submission_pipe_read_fd: cis.submission_pipe_read_fd, next_request_id: AtomicU64::new(1), in_progress_table: RequestInProgressTable::new(), @@ -173,8 +175,7 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> { let mut idxbuf: [u8; 4] = [0; 4]; let mut submission_pipe_read = - PipeRead::from_raw_fd_checked(self.submission_pipe_read_raw_fd) - .expect("invalid pipe fd"); + PipeRead::try_from(self.submission_pipe_read_fd.as_raw_fd()).expect("invalid pipe fd"); loop { // Wait for a backend to ring the doorbell @@ -218,8 +219,8 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> { } } - fn request_common(&self, not_modified_since_lsn: Lsn) -> model::RequestCommon { - model::RequestCommon { + fn request_lsns(&self, not_modified_since_lsn: Lsn) -> model::ReadLsn { + model::ReadLsn { request_lsn: get_request_lsn(), not_modified_since_lsn, } @@ -246,8 +247,8 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> { match self .pageserver_client - .process_rel_exists_request(&model::RelExistsRequest { - common: self.request_common(not_modified_since), + .process_check_rel_exists_request(&model::CheckRelExistsRequest { + read_lsn: self.request_lsns(not_modified_since), rel, }) .await @@ -277,11 +278,11 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> { CacheResult::NotFound(lsn) => lsn, }; - let common = self.request_common(not_modified_since); + let read_lsn = self.request_lsns(not_modified_since); match self .pageserver_client - .process_rel_size_request(&model::RelSizeRequest { - common: common.clone(), + .process_get_rel_size_request(&model::GetRelSizeRequest { + read_lsn, rel: rel.clone(), }) .await @@ -333,8 +334,8 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> { match self .pageserver_client - .process_dbsize_request(&model::DbSizeRequest { - common: self.request_common(not_modified_since), + .process_get_dbsize_request(&model::GetDbSizeRequest { + read_lsn: self.request_lsns(not_modified_since), db_oid: req.db_oid, }) .await @@ -457,17 +458,19 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> { match self .pageserver_client .get_page(&model::GetPageRequest { - id: self.next_request_id.fetch_add(1, Ordering::Relaxed), - common: self.request_common(not_modified_since), + request_id: self.next_request_id.fetch_add(1, Ordering::Relaxed), + request_class: model::GetPageClass::Normal, + read_lsn: self.request_lsns(not_modified_since), rel: rel.clone(), - block_number: *blkno, - class: model::GetPageClass::Normal, + block_number: vec![*blkno], }) .await { - Ok(page_image) => { + Ok(page_images) => { // Write the received page image directly to the shared memory location // that the backend requested. + assert!(page_images.len() == 1); + let page_image = page_images[0].clone(); let src: &[u8] = page_image.as_ref(); let len = std::cmp::min(src.len(), dest.bytes_total() as usize); unsafe { @@ -533,19 +536,21 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> { match self .pageserver_client .get_page(&model::GetPageRequest { - id: self.next_request_id.fetch_add(1, Ordering::Relaxed), - common: self.request_common(not_modified_since), + request_id: self.next_request_id.fetch_add(1, Ordering::Relaxed), + request_class: model::GetPageClass::Prefetch, + read_lsn: self.request_lsns(not_modified_since), rel: rel.clone(), - block_number: *blkno, - class: model::GetPageClass::Prefetch, + block_number: vec![*blkno], }) .await { - Ok(page_image) => { + Ok(page_images) => { trace!( "prefetch completed, remembering blk {} in rel {:?} in LFC", *blkno, rel ); + assert!(page_images.len() == 1); + let page_image = page_images[0].clone(); self.cache .remember_page(&rel, *blkno, page_image, not_modified_since, false) .await; diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 9f06fb4da8..c930753dc0 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -98,7 +98,6 @@ #define MB ((uint64)1024*1024) #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> lfc_chunk_size_log)) - #define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (lfc_blocks_per_chunk-1)) /* @@ -135,6 +134,15 @@ typedef struct FileCacheEntry #define N_COND_VARS 64 #define CV_WAIT_TIMEOUT 10 +#define MAX_PREWARM_WORKERS 8 + +typedef struct PrewarmWorkerState +{ + uint32 prewarmed_pages; + uint32 skipped_pages; + TimestampTz completed; +} PrewarmWorkerState; + typedef struct FileCacheControl { uint64 generation; /* generation is needed to handle correct hash @@ -156,25 +164,45 @@ typedef struct FileCacheControl dlist_head holes; /* double linked list of punched holes */ HyperLogLogState wss_estimation; /* estimation of working set size */ ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */ + PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS]; + size_t n_prewarm_workers; + size_t n_prewarm_entries; + size_t total_prewarm_pages; + size_t prewarm_batch; + bool prewarm_active; + bool prewarm_canceled; + dsm_handle prewarm_lfc_state_handle; } FileCacheControl; -bool lfc_store_prefetch_result; +#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc + +#define FILE_CACHE_STATE_BITMAP(fcs) ((uint8*)&(fcs)->chunks[(fcs)->n_chunks]) +#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks) (sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8) +#define FILE_CACHE_STATE_SIZE(fcs) (sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8) static HTAB *lfc_hash; static int lfc_desc = -1; static LWLockId lfc_lock; static int lfc_max_size; int lfc_size_limit; +static int lfc_prewarm_limit; +static int lfc_prewarm_batch; static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG; static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK; char *lfc_path; static uint64 lfc_generation; static FileCacheControl *lfc_ctl; +static bool lfc_do_prewarm; static shmem_startup_hook_type prev_shmem_startup_hook; #if PG_VERSION_NUM>=150000 static shmem_request_hook_type prev_shmem_request_hook; #endif +bool lfc_store_prefetch_result; +bool lfc_prewarm_update_ws_estimation; + +bool AmPrewarmWorker; + #define LFC_ENABLED() (lfc_ctl->limit != 0) /* @@ -500,6 +528,17 @@ lfc_init(void) NULL, NULL); + DefineCustomBoolVariable("neon.prewarm_update_ws_estimation", + "Consider prewarmed pages for working set estimation", + NULL, + &lfc_prewarm_update_ws_estimation, + true, + PGC_SUSET, + 0, + NULL, + NULL, + NULL); + DefineCustomIntVariable("neon.max_file_cache_size", "Maximal size of Neon local file cache", NULL, @@ -550,6 +589,32 @@ lfc_init(void) lfc_change_chunk_size, NULL); + DefineCustomIntVariable("neon.file_cache_prewarm_limit", + "Maximal number of prewarmed chunks", + NULL, + &lfc_prewarm_limit, + INT_MAX, /* no limit by default */ + 0, + INT_MAX, + PGC_SIGHUP, + 0, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("neon.file_cache_prewarm_batch", + "Number of pages retrivied by prewarm from page server", + NULL, + &lfc_prewarm_batch, + 64, + 1, + INT_MAX, + PGC_SIGHUP, + 0, + NULL, + NULL, + NULL); + if (lfc_max_size == 0) return; @@ -563,6 +628,357 @@ lfc_init(void) #endif } +FileCacheState* +lfc_get_state(size_t max_entries) +{ + FileCacheState* fcs = NULL; + + if (lfc_maybe_disabled() || max_entries == 0) /* fast exit if file cache is disabled */ + return NULL; + + LWLockAcquire(lfc_lock, LW_SHARED); + + if (LFC_ENABLED()) + { + dlist_iter iter; + size_t i = 0; + uint8* bitmap; + size_t n_pages = 0; + size_t n_entries = Min(max_entries, lfc_ctl->used - lfc_ctl->pinned); + size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries); + fcs = (FileCacheState*)palloc0(state_size); + SET_VARSIZE(fcs, state_size); + fcs->magic = FILE_CACHE_STATE_MAGIC; + fcs->chunk_size_log = lfc_chunk_size_log; + fcs->n_chunks = n_entries; + bitmap = FILE_CACHE_STATE_BITMAP(fcs); + + dlist_reverse_foreach(iter, &lfc_ctl->lru) + { + FileCacheEntry *entry = dlist_container(FileCacheEntry, list_node, iter.cur); + fcs->chunks[i] = entry->key; + for (int j = 0; j < lfc_blocks_per_chunk; j++) + { + if (GET_STATE(entry, j) != UNAVAILABLE) + { + BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j); + n_pages += 1; + } + } + if (++i == n_entries) + break; + } + Assert(i == n_entries); + fcs->n_pages = n_pages; + Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages); + elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages); + } + + LWLockRelease(lfc_lock); + + return fcs; +} + +/* + * Prewarm LFC cache to the specified state. It uses lfc_prefetch function to load prewarmed page without hoilding shared buffer lock + * and avoid race conditions with other backends. + */ +void +lfc_prewarm(FileCacheState* fcs, uint32 n_workers) +{ + size_t fcs_chunk_size_log; + size_t n_entries; + size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size); + size_t fcs_size; + dsm_segment *seg; + BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS]; + + + if (!lfc_ensure_opened()) + return; + + if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0) + { + elog(LOG, "LFC: prewarm is disabled"); + return; + } + + if (n_workers > MAX_PREWARM_WORKERS) + { + elog(ERROR, "LFC: Too much prewarm workers, maximum is %d", MAX_PREWARM_WORKERS); + } + + if (fcs == NULL || fcs->n_chunks == 0) + { + elog(LOG, "LFC: nothing to prewarm"); + return; + } + + if (fcs->magic != FILE_CACHE_STATE_MAGIC) + { + elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic); + } + + fcs_size = VARSIZE(fcs); + if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size) + { + elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs)); + } + + fcs_chunk_size_log = fcs->chunk_size_log; + if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG) + { + elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log); + } + + n_entries = Min(fcs->n_chunks, lfc_prewarm_limit); + Assert(n_entries != 0); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + /* Do not prewarm more entries than LFC limit */ + if (lfc_ctl->limit <= lfc_ctl->size) + { + elog(LOG, "LFC: skip prewarm because LFC is already filled"); + LWLockRelease(lfc_lock); + return; + } + + if (lfc_ctl->prewarm_active) + { + LWLockRelease(lfc_lock); + elog(ERROR, "LFC: skip prewarm because another prewarm is still active"); + } + lfc_ctl->n_prewarm_entries = n_entries; + lfc_ctl->n_prewarm_workers = n_workers; + lfc_ctl->prewarm_active = true; + lfc_ctl->prewarm_canceled = false; + lfc_ctl->prewarm_batch = prewarm_batch; + memset(lfc_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState)); + + LWLockRelease(lfc_lock); + + /* Calculate total number of pages to be prewarmed */ + lfc_ctl->total_prewarm_pages = fcs->n_pages; + + seg = dsm_create(fcs_size, 0); + memcpy(dsm_segment_address(seg), fcs, fcs_size); + lfc_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg); + + /* Spawn background workers */ + for (uint32 i = 0; i < n_workers; i++) + { + BackgroundWorker worker = {0}; + + worker.bgw_flags = BGWORKER_SHMEM_ACCESS; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + worker.bgw_restart_time = BGW_NEVER_RESTART; + strcpy(worker.bgw_library_name, "neon"); + strcpy(worker.bgw_function_name, "lfc_prewarm_main"); + snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1); + strcpy(worker.bgw_type, "LFC prewarm worker"); + worker.bgw_main_arg = Int32GetDatum(i); + /* must set notify PID to wait for shutdown */ + worker.bgw_notify_pid = MyProcPid; + + if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i])) + { + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("LFC: registering dynamic bgworker prewarm failed"), + errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes"))); + n_workers = i; + lfc_ctl->prewarm_canceled = true; + break; + } + } + + for (uint32 i = 0; i < n_workers; i++) + { + bool interrupted; + do + { + interrupted = false; + PG_TRY(); + { + BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]); + if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED) + { + elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status); + } + } + PG_CATCH(); + { + elog(LOG, "LFC: cancel prewarm"); + lfc_ctl->prewarm_canceled = true; + interrupted = true; + } + PG_END_TRY(); + } while (interrupted); + + if (!lfc_ctl->prewarm_workers[i].completed) + { + /* Background worker doesn't set completion time: it means that it was abnormally terminated */ + elog(LOG, "LFC: prewarm worker %d failed", i+1); + /* Set completion time to prevent get_prewarm_info from considering this worker as active */ + lfc_ctl->prewarm_workers[i].completed = GetCurrentTimestamp(); + } + } + dsm_detach(seg); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + lfc_ctl->prewarm_active = false; + LWLockRelease(lfc_lock); +} + +void +lfc_prewarm_main(Datum main_arg) +{ + size_t snd_idx = 0, rcv_idx = 0; + size_t n_sent = 0, n_received = 0; + size_t fcs_chunk_size_log; + size_t max_prefetch_pages; + size_t prewarm_batch; + size_t n_workers; + dsm_segment *seg; + FileCacheState* fcs; + uint8* bitmap; + BufferTag tag; + PrewarmWorkerState* ws; + uint32 worker_id = DatumGetInt32(main_arg); + + AmPrewarmWorker = true; + + pqsignal(SIGTERM, die); + BackgroundWorkerUnblockSignals(); + + seg = dsm_attach(lfc_ctl->prewarm_lfc_state_handle); + if (seg == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not map dynamic shared memory segment"))); + + fcs = (FileCacheState*) dsm_segment_address(seg); + prewarm_batch = lfc_ctl->prewarm_batch; + fcs_chunk_size_log = fcs->chunk_size_log; + n_workers = lfc_ctl->n_prewarm_workers; + max_prefetch_pages = lfc_ctl->n_prewarm_entries << fcs_chunk_size_log; + ws = &lfc_ctl->prewarm_workers[worker_id]; + bitmap = FILE_CACHE_STATE_BITMAP(fcs); + + /* enable prefetch in LFC */ + lfc_store_prefetch_result = true; + lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */ + + elog(LOG, "LFC: worker %d start prewarming", worker_id); + while (!lfc_ctl->prewarm_canceled) + { + if (snd_idx < max_prefetch_pages) + { + if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id) + { + /* If there are multiple workers, split chunks between them */ + snd_idx += 1 << fcs_chunk_size_log; + } + else + { + if (BITMAP_ISSET(bitmap, snd_idx)) + { + tag = fcs->chunks[snd_idx >> fcs_chunk_size_log]; + tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1); + if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum)) + { + (void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); + n_sent += 1; + } + else + { + ws->skipped_pages += 1; + BITMAP_CLR(bitmap, snd_idx); + } + } + snd_idx += 1; + } + } + if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages) + { + if (n_received == n_sent && snd_idx == max_prefetch_pages) + { + break; + } + if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id) + { + /* Skip chunks processed by other workers */ + rcv_idx += 1 << fcs_chunk_size_log; + continue; + } + + /* Locate next block to prefetch */ + while (!BITMAP_ISSET(bitmap, rcv_idx)) + { + rcv_idx += 1; + } + tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log]; + tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1); + if (communicator_prefetch_receive(tag)) + { + ws->prewarmed_pages += 1; + } + else + { + ws->skipped_pages += 1; + } + rcv_idx += 1; + n_received += 1; + } + } + /* No need to perform prefetch cleanup here because prewarm worker will be terminated and + * connection to PS dropped just after return from this function. + */ + Assert(n_sent == n_received || lfc_ctl->prewarm_canceled); + elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received); + lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp(); +} + +void +lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks) +{ + BufferTag tag; + FileCacheEntry *entry; + uint32 hash; + + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + return; + + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + if (LFC_ENABLED()) + { + for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk) + { + tag.blockNum = blkno; + hash = get_hash_value(lfc_hash, &tag); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + if (entry != NULL) + { + for (int i = 0; i < lfc_blocks_per_chunk; i++) + { + if (GET_STATE(entry, i) == AVAILABLE) + { + lfc_ctl->used_pages -= 1; + SET_STATE(entry, i, UNAVAILABLE); + } + } + } + } + } + LWLockRelease(lfc_lock); +} + /* * Check if page is present in the cache. * Returns true if page is found in local cache. @@ -1001,8 +1417,11 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash) * If we can't (e.g. because all other slots are being accessed) * then we will remove this entry from the hash and continue * on to the next chunk, as we may not exceed the limit. + * + * While prewarming LFC we do not want to replace existed entries, + * so we just stop prewarm is LFC cache is full. */ - else if (!dlist_is_empty(&lfc_ctl->lru)) + else if (!dlist_is_empty(&lfc_ctl->lru) && !lfc_do_prewarm) { /* Cache overflow: evict least recently used chunk */ FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, @@ -1026,6 +1445,7 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash) /* Can't add this chunk - we don't have the space for it */ hash_search_with_hash_value(lfc_hash, &entry->key, hash, HASH_REMOVE, NULL); + lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */ return false; } @@ -1112,9 +1532,11 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - tag.blockNum = blkno; - addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); - + if (lfc_prewarm_update_ws_estimation) + { + tag.blockNum = blkno; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + } if (found) { state = GET_STATE(entry, chunk_offs); @@ -1748,3 +2170,82 @@ approximate_working_set_size(PG_FUNCTION_ARGS) } PG_RETURN_NULL(); } + +PG_FUNCTION_INFO_V1(get_local_cache_state); + +Datum +get_local_cache_state(PG_FUNCTION_ARGS) +{ + size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0); + FileCacheState* fcs = lfc_get_state(max_entries); + if (fcs != NULL) + PG_RETURN_BYTEA_P((bytea*)fcs); + else + PG_RETURN_NULL(); +} + +PG_FUNCTION_INFO_V1(prewarm_local_cache); + +Datum +prewarm_local_cache(PG_FUNCTION_ARGS) +{ + bytea* state = PG_GETARG_BYTEA_PP(0); + uint32 n_workers = PG_GETARG_INT32(1); + FileCacheState* fcs = (FileCacheState*)state; + + lfc_prewarm(fcs, n_workers); + + PG_RETURN_NULL(); +} + +PG_FUNCTION_INFO_V1(get_prewarm_info); + +Datum +get_prewarm_info(PG_FUNCTION_ARGS) +{ + Datum values[4]; + bool nulls[4]; + TupleDesc tupdesc; + uint32 prewarmed_pages = 0; + uint32 skipped_pages = 0; + uint32 active_workers = 0; + uint32 total_pages; + size_t n_workers; + + if (lfc_size_limit == 0) + PG_RETURN_NULL(); + + LWLockAcquire(lfc_lock, LW_SHARED); + if (!lfc_ctl || lfc_ctl->n_prewarm_workers == 0) + { + LWLockRelease(lfc_lock); + PG_RETURN_NULL(); + } + n_workers = lfc_ctl->n_prewarm_workers; + total_pages = lfc_ctl->total_prewarm_pages; + for (size_t i = 0; i < n_workers; i++) + { + PrewarmWorkerState* ws = &lfc_ctl->prewarm_workers[i]; + prewarmed_pages += ws->prewarmed_pages; + skipped_pages += ws->skipped_pages; + active_workers += ws->completed != 0; + } + LWLockRelease(lfc_lock); + + tupdesc = CreateTemplateTupleDesc(4); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = Int32GetDatum(total_pages); + values[1] = Int32GetDatum(prewarmed_pages); + values[2] = Int32GetDatum(skipped_pages); + values[3] = Int32GetDatum(active_workers); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h index 8c31738484..a392063862 100644 --- a/pgxn/neon/file_cache.h +++ b/pgxn/neon/file_cache.h @@ -13,12 +13,24 @@ #include "neon_pgversioncompat.h" +typedef struct FileCacheState +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + uint32 magic; + uint32 n_chunks; + uint32 n_pages; + uint16 chunk_size_log; + BufferTag chunks[FLEXIBLE_ARRAY_MEMBER]; + /* followed by bitmap */ +} FileCacheState; + /* GUCs */ extern bool lfc_store_prefetch_result; extern int lfc_size_limit; extern char *lfc_path; /* functions for local file cache */ +extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks); extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *const *buffers, BlockNumber nblocks); @@ -34,7 +46,10 @@ extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, extern void lfc_init(void); extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, const void* buffer, XLogRecPtr lsn); +extern FileCacheState* lfc_get_state(size_t max_entries); +extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers); +PGDLLEXPORT void lfc_prewarm_main(Datum main_arg); static inline bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 38172d2a11..902f471dd9 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -26,6 +26,7 @@ #include "portability/instr_time.h" #include "postmaster/interrupt.h" #include "storage/buf_internals.h" +#include "storage/fd.h" #include "storage/ipc.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" @@ -79,6 +80,7 @@ int neon_protocol_version = 3; static int neon_compute_mode = 0; static int max_reconnect_attempts = 60; static int stripe_size; +static int max_sockets; static int pageserver_response_log_timeout = 10000; /* 2.5 minutes. A bit higher than highest default TCP retransmission timeout */ @@ -384,6 +386,13 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p) pageserver_disconnect(i); } pagestore_local_counter = end_update_counter; + + /* Reserve file descriptors for sockets */ + while (max_sockets < num_shards) + { + max_sockets += 1; + ReserveExternalFD(); + } } if (num_shards_p) @@ -472,7 +481,6 @@ pageserver_connect(shardno_t shard_no, int elevel) now = GetCurrentTimestamp(); us_since_last_attempt = (int64) (now - shard->last_reconnect_time); - shard->last_reconnect_time = now; /* * Make sure we don't do exponential backoff with a constant multiplier @@ -486,14 +494,23 @@ pageserver_connect(shardno_t shard_no, int elevel) /* * If we did other tasks between reconnect attempts, then we won't * need to wait as long as a full delay. + * + * This is a loop to protect against interrupted sleeps. */ - if (us_since_last_attempt < shard->delay_us) + while (us_since_last_attempt < shard->delay_us) { pg_usleep(shard->delay_us - us_since_last_attempt); + + /* At least we should handle cancellations here */ + CHECK_FOR_INTERRUPTS(); + + now = GetCurrentTimestamp(); + us_since_last_attempt = (int64) (now - shard->last_reconnect_time); } /* update the delay metric */ shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC); + shard->last_reconnect_time = now; /* * Connect using the connection string we got from the @@ -784,8 +801,8 @@ pageserver_connect(shardno_t shard_no, int elevel) default: neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state); } - /* This shouldn't be hit */ - Assert(false); + + pg_unreachable(); } static void @@ -925,6 +942,7 @@ retry: int port; int sndbuf; int recvbuf; + uint64* max_wait; get_local_port(PQsocket(pageserver_conn), &port); get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf); @@ -935,7 +953,10 @@ retry: shard->nrequests_sent, shard->nresponses_received, port, sndbuf, recvbuf, pageserver_conn->inStart, pageserver_conn->inEnd); shard->receive_last_log_time = now; + MyNeonCounters->compute_getpage_stuck_requests_total += !shard->receive_logged; shard->receive_logged = true; + max_wait = &MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms; + *max_wait = Max(*max_wait, INSTR_TIME_GET_MILLISEC(since_start)); } /* @@ -958,6 +979,7 @@ retry: get_local_port(PQsocket(pageserver_conn), &port); neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting (socket port=%d)", INSTR_TIME_GET_DOUBLE(since_start), port); + MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0; pageserver_disconnect(shard_no); return -1; } @@ -981,6 +1003,7 @@ retry: INSTR_TIME_SET_ZERO(shard->receive_start_time); INSTR_TIME_SET_ZERO(shard->receive_last_log_time); shard->receive_logged = false; + MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0; return ret; } diff --git a/pgxn/neon/neon--1.5--1.6.sql b/pgxn/neon/neon--1.5--1.6.sql new file mode 100644 index 0000000000..c05f0f87aa --- /dev/null +++ b/pgxn/neon/neon--1.5--1.6.sql @@ -0,0 +1,22 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.6'" to load this file. \quit + +CREATE FUNCTION get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer) +RETURNS record +AS 'MODULE_PATHNAME', 'get_prewarm_info' +LANGUAGE C STRICT +PARALLEL SAFE; + +CREATE FUNCTION get_local_cache_state(max_chunks integer default null) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_local_cache_state' +LANGUAGE C +PARALLEL UNSAFE; + +CREATE FUNCTION prewarm_local_cache(state bytea, n_workers integer default 1) +RETURNS void +AS 'MODULE_PATHNAME', 'prewarm_local_cache' +LANGUAGE C STRICT +PARALLEL UNSAFE; + + + diff --git a/pgxn/neon/neon--1.6--1.5.sql b/pgxn/neon/neon--1.6--1.5.sql new file mode 100644 index 0000000000..57512980f5 --- /dev/null +++ b/pgxn/neon/neon--1.6--1.5.sql @@ -0,0 +1,7 @@ +DROP FUNCTION IF EXISTS get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer); + +DROP FUNCTION IF EXISTS get_local_cache_state(max_chunks integer); + +DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer default 1); + + diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 792e9fa2ff..149ed5ebed 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -24,6 +24,8 @@ extern int wal_acceptor_connection_timeout; extern int readahead_getpage_pull_timeout_ms; extern bool disable_wal_prev_lsn_checks; +extern bool AmPrewarmWorker; + #if PG_MAJORVERSION_NUM >= 17 extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; extern uint32 WAIT_EVENT_NEON_LFC_READ; diff --git a/pgxn/neon/neon_lwlsncache.c b/pgxn/neon/neon_lwlsncache.c index 6959da55cb..a8cfa0f825 100644 --- a/pgxn/neon/neon_lwlsncache.c +++ b/pgxn/neon/neon_lwlsncache.c @@ -4,6 +4,7 @@ #include "miscadmin.h" #include "access/xlog.h" +#include "access/xlog_internal.h" #include "storage/ipc.h" #include "storage/shmem.h" #include "storage/buf_internals.h" @@ -396,9 +397,10 @@ SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn, XLogRecPtr neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks) { - if (lsn < FirstNormalUnloggedLSN || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0) + if (lsn == InvalidXLogRecPtr || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0) return lsn; + Assert(lsn >= WalSegMinSize); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); lsn = SetLastWrittenLSNForBlockRangeInternal(lsn, rlocator, forknum, from, n_blocks); LWLockRelease(LastWrittenLsnLock); @@ -435,7 +437,6 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, NInfoGetRelNumber(relfilenode) == InvalidOid) return InvalidXLogRecPtr; - BufTagInit(key, relNumber, forknum, blockno, spcOid, dbOid); LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); @@ -444,6 +445,10 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode, { XLogRecPtr lsn = lsns[i]; + if (lsn == InvalidXLogRecPtr) + continue; + + Assert(lsn >= WalSegMinSize); key.blockNum = blockno + i; entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); if (found) diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index 05db187076..c77d99d636 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -148,7 +148,7 @@ histogram_to_metrics(IOHistogram histogram, static metric_t * neon_perf_counters_to_metrics(neon_per_backend_counters *counters) { -#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10) +#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12) metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); int i = 0; @@ -166,6 +166,8 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters) APPEND_METRIC(getpage_prefetch_requests_total); APPEND_METRIC(getpage_sync_requests_total); + APPEND_METRIC(compute_getpage_stuck_requests_total); + APPEND_METRIC(compute_getpage_max_inflight_stuck_time_ms); APPEND_METRIC(getpage_prefetch_misses_total); APPEND_METRIC(getpage_prefetch_discards_total); APPEND_METRIC(pageserver_requests_sent_total); @@ -294,6 +296,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) totals.file_cache_hits_total += counters->file_cache_hits_total; histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); + + totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total; + totals.compute_getpage_max_inflight_stuck_time_ms = Max( + totals.compute_getpage_max_inflight_stuck_time_ms, + counters->compute_getpage_max_inflight_stuck_time_ms); } metrics = neon_perf_counters_to_metrics(&totals); diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 5f5330bb69..10cf094d4a 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -57,6 +57,18 @@ typedef struct uint64 getpage_prefetch_requests_total; uint64 getpage_sync_requests_total; + /* + * Total number of Getpage requests left without an answer for more than + * pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout + */ + uint64 compute_getpage_stuck_requests_total; + + /* + * Longest waiting time for active stuck requests. If a stuck request gets a + * response or disconnects, this metric is updated + */ + uint64 compute_getpage_max_inflight_stuck_time_ms; + /* * Total number of readahead misses; consisting of either prefetches that * don't satisfy the LSN bounds, or cases where no readahead was issued diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index f2d6292768..b27b80e5d7 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -90,7 +90,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #define InvalidRelFileNumber InvalidOid -#define SMgrRelGetRelInfo(reln) \ +#define SMgrRelGetRelInfo(reln) \ (reln->smgr_rnode.node) #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers @@ -152,6 +152,12 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers #endif +#define NRelFileInfoInvalidate(rinfo) do { \ + NInfoGetSpcOid(rinfo) = InvalidOid; \ + NInfoGetDbOid(rinfo) = InvalidOid; \ + NInfoGetRelNumber(rinfo) = InvalidRelFileNumber; \ + } while (0) + #if PG_MAJORVERSION_NUM < 17 #define ProcNumber BackendId #define INVALID_PROC_NUMBER InvalidBackendId diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index be2c4ddf79..d5e3a38dbb 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -150,7 +150,7 @@ NeonWALReaderFree(NeonWALReader *state) * fetched from timeline 'tli'. * * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error - * occurs, in which case 'err' has the desciption. Error always closes remote + * occurs, in which case 'err' has the description. Error always closes remote * connection, if there was any, so socket subscription should be removed. * * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 85f49ab593..445f1e9ac8 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -105,7 +105,7 @@ typedef enum UNLOGGED_BUILD_NOT_PERMANENT } UnloggedBuildPhase; -static SMgrRelation unlogged_build_rel = NULL; +static NRelFileInfo unlogged_build_rel_info; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); @@ -927,16 +927,19 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, { case 0: neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdextend(reln, forkNum, blkno, buffer, skipFsync); + return; + } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdextend(reln, forkNum, blkno, buffer, skipFsync); - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer); return; default: @@ -1027,21 +1030,19 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block, { case 0: neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); + return; + } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - { - for (int i = 0; i < nblocks; i++) - { - lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data); - } - } return; default: @@ -1217,7 +1218,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } if (!neon_enable_new_communicator) - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); return false; } @@ -1260,7 +1261,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); if (!neon_enable_new_communicator) - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); return false; } @@ -1305,7 +1306,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, neon_log(SmgrTrace, "writeback noop"); if (!neon_enable_new_communicator) - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1331,105 +1332,24 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); } -#if PG_MAJORVERSION_NUM < 17 -/* - * neon_read() -- Read the specified block from a relation. - */ -#if PG_MAJORVERSION_NUM < 16 -static void -neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) -#else -static void -neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) -#endif -{ - neon_request_lsns request_lsns; - bits8 present; - void *bufferp; - bool prefetch_hit; - - switch (reln->smgr_relpersistence) - { - case 0: - neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - mdread(reln, forkNum, blkno, buffer); - return; - - default: - neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - present = 0; - bufferp = buffer; - - if (neon_enable_new_communicator) - { - communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forkNum, blkno, - (void *) &buffer, 1); - } - else - { - prefetch_hit = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present); - if (prefetch_hit) - { - /* Prefetch hit */ - return; - } - - /* Try to read from local file cache */ - if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) - { - MyNeonCounters->file_cache_hits_total++; - return; - } - - /* - * Try to receive prefetch results once again just to make sure we - * don't leave the smgr code while the OS might still have buffered - * bytes. - */ - communicator_prefetch_pump_state(false); - - neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); - - prefetch_hit = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present); - - if (prefetch_hit) - { - /* Prefetch hit */ - return; - } - - /* Try to read from local file cache */ - if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) - { - MyNeonCounters->file_cache_hits_total++; - return; - } - - neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); - - /* - * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. - */ - communicator_prefetch_pump_state(false); - } - #ifdef DEBUG_COMPARE_LOCAL +static void +compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void* buffer, XLogRecPtr request_lsn) +{ if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { char pageserver_masked[BLCKSZ]; PGIOAlignedBlock mdbuf; PGIOAlignedBlock mdbuf_masked; - XLogRecPtr request_lsn = request_lsns.request_lsn; +#if PG_MAJORVERSION_NUM >= 17 + { + void* mdbuffers[1] = { mdbuf.data }; + mdreadv(reln, forkNum, blkno, mdbuffers, 1); + } +#else mdread(reln, forkNum, blkno, mdbuf.data); +#endif memcpy(pageserver_masked, buffer, BLCKSZ); memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); @@ -1493,11 +1413,119 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } } } +} +#endif + + +#if PG_MAJORVERSION_NUM < 17 + +/* + * neon_read() -- Read the specified block from a relation. + */ +#if PG_MAJORVERSION_NUM < 16 +static void +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) +#else +static void +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) +#endif +{ + neon_request_lsns request_lsns; + bits8 present; + void *bufferp; + + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdread(reln, forkNum, blkno, buffer); + return; + } + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdread(reln, forkNum, blkno, buffer); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (neon_enable_new_communicator) + { + communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forkNum, blkno, + (void *) &buffer, 1); + } + else + { + /* Try to read PS results if they are available */ + communicator_prefetch_pump_state(); + + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); + + present = 0; + bufferp = buffer; + if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) + { + /* Prefetch hit */ +#ifdef DEBUG_COMPARE_LOCAL + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); +#else + return; +#endif + } + + /* Try to read from local file cache */ + if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) + { + MyNeonCounters->file_cache_hits_total++; +#ifdef DEBUG_COMPARE_LOCAL + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); +#else + return; +#endif + } + + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); + + /* + * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. + */ + communicator_prefetch_pump_state(); + } + +#ifdef DEBUG_COMPARE_LOCAL + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); #endif } #endif /* PG_MAJORVERSION_NUM <= 16 */ #if PG_MAJORVERSION_NUM >= 17 + +#ifdef DEBUG_COMPARE_LOCAL +static void +compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void** buffers, BlockNumber nblocks, neon_request_lsns* request_lsns, bits8* read_pages) +{ + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + for (BlockNumber i = 0; i < nblocks; i++) + { + if (BITMAP_ISSET(read_pages, i)) + { + compare_with_local(reln, forkNum, blkno + i, buffers[i], request_lsns[i].request_lsn); + } + } + } +} +#endif + + static void neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks) @@ -1511,8 +1539,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { case 0: neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdreadv(reln, forknum, blocknum, buffers, nblocks); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1530,7 +1564,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, /* Try to read PS results if they are available */ if (!neon_enable_new_communicator) - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); memset(read_pages, 0, sizeof(read_pages)); @@ -1548,8 +1582,13 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, blocknum, request_lsns, nblocks, buffers, read_pages); +#ifdef DEBUG_COMPARE_LOCAL + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); + memset(read_pages, 0, sizeof(read_pages)); +#else if (prefetch_result == nblocks) return; +#endif /* Try to read from local file cache */ lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, @@ -1558,9 +1597,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (lfc_result > 0) MyNeonCounters->file_cache_hits_total += lfc_result; +#ifdef DEBUG_COMPARE_LOCAL + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); + memset(read_pages, 0, sizeof(read_pages)); +#else /* Read all blocks from LFC, so we're done */ if (prefetch_result + lfc_result == nblocks) return; +#endif communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, buffers, nblocks, read_pages); @@ -1568,95 +1612,12 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); } #ifdef DEBUG_COMPARE_LOCAL - if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) - { - char pageserver_masked[BLCKSZ]; - PGIOAlignedBlock mdbuf; - PGIOAlignedBlock mdbuf_masked; - XLogRecPtr request_lsn = request_lsns->request_lsn; - - for (int i = 0; i < nblocks; i++) - { - BlockNumber blkno = blocknum + i; - if (!BITMAP_ISSET(read_pages, i)) - continue; - -#if PG_MAJORVERSION_NUM >= 17 - { - void* mdbuffers[1] = { mdbuf.data }; - mdreadv(reln, forknum, blkno, mdbuffers, 1); - } -#else - mdread(reln, forknum, blkno, mdbuf.data); -#endif - - memcpy(pageserver_masked, buffers[i], BLCKSZ); - memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); - - if (PageIsNew((Page) mdbuf.data)) - { - if (!PageIsNew((Page) pageserver_masked)) - { - neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(buffers[i])); - } - } - else if (PageIsNew((Page) buffers[i])) - { - neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf.data)); - } - else if (PageGetSpecialSize(mdbuf.data) == 0) - { - /* assume heap */ - RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno); - RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); - - if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) - { - neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked.data), - hexdump_page(pageserver_masked)); - } - } - else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData))) - { - if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID) - { - /* assume btree */ - RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno); - RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); - - if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) - { - neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked.data), - hexdump_page(pageserver_masked)); - } - } - } - } - } + memset(read_pages, 0xFF, sizeof(read_pages)); + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); #endif } #endif @@ -1727,6 +1688,15 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { +#if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); +#else + mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif + return; + } break; case RELPERSISTENCE_TEMP: @@ -1736,9 +1706,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); @@ -1760,7 +1727,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo { lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); } #ifdef DEBUG_COMPARE_LOCAL @@ -1806,14 +1773,16 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); + return; + } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); @@ -1834,7 +1803,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, { lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); } #ifdef DEBUG_COMPARE_LOCAL @@ -1861,6 +1830,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + return mdnblocks(reln, forknum); + } break; case RELPERSISTENCE_TEMP: @@ -1942,6 +1915,11 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdtruncate(reln, forknum, old_blocks, nblocks); + return; + } break; case RELPERSISTENCE_TEMP: @@ -2032,7 +2010,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); if (!neon_enable_new_communicator) - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -2090,7 +2068,6 @@ neon_start_unlogged_build(SMgrRelation reln) */ if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) neon_log(ERROR, "unlogged relation build is already in progress"); - Assert(unlogged_build_rel == NULL); ereport(SmgrTrace, (errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u", @@ -2107,7 +2084,7 @@ neon_start_unlogged_build(SMgrRelation reln) case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: - unlogged_build_rel = reln; + unlogged_build_rel_info = InfoFromSMgrRel(reln); unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; #ifdef DEBUG_COMPARE_LOCAL if (!IsParallelWorker()) @@ -2119,15 +2096,18 @@ neon_start_unlogged_build(SMgrRelation reln) neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } +#if PG_MAJORVERSION_NUM >= 17 + /* + * We have to disable this check for pg14-16 because sorted build of GIST index requires + * to perform unlogged build several times + */ if (smgrnblocks(reln, MAIN_FORKNUM) != 0) neon_log(ERROR, "cannot perform unlogged index build, index is not empty "); +#endif - unlogged_build_rel = reln; + unlogged_build_rel_info = InfoFromSMgrRel(reln); unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; - /* Make the relation look like it's unlogged */ - reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; - /* * Create the local file. In a parallel build, the leader is expected to * call this first and do it. @@ -2154,17 +2134,16 @@ neon_start_unlogged_build(SMgrRelation reln) static void neon_finish_unlogged_build_phase_1(SMgrRelation reln) { - Assert(unlogged_build_rel == reln); + Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))); ereport(SmgrTrace, (errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u", - RelFileInfoFmt(InfoFromSMgrRel(reln))))); + RelFileInfoFmt((unlogged_build_rel_info))))); if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) return; Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); - Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); /* * In a parallel build, (only) the leader process performs the 2nd @@ -2172,7 +2151,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) */ if (IsParallelWorker()) { - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } else @@ -2193,11 +2172,11 @@ neon_end_unlogged_build(SMgrRelation reln) { NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln); - Assert(unlogged_build_rel == reln); + Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))); ereport(SmgrTrace, (errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u", - RelFileInfoFmt(InfoFromNInfoB(rinfob))))); + RelFileInfoFmt(unlogged_build_rel_info)))); if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) { @@ -2205,7 +2184,6 @@ neon_end_unlogged_build(SMgrRelation reln) BlockNumber nblocks; Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); - Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); /* * Update the last-written LSN cache. @@ -2226,9 +2204,6 @@ neon_end_unlogged_build(SMgrRelation reln) InfoFromNInfoB(rinfob), MAIN_FORKNUM); - /* Make the relation look permanent again */ - reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT; - /* Remove local copy */ for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) { @@ -2237,6 +2212,8 @@ neon_end_unlogged_build(SMgrRelation reln) forknum); forget_cached_relsize(InfoFromNInfoB(rinfob), forknum); + lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks); + mdclose(reln, forknum); #ifndef DEBUG_COMPARE_LOCAL /* use isRedo == true, so that we drop it immediately */ @@ -2247,7 +2224,7 @@ neon_end_unlogged_build(SMgrRelation reln) mdunlink(rinfob, INIT_FORKNUM, true); #endif } - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } @@ -2323,7 +2300,7 @@ AtEOXact_neon(XactEvent event, void *arg) * Forget about any build we might have had in progress. The local * file will be unlinked by smgrDoPendingDeletes() */ - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; break; @@ -2335,7 +2312,7 @@ AtEOXact_neon(XactEvent event, void *arg) case XACT_EVENT_PRE_PREPARE: if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) { - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index b95b1451e4..3befb42030 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -124,6 +124,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) } else { + wp->safekeepers_generation = INVALID_GENERATION; host = wp->config->safekeepers_list; } wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation); @@ -756,7 +757,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) { SafekeeperId *sk_id = &wp->mconf.members.m[i]; - if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId) + if (sk_id->node_id == sk->greetResponse.nodeId) { /* * If mconf or list of safekeepers to connect to changed (the @@ -781,7 +782,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) { SafekeeperId *sk_id = &wp->mconf.new_members.m[i]; - if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId) + if (sk_id->node_id == sk->greetResponse.nodeId) { if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk) { @@ -836,7 +837,7 @@ TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf { uint32 n_greeted = 0; - for (uint32 i = 0; i < wp->mconf.members.len; i++) + for (uint32 i = 0; i < mset->len; i++) { Safekeeper *sk = msk[i]; @@ -1071,7 +1072,6 @@ RecvVoteResponse(Safekeeper *sk) /* ready for elected message */ sk->state = SS_WAIT_ELECTED; - wp->n_votes++; /* Are we already elected? */ if (wp->state == WPS_CAMPAIGN) { @@ -1106,7 +1106,7 @@ VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf { uint32 n_votes = 0; - for (uint32 i = 0; i < wp->mconf.members.len; i++) + for (uint32 i = 0; i < mset->len; i++) { Safekeeper *sk = msk[i]; diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 648b0015ad..83ef72d3d7 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -845,9 +845,6 @@ typedef struct WalProposer /* timeline globally starts at this LSN */ XLogRecPtr timelineStartLsn; - /* number of votes collected from safekeepers */ - int n_votes; - /* number of successful connections over the lifetime of walproposer */ int n_connected; diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index a061639815..17582405db 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -63,7 +63,7 @@ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; -int safekeeper_proto_version = 2; +int safekeeper_proto_version = 3; /* Set to true in the walproposer bgw. */ static bool am_walproposer; @@ -228,7 +228,7 @@ nwp_register_gucs(void) "Version of compute <-> safekeeper protocol.", "Used while migrating from 2 to 3.", &safekeeper_proto_version, - 2, 0, INT_MAX, + 3, 0, INT_MAX, PGC_POSTMASTER, 0, NULL, NULL, NULL); diff --git a/poetry.lock b/poetry.lock index 1a772d3415..21a2664555 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1145,18 +1145,19 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-cors" -version = "5.0.0" -description = "A Flask extension adding a decorator for CORS support" +version = "6.0.0" +description = "A Flask extension simplifying CORS support" optional = false -python-versions = "*" +python-versions = "<4.0,>=3.9" groups = ["main"] files = [ - {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"}, - {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"}, + {file = "flask_cors-6.0.0-py3-none-any.whl", hash = "sha256:6332073356452343a8ccddbfec7befdc3fdd040141fe776ec9b94c262f058657"}, + {file = "flask_cors-6.0.0.tar.gz", hash = "sha256:4592c1570246bf7beee96b74bc0adbbfcb1b0318f6ba05c412e8909eceec3393"}, ] [package.dependencies] -Flask = ">=0.9" +flask = ">=0.9" +Werkzeug = ">=0.7" [[package]] name = "frozenlist" @@ -3169,19 +3170,24 @@ pbr = "*" [[package]] name = "setuptools" -version = "70.0.0" +version = "78.1.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["main"] files = [ - {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"}, - {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"}, + {file = "setuptools-78.1.1-py3-none-any.whl", hash = "sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561"}, + {file = "setuptools-78.1.1.tar.gz", hash = "sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""] +core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"] [[package]] name = "six" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 2cec510d82..ce8610be24 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -127,3 +127,4 @@ rstest.workspace = true walkdir.workspace = true rand_distr = "0.4" tokio-postgres.workspace = true +tracing-test = "0.2" \ No newline at end of file diff --git a/proxy/README.md b/proxy/README.md index 1156bfd352..583db36f28 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -32,7 +32,7 @@ To play with it locally one may start proxy over a local postgres installation (see end of this page on how to generate certs with openssl): ``` -./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444 +LOGFMT=text ./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444 ``` If both postgres and proxy are running you may send a SQL query: @@ -130,7 +130,7 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key Then we need to build proxy with 'testing' feature and run, e.g.: ```sh -RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key +RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key ``` Now from client you can start a new session: diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 44a6a42665..a48f67199a 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -409,14 +409,22 @@ impl JwkCacheEntryLock { if let Some(exp) = payload.expiration { if now >= exp + CLOCK_SKEW_LEEWAY { - return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired)); + return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired( + exp.duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ))); } } if let Some(nbf) = payload.not_before { if nbf >= now + CLOCK_SKEW_LEEWAY { return Err(JwtError::InvalidClaims( - JwtClaimsError::JwtTokenNotYetReadyToUse, + JwtClaimsError::JwtTokenNotYetReadyToUse( + nbf.duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ), )); } } @@ -534,10 +542,10 @@ struct JwtPayload<'a> { #[serde(rename = "aud", default)] audience: OneOrMany, /// Expiration - Time after which the JWT expires - #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)] + #[serde(rename = "exp", deserialize_with = "numeric_date_opt", default)] expiration: Option, - /// Not before - Time after which the JWT expires - #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)] + /// Not before - Time before which the JWT is not valid + #[serde(rename = "nbf", deserialize_with = "numeric_date_opt", default)] not_before: Option, // the following entries are only extracted for the sake of debug logging. @@ -609,8 +617,15 @@ impl<'de> Deserialize<'de> for OneOrMany { } fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { - let d = >::deserialize(d)?; - Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n))) + >::deserialize(d)? + .map(|t| { + SystemTime::UNIX_EPOCH + .checked_add(Duration::from_secs(t)) + .ok_or_else(|| { + serde::de::Error::custom(format_args!("timestamp out of bounds: {t}")) + }) + }) + .transpose() } struct JwkRenewalPermit<'a> { @@ -746,11 +761,11 @@ pub enum JwtClaimsError { #[error("invalid JWT token audience")] InvalidJwtTokenAudience, - #[error("JWT token has expired")] - JwtTokenHasExpired, + #[error("JWT token has expired (exp={0})")] + JwtTokenHasExpired(u64), - #[error("JWT token is not yet ready to use")] - JwtTokenNotYetReadyToUse, + #[error("JWT token is not yet ready to use (nbf={0})")] + JwtTokenNotYetReadyToUse(u64), } #[allow(dead_code, reason = "Debug use only")] @@ -1233,14 +1248,14 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL "nbf": now + 60, "aud": "neon", }}, - error: JwtClaimsError::JwtTokenNotYetReadyToUse, + error: JwtClaimsError::JwtTokenNotYetReadyToUse(now + 60), }, Test { body: json! {{ "exp": now - 60, "aud": ["neon"], }}, - error: JwtClaimsError::JwtTokenHasExpired, + error: JwtClaimsError::JwtTokenHasExpired(now - 60), }, Test { body: json! {{ diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 83feed5094..6e5c0a3954 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -80,10 +80,22 @@ impl std::fmt::Display for Backend<'_, ()> { .field(&endpoint.url()) .finish(), #[cfg(any(test, feature = "testing"))] - ControlPlaneClient::PostgresMock(endpoint) => fmt - .debug_tuple("ControlPlane::PostgresMock") - .field(&endpoint.url()) - .finish(), + ControlPlaneClient::PostgresMock(endpoint) => { + let url = endpoint.url(); + match url::Url::parse(url) { + Ok(mut url) => { + let _ = url.set_password(Some("_redacted_")); + let url = url.as_str(); + fmt.debug_tuple("ControlPlane::PostgresMock") + .field(&url) + .finish() + } + Err(_) => fmt + .debug_tuple("ControlPlane::PostgresMock") + .field(&url) + .finish(), + } + } #[cfg(test)] ControlPlaneClient::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(), }, diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index c55af325e3..526d0df7f2 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -12,9 +12,9 @@ use tracing::{debug, warn}; use crate::auth::password_hack::parse_endpoint_param; use crate::context::RequestContext; use crate::error::{ReportableError, UserFacingError}; -use crate::metrics::{Metrics, SniKind}; +use crate::metrics::{Metrics, SniGroup, SniKind}; use crate::proxy::NeonOptions; -use crate::serverless::SERVERLESS_DRIVER_SNI; +use crate::serverless::{AUTH_BROKER_SNI, SERVERLESS_DRIVER_SNI}; use crate::types::{EndpointId, RoleName}; #[derive(Debug, Error, PartialEq, Eq, Clone)] @@ -32,12 +32,6 @@ pub(crate) enum ComputeUserInfoParseError { option: EndpointId, }, - #[error( - "Common name inferred from SNI ('{}') is not known", - .cn, - )] - UnknownCommonName { cn: String }, - #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] MalformedProjectName(EndpointId), } @@ -66,22 +60,15 @@ impl ComputeUserInfoMaybeEndpoint { } } -pub(crate) fn endpoint_sni( - sni: &str, - common_names: &HashSet, -) -> Result, ComputeUserInfoParseError> { - let Some((subdomain, common_name)) = sni.split_once('.') else { - return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() }); - }; +pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet) -> Option { + let (subdomain, common_name) = sni.split_once('.')?; if !common_names.contains(common_name) { - return Err(ComputeUserInfoParseError::UnknownCommonName { - cn: common_name.into(), - }); + return None; } - if subdomain == SERVERLESS_DRIVER_SNI { - return Ok(None); + if subdomain == SERVERLESS_DRIVER_SNI || subdomain == AUTH_BROKER_SNI { + return None; } - Ok(Some(EndpointId::from(subdomain))) + Some(EndpointId::from(subdomain)) } impl ComputeUserInfoMaybeEndpoint { @@ -113,15 +100,8 @@ impl ComputeUserInfoMaybeEndpoint { }) .map(|name| name.into()); - let endpoint_from_domain = if let Some(sni_str) = sni { - if let Some(cn) = common_names { - endpoint_sni(sni_str, cn)? - } else { - None - } - } else { - None - }; + let endpoint_from_domain = + sni.and_then(|sni_str| common_names.and_then(|cn| endpoint_sni(sni_str, cn))); let endpoint = match (endpoint_option, endpoint_from_domain) { // Invariant: if we have both project name variants, they should match. @@ -148,22 +128,23 @@ impl ComputeUserInfoMaybeEndpoint { let metrics = Metrics::get(); debug!(%user, "credentials"); - if sni.is_some() { + + let protocol = ctx.protocol(); + let kind = if sni.is_some() { debug!("Connection with sni"); - metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); + SniKind::Sni } else if endpoint.is_some() { - metrics - .proxy - .accepted_connections_by_sni - .inc(SniKind::NoSni); debug!("Connection without sni"); + SniKind::NoSni } else { - metrics - .proxy - .accepted_connections_by_sni - .inc(SniKind::PasswordHack); debug!("Connection with password hack"); - } + SniKind::PasswordHack + }; + + metrics + .proxy + .accepted_connections_by_sni + .inc(SniGroup { protocol, kind }); let options = NeonOptions::parse_params(params); @@ -424,21 +405,34 @@ mod tests { } #[test] - fn parse_inconsistent_sni() { + fn parse_unknown_sni() { let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("project.localhost"); let common_names = Some(["example.com".into()].into()); let ctx = RequestContext::test(); - let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) - .expect_err("should fail"); - match err { - UnknownCommonName { cn } => { - assert_eq!(cn, "localhost"); - } - _ => panic!("bad error: {err:?}"), - } + let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .unwrap(); + + assert!(info.endpoint_id.is_none()); + } + + #[test] + fn parse_unknown_sni_with_options() { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("options", "endpoint=foo-bar-baz-1234"), + ]); + + let sni = Some("project.localhost"); + let common_names = Some(["example.com".into()].into()); + + let ctx = RequestContext::test(); + let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref()) + .unwrap(); + + assert_eq!(info.endpoint_id.as_deref(), Some("foo-bar-baz-1234")); } #[test] diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 7d4b44841d..d60d32eb3b 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,6 +1,10 @@ #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +#[allow(non_upper_case_globals)] +#[unsafe(export_name = "malloc_conf")] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; + #[tokio::main] async fn main() -> anyhow::Result<()> { proxy::binary::proxy::run().await diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index ee7f6ffcd7..a566383390 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -423,8 +423,8 @@ async fn refresh_config_inner( if let Some(tls_config) = data.tls { let tls_config = tokio::task::spawn_blocking(move || { crate::tls::server_config::configure_tls( - &tls_config.key_path, - &tls_config.cert_path, + tls_config.key_path.as_ref(), + tls_config.cert_path.as_ref(), None, false, ) diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 19be058ac3..3e87538ae7 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -1,8 +1,10 @@ -/// A stand-alone program that routes connections, e.g. from -/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. -/// -/// This allows connecting to pods/services running in the same Kubernetes cluster from -/// the outside. Similar to an ingress controller for HTTPS. +//! A stand-alone program that routes connections, e.g. from +//! `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. +//! +//! This allows connecting to pods/services running in the same Kubernetes cluster from +//! the outside. Similar to an ingress controller for HTTPS. + +use std::path::Path; use std::{net::SocketAddr, sync::Arc}; use anyhow::{Context, anyhow, bail, ensure}; @@ -86,46 +88,7 @@ pub async fn run() -> anyhow::Result<()> { args.get_one::("tls-key"), args.get_one::("tls-cert"), ) { - (Some(key_path), Some(cert_path)) => { - let key = { - let key_bytes = std::fs::read(key_path).context("TLS key file")?; - - let mut keys = - rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); - - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - PrivateKeyDer::Pkcs8( - keys.pop() - .expect("keys should not be empty") - .context(format!("Failed to read TLS keys at '{key_path}'"))?, - ) - }; - - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain: Vec<_> = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - // needed for channel bindings - let first_cert = cert_chain.first().context("missing certificate")?; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - - let tls_config = - rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) - .context("ring should support TLS1.2 and TLS1.3")? - .with_no_client_auth() - .with_single_cert(cert_chain, key)? - .into(); - - (tls_config, tls_server_end_point) - } + (Some(key_path), Some(cert_path)) => parse_tls(key_path.as_ref(), cert_path.as_ref())?, _ => bail!("tls-key and tls-cert must be specified"), }; @@ -188,7 +151,58 @@ pub async fn run() -> anyhow::Result<()> { match signal {} } -async fn task_main( +pub(super) fn parse_tls( + key_path: &Path, + cert_path: &Path, +) -> anyhow::Result<(Arc, TlsServerEndPoint)> { + let key = { + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); + + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); + PrivateKeyDer::Pkcs8( + keys.pop() + .expect("keys should not be empty") + .context(format!( + "Failed to read TLS keys at '{}'", + key_path.display() + ))?, + ) + }; + + let cert_chain_bytes = std::fs::read(cert_path).context(format!( + "Failed to read TLS cert file at '{}.'", + cert_path.display() + ))?; + + let cert_chain: Vec<_> = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!( + "Failed to read TLS certificate chain from bytes from file at '{}'.", + cert_path.display() + ) + })? + }; + + // needed for channel bindings + let first_cert = cert_chain.first().context("missing certificate")?; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + + let tls_config = + rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("ring should support TLS1.2 and TLS1.3")? + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into(); + + Ok((tls_config, tls_server_end_point)) +} + +pub(super) async fn task_main( dest_suffix: Arc, tls_config: Arc, compute_tls_config: Option>, @@ -380,6 +394,7 @@ async fn handle_client( } } +#[allow(clippy::large_enum_variant)] enum Connection { Raw(tokio::net::TcpStream), Tls(tokio_rustls::client::TlsStream), diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index e03f2f33d9..5f24940985 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -1,9 +1,14 @@ +#[cfg(any(test, feature = "testing"))] +use std::env; use std::net::SocketAddr; +use std::path::PathBuf; use std::pin::pin; use std::sync::Arc; use std::time::Duration; -use anyhow::bail; +#[cfg(any(test, feature = "testing"))] +use anyhow::Context; +use anyhow::{bail, ensure}; use arc_swap::ArcSwapOption; use futures::future::Either; use remote_storage::RemoteStorageConfig; @@ -34,6 +39,8 @@ use crate::scram::threadpool::ThreadPool; use crate::serverless::GlobalConnPoolOptions; use crate::serverless::cancel_set::CancelSet; use crate::tls::client_config::compute_client_config_with_root_certs; +#[cfg(any(test, feature = "testing"))] +use crate::url::ApiUrl; use crate::{auth, control_plane, http, serverless, usage_metrics}; project_git_version!(GIT_VERSION); @@ -42,11 +49,12 @@ project_build_tag!(BUILD_TAG); use clap::{Parser, ValueEnum}; #[derive(Clone, Debug, ValueEnum)] +#[clap(rename_all = "kebab-case")] enum AuthBackendType { - #[value(name("cplane-v1"), alias("control-plane"))] - ControlPlaneV1, + #[clap(alias("cplane-v1"))] + ControlPlane, - #[value(name("link"), alias("control-redirect"))] + #[clap(alias("link"))] ConsoleRedirect, #[cfg(any(test, feature = "testing"))] @@ -62,18 +70,18 @@ struct ProxyCliArgs { region: String, /// listen for incoming client connections on ip:port #[clap(short, long, default_value = "127.0.0.1:4432")] - proxy: String, + proxy: SocketAddr, #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] auth_backend: AuthBackendType, /// listen for management callback connection on ip:port #[clap(short, long, default_value = "127.0.0.1:7000")] - mgmt: String, + mgmt: SocketAddr, /// listen for incoming http connections (metrics, etc) on ip:port #[clap(long, default_value = "127.0.0.1:7001")] - http: String, + http: SocketAddr, /// listen for incoming wss connections on ip:port #[clap(long)] - wss: Option, + wss: Option, /// redirect unauthenticated users to the given uri in case of console redirect auth #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] uri: String, @@ -99,18 +107,18 @@ struct ProxyCliArgs { /// /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir #[clap(short = 'k', long, alias = "ssl-key")] - tls_key: Option, + tls_key: Option, /// path to TLS cert for client postgres connections /// /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir #[clap(short = 'c', long, alias = "ssl-cert")] - tls_cert: Option, + tls_cert: Option, /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`. #[clap(long, alias = "allow-ssl-keylogfile")] allow_tls_keylogfile: bool, /// path to directory with TLS certificates for client postgres connections #[clap(long)] - certs_dir: Option, + certs_dir: Option, /// timeout for the TLS handshake #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] handshake_timeout: tokio::time::Duration, @@ -159,8 +167,11 @@ struct ProxyCliArgs { #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)] redis_rps_limit: Vec, /// Cancellation channel size (max queue size for redis kv client) - #[clap(long, default_value = "1024")] + #[clap(long, default_value_t = 1024)] cancellation_ch_size: usize, + /// Cancellation ops batch size for redis + #[clap(long, default_value_t = 8)] + cancellation_batch_size: usize, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] allowed_ips_cache: String, @@ -229,6 +240,9 @@ struct ProxyCliArgs { // TODO: rename to `console_redirect_confirmation_timeout`. #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] webauth_confirmation_timeout: std::time::Duration, + + #[clap(flatten)] + pg_sni_router: PgSniRouterArgs, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -277,6 +291,25 @@ struct SqlOverHttpArgs { sql_over_http_max_response_size_bytes: usize, } +#[derive(clap::Args, Clone, Debug)] +struct PgSniRouterArgs { + /// listen for incoming client connections on ip:port + #[clap(id = "sni-router-listen", long, default_value = "127.0.0.1:4432")] + listen: SocketAddr, + /// listen for incoming client connections on ip:port, requiring TLS to compute + #[clap(id = "sni-router-listen-tls", long, default_value = "127.0.0.1:4433")] + listen_tls: SocketAddr, + /// path to TLS key for client postgres connections + #[clap(id = "sni-router-tls-key", long)] + tls_key: Option, + /// path to TLS cert for client postgres connections + #[clap(id = "sni-router-tls-cert", long)] + tls_cert: Option, + /// append this domain zone to the SNI hostname to get the destination address + #[clap(id = "sni-router-destination", long)] + dest: Option, +} + pub async fn run() -> anyhow::Result<()> { let _logging_guard = crate::logging::init().await?; let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); @@ -307,73 +340,51 @@ pub async fn run() -> anyhow::Result<()> { Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), } info!("Using region: {}", args.aws_region); - - // TODO: untangle the config args - let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { - ("plain", redis_url) => match redis_url { - None => { - bail!("plain auth requires redis_notifications to be set"); - } - Some(url) => { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone())) - } - }, - ("irsa", _) => match (&args.redis_host, args.redis_port) { - (Some(host), Some(port)) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host.to_string(), - port, - elasticache::CredentialsProvider::new( - args.aws_region, - args.redis_cluster_name, - args.redis_user_id, - ) - .await, - ), - ), - (None, None) => { - warn!( - "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client" - ); - None - } - _ => { - bail!("redis-host and redis-port must be specified together"); - } - }, - _ => { - bail!("unknown auth type given"); - } - }; - - let redis_notifications_client = if let Some(url) = args.redis_notifications { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) - } else { - regional_redis_client.clone() - }; + let (regional_redis_client, redis_notifications_client) = configure_redis(&args).await?; // Check that we can bind to address before further initialization - let http_address: SocketAddr = args.http.parse()?; - info!("Starting http on {http_address}"); - let http_listener = TcpListener::bind(http_address).await?.into_std()?; + info!("Starting http on {}", args.http); + let http_listener = TcpListener::bind(args.http).await?.into_std()?; - let mgmt_address: SocketAddr = args.mgmt.parse()?; - info!("Starting mgmt on {mgmt_address}"); - let mgmt_listener = TcpListener::bind(mgmt_address).await?; + info!("Starting mgmt on {}", args.mgmt); + let mgmt_listener = TcpListener::bind(args.mgmt).await?; let proxy_listener = if args.is_auth_broker { None } else { - let proxy_address: SocketAddr = args.proxy.parse()?; - info!("Starting proxy on {proxy_address}"); + info!("Starting proxy on {}", args.proxy); + Some(TcpListener::bind(args.proxy).await?) + }; - Some(TcpListener::bind(proxy_address).await?) + let sni_router_listeners = { + let args = &args.pg_sni_router; + if args.dest.is_some() { + ensure!( + args.tls_key.is_some(), + "sni-router-tls-key must be provided" + ); + ensure!( + args.tls_cert.is_some(), + "sni-router-tls-cert must be provided" + ); + + info!( + "Starting pg-sni-router on {} and {}", + args.listen, args.listen_tls + ); + + Some(( + TcpListener::bind(args.listen).await?, + TcpListener::bind(args.listen_tls).await?, + )) + } else { + None + } }; // TODO: rename the argument to something like serverless. // It now covers more than just websockets, it also covers SQL over HTTP. let serverless_listener = if let Some(serverless_address) = args.wss { - let serverless_address: SocketAddr = serverless_address.parse()?; info!("Starting wss on {serverless_address}"); Some(TcpListener::bind(serverless_address).await?) } else if args.is_auth_broker { @@ -458,6 +469,37 @@ pub async fn run() -> anyhow::Result<()> { } } + // spawn pg-sni-router mode. + if let Some((listen, listen_tls)) = sni_router_listeners { + let args = args.pg_sni_router; + let dest = args.dest.expect("already asserted it is set"); + let key_path = args.tls_key.expect("already asserted it is set"); + let cert_path = args.tls_cert.expect("already asserted it is set"); + + let (tls_config, tls_server_end_point) = + super::pg_sni_router::parse_tls(&key_path, &cert_path)?; + + let dest = Arc::new(dest); + + client_tasks.spawn(super::pg_sni_router::task_main( + dest.clone(), + tls_config.clone(), + None, + tls_server_end_point, + listen, + cancellation_token.clone(), + )); + + client_tasks.spawn(super::pg_sni_router::task_main( + dest, + tls_config, + Some(config.connect_to_compute.tls.clone()), + tls_server_end_point, + listen_tls, + cancellation_token.clone(), + )); + } + client_tasks.spawn(crate::context::parquet::worker( cancellation_token.clone(), args.parquet_upload, @@ -509,7 +551,12 @@ pub async fn run() -> anyhow::Result<()> { if let Some(mut redis_kv_client) = redis_kv_client { maintenance_tasks.spawn(async move { redis_kv_client.try_connect().await?; - handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?; + handle_cancel_messages( + &mut redis_kv_client, + rx_cancel, + args.cancellation_batch_size, + ) + .await?; drop(redis_kv_client); @@ -565,7 +612,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { (Some(key_path), Some(cert_path)) => Some(config::configure_tls( key_path, cert_path, - args.certs_dir.as_ref(), + args.certs_dir.as_deref(), args.allow_tls_keylogfile, )?), (None, None) => None, @@ -675,7 +722,7 @@ fn build_auth_backend( args: &ProxyCliArgs, ) -> anyhow::Result, &'static ConsoleRedirectBackend>> { match &args.auth_backend { - AuthBackendType::ControlPlaneV1 => { + AuthBackendType::ControlPlane => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; @@ -736,7 +783,13 @@ fn build_auth_backend( #[cfg(any(test, feature = "testing"))] AuthBackendType::Postgres => { - let url = args.auth_endpoint.parse()?; + let mut url: ApiUrl = args.auth_endpoint.parse()?; + if url.password().is_none() { + let password = env::var("PGPASSWORD") + .with_context(|| "auth-endpoint does not contain a password and environment variable `PGPASSWORD` is not set")?; + url.set_password(Some(&password)) + .expect("Failed to set password"); + } let api = control_plane::client::mock::MockControlPlane::new( url, !args.is_private_access_proxy, @@ -811,6 +864,60 @@ fn build_auth_backend( } } +async fn configure_redis( + args: &ProxyCliArgs, +) -> anyhow::Result<( + Option, + Option, +)> { + // TODO: untangle the config args + let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { + ("plain", redis_url) => match redis_url { + None => { + bail!("plain auth requires redis_notifications to be set"); + } + Some(url) => { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone())) + } + }, + ("irsa", _) => match (&args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host.clone(), + port, + elasticache::CredentialsProvider::new( + args.aws_region.clone(), + args.redis_cluster_name.clone(), + args.redis_user_id.clone(), + ) + .await, + ), + ), + (None, None) => { + // todo: upgrade to error? + warn!( + "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client" + ); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }, + _ => { + bail!("unknown auth type given"); + } + }; + + let redis_notifications_client = if let Some(url) = &args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(&**url)) + } else { + regional_redis_client.clone() + }; + + Ok((regional_redis_client, redis_notifications_client)) +} + #[cfg(test)] mod tests { use std::time::Duration; diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index c5ba04eb8c..a6e7bf85a0 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -6,12 +6,12 @@ use ipnet::{IpNet, Ipv4Net, Ipv6Net}; use postgres_client::CancelToken; use postgres_client::tls::MakeTlsConnect; use pq_proto::CancelKeyData; -use redis::{FromRedisValue, Pipeline, Value, pipe}; +use redis::{Cmd, FromRedisValue, Value}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::{mpsc, oneshot}; -use tracing::{debug, info, warn}; +use tracing::{debug, error, info, warn}; use crate::auth::backend::ComputeUserInfo; use crate::auth::{AuthError, check_peer_addr_is_in_list}; @@ -30,8 +30,6 @@ use crate::tls::postgres_rustls::MakeRustlsConnect; type IpSubnetKey = IpNet; const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time -const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10); -const BATCH_SIZE: usize = 8; // Message types for sending through mpsc channel pub enum CancelKeyOp { @@ -56,8 +54,70 @@ pub enum CancelKeyOp { }, } +pub struct Pipeline { + inner: redis::Pipeline, + replies: Vec, +} + +impl Pipeline { + fn with_capacity(n: usize) -> Self { + Self { + inner: redis::Pipeline::with_capacity(n), + replies: Vec::with_capacity(n), + } + } + + async fn execute(&mut self, client: &mut RedisKVClient) { + let responses = self.replies.len(); + let batch_size = self.inner.len(); + + match client.query(&self.inner).await { + // for each reply, we expect that many values. + Ok(Value::Array(values)) if values.len() == responses => { + debug!( + batch_size, + responses, "successfully completed cancellation jobs", + ); + for (value, reply) in std::iter::zip(values, self.replies.drain(..)) { + reply.send_value(value); + } + } + Ok(value) => { + error!(batch_size, ?value, "unexpected redis return value"); + for reply in self.replies.drain(..) { + reply.send_err(anyhow!("incorrect response type from redis")); + } + } + Err(err) => { + for reply in self.replies.drain(..) { + reply.send_err(anyhow!("could not send cmd to redis: {err}")); + } + } + } + + self.inner.clear(); + self.replies.clear(); + } + + fn add_command_with_reply(&mut self, cmd: Cmd, reply: CancelReplyOp) { + self.inner.add_command(cmd); + self.replies.push(reply); + } + + fn add_command_no_reply(&mut self, cmd: Cmd) { + self.inner.add_command(cmd).ignore(); + } + + fn add_command(&mut self, cmd: Cmd, reply: Option) { + match reply { + Some(reply) => self.add_command_with_reply(cmd, reply), + None => self.add_command_no_reply(cmd), + } + } +} + impl CancelKeyOp { - fn register(self, pipe: &mut Pipeline) -> Option { + fn register(self, pipe: &mut Pipeline) { #[allow(clippy::used_underscore_binding)] match self { CancelKeyOp::StoreCancelKey { @@ -68,18 +128,18 @@ impl CancelKeyOp { _guard, expire, } => { - pipe.hset(&key, field, value); - pipe.expire(key, expire); - let resp_tx = resp_tx?; - Some(CancelReplyOp::StoreCancelKey { resp_tx, _guard }) + let reply = + resp_tx.map(|resp_tx| CancelReplyOp::StoreCancelKey { resp_tx, _guard }); + pipe.add_command(Cmd::hset(&key, field, value), reply); + pipe.add_command_no_reply(Cmd::expire(key, expire)); } CancelKeyOp::GetCancelData { key, resp_tx, _guard, } => { - pipe.hgetall(key); - Some(CancelReplyOp::GetCancelData { resp_tx, _guard }) + let reply = CancelReplyOp::GetCancelData { resp_tx, _guard }; + pipe.add_command_with_reply(Cmd::hgetall(key), reply); } CancelKeyOp::RemoveCancelKey { key, @@ -87,9 +147,9 @@ impl CancelKeyOp { resp_tx, _guard, } => { - pipe.hdel(key, field); - let resp_tx = resp_tx?; - Some(CancelReplyOp::RemoveCancelKey { resp_tx, _guard }) + let reply = + resp_tx.map(|resp_tx| CancelReplyOp::RemoveCancelKey { resp_tx, _guard }); + pipe.add_command(Cmd::hdel(key, field), reply); } } } @@ -169,12 +229,13 @@ impl CancelReplyOp { pub async fn handle_cancel_messages( client: &mut RedisKVClient, mut rx: mpsc::Receiver, + batch_size: usize, ) -> anyhow::Result<()> { - let mut batch = Vec::new(); - let mut replies = vec![]; + let mut batch = Vec::with_capacity(batch_size); + let mut pipeline = Pipeline::with_capacity(batch_size); loop { - if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 { + if rx.recv_many(&mut batch, batch_size).await == 0 { warn!("shutting down cancellation queue"); break Ok(()); } @@ -182,42 +243,11 @@ pub async fn handle_cancel_messages( let batch_size = batch.len(); debug!(batch_size, "running cancellation jobs"); - let mut pipe = pipe(); for msg in batch.drain(..) { - if let Some(reply) = msg.register(&mut pipe) { - replies.push(reply); - } else { - pipe.ignore(); - } + msg.register(&mut pipeline); } - let responses = replies.len(); - - match client.query(pipe).await { - // for each reply, we expect that many values. - Ok(Value::Array(values)) if values.len() == responses => { - debug!( - batch_size, - responses, "successfully completed cancellation jobs", - ); - for (value, reply) in std::iter::zip(values, replies.drain(..)) { - reply.send_value(value); - } - } - Ok(value) => { - debug!(?value, "unexpected redis return value"); - for reply in replies.drain(..) { - reply.send_err(anyhow!("incorrect response type from redis")); - } - } - Err(err) => { - for reply in replies.drain(..) { - reply.send_err(anyhow!("could not send cmd to redis: {err}")); - } - } - } - - replies.clear(); + pipeline.execute(client).await; } } @@ -336,8 +366,7 @@ impl CancellationHandler { return Err(CancelError::InternalError); }; - tx.send_timeout(op, REDIS_SEND_TIMEOUT) - .await + tx.try_send(op) .map_err(|e| { tracing::warn!("failed to send GetCancelData for {key}: {e}"); }) @@ -539,7 +568,7 @@ impl Session { } // Send the store key op to the cancellation handler and set TTL for the key - pub(crate) async fn write_cancel_key( + pub(crate) fn write_cancel_key( &self, cancel_closure: CancelClosure, ) -> Result<(), CancelError> { @@ -565,14 +594,14 @@ impl Session { expire: CANCEL_KEY_TTL, }; - let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { + let _ = tx.try_send(op).map_err(|e| { let key = self.key; tracing::warn!("failed to send StoreCancelKey for {key}: {e}"); }); Ok(()) } - pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> { + pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> { let Some(tx) = &self.cancellation_handler.tx else { tracing::warn!("cancellation handler is not available"); return Err(CancelError::InternalError); @@ -588,7 +617,7 @@ impl Session { .guard(RedisMsgKind::HDel), }; - let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { + let _ = tx.try_send(op).map_err(|e| { let key = self.key; tracing::warn!("failed to send RemoveCancelKey for {key}: {e}"); }); diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 0f2c3def0d..e3184e20d1 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -244,9 +244,7 @@ pub(crate) async fn handle_client( let cancellation_handler_clone = Arc::clone(&cancellation_handler); let session = cancellation_handler_clone.get_key(); - session - .write_cancel_key(node.cancel_closure.clone()) - .await?; + session.write_cancel_key(node.cancel_closure.clone())?; prepare_client_connection(&node, *session.key(), &mut stream).await?; diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 5f649d2b21..79aaf22990 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -78,7 +78,7 @@ struct RequestContextInner { #[derive(Clone, Debug)] pub(crate) enum AuthMethod { - // aka passwordless, fka link + // aka link ConsoleRedirect, ScramSha256, ScramSha256Plus, diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 5278fe2a3e..b0b5a598d1 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -3,7 +3,7 @@ use std::net::TcpListener; use std::sync::{Arc, Mutex}; use anyhow::{anyhow, bail}; -use http_utils::endpoint::{self, request_span}; +use http_utils::endpoint::{self, profile_cpu_handler, profile_heap_handler, request_span}; use http_utils::error::ApiError; use http_utils::json::json_response; use http_utils::{RouterBuilder, RouterService}; @@ -33,6 +33,12 @@ fn make_router(metrics: AppMetrics) -> RouterBuilder { request_span(r, move |b| prometheus_metrics_handler(b, state)) }) .get("/v1/status", status_handler) + .get("/profile/cpu", move |r| { + request_span(r, profile_cpu_handler) + }) + .get("/profile/heap", move |r| { + request_span(r, profile_heap_handler) + }) } pub async fn task_main( diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index b83b03bc4f..a58b55a704 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,13 +1,11 @@ -use std::cell::{Cell, RefCell}; +use std::cell::RefCell; use std::collections::HashMap; -use std::hash::BuildHasher; +use std::sync::Arc; use std::sync::atomic::{AtomicU32, Ordering}; -use std::{array, env, fmt, io}; +use std::{env, io}; use chrono::{DateTime, Utc}; -use indexmap::IndexSet; use opentelemetry::trace::TraceContextExt; -use scopeguard::defer; use serde::ser::{SerializeMap, Serializer}; use tracing::subscriber::Interest; use tracing::{Event, Metadata, Span, Subscriber, callsite, span}; @@ -19,7 +17,6 @@ use tracing_subscriber::fmt::{FormatEvent, FormatFields}; use tracing_subscriber::layer::{Context, Layer}; use tracing_subscriber::prelude::*; use tracing_subscriber::registry::{LookupSpan, SpanRef}; -use try_lock::TryLock; /// Initialize logging and OpenTelemetry tracing and exporter. /// @@ -55,7 +52,7 @@ pub async fn init() -> anyhow::Result { StderrWriter { stderr: std::io::stderr(), }, - ["request_id", "session_id", "conn_id"], + &["request_id", "session_id", "conn_id"], )) } else { None @@ -132,11 +129,10 @@ impl Drop for LoggingGuard { } } -// TODO: make JSON the default #[derive(Copy, Clone, PartialEq, Eq, Default, Debug)] enum LogFormat { + Text, #[default] - Text = 1, Json, } @@ -184,50 +180,65 @@ impl Clock for RealClock { /// Name of the field used by tracing crate to store the event message. const MESSAGE_FIELD: &str = "message"; +/// Tracing used to enforce that spans/events have no more than 32 fields. +/// It seems this is no longer the case, but it's still documented in some places. +/// Generally, we shouldn't expect more than 32 fields anyway, so we can try and +/// rely on it for some (minor) performance gains. +const MAX_TRACING_FIELDS: usize = 32; + thread_local! { - /// Protects against deadlocks and double panics during log writing. - /// The current panic handler will use tracing to log panic information. - static REENTRANCY_GUARD: Cell = const { Cell::new(false) }; /// Thread-local instance with per-thread buffer for log writing. - static EVENT_FORMATTER: RefCell = RefCell::new(EventFormatter::new()); + static EVENT_FORMATTER: RefCell = const { RefCell::new(EventFormatter::new()) }; /// Cached OS thread ID. static THREAD_ID: u64 = gettid::gettid(); } +/// Map for values fixed at callsite registration. +// We use papaya here because registration rarely happens post-startup. +// papaya is good for read-heavy workloads. +// +// We use rustc_hash here because callsite::Identifier will always be an integer with low-bit entropy, +// since it's always a pointer to static mutable data. rustc_hash was designed for low-bit entropy. +type CallsiteMap = + papaya::HashMap>; + /// Implements tracing layer to handle events specific to logging. -struct JsonLoggingLayer { +struct JsonLoggingLayer { clock: C, - skipped_field_indices: papaya::HashMap, - callsite_ids: papaya::HashMap, writer: W, - // We use a const generic and arrays to bypass one heap allocation. - extract_fields: IndexSet<&'static str>, - _marker: std::marker::PhantomData<[&'static str; F]>, + + /// tracks which fields of each **event** are duplicates + skipped_field_indices: CallsiteMap, + + span_info: CallsiteMap, + + /// Fields we want to keep track of in a separate json object. + extract_fields: &'static [&'static str], } -impl JsonLoggingLayer { - fn new(clock: C, writer: W, extract_fields: [&'static str; F]) -> Self { +impl JsonLoggingLayer { + fn new(clock: C, writer: W, extract_fields: &'static [&'static str]) -> Self { JsonLoggingLayer { clock, - skipped_field_indices: papaya::HashMap::default(), - callsite_ids: papaya::HashMap::default(), + skipped_field_indices: CallsiteMap::default(), + span_info: CallsiteMap::default(), writer, - extract_fields: IndexSet::from_iter(extract_fields), - _marker: std::marker::PhantomData, + extract_fields, } } #[inline] - fn callsite_id(&self, cs: callsite::Identifier) -> CallsiteId { - *self - .callsite_ids + fn span_info(&self, metadata: &'static Metadata<'static>) -> CallsiteSpanInfo { + self.span_info .pin() - .get_or_insert_with(cs, CallsiteId::next) + .get_or_insert_with(metadata.callsite(), || { + CallsiteSpanInfo::new(metadata, self.extract_fields) + }) + .clone() } } -impl Layer - for JsonLoggingLayer +impl Layer for JsonLoggingLayer where S: Subscriber + for<'a> LookupSpan<'a>, { @@ -238,35 +249,25 @@ where // early, before OTel machinery, and add as event extension. let now = self.clock.now(); - let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| { - if entered.get() { - let mut formatter = EventFormatter::new(); - formatter.format::( - now, - event, - &ctx, - &self.skipped_field_indices, - &self.callsite_ids, - &self.extract_fields, - )?; - self.writer.make_writer().write_all(formatter.buffer()) - } else { - entered.set(true); - defer!(entered.set(false);); + let res: io::Result<()> = EVENT_FORMATTER.with(|f| { + let mut borrow = f.try_borrow_mut(); + let formatter = match borrow.as_deref_mut() { + Ok(formatter) => formatter, + // If the thread local formatter is borrowed, + // then we likely hit an edge case were we panicked during formatting. + // We allow the logging to proceed with an uncached formatter. + Err(_) => &mut EventFormatter::new(), + }; - EVENT_FORMATTER.with_borrow_mut(move |formatter| { - formatter.reset(); - formatter.format::( - now, - event, - &ctx, - &self.skipped_field_indices, - &self.callsite_ids, - &self.extract_fields, - )?; - self.writer.make_writer().write_all(formatter.buffer()) - }) - } + formatter.reset(); + formatter.format( + now, + event, + &ctx, + &self.skipped_field_indices, + self.extract_fields, + )?; + self.writer.make_writer().write_all(formatter.buffer()) }); // In case logging fails we generate a simpler JSON object. @@ -288,50 +289,48 @@ where /// Registers a SpanFields instance as span extension. fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) { let span = ctx.span(id).expect("span must exist"); - let fields = SpanFields::default(); - fields.record_fields(attrs); - // This could deadlock when there's a panic somewhere in the tracing - // event handling and a read or write guard is still held. This includes - // the OTel subscriber. - let mut exts = span.extensions_mut(); + let mut fields = SpanFields::new(self.span_info(span.metadata())); + attrs.record(&mut fields); - exts.insert(fields); + // This is a new span: the extensions should not be locked + // unless some layer spawned a thread to process this span. + // I don't think any layers do that. + span.extensions_mut().insert(fields); } fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) { let span = ctx.span(id).expect("span must exist"); - let ext = span.extensions(); - if let Some(data) = ext.get::() { - data.record_fields(values); + + // assumption: `on_record` is rarely called. + // assumption: a span being updated by one thread, + // and formatted by another thread is even rarer. + let mut ext = span.extensions_mut(); + if let Some(fields) = ext.get_mut::() { + values.record(fields); } } - /// Called (lazily) whenever a new log call is executed. We quickly check - /// for duplicate field names and record duplicates as skippable. Last one - /// wins. + /// Called (lazily) roughly once per event/span instance. We quickly check + /// for duplicate field names and record duplicates as skippable. Last field wins. fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest { + debug_assert!( + metadata.fields().len() <= MAX_TRACING_FIELDS, + "callsite {metadata:?} has too many fields." + ); + if !metadata.is_event() { - self.callsite_id(metadata.callsite()); + // register the span info. + self.span_info(metadata); // Must not be never because we wouldn't get trace and span data. return Interest::always(); } let mut field_indices = SkippedFieldIndices::default(); - let mut seen_fields = HashMap::<&'static str, usize>::new(); + let mut seen_fields = HashMap::new(); for field in metadata.fields() { - use std::collections::hash_map::Entry; - match seen_fields.entry(field.name()) { - Entry::Vacant(entry) => { - // field not seen yet - entry.insert(field.index()); - } - Entry::Occupied(mut entry) => { - // replace currently stored index - let old_index = entry.insert(field.index()); - // ... and append it to list of skippable indices - field_indices.push(old_index); - } + if let Some(old_index) = seen_fields.insert(field.name(), field.index()) { + field_indices.set(old_index); } } @@ -345,110 +344,113 @@ where } } -#[derive(Copy, Clone, Debug, Default)] -#[repr(transparent)] -struct CallsiteId(u32); +/// Any span info that is fixed to a particular callsite. Not variable between span instances. +#[derive(Clone)] +struct CallsiteSpanInfo { + /// index of each field to extract. usize::MAX if not found. + extract: Arc<[usize]>, -impl CallsiteId { - #[inline] - fn next() -> Self { - // Start at 1 to reserve 0 for default. - static COUNTER: AtomicU32 = AtomicU32::new(1); - CallsiteId(COUNTER.fetch_add(1, Ordering::Relaxed)) - } + /// tracks the fixed "callsite ID" for each span. + /// note: this is not stable between runs. + normalized_name: Arc, } -impl fmt::Display for CallsiteId { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.0.fmt(f) +impl CallsiteSpanInfo { + fn new(metadata: &'static Metadata<'static>, extract_fields: &[&'static str]) -> Self { + // Start at 1 to reserve 0 for default. + static COUNTER: AtomicU32 = AtomicU32::new(1); + + let names: Vec<&'static str> = metadata.fields().iter().map(|f| f.name()).collect(); + + // get all the indices of span fields we want to focus + let extract = extract_fields + .iter() + // use rposition, since we want last match wins. + .map(|f1| names.iter().rposition(|f2| f1 == f2).unwrap_or(usize::MAX)) + .collect(); + + // normalized_name is unique for each callsite, but it is not + // unified across separate proxy instances. + // todo: can we do better here? + let cid = COUNTER.fetch_add(1, Ordering::Relaxed); + let normalized_name = format!("{}#{cid}", metadata.name()).into(); + + Self { + extract, + normalized_name, + } } } /// Stores span field values recorded during the spans lifetime. -#[derive(Default)] struct SpanFields { - // TODO: Switch to custom enum with lasso::Spur for Strings? - fields: papaya::HashMap<&'static str, serde_json::Value>, + values: [serde_json::Value; MAX_TRACING_FIELDS], + + /// cached span info so we can avoid extra hashmap lookups in the hot path. + span_info: CallsiteSpanInfo, } impl SpanFields { - #[inline] - fn record_fields(&self, fields: R) { - fields.record(&mut SpanFieldsRecorder { - fields: self.fields.pin(), - }); + fn new(span_info: CallsiteSpanInfo) -> Self { + Self { + span_info, + values: [const { serde_json::Value::Null }; MAX_TRACING_FIELDS], + } } } -/// Implements a tracing field visitor to convert and store values. -struct SpanFieldsRecorder<'m, S, G> { - fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>, -} - -impl tracing::field::Visit for SpanFieldsRecorder<'_, S, G> { +impl tracing::field::Visit for SpanFields { #[inline] fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } #[inline] fn record_i64(&mut self, field: &tracing::field::Field, value: i64) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } #[inline] fn record_u64(&mut self, field: &tracing::field::Field, value: u64) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } #[inline] fn record_i128(&mut self, field: &tracing::field::Field, value: i128) { if let Ok(value) = i64::try_from(value) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } else { - self.fields - .insert(field.name(), serde_json::Value::from(format!("{value}"))); + self.values[field.index()] = serde_json::Value::from(format!("{value}")); } } #[inline] fn record_u128(&mut self, field: &tracing::field::Field, value: u128) { if let Ok(value) = u64::try_from(value) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } else { - self.fields - .insert(field.name(), serde_json::Value::from(format!("{value}"))); + self.values[field.index()] = serde_json::Value::from(format!("{value}")); } } #[inline] fn record_bool(&mut self, field: &tracing::field::Field, value: bool) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } #[inline] fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } #[inline] fn record_str(&mut self, field: &tracing::field::Field, value: &str) { - self.fields - .insert(field.name(), serde_json::Value::from(value)); + self.values[field.index()] = serde_json::Value::from(value); } #[inline] fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) { - self.fields - .insert(field.name(), serde_json::Value::from(format!("{value:?}"))); + self.values[field.index()] = serde_json::Value::from(format!("{value:?}")); } #[inline] @@ -457,38 +459,33 @@ impl tracing::field::Visit for SpanFieldsRecor field: &tracing::field::Field, value: &(dyn std::error::Error + 'static), ) { - self.fields - .insert(field.name(), serde_json::Value::from(format!("{value}"))); + self.values[field.index()] = serde_json::Value::from(format!("{value}")); } } /// List of field indices skipped during logging. Can list duplicate fields or /// metafields not meant to be logged. -#[derive(Clone, Default)] +#[derive(Copy, Clone, Default)] struct SkippedFieldIndices { - bits: u64, + // 32-bits is large enough for `MAX_TRACING_FIELDS` + bits: u32, } impl SkippedFieldIndices { #[inline] - fn is_empty(&self) -> bool { + fn is_empty(self) -> bool { self.bits == 0 } #[inline] - fn push(&mut self, index: usize) { - self.bits |= 1u64 - .checked_shl(index as u32) - .expect("field index too large"); + fn set(&mut self, index: usize) { + debug_assert!(index <= 32, "index out of bounds of 32-bit set"); + self.bits |= 1 << index; } #[inline] - fn contains(&self, index: usize) -> bool { - self.bits - & 1u64 - .checked_shl(index as u32) - .expect("field index too large") - != 0 + fn contains(self, index: usize) -> bool { + self.bits & (1 << index) != 0 } } @@ -500,7 +497,7 @@ struct EventFormatter { impl EventFormatter { #[inline] - fn new() -> Self { + const fn new() -> Self { EventFormatter { logline_buffer: Vec::new(), } @@ -516,14 +513,13 @@ impl EventFormatter { self.logline_buffer.clear(); } - fn format( + fn format( &mut self, now: DateTime, event: &Event<'_>, ctx: &Context<'_, S>, - skipped_field_indices: &papaya::HashMap, - callsite_ids: &papaya::HashMap, - extract_fields: &IndexSet<&'static str>, + skipped_field_indices: &CallsiteMap, + extract_fields: &'static [&'static str], ) -> io::Result<()> where S: Subscriber + for<'a> LookupSpan<'a>, @@ -534,8 +530,11 @@ impl EventFormatter { let normalized_meta = event.normalized_metadata(); let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata()); - let skipped_field_indices = skipped_field_indices.pin(); - let skipped_field_indices = skipped_field_indices.get(&meta.callsite()); + let skipped_field_indices = skipped_field_indices + .pin() + .get(&meta.callsite()) + .copied() + .unwrap_or_default(); let mut serialize = || { let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer); @@ -566,9 +565,11 @@ impl EventFormatter { } let spans = SerializableSpans { - ctx, - callsite_ids, - extract: ExtractedSpanFields::<'_, F>::new(extract_fields), + // collect all spans from parent to root. + spans: ctx + .event_span(event) + .map_or(vec![], |parent| parent.scope().collect()), + extracted: ExtractedSpanFields::new(extract_fields), }; serializer.serialize_entry("spans", &spans)?; @@ -621,9 +622,9 @@ impl EventFormatter { } } - if spans.extract.has_values() { + if spans.extracted.has_values() { // TODO: add fields from event, too? - serializer.serialize_entry("extract", &spans.extract)?; + serializer.serialize_entry("extract", &spans.extracted)?; } serializer.end() @@ -636,15 +637,15 @@ impl EventFormatter { } /// Extracts the message field that's mixed will other fields. -struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> { +struct MessageFieldExtractor { serializer: S, - skipped_field_indices: Option<&'a SkippedFieldIndices>, + skipped_field_indices: SkippedFieldIndices, state: Option>, } -impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> { +impl MessageFieldExtractor { #[inline] - fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self { + fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self { Self { serializer, skipped_field_indices, @@ -666,13 +667,11 @@ impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> { fn accept_field(&self, field: &tracing::field::Field) -> bool { self.state.is_none() && field.name() == MESSAGE_FIELD - && !self - .skipped_field_indices - .is_some_and(|i| i.contains(field.index())) + && !self.skipped_field_indices.contains(field.index()) } } -impl tracing::field::Visit for MessageFieldExtractor<'_, S> { +impl tracing::field::Visit for MessageFieldExtractor { #[inline] fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { if self.accept_field(field) { @@ -752,14 +751,14 @@ impl tracing::field::Visit for MessageFieldExtracto /// can be skipped. // This is entirely optional and only cosmetic, though maybe helps a // bit during log parsing in dashboards when there's no field with empty object. -struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>); +struct FieldsPresent(pub bool, SkippedFieldIndices); // Even though some methods have an overhead (error, bytes) it is assumed the // compiler won't include this since we ignore the value entirely. -impl tracing::field::Visit for FieldsPresent<'_> { +impl tracing::field::Visit for FieldsPresent { #[inline] fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) { - if !self.1.is_some_and(|i| i.contains(field.index())) + if !self.1.contains(field.index()) && field.name() != MESSAGE_FIELD && !field.name().starts_with("log.") { @@ -769,10 +768,7 @@ impl tracing::field::Visit for FieldsPresent<'_> { } /// Serializes the fields directly supplied with a log event. -struct SerializableEventFields<'a, 'event>( - &'a tracing::Event<'event>, - Option<&'a SkippedFieldIndices>, -); +struct SerializableEventFields<'a, 'event>(&'a tracing::Event<'event>, SkippedFieldIndices); impl serde::ser::Serialize for SerializableEventFields<'_, '_> { fn serialize(&self, serializer: S) -> Result @@ -789,15 +785,15 @@ impl serde::ser::Serialize for SerializableEventFields<'_, '_> { } /// A tracing field visitor that skips the message field. -struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> { +struct MessageFieldSkipper { serializer: S, - skipped_field_indices: Option<&'a SkippedFieldIndices>, + skipped_field_indices: SkippedFieldIndices, state: Result<(), S::Error>, } -impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> { +impl MessageFieldSkipper { #[inline] - fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self { + fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self { Self { serializer, skipped_field_indices, @@ -810,9 +806,7 @@ impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> { self.state.is_ok() && field.name() != MESSAGE_FIELD && !field.name().starts_with("log.") - && !self - .skipped_field_indices - .is_some_and(|i| i.contains(field.index())) + && !self.skipped_field_indices.contains(field.index()) } #[inline] @@ -822,7 +816,7 @@ impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> { } } -impl tracing::field::Visit for MessageFieldSkipper<'_, S> { +impl tracing::field::Visit for MessageFieldSkipper { #[inline] fn record_f64(&mut self, field: &tracing::field::Field, value: f64) { if self.accept_field(field) { @@ -906,18 +900,17 @@ impl tracing::field::Visit for MessageFieldSkipper< /// with the span names as keys. To prevent collision we append a numberic value /// to the name. Also, collects any span fields we're interested in. Last one /// wins. -struct SerializableSpans<'a, 'ctx, Span, const F: usize> +struct SerializableSpans<'ctx, S> where - Span: Subscriber + for<'lookup> LookupSpan<'lookup>, + S: for<'lookup> LookupSpan<'lookup>, { - ctx: &'a Context<'ctx, Span>, - callsite_ids: &'a papaya::HashMap, - extract: ExtractedSpanFields<'a, F>, + spans: Vec>, + extracted: ExtractedSpanFields, } -impl serde::ser::Serialize for SerializableSpans<'_, '_, Span, F> +impl serde::ser::Serialize for SerializableSpans<'_, S> where - Span: Subscriber + for<'lookup> LookupSpan<'lookup>, + S: for<'lookup> LookupSpan<'lookup>, { fn serialize(&self, serializer: Ser) -> Result where @@ -925,25 +918,22 @@ where { let mut serializer = serializer.serialize_map(None)?; - if let Some(leaf_span) = self.ctx.lookup_current() { - for span in leaf_span.scope().from_root() { - // Append a numeric callsite ID to the span name to keep the name unique - // in the JSON object. - let cid = self - .callsite_ids - .pin() - .get(&span.metadata().callsite()) - .copied() - .unwrap_or_default(); + for span in self.spans.iter().rev() { + let ext = span.extensions(); - // Loki turns the # into an underscore during field name concatenation. - serializer.serialize_key(&format_args!("{}#{}", span.metadata().name(), &cid))?; + // all spans should have this extension. + let Some(fields) = ext.get() else { continue }; - serializer.serialize_value(&SerializableSpanFields { - span: &span, - extract: &self.extract, - })?; - } + self.extracted.layer_span(fields); + + let SpanFields { values, span_info } = fields; + serializer.serialize_entry( + &*span_info.normalized_name, + &SerializableSpanFields { + fields: span.metadata().fields(), + values, + }, + )?; } serializer.end() @@ -951,80 +941,77 @@ where } /// Serializes the span fields as object. -struct SerializableSpanFields<'a, 'span, Span, const F: usize> -where - Span: for<'lookup> LookupSpan<'lookup>, -{ - span: &'a SpanRef<'span, Span>, - extract: &'a ExtractedSpanFields<'a, F>, +struct SerializableSpanFields<'span> { + fields: &'span tracing::field::FieldSet, + values: &'span [serde_json::Value; MAX_TRACING_FIELDS], } -impl serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F> -where - Span: for<'lookup> LookupSpan<'lookup>, -{ +impl serde::ser::Serialize for SerializableSpanFields<'_> { fn serialize(&self, serializer: S) -> Result where S: serde::ser::Serializer, { let mut serializer = serializer.serialize_map(None)?; - let ext = self.span.extensions(); - if let Some(data) = ext.get::() { - for (name, value) in &data.fields.pin() { - serializer.serialize_entry(name, value)?; - // TODO: replace clone with reference, if possible. - self.extract.set(name, value.clone()); + for (field, value) in std::iter::zip(self.fields, self.values) { + if value.is_null() { + continue; } + serializer.serialize_entry(field.name(), value)?; } serializer.end() } } -struct ExtractedSpanFields<'a, const F: usize> { - names: &'a IndexSet<&'static str>, - // TODO: replace TryLock with something local thread and interior mutability. - // serde API doesn't let us use `mut`. - values: TryLock<([Option; F], bool)>, +struct ExtractedSpanFields { + names: &'static [&'static str], + values: RefCell>, } -impl<'a, const F: usize> ExtractedSpanFields<'a, F> { - fn new(names: &'a IndexSet<&'static str>) -> Self { +impl ExtractedSpanFields { + fn new(names: &'static [&'static str]) -> Self { ExtractedSpanFields { names, - values: TryLock::new((array::from_fn(|_| Option::default()), false)), + values: RefCell::new(vec![serde_json::Value::Null; names.len()]), } } - #[inline] - fn set(&self, name: &'static str, value: serde_json::Value) { - if let Some((index, _)) = self.names.get_full(name) { - let mut fields = self.values.try_lock().expect("thread-local use"); - fields.0[index] = Some(value); - fields.1 = true; + fn layer_span(&self, fields: &SpanFields) { + let mut v = self.values.borrow_mut(); + let SpanFields { values, span_info } = fields; + + // extract the fields + for (i, &j) in span_info.extract.iter().enumerate() { + let Some(value) = values.get(j) else { continue }; + + if !value.is_null() { + // TODO: replace clone with reference, if possible. + v[i] = value.clone(); + } } } #[inline] fn has_values(&self) -> bool { - self.values.try_lock().expect("thread-local use").1 + self.values.borrow().iter().any(|v| !v.is_null()) } } -impl serde::ser::Serialize for ExtractedSpanFields<'_, F> { +impl serde::ser::Serialize for ExtractedSpanFields { fn serialize(&self, serializer: S) -> Result where S: serde::ser::Serializer, { let mut serializer = serializer.serialize_map(None)?; - let values = self.values.try_lock().expect("thread-local use"); - for (i, value) in values.0.iter().enumerate() { - if let Some(value) = value { - let key = self.names[i]; - serializer.serialize_entry(key, value)?; + let values = self.values.borrow(); + for (key, value) in std::iter::zip(self.names, &*values) { + if value.is_null() { + continue; } + + serializer.serialize_entry(key, value)?; } serializer.end() @@ -1033,7 +1020,6 @@ impl serde::ser::Serialize for ExtractedSpanFields<'_, F> { #[cfg(test)] mod tests { - use std::marker::PhantomData; use std::sync::{Arc, Mutex, MutexGuard}; use assert_json_diff::assert_json_eq; @@ -1082,10 +1068,9 @@ mod tests { let log_layer = JsonLoggingLayer { clock: clock.clone(), skipped_field_indices: papaya::HashMap::default(), - callsite_ids: papaya::HashMap::default(), + span_info: papaya::HashMap::default(), writer: buffer.clone(), - extract_fields: IndexSet::from_iter(["x"]), - _marker: PhantomData::<[&'static str; 1]>, + extract_fields: &["x"], }; let registry = tracing_subscriber::Registry::default().with(log_layer); diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index e5fc0b724b..4b22c912eb 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -115,8 +115,8 @@ pub struct ProxyMetrics { #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_vpc_endpoint_ids: Histogram<10>, - /// Number of connections (per sni). - pub accepted_connections_by_sni: CounterVec>, + /// Number of connections, by the method we used to determine the endpoint. + pub accepted_connections_by_sni: CounterVec, /// Number of connection failures (per kind). pub connection_failures_total: CounterVec>, @@ -342,11 +342,20 @@ pub enum LatencyExclusions { ClientCplaneComputeRetry, } +#[derive(LabelGroup)] +#[label(set = SniSet)] +pub struct SniGroup { + pub protocol: Protocol, + pub kind: SniKind, +} + #[derive(FixedCardinalityLabel, Copy, Clone)] -#[label(singleton = "kind")] pub enum SniKind { + /// Domain name based routing. SNI for libpq/websockets. Host for HTTP Sni, + /// Metadata based routing. `options` for libpq/websockets. Header for HTTP NoSni, + /// Metadata based routing, using the password field. PasswordHack, } diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index c05031ad97..54c02f2c15 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -24,9 +24,6 @@ pub(crate) enum HandshakeError { #[error("protocol violation")] ProtocolViolation, - #[error("missing certificate")] - MissingCertificate, - #[error("{0}")] StreamUpgradeError(#[from] StreamUpgradeError), @@ -42,10 +39,6 @@ impl ReportableError for HandshakeError { match self { HandshakeError::EarlyData => crate::error::ErrorKind::User, HandshakeError::ProtocolViolation => crate::error::ErrorKind::User, - // This error should not happen, but will if we have no default certificate and - // the client sends no SNI extension. - // If they provide SNI then we can be sure there is a certificate that matches. - HandshakeError::MissingCertificate => crate::error::ErrorKind::Service, HandshakeError::StreamUpgradeError(upgrade) => match upgrade { StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service, StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect, @@ -146,7 +139,7 @@ pub(crate) async fn handshake( // try parse endpoint let ep = conn_info .server_name() - .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten()); + .and_then(|sni| endpoint_sni(sni, &tls.common_names)); if let Some(ep) = ep { ctx.set_endpoint_id(ep); } @@ -161,10 +154,8 @@ pub(crate) async fn handshake( } } - let (_, tls_server_end_point) = tls - .cert_resolver - .resolve(conn_info.server_name()) - .ok_or(HandshakeError::MissingCertificate)?; + let (_, tls_server_end_point) = + tls.cert_resolver.resolve(conn_info.server_name()); stream = PqStream { framed: Framed { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index cf331b8bc0..0a86022e78 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -383,9 +383,7 @@ pub(crate) async fn handle_client( let cancellation_handler_clone = Arc::clone(&cancellation_handler); let session = cancellation_handler_clone.get_key(); - session - .write_cancel_key(node.cancel_closure.clone()) - .await?; + session.write_cancel_key(node.cancel_closure.clone())?; prepare_client_connection(&node, *session.key(), &mut stream).await?; diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index c100b8d716..8f9bd2de2d 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -94,7 +94,7 @@ impl ProxyPassthrough { tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database"); } - drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error + drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error res } diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 42d1491782..0879564ced 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -48,7 +48,7 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError { use postgres_client::error::SqlState; // Here are errors that happens after the user successfully authenticated to the database. // TODO: there are pgbouncer errors that should be retried, but they are not listed here. - !matches!( + let non_retriable_pg_errors = matches!( self.code(), &SqlState::TOO_MANY_CONNECTIONS | &SqlState::OUT_OF_MEMORY @@ -56,8 +56,20 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError { | &SqlState::T_R_SERIALIZATION_FAILURE | &SqlState::INVALID_CATALOG_NAME | &SqlState::INVALID_SCHEMA_NAME - | &SqlState::INVALID_PARAMETER_VALUE - ) + | &SqlState::INVALID_PARAMETER_VALUE, + ); + if non_retriable_pg_errors { + return false; + } + // PGBouncer errors that should not trigger a wake_compute retry. + if self.code() == &SqlState::PROTOCOL_VIOLATION { + // Source for the error message: + // https://github.com/pgbouncer/pgbouncer/blob/f15997fe3effe3a94ba8bcc1ea562e6117d1a131/src/client.c#L1070 + return !self + .message() + .contains("no more connections allowed (max_client_conn)"); + } + true } } @@ -110,3 +122,55 @@ pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Durati .base_delay .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1)) } + +#[cfg(test)] +mod tests { + use super::ShouldRetryWakeCompute; + use postgres_client::error::{DbError, SqlState}; + + #[test] + fn should_retry_wake_compute_for_db_error() { + // These SQLStates should NOT trigger a wake_compute retry. + let non_retry_states = [ + SqlState::TOO_MANY_CONNECTIONS, + SqlState::OUT_OF_MEMORY, + SqlState::SYNTAX_ERROR, + SqlState::T_R_SERIALIZATION_FAILURE, + SqlState::INVALID_CATALOG_NAME, + SqlState::INVALID_SCHEMA_NAME, + SqlState::INVALID_PARAMETER_VALUE, + ]; + for state in non_retry_states { + let err = DbError::new_test_error(state.clone(), "oops".to_string()); + assert!( + !err.should_retry_wake_compute(), + "State {state:?} unexpectedly retried" + ); + } + + // Errors coming from pgbouncer should not trigger a wake_compute retry + let non_retry_pgbouncer_errors = ["no more connections allowed (max_client_conn)"]; + for error in non_retry_pgbouncer_errors { + let err = DbError::new_test_error(SqlState::PROTOCOL_VIOLATION, error.to_string()); + assert!( + !err.should_retry_wake_compute(), + "PGBouncer error {error:?} unexpectedly retried" + ); + } + + // These SQLStates should trigger a wake_compute retry. + let retry_states = [ + SqlState::CONNECTION_FAILURE, + SqlState::CONNECTION_EXCEPTION, + SqlState::CONNECTION_DOES_NOT_EXIST, + SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, + ]; + for state in retry_states { + let err = DbError::new_test_error(state.clone(), "oops".to_string()); + assert!( + err.should_retry_wake_compute(), + "State {state:?} unexpectedly skipped retry" + ); + } + } +} diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 9a6864c33e..be6426a63c 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -15,6 +15,7 @@ use rstest::rstest; use rustls::crypto::ring; use rustls::pki_types; use tokio::io::DuplexStream; +use tracing_test::traced_test; use super::connect_compute::ConnectMechanism; use super::retry::CouldRetry; @@ -98,8 +99,7 @@ fn generate_tls_config<'a>( .with_no_client_auth() .with_single_cert(vec![cert.clone()], key.clone_key())?; - let mut cert_resolver = CertResolver::new(); - cert_resolver.add_cert(key, vec![cert], true)?; + let cert_resolver = CertResolver::new(key, vec![cert])?; let common_names = cert_resolver.get_common_names(); @@ -382,8 +382,14 @@ enum ConnectAction { WakeFail, WakeRetry, Connect, + // connect_once -> Err, could_retry = true, should_retry_wake_compute = true Retry, + // connect_once -> Err, could_retry = true, should_retry_wake_compute = false + RetryNoWake, + // connect_once -> Err, could_retry = false, should_retry_wake_compute = true Fail, + // connect_once -> Err, could_retry = false, should_retry_wake_compute = false + FailNoWake, } #[derive(Clone)] @@ -425,6 +431,7 @@ struct TestConnection; #[derive(Debug)] struct TestConnectError { retryable: bool, + wakeable: bool, kind: crate::error::ErrorKind, } @@ -449,7 +456,7 @@ impl CouldRetry for TestConnectError { } impl ShouldRetryWakeCompute for TestConnectError { fn should_retry_wake_compute(&self) -> bool { - true + self.wakeable } } @@ -472,10 +479,22 @@ impl ConnectMechanism for TestConnectMechanism { ConnectAction::Connect => Ok(TestConnection), ConnectAction::Retry => Err(TestConnectError { retryable: true, + wakeable: true, + kind: ErrorKind::Compute, + }), + ConnectAction::RetryNoWake => Err(TestConnectError { + retryable: true, + wakeable: false, kind: ErrorKind::Compute, }), ConnectAction::Fail => Err(TestConnectError { retryable: false, + wakeable: true, + kind: ErrorKind::Compute, + }), + ConnectAction::FailNoWake => Err(TestConnectError { + retryable: false, + wakeable: false, kind: ErrorKind::Compute, }), x => panic!("expecting action {x:?}, connect is called instead"), @@ -710,3 +729,92 @@ async fn wake_non_retry() { .unwrap_err(); mechanism.verify(); } + +#[tokio::test] +#[traced_test] +async fn fail_but_wake_invalidates_cache() { + let ctx = RequestContext::test(); + let mech = TestConnectMechanism::new(vec![ + ConnectAction::Wake, + ConnectAction::Fail, + ConnectAction::Wake, + ConnectAction::Connect, + ]); + let user = helper_create_connect_info(&mech); + let cfg = config(); + + connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg) + .await + .unwrap(); + + assert!(logs_contain( + "invalidating stalled compute node info cache entry" + )); +} + +#[tokio::test] +#[traced_test] +async fn fail_no_wake_skips_cache_invalidation() { + let ctx = RequestContext::test(); + let mech = TestConnectMechanism::new(vec![ + ConnectAction::Wake, + ConnectAction::FailNoWake, + ConnectAction::Connect, + ]); + let user = helper_create_connect_info(&mech); + let cfg = config(); + + connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg) + .await + .unwrap(); + + assert!(!logs_contain( + "invalidating stalled compute node info cache entry" + )); +} + +#[tokio::test] +#[traced_test] +async fn retry_but_wake_invalidates_cache() { + let _ = env_logger::try_init(); + use ConnectAction::*; + + let ctx = RequestContext::test(); + // Wake → Retry (retryable + wakeable) → Wake → Connect + let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + let cfg = config(); + + connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg) + .await + .unwrap(); + mechanism.verify(); + + // Because Retry has wakeable=true, we should see invalidate_cache + assert!(logs_contain( + "invalidating stalled compute node info cache entry" + )); +} + +#[tokio::test] +#[traced_test] +async fn retry_no_wake_skips_invalidation() { + let _ = env_logger::try_init(); + use ConnectAction::*; + + let ctx = RequestContext::test(); + // Wake → RetryNoWake (retryable + NOT wakeable) + let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake]); + let user_info = helper_create_connect_info(&mechanism); + let cfg = config(); + + connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg) + .await + .unwrap_err(); + mechanism.verify(); + + // Because RetryNoWake has wakeable=false, we must NOT see invalidate_cache + assert!(!logs_contain( + "invalidating stalled compute node info cache entry" + )); +} diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index aa627b29a6..f71730c533 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -47,7 +47,7 @@ impl RedisKVClient { pub(crate) async fn query( &mut self, - q: impl Queryable, + q: &impl Queryable, ) -> anyhow::Result { if !self.limiter.check() { tracing::info!("Rate limit exceeded. Skipping query"); diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs index 9c559e9082..7f48e00c41 100644 --- a/proxy/src/scram/pbkdf2.rs +++ b/proxy/src/scram/pbkdf2.rs @@ -13,22 +13,19 @@ pub(crate) struct Pbkdf2 { // inspired from impl Pbkdf2 { pub(crate) fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self { - let hmac = + // key the HMAC and derive the first block in-place + let mut hmac = Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); - - let prev = hmac - .clone() - .chain_update(salt) - .chain_update(1u32.to_be_bytes()) - .finalize() - .into_bytes(); + hmac.update(salt); + hmac.update(&1u32.to_be_bytes()); + let init_block = hmac.finalize_reset().into_bytes(); Self { hmac, - // one consumed for the hash above + // one iteration spent above iterations: iterations - 1, - hi: prev, - prev, + hi: init_block, + prev: init_block, } } @@ -44,14 +41,17 @@ impl Pbkdf2 { iterations, } = self; - // only do 4096 iterations per turn before sharing the thread for fairness + // only do up to 4096 iterations per turn for fairness let n = (*iterations).clamp(0, 4096); for _ in 0..n { - *prev = hmac.clone().chain_update(*prev).finalize().into_bytes(); + hmac.update(prev); + let block = hmac.finalize_reset().into_bytes(); - for (hi, prev) in hi.iter_mut().zip(*prev) { - *hi ^= prev; + for (hi_byte, &b) in hi.iter_mut().zip(block.iter()) { + *hi_byte ^= b; } + + *prev = block; } *iterations -= n; diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 1d9b35f41d..bb5637cd5f 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -41,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; pub(crate) const EXT_NAME: &str = "pg_session_jwt"; -pub(crate) const EXT_VERSION: &str = "0.3.0"; +pub(crate) const EXT_VERSION: &str = "0.3.1"; pub(crate) const EXT_SCHEMA: &str = "auth"; #[derive(Clone)] diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 6f24ad3dec..2a7069b1c2 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -56,6 +56,7 @@ use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; +pub(crate) const AUTH_BROKER_SNI: &str = "apiauth"; pub async fn task_main( config: &'static ProxyConfig, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 7fb39553f9..1c5bb64480 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -14,7 +14,9 @@ use hyper::http::{HeaderName, HeaderValue}; use hyper::{HeaderMap, Request, Response, StatusCode, header}; use indexmap::IndexMap; use postgres_client::error::{DbError, ErrorPosition, SqlState}; -use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; +use postgres_client::{ + GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, RowStream, Transaction, +}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; use serde_json::Value; @@ -38,7 +40,7 @@ use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::http::{ReadBodyError, read_body_with_limit}; -use crate::metrics::{HttpDirection, Metrics}; +use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind}; use crate::proxy::{NeonOptions, run_until_cancelled}; use crate::serverless::backend::HttpConnError; use crate::types::{DbName, RoleName}; @@ -199,8 +201,7 @@ fn get_conn_info( let endpoint = match connection_url.host() { Some(url::Host::Domain(hostname)) => { if let Some(tls) = tls { - endpoint_sni(hostname, &tls.common_names)? - .ok_or(ConnInfoError::MalformedEndpoint)? + endpoint_sni(hostname, &tls.common_names).ok_or(ConnInfoError::MalformedEndpoint)? } else { hostname .split_once('.') @@ -228,6 +229,32 @@ fn get_conn_info( } } + // check the URL that was used, for metrics + { + let host_endpoint = headers + // get the host header + .get("host") + // extract the domain + .and_then(|h| { + let (host, _port) = h.to_str().ok()?.split_once(':')?; + Some(host) + }) + // get the endpoint prefix + .map(|h| h.split_once('.').map_or(h, |(prefix, _)| prefix)); + + let kind = if host_endpoint == Some(&*endpoint) { + SniKind::Sni + } else { + SniKind::NoSni + }; + + let protocol = ctx.protocol(); + Metrics::get() + .proxy + .accepted_connections_by_sni + .inc(SniGroup { protocol, kind }); + } + ctx.set_user_agent( headers .get(hyper::header::USER_AGENT) @@ -1067,22 +1094,41 @@ async fn query_to_json( let query_start = Instant::now(); let query_params = data.params; - let mut row_stream = std::pin::pin!( - client - .query_raw_txt(&data.query, query_params) - .await - .map_err(SqlOverHttpError::Postgres)? - ); + let mut row_stream = client + .query_raw_txt(&data.query, query_params) + .await + .map_err(SqlOverHttpError::Postgres)?; let query_acknowledged = Instant::now(); + let columns_len = row_stream.statement.columns().len(); + let mut fields = Vec::with_capacity(columns_len); + let mut types = Vec::with_capacity(columns_len); + + for c in row_stream.statement.columns() { + fields.push(json!({ + "name": c.name().to_owned(), + "dataTypeID": c.type_().oid(), + "tableID": c.table_oid(), + "columnID": c.column_id(), + "dataTypeSize": c.type_size(), + "dataTypeModifier": c.type_modifier(), + "format": "text", + })); + + types.push(c.type_().clone()); + } + + let raw_output = parsed_headers.raw_output; + let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode); + // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too // big. - let mut rows: Vec = Vec::new(); + let mut rows = Vec::new(); while let Some(row) = row_stream.next().await { let row = row.map_err(SqlOverHttpError::Postgres)?; *current_size += row.body_len(); - rows.push(row); + // we don't have a streaming response support yet so this is to prevent OOM // from a malicious query (eg a cross join) if *current_size > config.max_response_size_bytes { @@ -1090,13 +1136,26 @@ async fn query_to_json( config.max_response_size_bytes, )); } + + let row = pg_text_row_to_json(&row, &types, raw_output, array_mode)?; + rows.push(row); + + // assumption: parsing pg text and converting to json takes CPU time. + // let's assume it is slightly expensive, so we should consume some cooperative budget. + // Especially considering that `RowStream::next` might be pulling from a batch + // of rows and never hit the tokio mpsc for a long time (although unlikely). + tokio::task::consume_budget().await; } let query_resp_end = Instant::now(); - let ready = row_stream.ready_status(); + let RowStream { + command_tag, + status: ready, + .. + } = row_stream; // grab the command tag and number of rows affected - let command_tag = row_stream.command_tag().unwrap_or_default(); + let command_tag = command_tag.unwrap_or_default(); let mut command_tag_split = command_tag.split(' '); let command_tag_name = command_tag_split.next().unwrap_or_default(); let command_tag_count = if command_tag_name == "INSERT" { @@ -1117,38 +1176,6 @@ async fn query_to_json( "finished executing query" ); - let columns_len = row_stream.columns().len(); - let mut fields = Vec::with_capacity(columns_len); - let mut columns = Vec::with_capacity(columns_len); - - for c in row_stream.columns() { - fields.push(json!({ - "name": c.name().to_owned(), - "dataTypeID": c.type_().oid(), - "tableID": c.table_oid(), - "columnID": c.column_id(), - "dataTypeSize": c.type_size(), - "dataTypeModifier": c.type_modifier(), - "format": "text", - })); - - match client.get_type(c.type_oid()).await { - Ok(t) => columns.push(t), - Err(err) => { - tracing::warn!(?err, "unable to query type information"); - return Err(SqlOverHttpError::InternalPostgres(err)); - } - } - } - - let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode); - - // convert rows to JSON - let rows = rows - .iter() - .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode)) - .collect::, _>>()?; - // Resulting JSON format is based on the format of node-postgres result. let results = json!({ "command": command_tag_name.to_string(), diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index 5a95e69fde..66c53b3aff 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -1,10 +1,12 @@ use std::collections::{HashMap, HashSet}; +use std::path::Path; use std::sync::Arc; use anyhow::{Context, bail}; use itertools::Itertools; use rustls::crypto::ring::{self, sign}; use rustls::pki_types::{CertificateDer, PrivateKeyDer}; +use rustls::sign::CertifiedKey; use x509_cert::der::{Reader, SliceReader}; use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint}; @@ -20,15 +22,13 @@ pub struct TlsConfig { /// Configure TLS for the main endpoint. pub fn configure_tls( - key_path: &str, - cert_path: &str, - certs_dir: Option<&String>, + key_path: &Path, + cert_path: &Path, + certs_dir: Option<&Path>, allow_tls_keylogfile: bool, ) -> anyhow::Result { - let mut cert_resolver = CertResolver::new(); - // add default certificate - cert_resolver.add_cert_path(key_path, cert_path, true)?; + let mut cert_resolver = CertResolver::parse_new(key_path, cert_path)?; // add extra certificates if let Some(certs_dir) = certs_dir { @@ -40,11 +40,7 @@ pub fn configure_tls( let key_path = path.join("tls.key"); let cert_path = path.join("tls.crt"); if key_path.exists() && cert_path.exists() { - cert_resolver.add_cert_path( - &key_path.to_string_lossy(), - &cert_path.to_string_lossy(), - false, - )?; + cert_resolver.add_cert_path(&key_path, &cert_path)?; } } } @@ -83,92 +79,42 @@ pub fn configure_tls( }) } -#[derive(Default, Debug)] +#[derive(Debug)] pub struct CertResolver { certs: HashMap, TlsServerEndPoint)>, - default: Option<(Arc, TlsServerEndPoint)>, + default: (Arc, TlsServerEndPoint), } impl CertResolver { - pub fn new() -> Self { - Self::default() + fn parse_new(key_path: &Path, cert_path: &Path) -> anyhow::Result { + let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; + Self::new(priv_key, cert_chain) } - fn add_cert_path( - &mut self, - key_path: &str, - cert_path: &str, - is_default: bool, - ) -> anyhow::Result<()> { - let priv_key = { - let key_bytes = std::fs::read(key_path) - .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; - rustls_pemfile::private_key(&mut &key_bytes[..]) - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? - }; + pub fn new( + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, + ) -> anyhow::Result { + let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?; - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - self.add_cert(priv_key, cert_chain, is_default) + let mut certs = HashMap::new(); + let default = (cert.clone(), tls_server_end_point); + certs.insert(common_name, (cert, tls_server_end_point)); + Ok(Self { certs, default }) } - pub fn add_cert( + fn add_cert_path(&mut self, key_path: &Path, cert_path: &Path) -> anyhow::Result<()> { + let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; + self.add_cert(priv_key, cert_chain) + } + + fn add_cert( &mut self, priv_key: PrivateKeyDer<'static>, cert_chain: Vec>, - is_default: bool, ) -> anyhow::Result<()> { - let key = sign::any_supported_type(&priv_key).context("invalid private key")?; - - let first_cert = &cert_chain[0]; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - - let certificate = SliceReader::new(first_cert) - .context("Failed to parse cerficiate")? - .decode::() - .context("Failed to parse cerficiate")?; - - let common_name = certificate.tbs_certificate.subject.to_string(); - - // We need to get the canonical name for this certificate so we can match them against any domain names - // seen within the proxy codebase. - // - // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. - // We need to remove the wildcard prefix for the purposes of certificate selection. - // - // auth-broker does not use SNI and instead uses the Neon-Connection-String header. - // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. - // - // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string - // validation, so let's we can continue with any common-name - let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { - s.to_string() - } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { - s.to_string() - } else if let Some(s) = common_name.strip_prefix("CN=") { - s.to_string() - } else { - bail!("Failed to parse common name from certificate") - }; - - let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); - - if is_default { - self.default = Some((cert.clone(), tls_server_end_point)); - } - + let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?; self.certs.insert(common_name, (cert, tls_server_end_point)); - Ok(()) } @@ -177,12 +123,85 @@ impl CertResolver { } } +fn parse_key_cert( + key_path: &Path, + cert_path: &Path, +) -> anyhow::Result<(PrivateKeyDer<'static>, Vec>)> { + let priv_key = { + let key_bytes = std::fs::read(key_path) + .with_context(|| format!("Failed to read TLS keys at '{}'", key_path.display()))?; + rustls_pemfile::private_key(&mut &key_bytes[..]) + .with_context(|| format!("Failed to parse TLS keys at '{}'", key_path.display()))? + .with_context(|| format!("Failed to parse TLS keys at '{}'", key_path.display()))? + }; + + let cert_chain_bytes = std::fs::read(cert_path).context(format!( + "Failed to read TLS cert file at '{}.'", + cert_path.display() + ))?; + + let cert_chain = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!( + "Failed to read TLS certificate chain from bytes from file at '{}'.", + cert_path.display() + ) + })? + }; + + Ok((priv_key, cert_chain)) +} + +fn process_key_cert( + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, +) -> anyhow::Result<(String, Arc, TlsServerEndPoint)> { + let key = sign::any_supported_type(&priv_key).context("invalid private key")?; + + let first_cert = &cert_chain[0]; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + + let certificate = SliceReader::new(first_cert) + .context("Failed to parse cerficiate")? + .decode::() + .context("Failed to parse cerficiate")?; + + let common_name = certificate.tbs_certificate.subject.to_string(); + + // We need to get the canonical name for this certificate so we can match them against any domain names + // seen within the proxy codebase. + // + // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. + // We need to remove the wildcard prefix for the purposes of certificate selection. + // + // auth-broker does not use SNI and instead uses the Neon-Connection-String header. + // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. + // + // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string + // validation, so let's we can continue with any common-name + let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=") { + s.to_string() + } else { + bail!("Failed to parse common name from certificate") + }; + + let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); + + Ok((common_name, cert, tls_server_end_point)) +} + impl rustls::server::ResolvesServerCert for CertResolver { fn resolve( &self, client_hello: rustls::server::ClientHello<'_>, ) -> Option> { - self.resolve(client_hello.server_name()).map(|x| x.0) + Some(self.resolve(client_hello.server_name()).0) } } @@ -190,7 +209,7 @@ impl CertResolver { pub fn resolve( &self, server_name: Option<&str>, - ) -> Option<(Arc, TlsServerEndPoint)> { + ) -> (Arc, TlsServerEndPoint) { // loop here and cut off more and more subdomains until we find // a match to get a proper wildcard support. OTOH, we now do not // use nested domains, so keep this simple for now. @@ -200,12 +219,17 @@ impl CertResolver { if let Some(mut sni_name) = server_name { loop { if let Some(cert) = self.certs.get(sni_name) { - return Some(cert.clone()); + return cert.clone(); } if let Some((_, rest)) = sni_name.split_once('.') { sni_name = rest; } else { - return None; + // The customer has some custom DNS mapping - just return + // a default certificate. + // + // This will error if the customer uses anything stronger + // than sslmode=require. That's a choice they can make. + return self.default.clone(); } } } else { diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 270cd7c24d..7dce36be2f 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -43,6 +43,12 @@ impl std::ops::Deref for ApiUrl { } } +impl std::ops::DerefMut for ApiUrl { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl std::fmt::Display for ApiUrl { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index a0d5970bd5..c48def3483 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.86.0" +channel = "1.87.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 5849df0343..b364ac8e48 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -121,6 +121,20 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn switch_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/membership", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.put(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result { let uri = format!("{}/v1/tenant/{}", self.mgmt_api_endpoint, tenant_id); let resp = self diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index dd71420efb..8d31ada24f 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -1,7 +1,6 @@ // // Main entry point for the safekeeper executable // -use std::env::{VarError, var}; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::str::FromStr; @@ -23,9 +22,10 @@ use safekeeper::defaults::{ DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE, }; +use safekeeper::wal_backup::WalBackup; use safekeeper::{ BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, - WAL_SERVICE_RUNTIME, broker, control_file, http, wal_backup, wal_service, + WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service, }; use sd_notify::NotifyState; use storage_broker::{DEFAULT_ENDPOINT, Uri}; @@ -354,29 +354,13 @@ async fn main() -> anyhow::Result<()> { }; // Load JWT auth token to connect to other safekeepers for pull_timeline. - // First check if the env var is present, then check the arg with the path. - // We want to deprecate and remove the env var method in the future. - let sk_auth_token = match var("SAFEKEEPER_AUTH_TOKEN") { - Ok(v) => { - info!("loaded JWT token for authentication with safekeepers"); - Some(SecretString::from(v)) - } - Err(VarError::NotPresent) => { - if let Some(auth_token_path) = args.auth_token_path.as_ref() { - info!( - "loading JWT token for authentication with safekeepers from {auth_token_path}" - ); - let auth_token = tokio::fs::read_to_string(auth_token_path).await?; - Some(SecretString::from(auth_token.trim().to_owned())) - } else { - info!("no JWT token for authentication with safekeepers detected"); - None - } - } - Err(_) => { - warn!("JWT token for authentication with safekeepers is not unicode"); - None - } + let sk_auth_token = if let Some(auth_token_path) = args.auth_token_path.as_ref() { + info!("loading JWT token for authentication with safekeepers from {auth_token_path}"); + let auth_token = tokio::fs::read_to_string(auth_token_path).await?; + Some(SecretString::from(auth_token.trim().to_owned())) + } else { + info!("no JWT token for authentication with safekeepers detected"); + None }; let ssl_ca_certs = match args.ssl_ca_file.as_ref() { @@ -501,15 +485,15 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { None => None, }; - let global_timelines = Arc::new(GlobalTimelines::new(conf.clone())); + let wal_backup = Arc::new(WalBackup::new(&conf).await?); + + let global_timelines = Arc::new(GlobalTimelines::new(conf.clone(), wal_backup.clone())); // Register metrics collector for active timelines. It's important to do this // after daemonizing, otherwise process collector will be upset. let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone()); metrics::register_internal(Box::new(timeline_collector))?; - wal_backup::init_remote_storage(&conf).await; - // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = FuturesUnordered::new(); diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 11daff22cb..7984c2e2b9 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use anyhow::{Result, bail}; use camino::Utf8PathBuf; use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use remote_storage::GenericRemoteStorage; use safekeeper_api::membership::Configuration; use tokio::fs::OpenOptions; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; @@ -30,6 +31,7 @@ pub struct Request { pub async fn handle_request( request: Request, global_timelines: Arc, + storage: Arc, ) -> Result<()> { // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :( // if LSN will point to the middle of a WAL record, timeline will be in "broken" state @@ -127,6 +129,7 @@ pub async fn handle_request( assert!(first_ondisk_segment >= first_segment); copy_s3_segments( + &storage, wal_seg_size, &request.source_ttid, &request.destination_ttid, diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 2b2d721db2..384c582678 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -243,8 +243,7 @@ async fn timeline_pull_handler(mut request: Request) -> Result) -> Result, // so create the chan and write to it in another task. @@ -270,6 +270,7 @@ async fn timeline_snapshot_handler(request: Request) -> Result) -> Result bool { - self.remote_storage.is_some() && self.wal_backup_enabled - } -} - impl SafeKeeperConf { pub fn dummy() -> Self { SafeKeeperConf { diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 653b084ad8..14aef1ee5e 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -7,7 +7,9 @@ use bytes::Bytes; use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; +use http_utils::error::ApiError; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; +use remote_storage::GenericRemoteStorage; use reqwest::Certificate; use safekeeper_api::Term; use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}; @@ -30,7 +32,7 @@ use utils::pausable_failpoint; use crate::control_file::CONTROL_FILE_NAME; use crate::state::{EvictionState, TimelinePersistentState}; -use crate::timeline::{Timeline, WalResidentTimeline}; +use crate::timeline::{Timeline, TimelineError, WalResidentTimeline}; use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}; use crate::wal_storage::open_wal_file; use crate::{GlobalTimelines, debug_dump, wal_backup}; @@ -42,6 +44,7 @@ pub async fn stream_snapshot( source: NodeId, destination: NodeId, tx: mpsc::Sender>, + storage: Option>, ) { match tli.try_wal_residence_guard().await { Err(e) => { @@ -52,10 +55,32 @@ pub async fn stream_snapshot( Ok(maybe_resident_tli) => { if let Err(e) = match maybe_resident_tli { Some(resident_tli) => { - stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone()) - .await + stream_snapshot_resident_guts( + resident_tli, + source, + destination, + tx.clone(), + storage, + ) + .await + } + None => { + if let Some(storage) = storage { + stream_snapshot_offloaded_guts( + tli, + source, + destination, + tx.clone(), + &storage, + ) + .await + } else { + tx.send(Err(anyhow!("remote storage not configured"))) + .await + .ok(); + return; + } } - None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await, } { // Error type/contents don't matter as they won't can't reach the client // (hyper likely doesn't do anything with it), but http stream will be @@ -122,10 +147,12 @@ pub(crate) async fn stream_snapshot_offloaded_guts( source: NodeId, destination: NodeId, tx: mpsc::Sender>, + storage: &GenericRemoteStorage, ) -> Result<()> { let mut ar = prepare_tar_stream(tx); - tli.snapshot_offloaded(&mut ar, source, destination).await?; + tli.snapshot_offloaded(&mut ar, source, destination, storage) + .await?; ar.finish().await?; @@ -138,10 +165,13 @@ pub async fn stream_snapshot_resident_guts( source: NodeId, destination: NodeId, tx: mpsc::Sender>, + storage: Option>, ) -> Result<()> { let mut ar = prepare_tar_stream(tx); - let bctx = tli.start_snapshot(&mut ar, source, destination).await?; + let bctx = tli + .start_snapshot(&mut ar, source, destination, storage) + .await?; pausable_failpoint!("sk-snapshot-after-list-pausable"); let tli_dir = tli.get_timeline_dir(); @@ -181,6 +211,7 @@ impl Timeline { ar: &mut tokio_tar::Builder, source: NodeId, destination: NodeId, + storage: &GenericRemoteStorage, ) -> Result<()> { // Take initial copy of control file, then release state lock let mut control_file = { @@ -215,6 +246,7 @@ impl Timeline { // can fail if the timeline was un-evicted and modified in the background. let remote_timeline_path = &self.remote_path; wal_backup::copy_partial_segment( + storage, &replace.previous.remote_path(remote_timeline_path), &replace.current.remote_path(remote_timeline_path), ) @@ -261,6 +293,7 @@ impl WalResidentTimeline { ar: &mut tokio_tar::Builder, source: NodeId, destination: NodeId, + storage: Option>, ) -> Result { let mut shared_state = self.write_shared_state().await; let wal_seg_size = shared_state.get_wal_seg_size(); @@ -282,6 +315,7 @@ impl WalResidentTimeline { let remote_timeline_path = &self.tli.remote_path; wal_backup::copy_partial_segment( + &*storage.context("remote storage not configured")?, &replace.previous.remote_path(remote_timeline_path), &replace.current.remote_path(remote_timeline_path), ) @@ -395,20 +429,25 @@ pub async fn handle_request( sk_auth_token: Option, ssl_ca_certs: Vec, global_timelines: Arc, -) -> Result { +) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( request.tenant_id, request.timeline_id, )); if existing_tli.is_ok() { - bail!("Timeline {} already exists", request.timeline_id); + info!("Timeline {} already exists", request.timeline_id); + return Ok(PullTimelineResponse { + safekeeper_host: None, + }); } let mut http_client = reqwest::Client::builder(); for ssl_ca_cert in ssl_ca_certs { http_client = http_client.add_root_certificate(ssl_ca_cert); } - let http_client = http_client.build()?; + let http_client = http_client + .build() + .map_err(|e| ApiError::InternalServerError(e.into()))?; let http_hosts = request.http_hosts.clone(); @@ -425,8 +464,25 @@ pub async fn handle_request( let mut statuses = Vec::new(); for (i, response) in responses.into_iter().enumerate() { - let status = response.context(format!("fetching status from {}", http_hosts[i]))?; - statuses.push((status, i)); + match response { + Ok(status) => { + statuses.push((status, i)); + } + Err(e) => { + info!("error fetching status from {}: {e}", http_hosts[i]); + } + } + } + + // Allow missing responses from up to one safekeeper (say due to downtime) + // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes + // offline and C comes online. Then we want a pull on C with A and B as hosts to work. + let min_required_successful = (http_hosts.len() - 1).max(1); + if statuses.len() < min_required_successful { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "only got {} successful status responses. required: {min_required_successful}", + statuses.len() + ))); } // Find the most advanced safekeeper @@ -445,14 +501,32 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); - pull_timeline( + let check_tombstone = !request.ignore_tombstone.unwrap_or_default(); + + match pull_timeline( status, safekeeper_host, sk_auth_token, http_client, global_timelines, + check_tombstone, ) .await + { + Ok(resp) => Ok(resp), + Err(e) => { + match e.downcast_ref::() { + Some(TimelineError::AlreadyExists(_)) => Ok(PullTimelineResponse { + safekeeper_host: None, + }), + Some(TimelineError::CreationInProgress(_)) => { + // We don't return success here because creation might still fail. + Err(ApiError::Conflict("Creation in progress".to_owned())) + } + _ => Err(ApiError::InternalServerError(e)), + } + } + } } async fn pull_timeline( @@ -461,6 +535,7 @@ async fn pull_timeline( sk_auth_token: Option, http_client: reqwest::Client, global_timelines: Arc, + check_tombstone: bool, ) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); info!( @@ -532,10 +607,10 @@ async fn pull_timeline( // Finally, load the timeline. let _tli = global_timelines - .load_temp_timeline(ttid, &tli_dir_path, false) + .load_temp_timeline(ttid, &tli_dir_path, check_tombstone) .await?; Ok(PullTimelineResponse { - safekeeper_host: host, + safekeeper_host: Some(host), }) } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 9975153f6c..eb8eee6ab8 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -32,7 +32,7 @@ use crate::metrics::{ WAL_RECEIVERS, }; use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage}; -use crate::timeline::WalResidentTimeline; +use crate::timeline::{TimelineError, WalResidentTimeline}; const DEFAULT_FEEDBACK_CAPACITY: usize = 8; @@ -357,9 +357,14 @@ impl NetworkReader<'_, IO> { .await .context("create timeline")? } else { - self.global_timelines - .get(self.ttid) - .context("get timeline")? + let timeline_res = self.global_timelines.get(self.ttid); + match timeline_res { + Ok(tl) => tl, + Err(TimelineError::NotFound(_)) => { + return Err(CopyStreamHandlerEnd::TimelineNoCreate); + } + other => other.context("get_timeline")?, + } }; tli.wal_residence_guard().await? } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 33e3d0485c..05f827494e 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -513,7 +513,7 @@ impl SafekeeperPostgresHandler { let end_pos = end_watch.get(); if end_pos < start_pos { - warn!( + info!( "requested start_pos {} is ahead of available WAL end_pos {}", start_pos, end_pos ); diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs index 618e2b59d2..e2817c8337 100644 --- a/safekeeper/src/test_utils.rs +++ b/safekeeper/src/test_utils.rs @@ -18,7 +18,7 @@ use crate::send_wal::EndWatch; use crate::state::{TimelinePersistentState, TimelineState}; use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir}; use crate::timelines_set::TimelinesSet; -use crate::wal_backup::remote_timeline_path; +use crate::wal_backup::{WalBackup, remote_timeline_path}; use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage}; /// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop. @@ -101,18 +101,22 @@ impl Env { let safekeeper = self.make_safekeeper(node_id, ttid, start_lsn).await?; let shared_state = SharedState::new(StateSK::Loaded(safekeeper)); + let wal_backup = Arc::new(WalBackup::new(&conf).await?); + let timeline = Timeline::new( ttid, &timeline_dir, &remote_path, shared_state, conf.clone(), + wal_backup.clone(), ); timeline.bootstrap( &mut timeline.write_shared_state().await, &conf, Arc::new(TimelinesSet::default()), // ignored for now RateLimiter::new(0, 0), + wal_backup, ); Ok(timeline) } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index b7ba28f435..588bd4f2c9 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -35,7 +35,8 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim use crate::timeline_guard::ResidenceGuard; use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; -use crate::wal_backup::{self, remote_timeline_path}; +use crate::wal_backup; +use crate::wal_backup::{WalBackup, remote_timeline_path}; use crate::wal_backup_partial::PartialRemoteSegment; use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage}; @@ -452,6 +453,8 @@ pub struct Timeline { manager_ctl: ManagerCtl, conf: Arc, + pub(crate) wal_backup: Arc, + remote_deletion: std::sync::Mutex>, /// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding @@ -476,6 +479,7 @@ impl Timeline { remote_path: &RemotePath, shared_state: SharedState, conf: Arc, + wal_backup: Arc, ) -> Arc { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(shared_state.sk.state().commit_lsn); @@ -509,6 +513,7 @@ impl Timeline { wal_backup_active: AtomicBool::new(false), last_removed_segno: AtomicU64::new(0), mgr_status: AtomicStatus::new(), + wal_backup, }) } @@ -516,6 +521,7 @@ impl Timeline { pub fn load_timeline( conf: Arc, ttid: TenantTimelineId, + wal_backup: Arc, ) -> Result> { let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); @@ -529,6 +535,7 @@ impl Timeline { &remote_path, shared_state, conf, + wal_backup, )) } @@ -539,6 +546,7 @@ impl Timeline { conf: &SafeKeeperConf, broker_active_set: Arc, partial_backup_rate_limiter: RateLimiter, + wal_backup: Arc, ) { let (tx, rx) = self.manager_ctl.bootstrap_manager(); @@ -561,6 +569,7 @@ impl Timeline { tx, rx, partial_backup_rate_limiter, + wal_backup, ) .await } @@ -606,9 +615,10 @@ impl Timeline { // it is cancelled, so WAL storage won't be opened again. shared_state.sk.close_wal_store(); - if !only_local && self.conf.is_wal_backup_enabled() { + if !only_local { self.remote_delete().await?; } + let dir_existed = delete_dir(&self.timeline_dir).await?; Ok(dir_existed) } @@ -675,11 +685,20 @@ impl Timeline { guard: &mut std::sync::MutexGuard>, ) -> RemoteDeletionReceiver { tracing::info!("starting remote deletion"); + let storage = self.wal_backup.get_storage().clone(); let (result_tx, result_rx) = tokio::sync::watch::channel(None); let ttid = self.ttid; tokio::task::spawn( async move { - let r = wal_backup::delete_timeline(&ttid).await; + let r = if let Some(storage) = storage { + wal_backup::delete_timeline(&storage, &ttid).await + } else { + tracing::info!( + "skipping remote deletion because no remote storage is configured; this effectively leaks the objects in remote storage" + ); + Ok(()) + }; + if let Err(e) = &r { // Log error here in case nobody ever listens for our result (e.g. dropped API request) tracing::error!("remote deletion failed: {e}"); @@ -1046,14 +1065,13 @@ impl WalResidentTimeline { pub async fn get_walreader(&self, start_lsn: Lsn) -> Result { let (_, persisted_state) = self.get_state().await; - let enable_remote_read = self.conf.is_wal_backup_enabled(); WalReader::new( &self.ttid, self.timeline_dir.clone(), &persisted_state, start_lsn, - enable_remote_read, + self.wal_backup.clone(), ) } diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 84c636daf6..e817dbf6f9 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -6,7 +6,7 @@ use anyhow::Context; use camino::Utf8PathBuf; -use remote_storage::RemotePath; +use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::fs::File; use tokio::io::{AsyncRead, AsyncWriteExt}; use tracing::{debug, info, instrument, warn}; @@ -68,6 +68,10 @@ impl Manager { #[instrument(name = "evict_timeline", skip_all)] pub(crate) async fn evict_timeline(&mut self) -> bool { assert!(!self.is_offloaded); + let Some(storage) = self.wal_backup.get_storage() else { + warn!("no remote storage configured, skipping uneviction"); + return false; + }; let partial_backup_uploaded = match &self.partial_backup_uploaded { Some(p) => p.clone(), None => { @@ -87,7 +91,7 @@ impl Manager { .inc(); }); - if let Err(e) = do_eviction(self, &partial_backup_uploaded).await { + if let Err(e) = do_eviction(self, &partial_backup_uploaded, &storage).await { warn!("failed to evict timeline: {:?}", e); return false; } @@ -102,6 +106,10 @@ impl Manager { #[instrument(name = "unevict_timeline", skip_all)] pub(crate) async fn unevict_timeline(&mut self) { assert!(self.is_offloaded); + let Some(storage) = self.wal_backup.get_storage() else { + warn!("no remote storage configured, skipping uneviction"); + return; + }; let partial_backup_uploaded = match &self.partial_backup_uploaded { Some(p) => p.clone(), None => { @@ -121,7 +129,7 @@ impl Manager { .inc(); }); - if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await { + if let Err(e) = do_uneviction(self, &partial_backup_uploaded, &storage).await { warn!("failed to unevict timeline: {:?}", e); return; } @@ -137,8 +145,12 @@ impl Manager { /// Ensure that content matches the remote partial backup, if local segment exists. /// Then change state in control file and in-memory. If `delete_offloaded_wal` is set, /// delete the local segment. -async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { - compare_local_segment_with_remote(mgr, partial).await?; +async fn do_eviction( + mgr: &mut Manager, + partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, +) -> anyhow::Result<()> { + compare_local_segment_with_remote(mgr, partial, storage).await?; mgr.tli.switch_to_offloaded(partial).await?; // switch manager state as soon as possible @@ -153,12 +165,16 @@ async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyho /// Ensure that content matches the remote partial backup, if local segment exists. /// Then download segment to local disk and change state in control file and in-memory. -async fn do_uneviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { +async fn do_uneviction( + mgr: &mut Manager, + partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, +) -> anyhow::Result<()> { // if the local segment is present, validate it - compare_local_segment_with_remote(mgr, partial).await?; + compare_local_segment_with_remote(mgr, partial, storage).await?; // atomically download the partial segment - redownload_partial_segment(mgr, partial).await?; + redownload_partial_segment(mgr, partial, storage).await?; mgr.tli.switch_to_present().await?; // switch manager state as soon as possible @@ -181,6 +197,7 @@ async fn delete_local_segment(mgr: &Manager, partial: &PartialRemoteSegment) -> async fn redownload_partial_segment( mgr: &Manager, partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, ) -> anyhow::Result<()> { let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp"); let remote_segfile = remote_segment_path(mgr, partial); @@ -190,7 +207,7 @@ async fn redownload_partial_segment( remote_segfile, tmp_file ); - let mut reader = wal_backup::read_object(&remote_segfile, 0).await?; + let mut reader = wal_backup::read_object(storage, &remote_segfile, 0).await?; let mut file = File::create(&tmp_file).await?; let actual_len = tokio::io::copy(&mut reader, &mut file).await?; @@ -234,13 +251,16 @@ async fn redownload_partial_segment( async fn compare_local_segment_with_remote( mgr: &Manager, partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, ) -> anyhow::Result<()> { let local_path = local_segment_path(mgr, partial); match File::open(&local_path).await { - Ok(mut local_file) => do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial) - .await - .context("validation failed"), + Ok(mut local_file) => { + do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial, storage) + .await + .context("validation failed") + } Err(_) => { info!( "local WAL file {} is not present, skipping validation", @@ -258,6 +278,7 @@ async fn do_validation( file: &mut File, wal_seg_size: usize, partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, ) -> anyhow::Result<()> { let local_size = file.metadata().await?.len() as usize; if local_size != wal_seg_size { @@ -270,7 +291,7 @@ async fn do_validation( let remote_segfile = remote_segment_path(mgr, partial); let mut remote_reader: std::pin::Pin> = - wal_backup::read_object(&remote_segfile, 0).await?; + wal_backup::read_object(storage, &remote_segfile, 0).await?; // remote segment should have bytes excatly up to `flush_lsn` let expected_remote_size = partial.flush_lsn.segment_offset(mgr.wal_seg_size); diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 71e99a4de7..48eda92fed 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -35,7 +35,7 @@ use crate::state::TimelineState; use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}; use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard}; use crate::timelines_set::{TimelineSetGuard, TimelinesSet}; -use crate::wal_backup::{self, WalBackupTaskHandle}; +use crate::wal_backup::{self, WalBackup, WalBackupTaskHandle}; use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}; pub(crate) struct StateSnapshot { @@ -200,6 +200,7 @@ pub(crate) struct Manager { pub(crate) conf: SafeKeeperConf, pub(crate) wal_seg_size: usize, pub(crate) walsenders: Arc, + pub(crate) wal_backup: Arc, // current state pub(crate) state_version_rx: tokio::sync::watch::Receiver, @@ -238,6 +239,7 @@ pub async fn main_task( manager_tx: tokio::sync::mpsc::UnboundedSender, mut manager_rx: tokio::sync::mpsc::UnboundedReceiver, global_rate_limiter: RateLimiter, + wal_backup: Arc, ) { tli.set_status(Status::Started); @@ -256,6 +258,7 @@ pub async fn main_task( broker_active_set, manager_tx, global_rate_limiter, + wal_backup, ) .await; @@ -371,7 +374,7 @@ pub async fn main_task( mgr.tli_broker_active.set(false); // shutdown background tasks - if mgr.conf.is_wal_backup_enabled() { + if let Some(storage) = mgr.wal_backup.get_storage() { if let Some(backup_task) = mgr.backup_task.take() { // If we fell through here, then the timeline is shutting down. This is important // because otherwise joining on the wal_backup handle might hang. @@ -379,7 +382,7 @@ pub async fn main_task( backup_task.join().await; } - wal_backup::update_task(&mut mgr, false, &last_state).await; + wal_backup::update_task(&mut mgr, storage, false, &last_state).await; } if let Some(recovery_task) = &mut mgr.recovery_task { @@ -415,11 +418,13 @@ impl Manager { broker_active_set: Arc, manager_tx: tokio::sync::mpsc::UnboundedSender, global_rate_limiter: RateLimiter, + wal_backup: Arc, ) -> Manager { let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await; Manager { wal_seg_size: tli.get_wal_seg_size().await, walsenders: tli.get_walsenders().clone(), + wal_backup, state_version_rx: tli.get_state_version_rx(), num_computes_rx: tli.get_walreceivers().get_num_rx(), tli_broker_active: broker_active_set.guard(tli.clone()), @@ -477,8 +482,8 @@ impl Manager { let is_wal_backup_required = wal_backup::is_wal_backup_required(self.wal_seg_size, num_computes, state); - if self.conf.is_wal_backup_enabled() { - wal_backup::update_task(self, is_wal_backup_required, state).await; + if let Some(storage) = self.wal_backup.get_storage() { + wal_backup::update_task(self, storage, is_wal_backup_required, state).await; } // update the state in Arc @@ -624,9 +629,9 @@ impl Manager { /// Spawns partial WAL backup task if needed. async fn update_partial_backup(&mut self, state: &StateSnapshot) { // check if WAL backup is enabled and should be started - if !self.conf.is_wal_backup_enabled() { + let Some(storage) = self.wal_backup.get_storage() else { return; - } + }; if self.partial_backup_task.is_some() { // partial backup is already running @@ -650,6 +655,7 @@ impl Manager { self.conf.clone(), self.global_rate_limiter.clone(), cancel.clone(), + storage, )); self.partial_backup_task = Some((handle, cancel)); } @@ -669,6 +675,10 @@ impl Manager { /// Reset partial backup state and remove its remote storage data. Since it /// might concurrently uploading something, cancel the task first. async fn backup_partial_reset(&mut self) -> anyhow::Result> { + let Some(storage) = self.wal_backup.get_storage() else { + anyhow::bail!("remote storage is not enabled"); + }; + info!("resetting partial backup state"); // Force unevict timeline if it is evicted before erasing partial backup // state. The intended use of this function is to drop corrupted remote @@ -689,7 +699,7 @@ impl Manager { } let tli = self.wal_resident_timeline()?; - let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await; + let mut partial_backup = PartialBackup::new(tli, self.conf.clone(), storage).await; // Reset might fail e.g. when cfile is already reset but s3 removal // failed, so set manager state to None beforehand. In any case caller // is expected to retry until success. diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 41abee369e..af33bcbd20 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -25,6 +25,7 @@ use crate::rate_limit::RateLimiter; use crate::state::TimelinePersistentState; use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir}; use crate::timelines_set::TimelinesSet; +use crate::wal_backup::WalBackup; use crate::wal_storage::Storage; use crate::{SafeKeeperConf, control_file, wal_storage}; @@ -47,15 +48,24 @@ struct GlobalTimelinesState { conf: Arc, broker_active_set: Arc, global_rate_limiter: RateLimiter, + wal_backup: Arc, } impl GlobalTimelinesState { /// Get dependencies for a timeline constructor. - fn get_dependencies(&self) -> (Arc, Arc, RateLimiter) { + fn get_dependencies( + &self, + ) -> ( + Arc, + Arc, + RateLimiter, + Arc, + ) { ( self.conf.clone(), self.broker_active_set.clone(), self.global_rate_limiter.clone(), + self.wal_backup.clone(), ) } @@ -84,7 +94,7 @@ pub struct GlobalTimelines { impl GlobalTimelines { /// Create a new instance of the global timelines map. - pub fn new(conf: Arc) -> Self { + pub fn new(conf: Arc, wal_backup: Arc) -> Self { Self { state: Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), @@ -92,6 +102,7 @@ impl GlobalTimelines { conf, broker_active_set: Arc::new(TimelinesSet::default()), global_rate_limiter: RateLimiter::new(1, 1), + wal_backup, }), } } @@ -147,7 +158,7 @@ impl GlobalTimelines { /// just lock and unlock it for each timeline -- this function is called /// during init when nothing else is running, so this is fine. async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> { - let (conf, broker_active_set, partial_backup_rate_limiter) = { + let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = { let state = self.state.lock().unwrap(); state.get_dependencies() }; @@ -162,7 +173,7 @@ impl GlobalTimelines { TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) { let ttid = TenantTimelineId::new(tenant_id, timeline_id); - match Timeline::load_timeline(conf.clone(), ttid) { + match Timeline::load_timeline(conf.clone(), ttid, wal_backup.clone()) { Ok(tli) => { let mut shared_state = tli.write_shared_state().await; self.state @@ -175,6 +186,7 @@ impl GlobalTimelines { &conf, broker_active_set.clone(), partial_backup_rate_limiter.clone(), + wal_backup.clone(), ); } // If we can't load a timeline, it's most likely because of a corrupted @@ -212,6 +224,10 @@ impl GlobalTimelines { self.state.lock().unwrap().broker_active_set.clone() } + pub fn get_wal_backup(&self) -> Arc { + self.state.lock().unwrap().wal_backup.clone() + } + /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. pub(crate) async fn create( @@ -222,7 +238,7 @@ impl GlobalTimelines { start_lsn: Lsn, commit_lsn: Lsn, ) -> Result> { - let (conf, _, _) = { + let (conf, _, _, _) = { let state = self.state.lock().unwrap(); if let Ok(timeline) = state.get(&ttid) { // Timeline already exists, return it. @@ -267,7 +283,7 @@ impl GlobalTimelines { check_tombstone: bool, ) -> Result> { // Check for existence and mark that we're creating it. - let (conf, broker_active_set, partial_backup_rate_limiter) = { + let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = { let mut state = self.state.lock().unwrap(); match state.timelines.get(&ttid) { Some(GlobalMapTimeline::CreationInProgress) => { @@ -296,7 +312,14 @@ impl GlobalTimelines { }; // Do the actual move and reflect the result in the map. - match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await { + match GlobalTimelines::install_temp_timeline( + ttid, + tmp_path, + conf.clone(), + wal_backup.clone(), + ) + .await + { Ok(timeline) => { let mut timeline_shared_state = timeline.write_shared_state().await; let mut state = self.state.lock().unwrap(); @@ -314,6 +337,7 @@ impl GlobalTimelines { &conf, broker_active_set, partial_backup_rate_limiter, + wal_backup, ); drop(timeline_shared_state); Ok(timeline) @@ -336,6 +360,7 @@ impl GlobalTimelines { ttid: TenantTimelineId, tmp_path: &Utf8PathBuf, conf: Arc, + wal_backup: Arc, ) -> Result> { let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id); let timeline_path = get_timeline_dir(conf.as_ref(), &ttid); @@ -377,7 +402,7 @@ impl GlobalTimelines { // Do the move. durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?; - Timeline::load_timeline(conf, ttid) + Timeline::load_timeline(conf, ttid, wal_backup) } /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 56f4a2faf9..0beb272a60 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -2,6 +2,7 @@ use std::cmp::min; use std::collections::HashSet; use std::num::NonZeroU32; use std::pin::Pin; +use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; @@ -17,7 +18,7 @@ use safekeeper_api::models::PeerInfo; use tokio::fs::File; use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::{OnceCell, watch}; +use tokio::sync::watch; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; @@ -63,7 +64,12 @@ pub(crate) fn is_wal_backup_required( /// Based on peer information determine which safekeeper should offload; if it /// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task /// is running, kill it. -pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &StateSnapshot) { +pub(crate) async fn update_task( + mgr: &mut Manager, + storage: Arc, + need_backup: bool, + state: &StateSnapshot, +) { let (offloader, election_dbg_str) = determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf); let elected_me = Some(mgr.conf.my_id) == offloader; @@ -82,7 +88,12 @@ pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &St return; }; - let async_task = backup_task_main(resident, mgr.conf.backup_parallel_jobs, shutdown_rx); + let async_task = backup_task_main( + resident, + storage, + mgr.conf.backup_parallel_jobs, + shutdown_rx, + ); let handle = if mgr.conf.current_thread_runtime { tokio::spawn(async_task) @@ -169,33 +180,31 @@ fn determine_offloader( } } -static REMOTE_STORAGE: OnceCell> = OnceCell::const_new(); - -// Storage must be configured and initialized when this is called. -fn get_configured_remote_storage() -> &'static GenericRemoteStorage { - REMOTE_STORAGE - .get() - .expect("failed to get remote storage") - .as_ref() - .unwrap() +pub struct WalBackup { + storage: Option>, } -pub async fn init_remote_storage(conf: &SafeKeeperConf) { - // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide - // dependencies to all tasks instead. - REMOTE_STORAGE - .get_or_init(|| async { - if let Some(conf) = conf.remote_storage.as_ref() { - Some( - GenericRemoteStorage::from_config(conf) - .await - .expect("failed to create remote storage"), - ) - } else { - None +impl WalBackup { + /// Create a new WalBackup instance. + pub async fn new(conf: &SafeKeeperConf) -> Result { + if !conf.wal_backup_enabled { + return Ok(Self { storage: None }); + } + + match conf.remote_storage.as_ref() { + Some(config) => { + let storage = GenericRemoteStorage::from_config(config).await?; + Ok(Self { + storage: Some(Arc::new(storage)), + }) } - }) - .await; + None => Ok(Self { storage: None }), + } + } + + pub fn get_storage(&self) -> Option> { + self.storage.clone() + } } struct WalBackupTask { @@ -204,12 +213,14 @@ struct WalBackupTask { wal_seg_size: usize, parallel_jobs: usize, commit_lsn_watch_rx: watch::Receiver, + storage: Arc, } /// Offload single timeline. #[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))] async fn backup_task_main( tli: WalResidentTimeline, + storage: Arc, parallel_jobs: usize, mut shutdown_rx: Receiver<()>, ) { @@ -223,6 +234,7 @@ async fn backup_task_main( timeline_dir: tli.get_timeline_dir(), timeline: tli, parallel_jobs, + storage, }; // task is spinned up only when wal_seg_size already initialized @@ -293,6 +305,7 @@ impl WalBackupTask { match backup_lsn_range( &self.timeline, + self.storage.clone(), &mut backup_lsn, commit_lsn, self.wal_seg_size, @@ -322,6 +335,7 @@ impl WalBackupTask { async fn backup_lsn_range( timeline: &WalResidentTimeline, + storage: Arc, backup_lsn: &mut Lsn, end_lsn: Lsn, wal_seg_size: usize, @@ -352,7 +366,12 @@ async fn backup_lsn_range( loop { let added_task = match iter.next() { Some(s) => { - uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path)); + uploads.push_back(backup_single_segment( + &storage, + s, + timeline_dir, + remote_timeline_path, + )); true } None => false, @@ -388,6 +407,7 @@ async fn backup_lsn_range( } async fn backup_single_segment( + storage: &GenericRemoteStorage, seg: &Segment, timeline_dir: &Utf8Path, remote_timeline_path: &RemotePath, @@ -395,7 +415,13 @@ async fn backup_single_segment( let segment_file_path = seg.file_path(timeline_dir)?; let remote_segment_path = seg.remote_path(remote_timeline_path); - let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await; + let res = backup_object( + storage, + &segment_file_path, + &remote_segment_path, + seg.size(), + ) + .await; if res.is_ok() { BACKED_UP_SEGMENTS.inc(); } else { @@ -455,12 +481,11 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { } async fn backup_object( + storage: &GenericRemoteStorage, source_file: &Utf8Path, target_file: &RemotePath, size: usize, ) -> Result<()> { - let storage = get_configured_remote_storage(); - let file = File::open(&source_file) .await .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; @@ -475,12 +500,11 @@ async fn backup_object( } pub(crate) async fn backup_partial_segment( + storage: &GenericRemoteStorage, source_file: &Utf8Path, target_file: &RemotePath, size: usize, ) -> Result<()> { - let storage = get_configured_remote_storage(); - let file = File::open(&source_file) .await .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; @@ -504,25 +528,20 @@ pub(crate) async fn backup_partial_segment( } pub(crate) async fn copy_partial_segment( + storage: &GenericRemoteStorage, source: &RemotePath, destination: &RemotePath, ) -> Result<()> { - let storage = get_configured_remote_storage(); let cancel = CancellationToken::new(); storage.copy_object(source, destination, &cancel).await } pub async fn read_object( + storage: &GenericRemoteStorage, file_path: &RemotePath, offset: u64, ) -> anyhow::Result>> { - let storage = REMOTE_STORAGE - .get() - .context("Failed to get remote storage")? - .as_ref() - .context("No remote storage configured")?; - info!("segment download about to start from remote path {file_path:?} at offset {offset}"); let cancel = CancellationToken::new(); @@ -547,8 +566,10 @@ pub async fn read_object( /// Delete WAL files for the given timeline. Remote storage must be configured /// when called. -pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { - let storage = get_configured_remote_storage(); +pub async fn delete_timeline( + storage: &GenericRemoteStorage, + ttid: &TenantTimelineId, +) -> Result<()> { let remote_path = remote_timeline_path(ttid)?; // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE @@ -618,14 +639,14 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { } /// Used by wal_backup_partial. -pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> { +pub async fn delete_objects(storage: &GenericRemoteStorage, paths: &[RemotePath]) -> Result<()> { let cancel = CancellationToken::new(); // not really used - let storage = get_configured_remote_storage(); storage.delete_objects(paths, &cancel).await } /// Copy segments from one timeline to another. Used in copy_timeline. pub async fn copy_s3_segments( + storage: &GenericRemoteStorage, wal_seg_size: usize, src_ttid: &TenantTimelineId, dst_ttid: &TenantTimelineId, @@ -634,12 +655,6 @@ pub async fn copy_s3_segments( ) -> Result<()> { const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024; - let storage = REMOTE_STORAGE - .get() - .expect("failed to get remote storage") - .as_ref() - .unwrap(); - let remote_dst_path = remote_timeline_path(dst_ttid)?; let cancel = CancellationToken::new(); diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 049852a048..fe0f1b3607 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -19,9 +19,11 @@ //! file. Code updates state in the control file before doing any S3 operations. //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. +use std::sync::Arc; + use camino::Utf8PathBuf; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; -use remote_storage::RemotePath; +use remote_storage::{GenericRemoteStorage, RemotePath}; use safekeeper_api::Term; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; @@ -154,12 +156,16 @@ pub struct PartialBackup { conf: SafeKeeperConf, local_prefix: Utf8PathBuf, remote_timeline_path: RemotePath, - + storage: Arc, state: State, } impl PartialBackup { - pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup { + pub async fn new( + tli: WalResidentTimeline, + conf: SafeKeeperConf, + storage: Arc, + ) -> PartialBackup { let (_, persistent_state) = tli.get_state().await; let wal_seg_size = tli.get_wal_seg_size().await; @@ -173,6 +179,7 @@ impl PartialBackup { conf, local_prefix, remote_timeline_path, + storage, } } @@ -240,7 +247,8 @@ impl PartialBackup { let remote_path = prepared.remote_path(&self.remote_timeline_path); // Upload first `backup_bytes` bytes of the segment to the remote storage. - wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?; + wal_backup::backup_partial_segment(&self.storage, &local_path, &remote_path, backup_bytes) + .await?; PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64); // We uploaded the segment, now let's verify that the data is still actual. @@ -326,7 +334,7 @@ impl PartialBackup { let remote_path = self.remote_timeline_path.join(seg); objects_to_delete.push(remote_path); } - wal_backup::delete_objects(&objects_to_delete).await + wal_backup::delete_objects(&self.storage, &objects_to_delete).await } /// Delete all non-Uploaded segments from the remote storage. There should be only one @@ -424,6 +432,7 @@ pub async fn main_task( conf: SafeKeeperConf, limiter: RateLimiter, cancel: CancellationToken, + storage: Arc, ) -> Option { debug!("started"); let await_duration = conf.partial_backup_timeout; @@ -432,7 +441,7 @@ pub async fn main_task( let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); - let mut backup = PartialBackup::new(tli, conf).await; + let mut backup = PartialBackup::new(tli, conf, storage).await; debug!("state: {:?}", backup.state); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index f0bac4b40a..8ba3e7cc47 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -21,6 +21,7 @@ use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion}; use pq_proto::SystemId; use remote_storage::RemotePath; +use std::sync::Arc; use tokio::fs::{self, File, OpenOptions, remove_file}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; use tracing::*; @@ -32,7 +33,7 @@ use crate::metrics::{ REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure, }; use crate::state::TimelinePersistentState; -use crate::wal_backup::{read_object, remote_timeline_path}; +use crate::wal_backup::{WalBackup, read_object, remote_timeline_path}; pub trait Storage { // Last written LSN. @@ -645,7 +646,7 @@ pub struct WalReader { wal_segment: Option>>, // S3 will be used to read WAL if LSN is not available locally - enable_remote_read: bool, + wal_backup: Arc, // We don't have WAL locally if LSN is less than local_start_lsn local_start_lsn: Lsn, @@ -664,7 +665,7 @@ impl WalReader { timeline_dir: Utf8PathBuf, state: &TimelinePersistentState, start_pos: Lsn, - enable_remote_read: bool, + wal_backup: Arc, ) -> Result { if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) { bail!("state uninitialized, no data to read"); @@ -693,7 +694,7 @@ impl WalReader { wal_seg_size: state.server.wal_seg_size as usize, pos: start_pos, wal_segment: None, - enable_remote_read, + wal_backup, local_start_lsn: state.local_start_lsn, timeline_start_lsn: state.timeline_start_lsn, pg_version: state.server.pg_version / 10000, @@ -812,9 +813,9 @@ impl WalReader { } // Try to open remote file, if remote reads are enabled - if self.enable_remote_read { + if let Some(storage) = self.wal_backup.get_storage() { let remote_wal_file_path = self.remote_path.join(&wal_file_name); - return read_object(&remote_wal_file_path, xlogoff as u64).await; + return read_object(&storage, &remote_wal_file_path, xlogoff as u64).await; } bail!("WAL segment is not found") diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py index a9a90c7370..c74ef9d899 100755 --- a/scripts/benchmark_durations.py +++ b/scripts/benchmark_durations.py @@ -32,12 +32,6 @@ BENCHMARKS_DURATION_QUERY = """ # the total duration varies from 8 to 40 minutes. # We use some pre-collected durations as a fallback to have a better distribution. FALLBACK_DURATION = { - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]": 400.15, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]": 372.521, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]": 420.017, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-6-30]": 373.769, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-13-30]": 678.742, - "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-6-30]": 512.135, "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036, "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104, "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073, diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 476d5f03ea..bae5ccb36c 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -17,12 +17,14 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Duration; +use bytes::Bytes; use camino::Utf8PathBuf; use clap::{Parser, command}; use futures::future::OptionFuture; use futures_core::Stream; use futures_util::StreamExt; -use http_body_util::Full; +use http_body_util::combinators::BoxBody; +use http_body_util::{Empty, Full}; use http_utils::tls_certs::ReloadingCertificateResolver; use hyper::body::Incoming; use hyper::header::CONTENT_TYPE; @@ -46,7 +48,6 @@ use tokio::net::TcpListener; use tokio::sync::broadcast; use tokio::sync::broadcast::error::RecvError; use tokio::time; -use tonic::body::{self, BoxBody, empty_body}; use tonic::codegen::Service; use tonic::{Code, Request, Response, Status}; use tracing::*; @@ -634,7 +635,7 @@ impl BrokerService for Broker { // We serve only metrics and healthcheck through http1. async fn http1_handler( req: hyper::Request, -) -> Result, Infallible> { +) -> Result>, Infallible> { let resp = match (req.method(), req.uri().path()) { (&Method::GET, "/metrics") => { let mut buffer = vec![]; @@ -645,16 +646,16 @@ async fn http1_handler( hyper::Response::builder() .status(StatusCode::OK) .header(CONTENT_TYPE, encoder.format_type()) - .body(body::boxed(Full::new(bytes::Bytes::from(buffer)))) + .body(BoxBody::new(Full::new(Bytes::from(buffer)))) .unwrap() } (&Method::GET, "/status") => hyper::Response::builder() .status(StatusCode::OK) - .body(empty_body()) + .body(BoxBody::new(Empty::new())) .unwrap(), _ => hyper::Response::builder() .status(StatusCode::NOT_FOUND) - .body(empty_body()) + .body(BoxBody::new(Empty::new())) .unwrap(), }; Ok(resp) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 649113b8ce..02c02c0e7f 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -31,7 +31,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::TenantShardId; use pageserver_api::upcall_api::{ - PutTimelineImportStatusRequest, ReAttachRequest, ValidateRequest, + PutTimelineImportStatusRequest, ReAttachRequest, TimelineImportStatusRequest, ValidateRequest, }; use pageserver_client::{BlockUnblock, mgmt_api}; use routerify::Middleware; @@ -157,6 +157,29 @@ async fn handle_validate(req: Request) -> Result, ApiError> json_response(StatusCode::OK, state.service.validate(validate_req).await?) } +async fn handle_get_timeline_import_status(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::GenerationsApi)?; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let get_req = json_request::(&mut req).await?; + + let state = get_state(&req); + + json_response( + StatusCode::OK, + state + .service + .handle_timeline_shard_import_progress(get_req) + .await?, + ) +} + async fn handle_put_timeline_import_status(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; @@ -2008,6 +2031,13 @@ pub fn make_router( .post("/upcall/v1/validate", |r| { named_request_span(r, handle_validate, RequestName("upcall_v1_validate")) }) + .get("/upcall/v1/timeline_import_status", |r| { + named_request_span( + r, + handle_get_timeline_import_status, + RequestName("upcall_v1_timeline_import_status"), + ) + }) .post("/upcall/v1/timeline_import_status", |r| { named_request_span( r, diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 71dde9e126..2eea2f9d10 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -19,7 +19,8 @@ use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, - PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service, + PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT, Service, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; @@ -132,6 +133,10 @@ struct Cli { #[arg(long)] priority_reconciler_concurrency: Option, + /// Maximum number of safekeeper reconciliations that may run in parallel (per safekeeper) + #[arg(long)] + safekeeper_reconciler_concurrency: Option, + /// Tenant API rate limit, as requests per second per tenant. #[arg(long, default_value = "10")] tenant_rate_limit: NonZeroU32, @@ -403,6 +408,9 @@ async fn async_main() -> anyhow::Result<()> { priority_reconciler_concurrency: args .priority_reconciler_concurrency .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT), + safekeeper_reconciler_concurrency: args + .safekeeper_reconciler_concurrency + .unwrap_or(SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT), tenant_rate_limit: args.tenant_rate_limit, split_threshold: args.split_threshold, max_split_shards: args.max_split_shards, diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 554ca375f5..817409e112 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,3 +1,5 @@ +use std::time::Duration; + use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::models::{ DetachBehavior, LocationConfig, LocationConfigListResponse, LsnLease, PageserverUtilization, @@ -212,6 +214,7 @@ impl PageserverClient { ) } + #[allow(unused)] pub(crate) async fn timeline_detail( &self, tenant_shard_id: TenantShardId, @@ -357,4 +360,20 @@ impl PageserverClient { self.inner.wait_lsn(tenant_shard_id, request).await ) } + + pub(crate) async fn activate_post_import( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + timeline_activate_timeout: Duration, + ) -> Result { + measured_request!( + "activate_post_import", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .activate_post_import(tenant_shard_id, timeline_id, timeline_activate_timeout) + .await + ) + } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 9ffcf9b9e6..052c0f02eb 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -1666,6 +1666,39 @@ impl Persistence { } } + pub(crate) async fn get_timeline_import( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> DatabaseResult> { + use crate::schema::timeline_imports::dsl; + let persistent_import = self + .with_measured_conn(DatabaseOperation::ListTimelineImports, move |conn| { + Box::pin(async move { + let mut from_db: Vec = dsl::timeline_imports + .filter(dsl::tenant_id.eq(tenant_id.to_string())) + .filter(dsl::timeline_id.eq(timeline_id.to_string())) + .load(conn) + .await?; + + if from_db.len() > 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + from_db.len() + ))); + } + + Ok(from_db.pop()) + }) + }) + .await?; + + persistent_import + .map(TimelineImport::from_persistent) + .transpose() + .map_err(|err| DatabaseError::Logical(format!("failed to deserialize import: {err}"))) + } + pub(crate) async fn delete_timeline_import( &self, tenant_id: TenantId, diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index 988159af4a..1f3ea96d96 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -98,6 +98,23 @@ impl SafekeeperClient { ) } + #[allow(unused)] + pub(crate) async fn switch_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + measured_request!( + "switch_timeline_membership", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .switch_timeline_membership(tenant_id, timeline_id, req) + .await + ) + } + pub(crate) async fn delete_tenant( &self, tenant_id: TenantId, diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 3d5f36fb98..773373391e 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -628,11 +628,7 @@ impl Scheduler { tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node); } - if node.attached_shard_count < expected_attached_shards_per_node { - expected_attached_shards_per_node - node.attached_shard_count - } else { - 0 - } + expected_attached_shards_per_node.saturating_sub(node.attached_shard_count) } pub(crate) fn expected_attached_shard_count(&self) -> usize { diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index ca9b911c4d..7e4bb627af 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -35,19 +35,19 @@ use pageserver_api::controller_api::{ }; use pageserver_api::models::{ self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, - PageserverUtilization, SecondaryProgress, ShardParameters, TenantConfig, + PageserverUtilization, SecondaryProgress, ShardImportStatus, ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse, TenantSorting, TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateResponseStorcon, - TimelineInfo, TimelineState, TopTenantShardItem, TopTenantShardsRequest, + TimelineInfo, TopTenantShardItem, TopTenantShardsRequest, }; use pageserver_api::shard::{ DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, }; use pageserver_api::upcall_api::{ PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, - ValidateRequest, ValidateResponse, ValidateResponseTenant, + TimelineImportStatusRequest, ValidateRequest, ValidateResponse, ValidateResponseTenant, }; use pageserver_client::{BlockUnblock, mgmt_api}; use reqwest::{Certificate, StatusCode}; @@ -61,6 +61,7 @@ use utils::completion::Barrier; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; +use utils::shard::ShardIndex; use utils::sync::gate::{Gate, GateGuard}; use utils::{failpoint_support, pausable_failpoint}; @@ -98,7 +99,8 @@ use crate::tenant_shard::{ ScheduleOptimization, ScheduleOptimizationAction, TenantShard, }; use crate::timeline_import::{ - ShardImportStatuses, TimelineImport, TimelineImportState, UpcallClient, + ImportResult, ShardImportStatuses, TimelineImport, TimelineImportFinalizeError, + TimelineImportState, UpcallClient, }; const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500); @@ -192,8 +194,17 @@ pub(crate) enum LeadershipStatus { Candidate, } +enum ShardGenerationValidity { + Valid, + Mismatched { + claimed: Generation, + actual: Option, + }, +} + pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256; +pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32; // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly @@ -382,6 +393,9 @@ pub struct Config { /// How many high-priority Reconcilers may be spawned concurrently pub priority_reconciler_concurrency: usize, + /// How many safekeeper reconciles may happen concurrently (per safekeeper) + pub safekeeper_reconciler_concurrency: usize, + /// How many API requests per second to allow per tenant, across all /// tenant-scoped API endpoints. Further API requests queue until ready. pub tenant_rate_limit: NonZeroU32, @@ -3659,7 +3673,7 @@ impl Service { locations: ShardMutationLocations, http_client: reqwest::Client, jwt: Option, - create_req: TimelineCreateRequest, + mut create_req: TimelineCreateRequest, ) -> Result { let latest = locations.latest.node; @@ -3678,6 +3692,15 @@ impl Service { .await .map_err(|e| passthrough_api_error(&latest, e))?; + // If we are going to create the timeline on some stale locations for shard 0, then ask them to re-use + // the initdb generated by the latest location, rather than generating their own. This avoids racing uploads + // of initdb to S3 which might not be binary-identical if different pageservers have different postgres binaries. + if tenant_shard_id.is_shard_zero() { + if let models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } = &mut create_req.mode { + *existing_initdb_timeline_id = Some(create_req.new_timeline_id); + } + } + // We propagate timeline creations to all attached locations such that a compute // for the new timeline is able to start regardless of the current state of the // tenant shard reconciliation. @@ -3720,6 +3743,10 @@ impl Service { // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then // use whatever LSN that shard picked when creating on subsequent shards. We arbitrarily use shard zero as the shard // that will get the first creation request, and propagate the LSN to all the >0 shards. + // + // This also enables non-zero shards to use the initdb that shard 0 generated and uploaded to S3, rather than + // independently generating their own initdb. This guarantees that shards cannot end up with different initial + // states if e.g. they have different postgres binary versions. let timeline_info = create_one( shard_zero_tid, shard_zero_locations, @@ -3729,11 +3756,16 @@ impl Service { ) .await?; - // Propagate the LSN that shard zero picked, if caller didn't provide one + // Update the create request for shards >= 0 match &mut create_req.mode { models::TimelineCreateRequestMode::Branch { ancestor_start_lsn, .. } if ancestor_start_lsn.is_none() => { + // Propagate the LSN that shard zero picked, if caller didn't provide one *ancestor_start_lsn = timeline_info.ancestor_lsn; }, + models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } => { + // For shards >= 0, do not run initdb: use the one that shard 0 uploaded to S3 + *existing_initdb_timeline_id = Some(create_req.new_timeline_id) + } _ => {} } @@ -3864,10 +3896,10 @@ impl Service { None } else if safekeepers { - // Note that we do not support creating the timeline on the safekeepers - // for imported timelines. The `start_lsn` of the timeline is not known - // until the import finshes. - // https://github.com/neondatabase/neon/issues/11569 + // Note that for imported timelines, we do not create the timeline on the safekeepers + // straight away. Instead, we do it once the import finalized such that we know what + // start LSN to provide for the safekeepers. This is done in + // [`Self::finalize_timeline_import`]. let res = self .tenant_timeline_create_safekeepers(tenant_id, &timeline_info) .instrument(tracing::info_span!("timeline_create_safekeepers", %tenant_id, timeline_id=%timeline_info.timeline_id)) @@ -3883,10 +3915,77 @@ impl Service { }) } + pub(crate) async fn handle_timeline_shard_import_progress( + self: &Arc, + req: TimelineImportStatusRequest, + ) -> Result { + let validity = self + .validate_shard_generation(req.tenant_shard_id, req.generation) + .await?; + match validity { + ShardGenerationValidity::Valid => { + // fallthrough + } + ShardGenerationValidity::Mismatched { claimed, actual } => { + tracing::info!( + claimed=?claimed.into(), + actual=?actual.and_then(|g| g.into()), + "Rejecting import progress fetch from stale generation" + ); + + return Err(ApiError::BadRequest(anyhow::anyhow!("Invalid generation"))); + } + } + + let maybe_import = self + .persistence + .get_timeline_import(req.tenant_shard_id.tenant_id, req.timeline_id) + .await?; + + let import = maybe_import.ok_or_else(|| { + ApiError::NotFound( + format!( + "import for {}/{} not found", + req.tenant_shard_id.tenant_id, req.timeline_id + ) + .into(), + ) + })?; + + import + .shard_statuses + .0 + .get(&req.tenant_shard_id.to_index()) + .cloned() + .ok_or_else(|| { + ApiError::NotFound( + format!("shard {} not found", req.tenant_shard_id.shard_slug()).into(), + ) + }) + } + pub(crate) async fn handle_timeline_shard_import_progress_upcall( self: &Arc, req: PutTimelineImportStatusRequest, ) -> Result<(), ApiError> { + let validity = self + .validate_shard_generation(req.tenant_shard_id, req.generation) + .await?; + match validity { + ShardGenerationValidity::Valid => { + // fallthrough + } + ShardGenerationValidity::Mismatched { claimed, actual } => { + tracing::info!( + claimed=?claimed.into(), + actual=?actual.and_then(|g| g.into()), + "Rejecting import progress update from stale generation" + ); + + return Err(ApiError::PreconditionFailed("Invalid generation".into())); + } + } + let res = self .persistence .update_timeline_import(req.tenant_shard_id, req.timeline_id, req.status) @@ -3921,55 +4020,147 @@ impl Service { Ok(()) } + /// Check that a provided generation for some tenant shard is the most recent one. + /// + /// Validate with the in-mem state first, and, if that passes, validate with the + /// database state which is authoritative. + async fn validate_shard_generation( + self: &Arc, + tenant_shard_id: TenantShardId, + generation: Generation, + ) -> Result { + { + let locked = self.inner.read().unwrap(); + let tenant_shard = + locked + .tenants + .get(&tenant_shard_id) + .ok_or(ApiError::InternalServerError(anyhow::anyhow!( + "{} shard not found", + tenant_shard_id + )))?; + + if tenant_shard.generation != Some(generation) { + return Ok(ShardGenerationValidity::Mismatched { + claimed: generation, + actual: tenant_shard.generation, + }); + } + } + + let mut db_generations = self + .persistence + .shard_generations(std::iter::once(&tenant_shard_id)) + .await?; + let (_tid, db_generation) = + db_generations + .pop() + .ok_or(ApiError::InternalServerError(anyhow::anyhow!( + "{} shard not found", + tenant_shard_id + )))?; + + if db_generation != Some(generation) { + return Ok(ShardGenerationValidity::Mismatched { + claimed: generation, + actual: db_generation, + }); + } + + Ok(ShardGenerationValidity::Valid) + } + + /// Finalize the import of a timeline + /// + /// This method should be called once all shards have reported that the import is complete. + /// Firstly, it polls the post import timeline activation endpoint exposed by the pageserver. + /// Once the timeline is active on all shards, the timeline also gets created on the + /// safekeepers. Finally, notify cplane of the import completion (whether failed or + /// successful), and remove the import from the database and in-memory. + /// + /// If this method gets pre-empted by shut down, it will be called again at start-up (on-going + /// imports are stored in the database). #[instrument(skip_all, fields( tenant_id=%import.tenant_id, - shard_id=%import.timeline_id, + timeline_id=%import.timeline_id, ))] async fn finalize_timeline_import( self: &Arc, import: TimelineImport, - ) -> anyhow::Result<()> { + ) -> Result<(), TimelineImportFinalizeError> { tracing::info!("Finalizing timeline import"); pausable_failpoint!("timeline-import-pre-cplane-notification"); - let import_failed = import.completion_error().is_some(); + let tenant_id = import.tenant_id; + let timeline_id = import.timeline_id; - if !import_failed { - loop { - if self.cancel.is_cancelled() { - anyhow::bail!("Shut down requested while finalizing import"); - } - - let active = self.timeline_active_on_all_shards(&import).await?; - - match active { - true => { - tracing::info!("Timeline became active on all shards"); - break; - } - false => { - tracing::info!("Timeline not active on all shards yet"); - - tokio::select! { - _ = self.cancel.cancelled() => { - anyhow::bail!("Shut down requested while finalizing import"); - }, - _ = tokio::time::sleep(Duration::from_secs(5)) => {} - }; - } - } + let import_error = import.completion_error(); + match import_error { + Some(err) => { + self.notify_cplane_and_delete_import(tenant_id, timeline_id, Err(err)) + .await?; + tracing::warn!("Timeline import completed with shard errors"); + Ok(()) } - } + None => match self.activate_timeline_post_import(&import).await { + Ok(timeline_info) => { + tracing::info!("Post import timeline activation complete"); + if self.config.timelines_onto_safekeepers { + // Now that we know the start LSN of this timeline, create it on the + // safekeepers. + self.tenant_timeline_create_safekeepers_until_success( + import.tenant_id, + timeline_info, + ) + .await?; + } + + self.notify_cplane_and_delete_import(tenant_id, timeline_id, Ok(())) + .await?; + + tracing::info!("Timeline import completed successfully"); + Ok(()) + } + Err(TimelineImportFinalizeError::ShuttingDown) => { + // We got pre-empted by shut down and will resume after the restart. + Err(TimelineImportFinalizeError::ShuttingDown) + } + Err(err) => { + // Any finalize error apart from shut down is permanent and requires us to notify + // cplane such that it can clean up. + tracing::error!("Import finalize failed with permanent error: {err}"); + self.notify_cplane_and_delete_import( + tenant_id, + timeline_id, + Err(err.to_string()), + ) + .await?; + Err(err) + } + }, + } + } + + async fn notify_cplane_and_delete_import( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + import_result: ImportResult, + ) -> Result<(), TimelineImportFinalizeError> { + let import_failed = import_result.is_err(); tracing::info!(%import_failed, "Notifying cplane of import completion"); let client = UpcallClient::new(self.get_config(), self.cancel.child_token()); - client.notify_import_complete(&import).await?; + client + .notify_import_complete(tenant_id, timeline_id, import_result) + .await + .map_err(|_err| TimelineImportFinalizeError::ShuttingDown)?; if let Err(err) = self .persistence - .delete_timeline_import(import.tenant_id, import.timeline_id) + .delete_timeline_import(tenant_id, timeline_id) .await { tracing::warn!("Failed to delete timeline import entry from database: {err}"); @@ -3979,17 +4170,113 @@ impl Service { .write() .unwrap() .tenants - .range_mut(TenantShardId::tenant_range(import.tenant_id)) + .range_mut(TenantShardId::tenant_range(tenant_id)) .for_each(|(_id, shard)| shard.importing = TimelineImportState::Idle); - // TODO(vlad): Timeline creations in import mode do not return a correct initdb lsn, - // so we can't create the timeline on the safekeepers. Fix by moving creation here. - // https://github.com/neondatabase/neon/issues/11569 - tracing::info!(%import_failed, "Timeline import complete"); - Ok(()) } + /// Activate an imported timeline on all shards once the import is complete. + /// Returns the [`TimelineInfo`] reported by shard zero. + async fn activate_timeline_post_import( + self: &Arc, + import: &TimelineImport, + ) -> Result { + const TIMELINE_ACTIVATE_TIMEOUT: Duration = Duration::from_millis(128); + + let mut shards_to_activate: HashSet = + import.shard_statuses.0.keys().cloned().collect(); + let mut shard_zero_timeline_info = None; + + while !shards_to_activate.is_empty() { + if self.cancel.is_cancelled() { + return Err(TimelineImportFinalizeError::ShuttingDown); + } + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in locked + .tenants + .range(TenantShardId::tenant_range(import.tenant_id)) + { + if !import + .shard_statuses + .0 + .contains_key(&tenant_shard_id.to_index()) + { + return Err(TimelineImportFinalizeError::MismatchedShards( + tenant_shard_id.to_index(), + )); + } + + if let Some(node_id) = shard.intent.get_attached() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + targets.push((*tenant_shard_id, node.clone())); + } + } + + targets + }; + + let targeted_tenant_shards: Vec<_> = targets.iter().map(|(tid, _node)| *tid).collect(); + + let results = self + .tenant_for_shards_api( + targets, + |tenant_shard_id, client| async move { + client + .activate_post_import( + tenant_shard_id, + import.timeline_id, + TIMELINE_ACTIVATE_TIMEOUT, + ) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + let mut failed = 0; + for (tid, result) in targeted_tenant_shards.iter().zip(results.into_iter()) { + match result { + Ok(ok) => { + if tid.is_shard_zero() { + shard_zero_timeline_info = Some(ok); + } + + shards_to_activate.remove(&tid.to_index()); + } + Err(_err) => { + failed += 1; + } + } + } + + if failed > 0 { + tracing::info!( + "Failed to activate timeline on {failed} shards post import. Will retry" + ); + } + + tokio::select! { + _ = tokio::time::sleep(Duration::from_millis(250)) => {}, + _ = self.cancel.cancelled() => { + return Err(TimelineImportFinalizeError::ShuttingDown); + } + } + } + + Ok(shard_zero_timeline_info.expect("All shards replied")) + } + async fn finalize_timeline_imports(self: &Arc, imports: Vec) { futures::future::join_all( imports @@ -3999,61 +4286,6 @@ impl Service { .await; } - async fn timeline_active_on_all_shards( - self: &Arc, - import: &TimelineImport, - ) -> anyhow::Result { - let targets = { - let locked = self.inner.read().unwrap(); - let mut targets = Vec::new(); - - for (tenant_shard_id, shard) in locked - .tenants - .range(TenantShardId::tenant_range(import.tenant_id)) - { - if !import - .shard_statuses - .0 - .contains_key(&tenant_shard_id.to_index()) - { - anyhow::bail!("Shard layout change detected on completion"); - } - - if let Some(node_id) = shard.intent.get_attached() { - let node = locked - .nodes - .get(node_id) - .expect("Pageservers may not be deleted while referenced"); - targets.push((*tenant_shard_id, node.clone())); - } else { - return Ok(false); - } - } - - targets - }; - - let results = self - .tenant_for_shards_api( - targets, - |tenant_shard_id, client| async move { - client - .timeline_detail(tenant_shard_id, import.timeline_id) - .await - }, - 1, - 1, - SHORT_RECONCILE_TIMEOUT, - &self.cancel, - ) - .await; - - Ok(results.into_iter().all(|res| match res { - Ok(info) => info.state == TimelineState::Active, - Err(_) => false, - })) - } - pub(crate) async fn tenant_timeline_archival_config( &self, tenant_id: TenantId, @@ -5159,7 +5391,8 @@ impl Service { } // We don't expect any new_shard_count shards to exist here, but drop them just in case - tenants.retain(|_id, s| s.shard.count != *new_shard_count); + tenants + .retain(|id, s| !(id.tenant_id == *tenant_id && s.shard.count == *new_shard_count)); detach_locations }; @@ -8462,7 +8695,7 @@ impl Service { // By default, live migrations are generous about the wait time for getting // the secondary location up to speed. When draining, give up earlier in order // to not stall the operation when a cold secondary is encountered. - const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) @@ -8795,7 +9028,7 @@ impl Service { node_id: NodeId, cancel: CancellationToken, ) -> Result<(), OperationError> { - const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index b15772a36c..f756d98c64 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -1,9 +1,17 @@ -use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration}; +use std::{ + collections::HashMap, + str::FromStr, + sync::{Arc, atomic::AtomicU64}, + time::Duration, +}; use clashmap::{ClashMap, Entry}; use safekeeper_api::models::PullTimelineRequest; use safekeeper_client::mgmt_api; -use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender}; +use tokio::sync::{ + Semaphore, + mpsc::{self, UnboundedReceiver, UnboundedSender}, +}; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::{ @@ -166,10 +174,17 @@ pub(crate) struct ScheduleRequest { pub(crate) kind: SafekeeperTimelineOpKind, } +/// A way to keep ongoing/queued reconcile requests apart +#[derive(Copy, Clone, PartialEq, Eq)] +struct TokenId(u64); + +type OngoingTokens = ClashMap<(TenantId, Option), (CancellationToken, TokenId)>; + /// Handle to per safekeeper reconciler. struct ReconcilerHandle { - tx: UnboundedSender<(ScheduleRequest, CancellationToken)>, - ongoing_tokens: Arc), CancellationToken>>, + tx: UnboundedSender<(ScheduleRequest, CancellationToken, TokenId)>, + ongoing_tokens: Arc, + token_id_counter: AtomicU64, cancel: CancellationToken, } @@ -182,47 +197,66 @@ impl ReconcilerHandle { &self, tenant_id: TenantId, timeline_id: Option, - ) -> CancellationToken { + ) -> (CancellationToken, TokenId) { + let token_id = self + .token_id_counter + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let token_id = TokenId(token_id); let entry = self.ongoing_tokens.entry((tenant_id, timeline_id)); if let Entry::Occupied(entry) = &entry { - let cancel: &CancellationToken = entry.get(); + let (cancel, _) = entry.get(); cancel.cancel(); } - entry.insert(self.cancel.child_token()).clone() + entry.insert((self.cancel.child_token(), token_id)).clone() } /// Cancel an ongoing reconciliation fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option) { - if let Some((_, cancel)) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) { + if let Some((_, (cancel, _id))) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) { cancel.cancel(); } } fn schedule_reconcile(&self, req: ScheduleRequest) { - let cancel = self.new_token_slot(req.tenant_id, req.timeline_id); + let (cancel, token_id) = self.new_token_slot(req.tenant_id, req.timeline_id); let hostname = req.safekeeper.skp.host.clone(); - if let Err(err) = self.tx.send((req, cancel)) { + if let Err(err) = self.tx.send((req, cancel, token_id)) { tracing::info!("scheduling request onto {hostname} returned error: {err}"); } } } pub(crate) struct SafekeeperReconciler { - service: Arc, - rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>, + inner: SafekeeperReconcilerInner, + concurrency_limiter: Arc, + rx: UnboundedReceiver<(ScheduleRequest, CancellationToken, TokenId)>, cancel: CancellationToken, } +/// Thin wrapper over `Service` to not clutter its inherent functions +#[derive(Clone)] +struct SafekeeperReconcilerInner { + ongoing_tokens: Arc, + service: Arc, +} + impl SafekeeperReconciler { fn spawn(cancel: CancellationToken, service: Arc) -> ReconcilerHandle { // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking. let (tx, rx) = mpsc::unbounded_channel(); + let concurrency = service.config.safekeeper_reconciler_concurrency; + let ongoing_tokens = Arc::new(ClashMap::new()); let mut reconciler = SafekeeperReconciler { - service, + inner: SafekeeperReconcilerInner { + service, + ongoing_tokens: ongoing_tokens.clone(), + }, rx, + concurrency_limiter: Arc::new(Semaphore::new(concurrency)), cancel: cancel.clone(), }; let handle = ReconcilerHandle { tx, - ongoing_tokens: Arc::new(ClashMap::new()), + ongoing_tokens, + token_id_counter: AtomicU64::new(0), cancel, }; tokio::spawn(async move { reconciler.run().await }); @@ -230,33 +264,54 @@ impl SafekeeperReconciler { } async fn run(&mut self) { loop { - // TODO add parallelism with semaphore here let req = tokio::select! { req = self.rx.recv() => req, _ = self.cancel.cancelled() => break, }; - let Some((req, req_cancel)) = req else { break }; + let Some((req, req_cancel, req_token_id)) = req else { + break; + }; + + let permit_res = tokio::select! { + req = self.concurrency_limiter.clone().acquire_owned() => req, + _ = self.cancel.cancelled() => break, + }; + let Ok(_permit) = permit_res else { return }; + + let inner = self.inner.clone(); if req_cancel.is_cancelled() { continue; } - let kind = req.kind; - let tenant_id = req.tenant_id; - let timeline_id = req.timeline_id; - let node_id = req.safekeeper.skp.id; - self.reconcile_one(req, req_cancel) - .instrument(tracing::info_span!( - "reconcile_one", - ?kind, - %tenant_id, - ?timeline_id, - %node_id, - )) - .await; + tokio::task::spawn(async move { + let kind = req.kind; + let tenant_id = req.tenant_id; + let timeline_id = req.timeline_id; + let node_id = req.safekeeper.skp.id; + inner + .reconcile_one(req, req_cancel, req_token_id) + .instrument(tracing::info_span!( + "reconcile_one", + ?kind, + %tenant_id, + ?timeline_id, + %node_id, + )) + .await; + }); } } - async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) { +} + +impl SafekeeperReconcilerInner { + async fn reconcile_one( + &self, + req: ScheduleRequest, + req_cancel: CancellationToken, + req_token_id: TokenId, + ) { let req_host = req.safekeeper.skp.host.clone(); + let success; match req.kind { SafekeeperTimelineOpKind::Pull => { let Some(timeline_id) = req.timeline_id else { @@ -276,19 +331,24 @@ impl SafekeeperReconciler { http_hosts, tenant_id: req.tenant_id, timeline_id, + ignore_tombstone: Some(false), }; - self.reconcile_inner( - req, - async |client| client.pull_timeline(&pull_req).await, - |resp| { - tracing::info!( - "pulled timeline from {} onto {req_host}", - resp.safekeeper_host, - ); - }, - req_cancel, - ) - .await; + success = self + .reconcile_inner( + &req, + async |client| client.pull_timeline(&pull_req).await, + |resp| { + if let Some(host) = resp.safekeeper_host { + tracing::info!("pulled timeline from {host} onto {req_host}"); + } else { + tracing::info!( + "timeline already present on safekeeper on {req_host}" + ); + } + }, + req_cancel, + ) + .await; } SafekeeperTimelineOpKind::Exclude => { // TODO actually exclude instead of delete here @@ -299,22 +359,23 @@ impl SafekeeperReconciler { ); return; }; - self.reconcile_inner( - req, - async |client| client.delete_timeline(tenant_id, timeline_id).await, - |_resp| { - tracing::info!("deleted timeline from {req_host}"); - }, - req_cancel, - ) - .await; + success = self + .reconcile_inner( + &req, + async |client| client.delete_timeline(tenant_id, timeline_id).await, + |_resp| { + tracing::info!("deleted timeline from {req_host}"); + }, + req_cancel, + ) + .await; } SafekeeperTimelineOpKind::Delete => { let tenant_id = req.tenant_id; if let Some(timeline_id) = req.timeline_id { - let deleted = self + success = self .reconcile_inner( - req, + &req, async |client| client.delete_timeline(tenant_id, timeline_id).await, |_resp| { tracing::info!("deleted timeline from {req_host}"); @@ -322,13 +383,13 @@ impl SafekeeperReconciler { req_cancel, ) .await; - if deleted { + if success { self.delete_timeline_from_db(tenant_id, timeline_id).await; } } else { - let deleted = self + success = self .reconcile_inner( - req, + &req, async |client| client.delete_tenant(tenant_id).await, |_resp| { tracing::info!(%tenant_id, "deleted tenant from {req_host}"); @@ -336,12 +397,21 @@ impl SafekeeperReconciler { req_cancel, ) .await; - if deleted { + if success { self.delete_tenant_timelines_from_db(tenant_id).await; } } } } + if success { + self.ongoing_tokens.remove_if( + &(req.tenant_id, req.timeline_id), + |_ttid, (_cancel, token_id)| { + // Ensure that this request is indeed the request we just finished and not a new one + req_token_id == *token_id + }, + ); + } } async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) { match self @@ -395,10 +465,10 @@ impl SafekeeperReconciler { self.delete_timeline_from_db(tenant_id, timeline_id).await; } } - /// Returns whether the reconciliation happened successfully + /// Returns whether the reconciliation happened successfully (or we got cancelled) async fn reconcile_inner( &self, - req: ScheduleRequest, + req: &ScheduleRequest, closure: impl Fn(SafekeeperClient) -> F, log_success: impl FnOnce(T) -> U, req_cancel: CancellationToken, diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 5eecf0d415..cd5ace449d 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -10,6 +10,7 @@ use crate::persistence::{ DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, }; use crate::safekeeper::Safekeeper; +use crate::timeline_import::TimelineImportFinalizeError; use anyhow::Context; use http_utils::error::ApiError; use pageserver_api::controller_api::{ @@ -323,6 +324,42 @@ impl Service { }) } + pub(crate) async fn tenant_timeline_create_safekeepers_until_success( + self: &Arc, + tenant_id: TenantId, + timeline_info: TimelineInfo, + ) -> Result<(), TimelineImportFinalizeError> { + const BACKOFF: Duration = Duration::from_secs(5); + + loop { + if self.cancel.is_cancelled() { + return Err(TimelineImportFinalizeError::ShuttingDown); + } + + let res = self + .tenant_timeline_create_safekeepers(tenant_id, &timeline_info) + .await; + + match res { + Ok(_) => { + tracing::info!("Timeline created on safekeepers"); + break; + } + Err(err) => { + tracing::error!("Failed to create timeline on safekeepers: {err}"); + tokio::select! { + _ = self.cancel.cancelled() => { + return Err(TimelineImportFinalizeError::ShuttingDown); + }, + _ = tokio::time::sleep(BACKOFF) => {} + }; + } + } + } + + Ok(()) + } + /// Directly insert the timeline into the database without reconciling it with safekeepers. /// /// Useful if the timeline already exists on the specified safekeepers, diff --git a/storage_controller/src/timeline_import.rs b/storage_controller/src/timeline_import.rs index 6dcc538c4b..909e8e2899 100644 --- a/storage_controller/src/timeline_import.rs +++ b/storage_controller/src/timeline_import.rs @@ -5,7 +5,7 @@ use http_utils::error::ApiError; use reqwest::Method; use serde::{Deserialize, Serialize}; -use pageserver_api::models::ShardImportStatus; +use pageserver_api::models::{ShardImportProgress, ShardImportStatus}; use tokio_util::sync::CancellationToken; use utils::{ id::{TenantId, TimelineId}, @@ -28,7 +28,12 @@ impl ShardImportStatuses { ShardImportStatuses( shards .into_iter() - .map(|ts_id| (ts_id, ShardImportStatus::InProgress)) + .map(|ts_id| { + ( + ts_id, + ShardImportStatus::InProgress(None::), + ) + }) .collect(), ) } @@ -46,6 +51,14 @@ pub(crate) enum TimelineImportUpdateFollowUp { None, } +#[derive(thiserror::Error, Debug)] +pub(crate) enum TimelineImportFinalizeError { + #[error("Shut down interrupted import finalize")] + ShuttingDown, + #[error("Mismatched shard detected during import finalize: {0}")] + MismatchedShards(ShardIndex), +} + pub(crate) enum TimelineImportUpdateError { ImportNotFound { tenant_id: TenantId, @@ -151,6 +164,8 @@ impl TimelineImport { } } +pub(crate) type ImportResult = Result<(), String>; + pub(crate) struct UpcallClient { authorization_header: Option, client: reqwest::Client, @@ -198,7 +213,9 @@ impl UpcallClient { /// eventual cplane availability. The cplane API is idempotent. pub(crate) async fn notify_import_complete( &self, - import: &TimelineImport, + tenant_id: TenantId, + timeline_id: TimelineId, + import_result: ImportResult, ) -> anyhow::Result<()> { let endpoint = if self.base_url.ends_with('/') { format!("{}import_complete", self.base_url) @@ -206,15 +223,13 @@ impl UpcallClient { format!("{}/import_complete", self.base_url) }; - tracing::info!("Endpoint is {endpoint}"); - let request = self .client .request(Method::PUT, endpoint) .json(&ImportCompleteRequest { - tenant_id: import.tenant_id, - timeline_id: import.timeline_id, - error: import.completion_error(), + tenant_id, + timeline_id, + error: import_result.err(), }) .timeout(IMPORT_COMPLETE_REQUEST_TIMEOUT); diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index f0ba632fd4..865f0908f9 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -13,7 +13,7 @@ use pageserver::tenant::remote_timeline_client::{ }; use pageserver::tenant::storage_layer::LayerName; use pageserver_api::shard::ShardIndex; -use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; +use remote_storage::{DownloadError, GenericRemoteStorage, ListingObject, RemotePath}; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::generation::Generation; @@ -165,22 +165,34 @@ pub(crate) async fn branch_cleanup_and_check_errors( .head_object(&path, &CancellationToken::new()) .await; - if response.is_err() { - // Object is not present. - let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta()); + match response { + Ok(_) => {} + Err(DownloadError::NotFound) => { + // Object is not present. + let is_l0 = + LayerMap::is_l0(layer.key_range(), layer.is_delta()); - let msg = format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", - layer, - metadata.generation.get_suffix(), - metadata.shard, - is_l0, - ); + let msg = format!( + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + layer, + metadata.generation.get_suffix(), + metadata.shard, + is_l0, + ); - if is_l0 || ignore_error { - result.warnings.push(msg); - } else { - result.errors.push(msg); + if is_l0 || ignore_error { + result.warnings.push(msg); + } else { + result.errors.push(msg); + } + } + Err(e) => { + tracing::warn!( + "cannot check if the layer {}{} is present in remote storage (error: {})", + layer, + metadata.generation.get_suffix(), + e, + ); } } } @@ -354,6 +366,7 @@ pub(crate) async fn list_timeline_blobs( match res { ListTimelineBlobsResult::Ready(data) => Ok(data), ListTimelineBlobsResult::MissingIndexPart(_) => { + tracing::warn!("listing raced with removal of an index, retrying"); // Retry if listing raced with removal of an index let data = list_timeline_blobs_impl(remote_client, id, root_target) .await? @@ -440,7 +453,7 @@ async fn list_timeline_blobs_impl( } if index_part_keys.is_empty() && s3_layers.is_empty() { - tracing::debug!("Timeline is empty: expected post-deletion state."); + tracing::info!("Timeline is empty: expected post-deletion state."); if initdb_archive { tracing::info!("Timeline is post deletion but initdb archive is still present."); } diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index f14341c7bc..49ab192285 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -137,11 +137,10 @@ struct TenantRefAccumulator { impl TenantRefAccumulator { fn update(&mut self, ttid: TenantShardTimelineId, index_part: &IndexPart) { let this_shard_idx = ttid.tenant_shard_id.to_index(); - (*self - .shards_seen + self.shards_seen .entry(ttid.tenant_shard_id.tenant_id) - .or_default()) - .insert(this_shard_idx); + .or_default() + .insert(this_shard_idx); let mut ancestor_refs = Vec::new(); for (layer_name, layer_metadata) in &index_part.layer_metadata { @@ -594,6 +593,7 @@ async fn gc_timeline( index_part_snapshot_time: _, } => (index_part, *index_part_generation, data.unused_index_keys), BlobDataParseResult::Relic => { + tracing::info!("Skipping timeline {ttid}, it is a relic"); // Post-deletion tenant location: don't try and GC it. return Ok(summary); } @@ -767,10 +767,13 @@ pub async fn pageserver_physical_gc( stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id).await?, ); Ok(try_stream! { + let mut cnt = 0; while let Some(ttid_res) = timelines.next().await { let ttid = ttid_res?; + cnt += 1; yield (ttid, tenant_manifest_arc.clone()); } + tracing::info!(%tenant_shard_id, "Found {} timelines", cnt); }) } }); @@ -790,6 +793,7 @@ pub async fn pageserver_physical_gc( &accumulator, tenant_manifest_arc, ) + .instrument(info_span!("gc_timeline", %ttid)) }); let timelines = timelines.try_buffered(CONCURRENCY); let mut timelines = std::pin::pin!(timelines); diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index ba75f25984..77c7987aa7 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -153,7 +153,10 @@ pub async fn scan_pageserver_metadata( const CONCURRENCY: usize = 32; // Generate a stream of TenantTimelineId - let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t)); + let timelines = tenants.map_ok(|t| { + tracing::info!("Found tenant: {}", t); + stream_tenant_timelines(&remote_client, &target, t) + }); let timelines = timelines.try_buffered(CONCURRENCY); let timelines = timelines.try_flatten(); diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 24231e32fc..d0ca53f8ab 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -24,7 +24,6 @@ pub struct SnapshotDownloader { remote_client: GenericRemoteStorage, #[allow(dead_code)] target: RootTarget, - bucket_config: BucketConfig, tenant_id: TenantId, output_path: Utf8PathBuf, concurrency: usize, @@ -43,7 +42,6 @@ impl SnapshotDownloader { Ok(Self { remote_client, target, - bucket_config, tenant_id, output_path, concurrency, @@ -218,11 +216,9 @@ impl SnapshotDownloader { } pub async fn download(&self) -> anyhow::Result<()> { - let (remote_client, target) = - init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?; - // Generate a stream of TenantShardId - let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?; + let shards = + stream_tenant_shards(&self.remote_client, &self.target, self.tenant_id).await?; let shards: Vec = shards.try_collect().await?; // Only read from shards that have the highest count: avoids redundantly downloading @@ -240,7 +236,8 @@ impl SnapshotDownloader { for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { // Generate a stream of TenantTimelineId - let timelines = stream_tenant_timelines(&remote_client, &target, shard).await?; + let timelines = + stream_tenant_timelines(&self.remote_client, &self.target, shard).await?; // Generate a stream of S3TimelineBlobData async fn load_timeline_index( @@ -251,8 +248,8 @@ impl SnapshotDownloader { let data = list_timeline_blobs(remote_client, ttid, target).await?; Ok((ttid, data)) } - let timelines = - timelines.map_ok(|ttid| load_timeline_index(&remote_client, &target, ttid)); + let timelines = timelines + .map_ok(|ttid| load_timeline_index(&self.remote_client, &self.target, ttid)); let mut timelines = std::pin::pin!(timelines.try_buffered(8)); while let Some(i) = timelines.next().await { diff --git a/test_runner/bin/neon_local_create_deep_l0_stack.py b/test_runner/bin/neon_local_create_deep_l0_stack.py new file mode 100644 index 0000000000..ebe11f7308 --- /dev/null +++ b/test_runner/bin/neon_local_create_deep_l0_stack.py @@ -0,0 +1,59 @@ +""" +Script to creates a stack of L0 deltas each of which should have 1 Value::Delta per page in `data`, +in your running neon_local setup. + +Use this bash setup to reset your neon_local environment. +The last line of this bash snippet will run this file here. +``` + export NEON_REPO_DIR=$PWD/.neon + export NEON_BIN_DIR=$PWD/target/release + $NEON_BIN_DIR/neon_local stop + rm -rf $NEON_REPO_DIR + $NEON_BIN_DIR/neon_local init + cat >> $NEON_REPO_DIR/pageserver_1/pageserver.toml <<"EOF" + # customizations + virtual_file_io_mode = "direct-rw" + page_service_pipelining={mode="pipelined", max_batch_size=32, execution="concurrent-futures"} + get_vectored_concurrent_io={mode="sidecar-task"} +EOF + $NEON_BIN_DIR/neon_local start + + psql 'postgresql://localhost:1235/storage_controller' -c 'DELETE FROM tenant_shards' + sed 's/.*get_vectored_concurrent_io.*/get_vectored_concurrent_io={mode="sidecar-task"}/' -i $NEON_REPO_DIR/pageserver_1/pageserver.toml + $NEON_BIN_DIR/neon_local pageserver restart + sleep 2 + $NEON_BIN_DIR/neon_local tenant create --set-default + ./target/debug/neon_local endpoint stop foo + rm -rf $NEON_REPO_DIR/endpoints/foo + ./target/debug/neon_local endpoint create foo + echo 'full_page_writes=off' >> $NEON_REPO_DIR/endpoints/foo/postgresql.conf + ./target/debug/neon_local endpoint start foo + + pushd test_runner; poetry run python3 -m bin.neon_local_create_deep_l0_stack 10; popd +``` +""" + +import sys + +import psycopg2 +from fixtures.common_types import TenantShardId, TimelineId +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.makelayers.l0stack import L0StackShape, make_l0_stack_standalone + +ps_http = PageserverHttpClient(port=9898, is_testing_enabled_or_skip=lambda: None) +vps_http = PageserverHttpClient(port=1234, is_testing_enabled_or_skip=lambda: None) + +tenants = ps_http.tenant_list() +assert len(tenants) == 1 +tenant_shard_id = TenantShardId.parse(tenants[0]["id"]) + +timlines = ps_http.timeline_list(tenant_shard_id) +assert len(timlines) == 1 +timeline_id = TimelineId(timlines[0]["timeline_id"]) + +connstr = "postgresql://cloud_admin@localhost:55432/postgres" +conn = psycopg2.connect(connstr) + +shape = L0StackShape(logical_table_size_mib=50, delta_stack_height=int(sys.argv[1])) + +make_l0_stack_standalone(vps_http, ps_http, tenant_shard_id, timeline_id, conn, shape) diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 652c38f5c3..4b4b98aa6c 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -1,6 +1,7 @@ from __future__ import annotations import urllib.parse +from enum import StrEnum from typing import TYPE_CHECKING, final import requests @@ -9,11 +10,23 @@ from requests.auth import AuthBase from typing_extensions import override from fixtures.log_helper import log +from fixtures.utils import wait_until if TYPE_CHECKING: from requests import PreparedRequest +COMPUTE_AUDIENCE = "compute" +""" +The value to place in the `aud` claim. +""" + + +@final +class ComputeClaimsScope(StrEnum): + ADMIN = "admin" + + @final class BearerAuth(AuthBase): """ @@ -50,6 +63,35 @@ class EndpointHttpClient(requests.Session): res.raise_for_status() return res.json() + def prewarm_lfc_status(self) -> dict[str, str]: + res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm") + res.raise_for_status() + json: dict[str, str] = res.json() + return json + + def prewarm_lfc(self): + self.post(f"http://localhost:{self.external_port}/lfc/prewarm").raise_for_status() + + def prewarmed(): + json = self.prewarm_lfc_status() + status, err = json["status"], json.get("error") + assert status == "completed", f"{status}, error {err}" + + wait_until(prewarmed) + + def offload_lfc(self): + url = f"http://localhost:{self.external_port}/lfc/offload" + self.post(url).raise_for_status() + + def offloaded(): + res = self.get(url) + res.raise_for_status() + json = res.json() + status, err = json["status"], json.get("error") + assert status == "completed", f"{status}, error {err}" + + wait_until(offloaded) + def database_schema(self, database: str): res = self.get( f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}", diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 879808b7ba..1dd4fe8316 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -184,6 +184,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( "pageserver_evictions_with_low_residence_duration_total", "pageserver_aux_file_estimated_size", "pageserver_valid_lsn_lease_count", + "pageserver_tenant_offloaded_timelines", counter("pageserver_tenant_throttling_count_accounted_start"), counter("pageserver_tenant_throttling_count_accounted_finish"), counter("pageserver_tenant_throttling_wait_usecs_sum"), diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index b5d69b5ab6..bb07e2b6d1 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: Any, ) + from fixtures.endpoint.http import ComputeClaimsScope from fixtures.pg_version import PgVersion @@ -102,7 +103,7 @@ class AbstractNeonCli: else: stdout = "" - log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}") + log.warning(f"CLI timeout: stderr={stderr}, stdout={stdout}") raise indent = " " @@ -535,12 +536,16 @@ class NeonLocalCli(AbstractNeonCli): res.check_returncode() return res - def endpoint_generate_jwt(self, endpoint_id: str) -> str: + def endpoint_generate_jwt( + self, endpoint_id: str, scope: ComputeClaimsScope | None = None + ) -> str: """ Generate a JWT for making requests to the endpoint's external HTTP server. """ args = ["endpoint", "generate-jwt", endpoint_id] + if scope: + args += ["--scope", str(scope)] cmd = self.raw_cli(args) cmd.check_returncode() @@ -552,7 +557,7 @@ class NeonLocalCli(AbstractNeonCli): endpoint_id: str, safekeepers_generation: int | None = None, safekeepers: list[int] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, create_test_user: bool = False, @@ -567,8 +572,8 @@ class NeonLocalCli(AbstractNeonCli): extra_env_vars = env or {} if basebackup_request_tries is not None: extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries) - if remote_ext_config is not None: - args.extend(["--remote-ext-config", remote_ext_config]) + if remote_ext_base_url is not None: + args.extend(["--remote-ext-base-url", remote_ext_base_url]) if safekeepers_generation is not None: args.extend(["--safekeepers-generation", str(safekeepers_generation)]) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1d668d4b2d..5c92f2e2d0 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -51,7 +51,7 @@ from fixtures.common_types import ( TimelineId, ) from fixtures.compute_migrations import NUM_COMPUTE_MIGRATIONS -from fixtures.endpoint.http import EndpointHttpClient +from fixtures.endpoint.http import ComputeClaimsScope, EndpointHttpClient from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.neon_cli import NeonLocalCli, Pagectl @@ -682,7 +682,7 @@ class NeonEnvBuilder: log.info( f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}" ) - shutil.copytree(tenants_from_dir, tenants_to_dir) + subprocess.run(["cp", "-a", tenants_from_dir, tenants_to_dir], check=True) else: log.info( f"Creating overlayfs mount of pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}" @@ -698,8 +698,9 @@ class NeonEnvBuilder: shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True) if self.test_overlay_dir is None: log.info("Copying local_fs_remote_storage directory from snapshot") - shutil.copytree( - repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" + subprocess.run( + ["cp", "-a", f"{repo_dir / 'local_fs_remote_storage'}", f"{self.repo_dir}"], + check=True, ) else: log.info("Creating overlayfs mount of local_fs_remote_storage directory from snapshot") @@ -1185,7 +1186,9 @@ class NeonEnv: "broker": {}, "safekeepers": [], "pageservers": [], - "endpoint_storage": {"port": self.port_distributor.get_port()}, + "endpoint_storage": { + "listen_addr": f"127.0.0.1:{self.port_distributor.get_port()}", + }, "generate_local_ssl_certs": self.generate_local_ssl_certs, } @@ -1194,8 +1197,7 @@ class NeonEnv: else: cfg["broker"]["listen_addr"] = self.broker.listen_addr() - if self.control_plane_api is not None: - cfg["control_plane_api"] = self.control_plane_api + cfg["control_plane_api"] = self.control_plane_api if self.control_plane_hooks_api is not None: cfg["control_plane_hooks_api"] = self.control_plane_hooks_api @@ -1254,6 +1256,12 @@ class NeonEnv: "no_sync": True, # Look for gaps in WAL received from safekeepeers "validate_wal_contiguity": True, + # TODO(vlad): make these configurable through the builder + "timeline_import_config": { + "import_job_concurrency": 4, + "import_job_soft_size_limit": 512 * 1024, + "import_job_checkpoint_threshold": 4, + }, } # Batching (https://github.com/neondatabase/neon/issues/9377): @@ -1273,6 +1281,8 @@ class NeonEnv: if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine + if self.pageserver_virtual_file_io_mode is not None: + ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode if config.pageserver_default_tenant_config_compaction_algorithm is not None: tenant_config = ps_cfg.setdefault("tenant_config", {}) tenant_config["compaction_algorithm"] = ( @@ -1280,7 +1290,8 @@ class NeonEnv: ) tenant_config = ps_cfg.setdefault("tenant_config", {}) - tenant_config["rel_size_v2_enabled"] = True # Enable relsize_v2 by default in tests + # This feature is pending rollout. + # tenant_config["rel_size_v2_enabled"] = True if self.pageserver_remote_storage is not None: ps_cfg["remote_storage"] = remote_storage_to_toml_dict( @@ -1297,13 +1308,6 @@ class NeonEnv: for key, value in override.items(): ps_cfg[key] = value - if self.pageserver_virtual_file_io_mode is not None: - # TODO(christian): https://github.com/neondatabase/neon/issues/11598 - if not config.test_may_use_compatibility_snapshot_binaries: - ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode - else: - log.info("ignoring virtual_file_io_mode parametrization for compatibility test") - if self.pageserver_wal_receiver_protocol is not None: key, value = PageserverWalReceiverProtocol.to_config_key_value( self.pageserver_wal_receiver_protocol @@ -1374,7 +1378,11 @@ class NeonEnv: force=config.config_init_force, ) - def start(self, timeout_in_seconds: int | None = None): + def start( + self, + timeout_in_seconds: int | None = None, + extra_ps_env_vars: dict[str, str] | None = None, + ): # Storage controller starts first, so that pageserver /re-attach calls don't # bounce through retries on startup self.storage_controller.start(timeout_in_seconds=timeout_in_seconds) @@ -1393,7 +1401,10 @@ class NeonEnv: for pageserver in self.pageservers: futs.append( executor.submit( - lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] + lambda ps=pageserver: ps.start( # type: ignore[misc] + extra_env_vars=extra_ps_env_vars or {}, + timeout_in_seconds=timeout_in_seconds, + ), ) ) @@ -1407,30 +1418,6 @@ class NeonEnv: for f in futs: f.result() - # Last step: register safekeepers at the storage controller - if ( - self.storage_controller_config is not None - and self.storage_controller_config.get("timelines_onto_safekeepers") is True - ): - for sk_id, sk in enumerate(self.safekeepers): - # 0 is an invalid safekeeper id - sk_id = sk_id + 1 - body = { - "id": sk_id, - "created_at": "2023-10-25T09:11:25Z", - "updated_at": "2024-08-28T11:32:43Z", - "region_id": "aws-us-east-2", - "host": "127.0.0.1", - "port": sk.port.pg, - "http_port": sk.port.http, - "https_port": None, - "version": 5957, - "availability_zone_id": f"us-east-2b-{sk_id}", - } - - self.storage_controller.on_safekeeper_deploy(sk_id, body) - self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active") - self.endpoint_storage.start(timeout_in_seconds=timeout_in_seconds) def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): @@ -3634,6 +3621,8 @@ class NeonProxy(PgProtocol): http_port: int, mgmt_port: int, external_http_port: int, + router_port: int, + router_tls_port: int, auth_backend: NeonProxy.AuthBackend, metric_collection_endpoint: str | None = None, metric_collection_interval: str | None = None, @@ -3650,6 +3639,8 @@ class NeonProxy(PgProtocol): self.test_output_dir = test_output_dir self.proxy_port = proxy_port self.mgmt_port = mgmt_port + self.router_port = router_port + self.router_tls_port = router_tls_port self.auth_backend = auth_backend self.metric_collection_endpoint = metric_collection_endpoint self.metric_collection_interval = metric_collection_interval @@ -3664,6 +3655,14 @@ class NeonProxy(PgProtocol): key_path = self.test_output_dir / "proxy.key" generate_proxy_tls_certs("*.local.neon.build", key_path, crt_path) + # generate key for pg-sni-router. + # endpoint.namespace.local.neon.build resolves to 127.0.0.1 + generate_proxy_tls_certs( + "endpoint.namespace.local.neon.build", + self.test_output_dir / "router.key", + self.test_output_dir / "router.crt", + ) + args = [ str(self.neon_binpath / "proxy"), *["--http", f"{self.host}:{self.http_port}"], @@ -3673,6 +3672,11 @@ class NeonProxy(PgProtocol): *["--sql-over-http-timeout", f"{self.http_timeout_seconds}s"], *["-c", str(crt_path)], *["-k", str(key_path)], + *["--sni-router-listen", f"{self.host}:{self.router_port}"], + *["--sni-router-listen-tls", f"{self.host}:{self.router_tls_port}"], + *["--sni-router-tls-cert", str(self.test_output_dir / "router.crt")], + *["--sni-router-tls-key", str(self.test_output_dir / "router.key")], + *["--sni-router-destination", "local.neon.build"], *self.auth_backend.extra_args(), ] @@ -3864,7 +3868,7 @@ class NeonAuthBroker: external_http_port: int, auth_backend: NeonAuthBroker.ProxyV1, ): - self.domain = "apiauth.local.neon.build" # resolves to 127.0.0.1 + self.domain = "local.neon.build" # resolves to 127.0.0.1 self.host = "127.0.0.1" self.http_port = http_port self.external_http_port = external_http_port @@ -3881,7 +3885,7 @@ class NeonAuthBroker: # generate key of it doesn't exist crt_path = self.test_output_dir / "proxy.crt" key_path = self.test_output_dir / "proxy.key" - generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path) + generate_proxy_tls_certs(f"apiauth.{self.domain}", key_path, crt_path) args = [ str(self.neon_binpath / "proxy"), @@ -3925,10 +3929,10 @@ class NeonAuthBroker: log.info(f"Executing http query: {query}") - connstr = f"postgresql://{user}@{self.domain}/postgres" + connstr = f"postgresql://{user}@ep-foo-bar-1234.{self.domain}/postgres" async with httpx.AsyncClient(verify=str(self.test_output_dir / "proxy.crt")) as client: response = await client.post( - f"https://{self.domain}:{self.external_http_port}/sql", + f"https://apiauth.{self.domain}:{self.external_http_port}/sql", json={"query": query, "params": args}, headers={ "Neon-Connection-String": connstr, @@ -3972,6 +3976,8 @@ def link_proxy( proxy_port = port_distributor.get_port() mgmt_port = port_distributor.get_port() external_http_port = port_distributor.get_port() + router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() with NeonProxy( neon_binpath=neon_binpath, @@ -3979,6 +3985,8 @@ def link_proxy( proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, + router_port=router_port, + router_tls_port=router_tls_port, external_http_port=external_http_port, auth_backend=NeonProxy.Link(), ) as proxy: @@ -4012,6 +4020,8 @@ def static_proxy( mgmt_port = port_distributor.get_port() http_port = port_distributor.get_port() external_http_port = port_distributor.get_port() + router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() with NeonProxy( neon_binpath=neon_binpath, @@ -4019,6 +4029,8 @@ def static_proxy( proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, + router_port=router_port, + router_tls_port=router_tls_port, external_http_port=external_http_port, auth_backend=NeonProxy.Postgres(auth_endpoint), ) as proxy: @@ -4218,13 +4230,13 @@ class Endpoint(PgProtocol, LogUtils): self.config(config_lines) - self.__jwt = self.env.neon_cli.endpoint_generate_jwt(self.endpoint_id) + self.__jwt = self.generate_jwt() return self def start( self, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, safekeeper_generation: int | None = None, safekeepers: list[int] | None = None, @@ -4250,7 +4262,7 @@ class Endpoint(PgProtocol, LogUtils): self.endpoint_id, safekeepers_generation=safekeeper_generation, safekeepers=self.active_safekeepers, - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, allow_multiple=allow_multiple, create_test_user=create_test_user, @@ -4265,6 +4277,14 @@ class Endpoint(PgProtocol, LogUtils): return self + def generate_jwt(self, scope: ComputeClaimsScope | None = None) -> str: + """ + Generate a JWT for making requests to the endpoint's external HTTP + server. + """ + assert self.endpoint_id is not None + return self.env.neon_cli.endpoint_generate_jwt(self.endpoint_id, scope) + def endpoint_path(self) -> Path: """Path to endpoint directory""" assert self.endpoint_id @@ -4457,7 +4477,7 @@ class Endpoint(PgProtocol, LogUtils): hot_standby: bool = False, lsn: Lsn | None = None, config_lines: list[str] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, basebackup_request_tries: int | None = None, @@ -4476,7 +4496,7 @@ class Endpoint(PgProtocol, LogUtils): pageserver_id=pageserver_id, allow_multiple=allow_multiple, ).start( - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, allow_multiple=allow_multiple, basebackup_request_tries=basebackup_request_tries, @@ -4560,7 +4580,7 @@ class EndpointFactory: lsn: Lsn | None = None, hot_standby: bool = False, config_lines: list[str] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, basebackup_request_tries: int | None = None, ) -> Endpoint: @@ -4580,7 +4600,7 @@ class EndpointFactory: hot_standby=hot_standby, config_lines=config_lines, lsn=lsn, - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, basebackup_request_tries=basebackup_request_tries, ) @@ -4634,7 +4654,10 @@ class EndpointFactory: return self def new_replica( - self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None + self, + origin: Endpoint, + endpoint_id: str | None = None, + config_lines: list[str] | None = None, ): branch_name = origin.branch_name assert origin in self.endpoints @@ -4650,7 +4673,10 @@ class EndpointFactory: ) def new_replica_start( - self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None + self, + origin: Endpoint, + endpoint_id: str | None = None, + config_lines: list[str] | None = None, ): branch_name = origin.branch_name assert origin in self.endpoints @@ -5467,6 +5493,13 @@ def wait_for_last_flush_lsn( if last_flush_lsn is None: last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + # The last_flush_lsn may not correspond to a record boundary. + # For example, if the compute flushed WAL on a page boundary, + # the remaining part of the record might not be flushed for a long time. + # This would prevent the pageserver from reaching last_flush_lsn promptly. + # To ensure the rest of the record reaches the pageserver quickly, + # we forcibly flush the WAL by using CHECKPOINT. + endpoint.safe_psql("CHECKPOINT") results = [] for tenant_shard_id, pageserver in shards: diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 24c856e279..9b564f0a60 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -111,6 +111,13 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*stalling layer flushes for compaction backpressure.*", ".*layer roll waiting for flush due to compaction backpressure.*", ".*BatchSpanProcessor.*", + *( + [ + r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*" + ] + if sys.platform != "linux" + else [] + ), ) @@ -122,6 +129,10 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ ".*Call to node.*management API.*failed.*Timeout.*", ".*Failed to update node .+ after heartbeat round.*error sending request for url.*", ".*background_reconcile: failed to fetch top tenants:.*client error \\(Connect\\).*", + # Many tests will take safekeepers offline + ".*Call to safekeeper.*management API.*failed.*receive body.*", + ".*Call to safekeeper.*management API.*failed.*ReceiveBody.*", + ".*Call to safekeeper.*management API.*failed.*Timeout.*", # Many tests will start up with a node offline ".*startup_reconcile: Could not scan node.*", # Tests run in dev mode diff --git a/test_runner/fixtures/pageserver/makelayers/__init__.py b/test_runner/fixtures/pageserver/makelayers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/pageserver/makelayers/l0stack.py b/test_runner/fixtures/pageserver/makelayers/l0stack.py new file mode 100644 index 0000000000..408ba1254f --- /dev/null +++ b/test_runner/fixtures/pageserver/makelayers/l0stack.py @@ -0,0 +1,148 @@ +from dataclasses import dataclass + +from psycopg2.extensions import connection as PgConnection + +from fixtures.common_types import Lsn, TenantShardId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn + + +@dataclass +class L0StackShape: + logical_table_size_mib: int = 50 + delta_stack_height: int = 20 + + +def make_l0_stack(endpoint: Endpoint, shape: L0StackShape): + """ + Creates stack of L0 deltas each of which should have 1 Value::Delta per page in table `data`. + """ + env = endpoint.env + + # TDOO: wait for storcon to finish any reonciles before jumping to action here? + description = env.storage_controller.tenant_describe(endpoint.tenant_id) + shards = description["shards"] + assert len(shards) == 1, "does not support sharding" + tenant_shard_id = TenantShardId.parse(shards[0]["tenant_shard_id"]) + + endpoint.config(["full_page_writes=off"]) + endpoint.reconfigure() + + ps = env.get_pageserver(shards[0]["node_attached"]) + + timeline_id = endpoint.show_timeline_id() + + vps_http = env.storage_controller.pageserver_api() + ps_http = ps.http_client() + endpoint_conn = endpoint.connect() + make_l0_stack_standalone(vps_http, ps_http, tenant_shard_id, timeline_id, endpoint_conn, shape) + + +def make_l0_stack_standalone( + vps_http: PageserverHttpClient, + ps_http: PageserverHttpClient, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + endpoint_conn: PgConnection, + shape: L0StackShape, +): + """ + See make_l0_stack for details. + + This function is a standalone version of make_l0_stack, usable from not-test code. + """ + + assert not tenant_shard_id.shard_index.is_sharded, ( + "the current implementation only supports unsharded tenants" + ) + + tenant_id = tenant_shard_id.tenant_id + conn = endpoint_conn + desired_size = shape.logical_table_size_mib * 1024 * 1024 + + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "1h", # doesn't matter, but 0 value will kill walredo every 10s + "compaction_threshold": 100000, # we just want L0s + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 100000, # we just want L0s + } + + vps_http.set_tenant_config(tenant_id, config) + + conn.autocommit = True + cur = conn.cursor() + + # Ensure full_page_writes are disabled so that all Value::Delta in + # pageserver are !will_init, and therefore a getpage needs to read + # the entire delta stack. + cur.execute("SHOW full_page_writes") + assert cur.fetchall()[0][0] == "off", "full_page_writes should be off" + + # each tuple is 23 (header) + 100 bytes = 123 bytes + # page header si 24 bytes + # 8k page size + # (8k-24bytes) / 123 bytes = 63 tuples per page + # set fillfactor to 10 to have 6 tuples per page + cur.execute("DROP TABLE IF EXISTS data") + cur.execute("CREATE TABLE data(id bigint, row char(92)) with (fillfactor=10)") + need_pages = desired_size // 8192 + need_rows = need_pages * 6 + log.info(f"Need {need_pages} pages, {need_rows} rows") + cur.execute(f"INSERT INTO data SELECT i,'row'||i FROM generate_series(1, {need_rows}) as i") + # Raise fillfactor to 100% so that all updates are HOT updates. + # We assert they're hot updates by checking fetch_id_to_page_mapping remains the same. + cur.execute("ALTER TABLE data SET (fillfactor=100)") + + def settle_and_flush(): + cur.execute("SELECT pg_current_wal_flush_lsn()") + flush_lsn = Lsn(cur.fetchall()[0][0]) + wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, flush_lsn) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + # create an L0 for the initial data we just inserted + settle_and_flush() + + # assert we wrote what we think we wrote + cur.execute(""" + with ntuples_per_page as ( + select (ctid::text::point)[0]::bigint pageno,count(*) ntuples from data group by pageno + ) + select ntuples, count(*) npages from ntuples_per_page group by ntuples order by ntuples; + """) + rows = cur.fetchall() + log.info(f"initial table layout: {rows}") + assert len(rows) == 1 + assert rows[0][0] == 6, f"expected 6 tuples per page, got {rows[0][0]}" + assert rows[0][1] == need_pages, f"expected {need_pages} pages, got {rows[0][1]}" + + def fetch_id_to_page_mapping(): + cur.execute(""" + SELECT id,(ctid::text::point)[0]::bigint pageno FROM data ORDER BY id + """) + return cur.fetchall() + + initial_mapping = fetch_id_to_page_mapping() + + # every iteration updates one tuple in each page + delta_stack_height = shape.delta_stack_height + for i in range(0, delta_stack_height): + log.info(i) + cur.execute(f"UPDATE data set row = row||',u' where id % 6 = {i % 6}") + log.info(f"modified rows: {cur.rowcount}") + assert cur.rowcount == need_pages + settle_and_flush() + post_update_mapping = fetch_id_to_page_mapping() + assert initial_mapping == post_update_mapping, "Postgres should be doing HOT updates" + + # Assert the layer count is what we expect it is + layer_map = vps_http.layer_map_info(tenant_id, timeline_id) + assert ( + len(layer_map.delta_l0_layers()) == delta_stack_height + 1 + 1 + ) # +1 for the initdb layer + 1 for the table creation & fill + assert len(layer_map.delta_l0_layers()) == len(layer_map.delta_layers()) # it's all L0s + assert len(layer_map.image_layers()) == 0 # no images diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 3b25a60e9b..21844648d1 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -15,7 +15,8 @@ Some handy pytest flags for local development: - `-k` selects a test to run - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`) - `--preserve-database-files` to skip cleanup -- `--out-dir` to produce a JSON with the recorded test metrics +- `--out-dir` to produce a JSON with the recorded test metrics. + There is a post-processing tool at `test_runner/performance/out_dir_to_csv.py`. # What performance tests do we have and how we run them diff --git a/test_runner/performance/out_dir_to_csv.py b/test_runner/performance/out_dir_to_csv.py new file mode 100644 index 0000000000..8647ad4acc --- /dev/null +++ b/test_runner/performance/out_dir_to_csv.py @@ -0,0 +1,57 @@ +# Tool to convert the JSON output from running a perf test with `--out-dir` to a CSV that +# can be easily pasted into a spreadsheet for quick viz & analysis. +# Check the `./README.md` in this directory for `--out-dir`. +# +# TODO: add the pytest.mark.parametrize to the json and make them columns here +# https://github.com/neondatabase/neon/issues/11878 + +import csv +import json +import os +import sys + + +def json_to_csv(json_file): + with open(json_file) as f: + data = json.load(f) + + # Collect all possible metric names to form headers + all_metrics = set() + for result in data.get("result", []): + for metric in result.get("data", []): + all_metrics.add(metric["name"]) + + # Sort metrics for consistent output + metrics = sorted(list(all_metrics)) + + # Create headers + headers = ["suit"] + metrics + + # Prepare rows + rows = [] + for result in data.get("result", []): + row = {"suit": result["suit"]} + + # Initialize all metrics to empty + for metric in metrics: + row[metric] = "" + + # Fill in available metrics + for item in result.get("data", []): + row[item["name"]] = item["value"] + + rows.append(row) + + # Write to stdout as CSV + writer = csv.DictWriter(sys.stdout, fieldnames=headers) + writer.writeheader() + writer.writerows(rows) + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print(f"Usage: python {os.path.basename(__file__)} ") + sys.exit(1) + + json_file = sys.argv[1] + json_to_csv(json_file) diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 8874fe663b..41696bf887 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -14,7 +14,7 @@ from fixtures.neon_fixtures import ( PgBin, wait_for_last_flush_lsn, ) -from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci +from fixtures.utils import get_scale_for_db, humantime_to_ms from performance.pageserver.util import setup_pageserver_with_tenants @@ -36,9 +36,6 @@ if TYPE_CHECKING: @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) @pytest.mark.parametrize("n_tenants", [500]) @pytest.mark.timeout(10000) -@skip_on_ci( - "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI" -) def test_pageserver_characterize_throughput_with_n_tenants( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, @@ -63,9 +60,6 @@ def test_pageserver_characterize_throughput_with_n_tenants( @pytest.mark.parametrize("n_clients", [1, 64]) @pytest.mark.parametrize("n_tenants", [1]) @pytest.mark.timeout(2400) -@skip_on_ci( - "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI" -) def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py index b17ca772c9..9e2312311a 100644 --- a/test_runner/performance/pageserver/test_page_service_batching.py +++ b/test_runner/performance/pageserver/test_page_service_batching.py @@ -10,7 +10,8 @@ from typing import Any import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin +from fixtures.pageserver.makelayers import l0stack from fixtures.utils import humantime_to_ms TARGET_RUNTIME = 30 @@ -34,28 +35,18 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig): mode: str = "pipelined" -EXECUTION = ["concurrent-futures"] -BATCHING = ["uniform-lsn", "scattered-lsn"] - -NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] -for max_batch_size in [1, 32]: - for execution in EXECUTION: - for batching in BATCHING: - NON_BATCHABLE.append( - PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) - ) - -BATCHABLE: list[PageServicePipeliningConfig] = [] +PS_IO_CONCURRENCY = ["sidecar-task"] +PIPELINING_CONFIGS: list[PageServicePipeliningConfig] = [] for max_batch_size in [32]: - for execution in EXECUTION: - for batching in BATCHING: - BATCHABLE.append( + for execution in ["concurrent-futures"]: + for batching in ["scattered-lsn"]: + PIPELINING_CONFIGS.append( PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) ) @pytest.mark.parametrize( - "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name", + "tablesize_mib, pipelining_config, target_runtime, ps_io_concurrency, effective_io_concurrency, readhead_buffer_size, name", [ # batchable workloads should show throughput and CPU efficiency improvements *[ @@ -63,20 +54,23 @@ for max_batch_size in [32]: 50, config, TARGET_RUNTIME, + ps_io_concurrency, 100, 128, f"batchable {dataclasses.asdict(config)}", ) - for config in BATCHABLE + for config in PIPELINING_CONFIGS + for ps_io_concurrency in PS_IO_CONCURRENCY ], ], ) -def test_throughput( +def test_postgres_seqscan( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, tablesize_mib: int, pipelining_config: PageServicePipeliningConfig, target_runtime: int, + ps_io_concurrency: str, effective_io_concurrency: int, readhead_buffer_size: int, name: str, @@ -97,6 +91,10 @@ def test_throughput( If the compute provides pipeline depth (effective_io_concurrency=100), then pipelining configs, especially with max_batch_size>1 should yield dramatic improvements in all performance metrics. + + We advance the LSN from a disruptor thread to simulate the effect of a workload with concurrent writes + in another table. The `scattered-lsn` batching mode handles this well whereas the + initial implementatin (`uniform-lsn`) would break the batch. """ # @@ -114,7 +112,19 @@ def test_throughput( } ) # For storing configuration as a metric, insert a fake 0 with labels with actual data - params.update({"pipelining_config": (0, {"labels": dataclasses.asdict(pipelining_config)})}) + params.update( + { + "config": ( + 0, + { + "labels": { + "pipelining_config": dataclasses.asdict(pipelining_config), + "ps_io_concurrency": ps_io_concurrency, + } + }, + ) + } + ) log.info("params: %s", params) @@ -266,7 +276,10 @@ def test_throughput( return iters env.pageserver.patch_config_toml_nonrecursive( - {"page_service_pipelining": dataclasses.asdict(pipelining_config)} + { + "page_service_pipelining": dataclasses.asdict(pipelining_config), + "get_vectored_concurrent_io": {"mode": ps_io_concurrency}, + } ) # set trace for log analysis below @@ -318,77 +331,63 @@ def test_throughput( ) -PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] -for max_batch_size in [1, 32]: - for execution in EXECUTION: - for batching in BATCHING: - PRECISION_CONFIGS.append( - PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) - ) - - @pytest.mark.parametrize( - "pipelining_config,name", - [(config, f"{dataclasses.asdict(config)}") for config in PRECISION_CONFIGS], + "pipelining_config,ps_io_concurrency,l0_stack_height,queue_depth,name", + [ + (config, ps_io_concurrency, l0_stack_height, queue_depth, f"{dataclasses.asdict(config)}") + for config in PIPELINING_CONFIGS + for ps_io_concurrency in PS_IO_CONCURRENCY + for queue_depth in [1, 2, 32] + for l0_stack_height in [0, 20] + ], ) -def test_latency( +def test_random_reads( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, pipelining_config: PageServicePipeliningConfig, + ps_io_concurrency: str, + l0_stack_height: int, + queue_depth: int, name: str, ): """ - Measure the latency impact of pipelining in an un-batchable workloads. - - An ideal implementation should not increase average or tail latencies for such workloads. - - We don't have support in pagebench to create queue depth yet. - => https://github.com/neondatabase/neon/issues/9837 + Throw pagebench random getpage at latest lsn workload from a single client against pageserver. """ # # Setup # + def build_snapshot_cb(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + l0stack.make_l0_stack( + endpoint, + l0stack.L0StackShape(logical_table_size_mib=50, delta_stack_height=l0_stack_height), + ) + return env + + env = neon_env_builder.build_and_use_snapshot( + f"test_page_service_batching--test_pagebench-{l0_stack_height}", build_snapshot_cb + ) + def patch_ps_config(ps_config): - if pipelining_config is not None: - ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config) + ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config) + ps_config["get_vectored_concurrent_io"] = {"mode": ps_io_concurrency} - neon_env_builder.pageserver_config_override = patch_ps_config + env.pageserver.edit_config_toml(patch_ps_config) - env = neon_env_builder.init_start() - endpoint = env.endpoints.create_start("main") - conn = endpoint.connect() - cur = conn.cursor() + env.start() - cur.execute("SET max_parallel_workers_per_gather=0") # disable parallel backends - cur.execute("SET effective_io_concurrency=1") - - cur.execute("CREATE EXTENSION IF NOT EXISTS neon;") - cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;") - - log.info("Filling the table") - cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)") - tablesize = 50 * 1024 * 1024 - npages = tablesize // (8 * 1024) - cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,)) - # TODO: can we force postgres to do sequential scans? - - cur.close() - conn.close() - - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) - - endpoint.stop() + lsn = env.safekeepers[0].get_commit_lsn(env.initial_tenant, env.initial_timeline) + ep = env.endpoints.create_start("main", lsn=lsn) + data_table_relnode_oid = ep.safe_psql_scalar("SELECT 'data'::regclass::oid") + ep.stop_and_destroy() for sk in env.safekeepers: sk.stop() - # - # Run single-threaded pagebench (TODO: dedup with other benchmark code) - # - env.pageserver.allowed_errors.append( # https://github.com/neondatabase/neon/issues/6925 r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" @@ -396,6 +395,8 @@ def test_latency( ps_http = env.pageserver.http_client() + metrics_before = ps_http.get_metrics() + cmd = [ str(env.neon_binpath / "pagebench"), "get-page-latest-lsn", @@ -405,6 +406,10 @@ def test_latency( env.pageserver.connstr(password=None), "--num-clients", "1", + "--queue-depth", + str(queue_depth), + "--only-relnode", + str(data_table_relnode_oid), "--runtime", "10s", ] @@ -413,12 +418,22 @@ def test_latency( results_path = Path(basepath + ".stdout") log.info(f"Benchmark results at: {results_path}") + metrics_after = ps_http.get_metrics() + with open(results_path) as f: results = json.load(f) log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") total = results["total"] + metric = "request_count" + zenbenchmark.record( + metric, + metric_value=total[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + metric = "latency_mean" zenbenchmark.record( metric, @@ -435,3 +450,17 @@ def test_latency( unit="ms", report=MetricReport.LOWER_IS_BETTER, ) + + reads_before = metrics_before.query_one( + "pageserver_io_operations_seconds_count", filter={"operation": "read"} + ) + reads_after = metrics_after.query_one( + "pageserver_io_operations_seconds_count", filter={"operation": "read"} + ) + + zenbenchmark.record( + "virtual_file_reads", + metric_value=reads_after.value - reads_before.value, + unit="", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/test_cumulative_statistics_persistence.py b/test_runner/performance/test_cumulative_statistics_persistence.py index 061467bbad..5e9e55cb0f 100644 --- a/test_runner/performance/test_cumulative_statistics_persistence.py +++ b/test_runner/performance/test_cumulative_statistics_persistence.py @@ -1,4 +1,5 @@ import math # Add this import +import os import time import traceback from pathlib import Path @@ -87,7 +88,10 @@ def test_cumulative_statistics_persistence( - insert additional tuples that by itself are not enough to trigger auto-vacuum but in combination with the previous tuples are - verify that autovacuum is triggered by the combination of tuples inserted before and after endpoint suspension """ - project = neon_api.create_project(pg_version) + project = neon_api.create_project( + pg_version, + f"Test cumulative statistics persistence, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}", + ) project_id = project["project"]["id"] neon_api.wait_for_operation_to_finish(project_id) endpoint_id = project["endpoints"][0]["id"] diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index bdafa2d657..c580bfcc14 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -62,7 +62,9 @@ def test_ro_replica_lag( pgbench_duration = f"-T{test_duration_min * 60 * 2}" - project = neon_api.create_project(pg_version) + project = neon_api.create_project( + pg_version, f"Test readonly replica lag, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" + ) project_id = project["project"]["id"] log.info("Project ID: %s", project_id) log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"]) @@ -195,7 +197,9 @@ def test_replication_start_stop( pgbench_duration = f"-T{2**num_replicas * configuration_test_time_sec}" error_occurred = False - project = neon_api.create_project(pg_version) + project = neon_api.create_project( + pg_version, f"Test replication start stop, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" + ) project_id = project["project"]["id"] log.info("Project ID: %s", project_id) log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"]) diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py index 643151fa11..645c9b7b9d 100644 --- a/test_runner/random_ops/test_random_ops.py +++ b/test_runner/random_ops/test_random_ops.py @@ -206,7 +206,7 @@ class NeonProject: self.neon_api = neon_api self.pg_bin = pg_bin proj = self.neon_api.create_project( - pg_version, f"Automatic random API test {os.getenv('GITHUB_RUN_ID')}" + pg_version, f"Automatic random API test GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}" ) self.id: str = proj["project"]["id"] self.name: str = proj["project"]["name"] diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index ee408e3c65..3eb6b7193c 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -186,7 +186,8 @@ def test_fully_custom_config(positive_env: NeonEnv): "type": "interpreted", "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}}, }, - "rel_size_v2_enabled": False, # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it + "rel_size_v2_enabled": True, + "relsize_snapshot_cache_capacity": 10000, "gc_compaction_enabled": True, "gc_compaction_verification": False, "gc_compaction_initial_threshold_kb": 1024000, diff --git a/test_runner/regress/test_basebackup.py b/test_runner/regress/test_basebackup.py new file mode 100644 index 0000000000..b083c394c7 --- /dev/null +++ b/test_runner/regress/test_basebackup.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from fixtures.utils import wait_until + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_basebackup_cache(neon_env_builder: NeonEnvBuilder): + """ + Simple test for basebackup cache. + 1. Check that we always hit the cache after compute restart. + 2. Check that we eventually delete old basebackup files, but not the latest one. + 3. Check that we delete basebackup file for timeline with active compute. + """ + + neon_env_builder.pageserver_config_override = """ + tenant_config = { basebackup_cache_enabled = true } + basebackup_cache_config = { cleanup_period = '1s' } + """ + + env = neon_env_builder.init_start() + ep = env.endpoints.create("main") + ps = env.pageserver + ps_http = ps.http_client() + + # 1. Check that we always hit the cache after compute restart. + for i in range(3): + ep.start() + ep.stop() + + def check_metrics(i=i): + metrics = ps_http.get_metrics() + # Never miss. + # The first time compute_ctl sends `get_basebackup` with lsn=None, we do not cache such requests. + # All other requests should be a hit + assert ( + metrics.query_one( + "pageserver_basebackup_cache_read_total", {"result": "miss"} + ).value + == 0 + ) + # All but the first requests are hits. + assert ( + metrics.query_one("pageserver_basebackup_cache_read_total", {"result": "hit"}).value + == i + ) + # Every compute shut down should trigger a prepare reuest. + assert ( + metrics.query_one( + "pageserver_basebackup_cache_prepare_total", {"result": "ok"} + ).value + == i + 1 + ) + + wait_until(check_metrics) + + # 2. Check that we eventually delete old basebackup files, but not the latest one. + def check_bb_file_count(): + bb_files = list(ps.workdir.joinpath("basebackup_cache").iterdir()) + # tmp dir + 1 basebackup file. + assert len(bb_files) == 2 + + wait_until(check_bb_file_count) + + # 3. Check that we delete basebackup file for timeline with active compute. + ep.start() + ep.safe_psql("create table t1 as select generate_series(1, 10) as n") + + def check_bb_dir_empty(): + bb_files = list(ps.workdir.joinpath("basebackup_cache").iterdir()) + # only tmp dir. + assert len(bb_files) == 1 + + wait_until(check_bb_dir_empty) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 53edf9f79e..370f57b19d 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -202,6 +202,8 @@ def test_pageserver_gc_compaction_preempt( env = neon_env_builder.init_start(initial_tenant_conf=conf) env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*") + env.pageserver.allowed_errors.append(".*flush task cancelled.*") + env.pageserver.allowed_errors.append(".*failed to pipe.*") tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -229,7 +231,7 @@ def test_pageserver_gc_compaction_preempt( @skip_in_debug_build("only run with release build") -@pytest.mark.timeout(600) # This test is slow with sanitizers enabled, especially on ARM +@pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM @pytest.mark.parametrize( "with_branches", ["with_branches", "no_branches"], diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index 37208c9fff..6ee6837cd2 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -19,6 +19,16 @@ TEST_ROLE_NAMES = [ {"name": "role$"}, {"name": "role$$"}, {"name": "role$x$"}, + {"name": "x"}, + {"name": "xx"}, + {"name": "$x"}, + {"name": "x$"}, + {"name": "$x$"}, + {"name": "xx$"}, + {"name": "$xx"}, + {"name": "$xx$"}, + # 63 bytes is the limit for role/DB names in Postgres + {"name": "x" * 63}, ] TEST_DB_NAMES = [ @@ -74,6 +84,43 @@ TEST_DB_NAMES = [ "name": "db name$x$", "owner": "role$x$", }, + { + "name": "x", + "owner": "x", + }, + { + "name": "xx", + "owner": "xx", + }, + { + "name": "$x", + "owner": "$x", + }, + { + "name": "x$", + "owner": "x$", + }, + { + "name": "$x$", + "owner": "$x$", + }, + { + "name": "xx$", + "owner": "xx$", + }, + { + "name": "$xx", + "owner": "$xx", + }, + { + "name": "$xx$", + "owner": "$xx$", + }, + # 63 bytes is the limit for role/DB names in Postgres + { + "name": "x" * 63, + "owner": "x" * 63, + }, ] @@ -146,6 +193,10 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): """ Test that compute_ctl can create and work with databases and roles with special characters (whitespaces, %, tabs, etc.) in the name. + Also use `drop_subscriptions_before_start: true`. We do not actually + have any subscriptions in this test, so it should be no-op, but it + i) simulates the case when we create a second dev branch together with + a new project creation, and ii) just generally stresses more code paths. """ env = neon_simple_env @@ -159,6 +210,7 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): **{ "spec": { "skip_pg_catalog_updates": False, + "drop_subscriptions_before_start": True, "cluster": { "roles": TEST_ROLE_NAMES, "databases": TEST_DB_NAMES, @@ -202,6 +254,7 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): **{ "spec": { "skip_pg_catalog_updates": False, + "drop_subscriptions_before_start": True, "cluster": { "roles": [], "databases": [], @@ -544,3 +597,69 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env ) role = cursor.fetchone() assert role is None + + +def test_db_with_custom_settings(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can work with databases that have some custom settings. + For example, role=some_other_role, default_transaction_read_only=on, + search_path=non_public_schema, statement_timeout=1 (1ms). + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + TEST_ROLE = "some_other_role" + TEST_DB = "db_with_custom_settings" + TEST_SCHEMA = "non_public_schema" + + endpoint.respec_deep( + **{ + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "databases": [ + { + "name": TEST_DB, + "owner": TEST_ROLE, + } + ], + "roles": [ + { + "name": TEST_ROLE, + } + ], + }, + } + } + ) + + endpoint.reconfigure() + + with endpoint.cursor(dbname=TEST_DB) as cursor: + cursor.execute(f"CREATE SCHEMA {TEST_SCHEMA}") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET role = {TEST_ROLE}") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET default_transaction_read_only = on") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET search_path = {TEST_SCHEMA}") + cursor.execute(f"ALTER DATABASE {TEST_DB} SET statement_timeout = 1") + + with endpoint.cursor(dbname=TEST_DB) as cursor: + cursor.execute("SELECT current_role") + role = cursor.fetchone() + assert role is not None + assert role[0] == TEST_ROLE + + cursor.execute("SHOW default_transaction_read_only") + default_transaction_read_only = cursor.fetchone() + assert default_transaction_read_only is not None + assert default_transaction_read_only[0] == "on" + + cursor.execute("SHOW search_path") + search_path = cursor.fetchone() + assert search_path is not None + assert search_path[0] == TEST_SCHEMA + + # Do not check statement_timeout, because we force it to 2min + # in `endpoint.cursor()` fixture. + + endpoint.reconfigure() diff --git a/test_runner/regress/test_compute_http.py b/test_runner/regress/test_compute_http.py new file mode 100644 index 0000000000..9846d44ce2 --- /dev/null +++ b/test_runner/regress/test_compute_http.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from http.client import FORBIDDEN, UNAUTHORIZED +from typing import TYPE_CHECKING + +import jwt +import pytest +from fixtures.endpoint.http import COMPUTE_AUDIENCE, ComputeClaimsScope, EndpointHttpClient +from fixtures.utils import run_only_on_default_postgres +from requests import RequestException + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + + +@run_only_on_default_postgres("The code path being tested is not dependent on Postgres version") +def test_compute_no_scope_claim(neon_simple_env: NeonEnv): + """ + Test that if the JWT scope is not admin and no compute_id is specified, + the external HTTP server returns a 403 Forbidden error. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + # Encode nothing in the token + token = jwt.encode({}, env.auth_keys.priv, algorithm="EdDSA") + + # Create an admin-scoped HTTP client + client = EndpointHttpClient( + external_port=endpoint.external_http_port, + internal_port=endpoint.internal_http_port, + jwt=token, + ) + + try: + client.status() + pytest.fail("Exception should have been raised") + except RequestException as e: + assert e.response is not None + assert e.response.status_code == FORBIDDEN + + +@pytest.mark.parametrize( + "audience", + (COMPUTE_AUDIENCE, "invalid", None), + ids=["with_audience", "with_invalid_audience", "without_audience"], +) +@run_only_on_default_postgres("The code path being tested is not dependent on Postgres version") +def test_compute_admin_scope_claim(neon_simple_env: NeonEnv, audience: str | None): + """ + Test that an admin-scoped JWT can access the compute's external HTTP server + without the compute_id being specified in the claims. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + data: dict[str, str | list[str]] = {"scope": str(ComputeClaimsScope.ADMIN)} + if audience: + data["aud"] = [audience] + + token = jwt.encode(data, env.auth_keys.priv, algorithm="EdDSA") + + # Create an admin-scoped HTTP client + client = EndpointHttpClient( + external_port=endpoint.external_http_port, + internal_port=endpoint.internal_http_port, + jwt=token, + ) + + try: + client.status() + if audience != COMPUTE_AUDIENCE: + pytest.fail("Exception should have been raised") + except RequestException as e: + assert e.response is not None + assert e.response.status_code == UNAUTHORIZED diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index 5e3f8671a2..2cb2ee7b58 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -217,11 +217,11 @@ if SQL_EXPORTER is None: self, logs_dir: Path, config_file: Path, collector_file: Path, port: int ) -> None: # NOTE: Keep the version the same as in - # compute/Dockerfile.compute-node and Dockerfile.build-tools. + # compute/compute-node.Dockerfile and build-tools.Dockerfile. # # The "host" network mode allows sql_exporter to talk to the # endpoint which is running on the host. - super().__init__("docker.io/burningalchemist/sql_exporter:0.17.0", network_mode="host") + super().__init__("docker.io/burningalchemist/sql_exporter:0.17.3", network_mode="host") self.__logs_dir = logs_dir self.__port = port diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 3b6c94a268..24ba0713d2 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -14,7 +14,7 @@ from fixtures.log_helper import log from fixtures.metrics import parse_metrics from fixtures.paths import BASE_DIR from fixtures.pg_config import PgConfigKey -from fixtures.utils import subprocess_capture +from fixtures.utils import WITH_SANITIZERS, subprocess_capture from werkzeug.wrappers.response import Response if TYPE_CHECKING: @@ -148,6 +148,15 @@ def test_remote_extensions( pg_config: PgConfig, extension: RemoteExtension, ): + if WITH_SANITIZERS and extension is RemoteExtension.WITH_LIB: + pytest.skip( + """ + For this test to work with sanitizers enabled, we would need to + compile the dummy Postgres extension with the same CFLAGS that we + compile Postgres and the neon extension with to link the sanitizers. + """ + ) + # Setup a mock nginx S3 gateway which will return our test extension. (host, port) = httpserver_listen_address extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway" @@ -212,7 +221,7 @@ def test_remote_extensions( endpoint.create_remote_extension_spec(spec) - endpoint.start(remote_ext_config=extensions_endpoint) + endpoint.start(remote_ext_base_url=extensions_endpoint) with endpoint.connect() as conn: with conn.cursor() as cur: @@ -240,7 +249,7 @@ def test_remote_extensions( # Remove the extension files to force a redownload of the extension. extension.remove(test_output_dir, pg_version) - endpoint.start(remote_ext_config=extensions_endpoint) + endpoint.start(remote_ext_base_url=extensions_endpoint) # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions. with endpoint.connect() as conn: diff --git a/test_runner/regress/test_endpoint_storage.py b/test_runner/regress/test_endpoint_storage.py index 04029114ec..1e27ef4b14 100644 --- a/test_runner/regress/test_endpoint_storage.py +++ b/test_runner/regress/test_endpoint_storage.py @@ -4,10 +4,12 @@ import pytest from aiohttp import ClientSession from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import run_only_on_default_postgres from jwcrypto import jwk, jwt @pytest.mark.asyncio +@run_only_on_default_postgres("test doesn't use postgres") async def test_endpoint_storage_insert_retrieve_delete(neon_simple_env: NeonEnv): """ Inserts, retrieves, and deletes test file using a JWT token @@ -35,7 +37,6 @@ async def test_endpoint_storage_insert_retrieve_delete(neon_simple_env: NeonEnv) key = f"http://{base_url}/{tenant_id}/{timeline_id}/{endpoint_id}/key" headers = {"Authorization": f"Bearer {token}"} log.info(f"cache key url {key}") - log.info(f"token {token}") async with ClientSession(headers=headers) as session: async with session.get(key) as res: diff --git a/test_runner/regress/test_gist.py b/test_runner/regress/test_gist.py new file mode 100644 index 0000000000..89e3b9b2b1 --- /dev/null +++ b/test_runner/regress/test_gist.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + + +# +# Test unlogged build for GIST index +# +def test_gist(neon_simple_env: NeonEnv): + env = neon_simple_env + endpoint = env.endpoints.create_start("main") + con = endpoint.connect() + cur = con.cursor() + iterations = 100 + + for _ in range(iterations): + cur.execute( + "CREATE TABLE pvactst (i INT, a INT[], p POINT) with (autovacuum_enabled = off)" + ) + cur.execute( + "INSERT INTO pvactst SELECT i, array[1,2,3], point(i, i+1) FROM generate_series(1,1000) i" + ) + cur.execute("CREATE INDEX gist_pvactst ON pvactst USING gist (p)") + cur.execute("VACUUM pvactst") + cur.execute("DROP TABLE pvactst") diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index a26c3994a5..0472b92145 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -24,6 +24,7 @@ from fixtures.utils import ( skip_in_debug_build, wait_until, ) +from fixtures.workload import Workload from mypy_boto3_kms import KMSClient from mypy_boto3_kms.type_defs import EncryptResponseTypeDef from mypy_boto3_s3 import S3Client @@ -97,6 +98,10 @@ def test_pgdata_import_smoke( f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/" ) + if neon_env_builder.storage_controller_config is None: + neon_env_builder.storage_controller_config = {} + neon_env_builder.storage_controller_config["timelines_onto_safekeepers"] = True + env = neon_env_builder.init_start() # The test needs LocalFs support, which is only built in testing mode. @@ -125,9 +130,8 @@ def test_pgdata_import_smoke( elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD: target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2 elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS: - # Postgres uses a 1GiB segment size, fixed at compile time, so we must use >2GB of data - # to exercise multiple segments. - target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192) + segment_size = 16 * 1024 * 1024 + target_relblock_size = segment_size * 8 else: raise ValueError @@ -286,34 +290,28 @@ def test_pgdata_import_smoke( # # validate that we can write # - rw_endpoint = env.endpoints.create_start( - branch_name=import_branch_name, - endpoint_id="rw", - tenant_id=tenant_id, - config_lines=ep_config, - ) - rw_endpoint.safe_psql("create table othertable(values text)") - rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()")) + workload = Workload(env, tenant_id, timeline_id, branch_name=import_branch_name) + workload.init() + workload.write_rows(64) + workload.validate() - # TODO: consider using `class Workload` here - # to do compaction and whatnot? + rw_lsn = Lsn(workload.endpoint().safe_psql_scalar("select pg_current_wal_flush_lsn()")) # # validate that we can branch (important use case) # # ... at the tip - _ = env.create_branch( + child_timeline_id = env.create_branch( new_branch_name="br-tip", ancestor_branch_name=import_branch_name, tenant_id=tenant_id, ancestor_start_lsn=rw_lsn, ) - br_tip_endpoint = env.endpoints.create_start( - branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config - ) - validate_vanilla_equivalence(br_tip_endpoint) - br_tip_endpoint.safe_psql("select * from othertable") + child_workload = workload.branch(timeline_id=child_timeline_id, branch_name="br-tip") + child_workload.validate() + + validate_vanilla_equivalence(child_workload.endpoint()) # ... at the initdb lsn _ = env.create_branch( @@ -330,7 +328,7 @@ def test_pgdata_import_smoke( ) validate_vanilla_equivalence(br_initdb_endpoint) with pytest.raises(psycopg2.errors.UndefinedTable): - br_initdb_endpoint.safe_psql("select * from othertable") + br_initdb_endpoint.safe_psql(f"select * from {workload.table}") @run_only_on_default_postgres(reason="PG version is irrelevant here") @@ -414,6 +412,88 @@ def test_import_completion_on_restart( wait_until(cplane_notified) +@run_only_on_default_postgres(reason="PG version is irrelevant here") +def test_import_respects_tenant_shutdown( + neon_env_builder: NeonEnvBuilder, vanilla_pg: VanillaPostgres, make_httpserver: HTTPServer +): + """ + Validate that importing timelines respect the usual timeline life cycle: + 1. Shut down on tenant shut-down and resumes upon re-attach + 2. Deletion on timeline deletion (TODO) + """ + # Set up mock control plane HTTP server to listen for import completions + import_completion_signaled = Event() + + def handler(request: Request) -> Response: + log.info(f"control plane /import_complete request: {request.json}") + import_completion_signaled.set() + return Response(json.dumps({}), status=200) + + cplane_mgmt_api_server = make_httpserver + cplane_mgmt_api_server.expect_request( + "/storage/api/v1/import_complete", method="PUT" + ).respond_with_handler(handler) + + # Plug the cplane mock in + neon_env_builder.control_plane_hooks_api = ( + f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/" + ) + + # The import will specifiy a local filesystem path mocking remote storage + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + vanilla_pg.start() + vanilla_pg.stop() + + env = neon_env_builder.init_configs() + env.start() + + importbucket_path = neon_env_builder.repo_dir / "test_import_completion_bucket" + mock_import_bucket(vanilla_pg, importbucket_path) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + idempotency = ImportPgdataIdemptencyKey.random() + + # Pause before sending the notification + failpoint_name = "import-timeline-pre-execute-pausable" + env.pageserver.http_client().configure_failpoints((failpoint_name, "pause")) + + env.storage_controller.tenant_create(tenant_id) + env.storage_controller.timeline_create( + tenant_id, + { + "new_timeline_id": str(timeline_id), + "import_pgdata": { + "idempotency_key": str(idempotency), + "location": {"LocalFs": {"path": str(importbucket_path.absolute())}}, + }, + }, + ) + + def hit_failpoint(): + log.info("Checking log for pattern...") + try: + assert env.pageserver.log_contains(f".*at failpoint {failpoint_name}.*") + except Exception: + log.exception("Failed to find pattern in log") + raise + + wait_until(hit_failpoint) + assert not import_completion_signaled.is_set() + + # Restart the pageserver while an import job is in progress. + # This clears the failpoint and we expect that the import starts up afresh + # after the restart and eventually completes. + env.pageserver.stop() + env.pageserver.start() + + def cplane_notified(): + assert import_completion_signaled.is_set() + + wait_until(cplane_notified) + + def test_fast_import_with_pageserver_ingest( test_output_dir, vanilla_pg: VanillaPostgres, @@ -521,7 +601,9 @@ def test_fast_import_with_pageserver_ingest( env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id) # Run fast_import - fast_import.set_aws_creds(mock_s3_server, {"RUST_LOG": "aws_config=debug,aws_sdk_kms=debug"}) + fast_import.set_aws_creds( + mock_s3_server, {"RUST_LOG": "info,aws_config=debug,aws_sdk_kms=debug"} + ) pg_port = port_distributor.get_port() fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}") @@ -641,6 +723,55 @@ def test_fast_import_binary( assert res[0][0] == 10 +def test_fast_import_event_triggers( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, +): + vanilla_pg.start() + vanilla_pg.safe_psql(""" + CREATE FUNCTION test_event_trigger_for_drops() + RETURNS event_trigger LANGUAGE plpgsql AS $$ + DECLARE + obj record; + BEGIN + FOR obj IN SELECT * FROM pg_event_trigger_dropped_objects() + LOOP + RAISE NOTICE '% dropped object: % %.% %', + tg_tag, + obj.object_type, + obj.schema_name, + obj.object_name, + obj.object_identity; + END LOOP; + END + $$; + + CREATE EVENT TRIGGER test_event_trigger_for_drops + ON sql_drop + EXECUTE PROCEDURE test_event_trigger_for_drops(); + """) + + pg_port = port_distributor.get_port() + p = fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr()) + assert p.returncode == 0 + + vanilla_pg.stop() + + pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) + with VanillaPostgres( + fast_import.workdir / "pgdata", pgbin, pg_port, False + ) as new_pgdata_vanilla_pg: + new_pgdata_vanilla_pg.start() + + # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres + conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb") + res = conn.safe_psql("SELECT count(*) FROM pg_event_trigger;") + log.info(f"Result: {res}") + assert res[0][0] == 0, f"Neon does not support importing event triggers, got: {res[0][0]}" + + def test_fast_import_restore_to_connstring( test_output_dir, vanilla_pg: VanillaPostgres, diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py new file mode 100644 index 0000000000..82e1e9fcba --- /dev/null +++ b/test_runner/regress/test_lfc_prewarm.py @@ -0,0 +1,221 @@ +import random +import threading +import time +from enum import Enum + +import pytest +from fixtures.endpoint.http import EndpointHttpClient +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import USE_LFC +from prometheus_client.parser import text_string_to_metric_families as prom_parse_impl + + +class LfcQueryMethod(Enum): + COMPUTE_CTL = False + POSTGRES = True + + +PREWARM_LABEL = "compute_ctl_lfc_prewarm_requests_total" +OFFLOAD_LABEL = "compute_ctl_lfc_offload_requests_total" +QUERY_OPTIONS = LfcQueryMethod.POSTGRES, LfcQueryMethod.COMPUTE_CTL + + +def check_pinned_entries(cur): + # some LFC buffer can be temporary locked by autovacuum or background writer + for _ in range(10): + cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'") + n_pinned = cur.fetchall()[0][0] + if n_pinned == 0: + break + time.sleep(1) + assert n_pinned == 0 + + +def prom_parse(client: EndpointHttpClient) -> dict[str, float]: + return { + sample.name: sample.value + for family in prom_parse_impl(client.metrics()) + for sample in family.samples + if sample.name in (PREWARM_LABEL, OFFLOAD_LABEL) + } + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"]) +def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod): + env = neon_simple_env + n_records = 1000000 + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=[ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "neon.file_cache_prewarm_limit=1000", + ], + ) + + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("create extension neon version '1.6'") + pg_cur.execute("create database lfc") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + log.info(f"Inserting {n_records} rows") + lfc_cur.execute("create table t(pk integer primary key, payload text default repeat('?', 128))") + lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))") + log.info(f"Inserted {n_records} rows") + + http_client = endpoint.http_client() + if query is LfcQueryMethod.COMPUTE_CTL: + status = http_client.prewarm_lfc_status() + assert status["status"] == "not_prewarmed" + assert "error" not in status + http_client.offload_lfc() + assert http_client.prewarm_lfc_status()["status"] == "not_prewarmed" + assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0} + else: + pg_cur.execute("select get_local_cache_state()") + lfc_state = pg_cur.fetchall()[0][0] + + endpoint.stop() + endpoint.start() + + # wait until compute_ctl completes downgrade of extension to default version + time.sleep(1) + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("alter extension neon update to '1.6'") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + + if query is LfcQueryMethod.COMPUTE_CTL: + http_client.prewarm_lfc() + else: + pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + + pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'") + lfc_used_pages = pg_cur.fetchall()[0][0] + log.info(f"Used LFC size: {lfc_used_pages}") + pg_cur.execute("select * from get_prewarm_info()") + prewarm_info = pg_cur.fetchall()[0] + log.info(f"Prewarm info: {prewarm_info}") + total, prewarmed, skipped, _ = prewarm_info + progress = (prewarmed + skipped) * 100 // total + log.info(f"Prewarm progress: {progress}%") + + assert lfc_used_pages > 10000 + assert ( + prewarm_info[0] > 0 + and prewarm_info[1] > 0 + and prewarm_info[0] == prewarm_info[1] + prewarm_info[2] + ) + + lfc_cur.execute("select sum(pk) from t") + assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 + + check_pinned_entries(pg_cur) + + desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped} + if query is LfcQueryMethod.COMPUTE_CTL: + assert http_client.prewarm_lfc_status() == desired + assert prom_parse(http_client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1} + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"]) +def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMethod): + env = neon_simple_env + n_records = 10000 + n_threads = 4 + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=[ + "shared_buffers=1MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "neon.file_cache_prewarm_limit=1000000", + ], + ) + + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + pg_cur.execute("create extension neon version '1.6'") + pg_cur.execute("CREATE DATABASE lfc") + + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + lfc_cur.execute( + "create table accounts(id integer primary key, balance bigint default 0, payload text default repeat('?', 1000)) with (fillfactor=10)" + ) + log.info(f"Inserting {n_records} rows") + lfc_cur.execute(f"insert into accounts(id) values (generate_series(1,{n_records}))") + log.info(f"Inserted {n_records} rows") + + http_client = endpoint.http_client() + if query is LfcQueryMethod.COMPUTE_CTL: + http_client.offload_lfc() + else: + pg_cur.execute("select get_local_cache_state()") + lfc_state = pg_cur.fetchall()[0][0] + + running = True + n_prewarms = 0 + + def workload(): + lfc_conn = endpoint.connect(dbname="lfc") + lfc_cur = lfc_conn.cursor() + n_transfers = 0 + while running: + src = random.randint(1, n_records) + dst = random.randint(1, n_records) + lfc_cur.execute("update accounts set balance=balance-100 where id=%s", (src,)) + lfc_cur.execute("update accounts set balance=balance+100 where id=%s", (dst,)) + n_transfers += 1 + log.info(f"Number of transfers: {n_transfers}") + + def prewarm(): + pg_conn = endpoint.connect() + pg_cur = pg_conn.cursor() + while running: + pg_cur.execute("alter system set neon.file_cache_size_limit='1MB'") + pg_cur.execute("select pg_reload_conf()") + pg_cur.execute("alter system set neon.file_cache_size_limit='1GB'") + pg_cur.execute("select pg_reload_conf()") + + if query is LfcQueryMethod.COMPUTE_CTL: + http_client.prewarm_lfc() + else: + pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + + nonlocal n_prewarms + n_prewarms += 1 + log.info(f"Number of prewarms: {n_prewarms}") + + workload_threads = [] + for _ in range(n_threads): + t = threading.Thread(target=workload) + workload_threads.append(t) + t.start() + + prewarm_thread = threading.Thread(target=prewarm) + prewarm_thread.start() + + time.sleep(20) + + running = False + for t in workload_threads: + t.join() + prewarm_thread.join() + + lfc_cur.execute("select sum(balance) from accounts") + total_balance = lfc_cur.fetchall()[0][0] + assert total_balance == 0 + + check_pinned_entries(pg_cur) + if query is LfcQueryMethod.COMPUTE_CTL: + assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms} diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index fa1cd61206..e3f9982486 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -3,7 +3,7 @@ Tests in this module exercise the pageserver's behavior around generation numbers, as defined in docs/rfcs/025-generation-numbers.md. Briefly, the behaviors we require of the pageserver are: -- Do not start a tenant without a generation number if control_plane_api is set +- Do not start a tenant without a generation number - Remote objects must be suffixed with generation - Deletions may only be executed after validating generation - Updates to remote_consistent_lsn may only be made visible after validating generation diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index acec0ba44a..474258c9eb 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -506,9 +506,11 @@ class SyntheticSizeVerifier: PER_METRIC_VERIFIERS = { "remote_storage_size": CannotVerifyAnything, - "resident_size": CannotVerifyAnything, "written_size": WrittenDataVerifier, "written_data_bytes_delta": WrittenDataDeltaVerifier, + "written_size_since_parent": WrittenDataVerifier, # same as written_size on root + "pitr_cutoff": CannotVerifyAnything, + "pitr_history_size_since_parent": WrittenDataVerifier, # same as written_size on root w/o GC "timeline_logical_size": CannotVerifyAnything, "synthetic_storage_size": SyntheticSizeVerifier, } diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 3aa0c63979..f2523ec9b5 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -510,7 +510,7 @@ def list_elegible_layers( except KeyError: # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map # matches what's on disk. - log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}") + log.warning(f"Lookup {layer_file_name} from {list(visible_map.keys())}") raise return list(c for c in candidates if is_visible(c)) @@ -636,7 +636,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): except: # On assertion failures, log some details to help with debugging heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) - log.warn(f"heatmap contents: {json.dumps(heatmap, indent=2)}") + log.warning(f"heatmap contents: {json.dumps(heatmap, indent=2)}") raise # Scrub the remote storage diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 0fea706888..474002353b 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -471,7 +471,7 @@ def test_tx_abort_with_many_relations( try: # Rollback phase should be fast: this is one WAL record that we should process efficiently fut = exec.submit(rollback_and_wait) - fut.result(timeout=15) + fut.result(timeout=15 if reldir_type == "v1" else 30) except: exec.shutdown(wait=False, cancel_futures=True) raise diff --git a/test_runner/regress/test_proxy_metric_collection.py b/test_runner/regress/test_proxy_metric_collection.py index 85d8a6daaa..7442d50f68 100644 --- a/test_runner/regress/test_proxy_metric_collection.py +++ b/test_runner/regress/test_proxy_metric_collection.py @@ -52,6 +52,8 @@ def proxy_with_metric_collector( proxy_port = port_distributor.get_port() mgmt_port = port_distributor.get_port() external_http_port = port_distributor.get_port() + router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() (host, port) = httpserver_listen_address metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" @@ -63,6 +65,8 @@ def proxy_with_metric_collector( proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, + router_port=router_port, + router_tls_port=router_tls_port, external_http_port=external_http_port, metric_collection_endpoint=metric_collection_endpoint, metric_collection_interval=metric_collection_interval, diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py index e2a22cc769..c88bc7aace 100644 --- a/test_runner/regress/test_replica_start.py +++ b/test_runner/regress/test_replica_start.py @@ -27,8 +27,9 @@ from contextlib import closing import psycopg2 import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup +from fixtures.neon_fixtures import NeonEnv, PgBin, wait_for_last_flush_lsn, wait_replica_caughtup from fixtures.pg_version import PgVersion from fixtures.utils import query_scalar, skip_on_postgres, wait_until @@ -695,3 +696,110 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv): with secondary.cursor() as secondary_cur: secondary_cur.execute("select count(*) from t") assert secondary_cur.fetchone() == (n_restarts,) + + +def test_ephemeral_endpoints_vacuum(neon_simple_env: NeonEnv, pg_bin: PgBin): + env = neon_simple_env + endpoint = env.endpoints.create_start("main") + + sql = """ +CREATE TABLE CHAR_TBL(f1 char(4)); +CREATE TABLE FLOAT8_TBL(f1 float8); +CREATE TABLE INT2_TBL(f1 int2); +CREATE TABLE INT4_TBL(f1 int4); +CREATE TABLE INT8_TBL(q1 int8, q2 int8); +CREATE TABLE POINT_TBL(f1 point); +CREATE TABLE TEXT_TBL (f1 text); +CREATE TABLE VARCHAR_TBL(f1 varchar(4)); +CREATE TABLE onek (unique1 int4); +CREATE TABLE onek2 AS SELECT * FROM onek; +CREATE TABLE tenk1 (unique1 int4); +CREATE TABLE tenk2 AS SELECT * FROM tenk1; +CREATE TABLE person (name text, age int4,location point); +CREATE TABLE emp (salary int4, manager name) INHERITS (person); +CREATE TABLE student (gpa float8) INHERITS (person); +CREATE TABLE stud_emp ( percent int4) INHERITS (emp, student); +CREATE TABLE road (name text,thepath path); +CREATE TABLE ihighway () INHERITS (road); +CREATE TABLE shighway(surface text) INHERITS (road); +CREATE TABLE BOOLTBL3 (d text, b bool, o int); +CREATE TABLE booltbl4(isfalse bool, istrue bool, isnul bool); +DROP TABLE BOOLTBL3; +DROP TABLE BOOLTBL4; +CREATE TABLE ceil_floor_round (a numeric); +DROP TABLE ceil_floor_round; +CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8); +DROP TABLE width_bucket_test; +CREATE TABLE num_input_test (n1 numeric); +CREATE TABLE num_variance (a numeric); +INSERT INTO num_variance VALUES (0); +CREATE TABLE snapshot_test (nr integer, snap txid_snapshot); +CREATE TABLE guid1(guid_field UUID, text_field TEXT DEFAULT(now())); +CREATE TABLE guid2(guid_field UUID, text_field TEXT DEFAULT(now())); +CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field); +CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field); +TRUNCATE guid1; +DROP TABLE guid1; +DROP TABLE guid2 CASCADE; +CREATE TABLE numrange_test (nr NUMRANGE); +CREATE INDEX numrange_test_btree on numrange_test(nr); +CREATE TABLE numrange_test2(nr numrange); +CREATE INDEX numrange_test2_hash_idx on numrange_test2 using hash (nr); +INSERT INTO numrange_test2 VALUES('[, 5)'); +CREATE TABLE textrange_test (tr text); +CREATE INDEX textrange_test_btree on textrange_test(tr); +CREATE TABLE test_range_gist(ir int4range); +CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir); +DROP INDEX test_range_gist_idx; +CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir); +CREATE TABLE test_range_spgist(ir int4range); +CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir); +DROP INDEX test_range_spgist_idx; +CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir); +CREATE TABLE test_range_elem(i int4); +CREATE INDEX test_range_elem_idx on test_range_elem (i); +CREATE INDEX ON test_range_elem using spgist(int4range(i,i+10)); +DROP TABLE test_range_elem; +CREATE TABLE test_range_excl(room int4range, speaker int4range, during tsrange, exclude using gist (room with =, during with &&), exclude using gist (speaker with =, during with &&)); +CREATE TABLE f_test(f text, i int); +CREATE TABLE i8r_array (f1 int, f2 text); +CREATE TYPE arrayrange as range (subtype=int4[]); +CREATE TYPE two_ints as (a int, b int); +DROP TYPE two_ints cascade; +CREATE TABLE text_support_test (t text); +CREATE TABLE TEMP_FLOAT (f1 FLOAT8); +CREATE TABLE TEMP_INT4 (f1 INT4); +CREATE TABLE TEMP_INT2 (f1 INT2); +CREATE TABLE TEMP_GROUP (f1 INT4, f2 INT4, f3 FLOAT8); +CREATE TABLE POLYGON_TBL(f1 polygon); +CREATE TABLE quad_poly_tbl (id int, p polygon); +INSERT INTO quad_poly_tbl SELECT (x - 1) * 100 + y, polygon(circle(point(x * 10, y * 10), 1 + (x + y) % 10)) FROM generate_series(1, 200) x, generate_series(1, 100) y; +CREATE TABLE quad_poly_tbl_ord_seq2 AS SELECT 1 FROM quad_poly_tbl; +CREATE TABLE quad_poly_tbl_ord_idx2 AS SELECT 1 FROM quad_poly_tbl; +""" + + with endpoint.cursor() as cur: + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + env.endpoints.create_start(branch_name="main", lsn=lsn) + log.info(f"lsn: {lsn}") + + for line in sql.split("\n"): + if len(line.strip()) == 0 or line.startswith("--"): + continue + cur.execute(line) + + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + env.endpoints.create_start(branch_name="main", lsn=lsn) + log.info(f"lsn: {lsn}") + + cur.execute("VACUUM FULL pg_class;") + + for ep in env.endpoints.endpoints: + log.info(f"{ep.endpoint_id} / {ep.pg_port}") + pg_dump_command = ["pg_dumpall", "-f", f"/tmp/dump-{ep.endpoint_id}.sql"] + env_vars = { + "PGPORT": str(ep.pg_port), + "PGUSER": endpoint.default_options["user"], + "PGHOST": endpoint.default_options["host"], + } + pg_bin.run_capture(pg_dump_command, env=env_vars) diff --git a/test_runner/regress/test_role_grants.py b/test_runner/regress/test_role_grants.py index b2251875f0..5b13d461f0 100644 --- a/test_runner/regress/test_role_grants.py +++ b/test_runner/regress/test_role_grants.py @@ -39,3 +39,10 @@ def test_role_grants(neon_simple_env: NeonEnv): res = cur.fetchall() assert res == [(1,)], "select should not succeed" + + # confirm that replicas can also ensure the grants are correctly set. + replica = env.endpoints.new_replica_start(endpoint) + replica_client = replica.http_client() + replica_client.set_role_grants( + "test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"] + ) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 0bfc4b1d8c..4c9887fb92 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1334,6 +1334,13 @@ def test_sharding_split_failures( tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}' ) + # Create bystander tenants with various shard counts. They should not be affected by the aborted + # splits. Regression test for https://github.com/neondatabase/cloud/issues/28589. + bystanders = {} # id → shard_count + for bystander_shard_count in [1, 2, 4, 8]: + id, _ = env.create_tenant(shard_count=bystander_shard_count) + bystanders[id] = bystander_shard_count + env.storage_controller.allowed_errors.extend( [ # All split failures log a warning when then enqueue the abort operation @@ -1394,6 +1401,8 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) + if tenant_shard_id.tenant_id != tenant_id: + continue # skip bystanders log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == initial_shard_count if loc[1]["mode"] == "Secondary": @@ -1414,6 +1423,8 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) + if tenant_shard_id.tenant_id != tenant_id: + continue # skip bystanders log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == split_shard_count if loc[1]["mode"] == "Secondary": @@ -1496,6 +1507,12 @@ def test_sharding_split_failures( # the scheduler reaches an idle state env.storage_controller.reconcile_until_idle(timeout_secs=30) + # Check that all bystanders are still around. + for bystander_id, bystander_shard_count in bystanders.items(): + response = env.storage_controller.tenant_describe(bystander_id) + assert TenantId(response["tenant_id"]) == bystander_id + assert len(response["shards"]) == bystander_shard_count + env.storage_controller.consistency_check() diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 19952fc71b..61893f22ba 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING import backoff from fixtures.log_helper import log -from fixtures.neon_fixtures import PgProtocol, VanillaPostgres +from fixtures.neon_fixtures import NeonProxy, PgProtocol, VanillaPostgres if TYPE_CHECKING: from pathlib import Path @@ -41,6 +41,7 @@ class PgSniRouter(PgProtocol): self, neon_binpath: Path, port: int, + tls_port: int, destination: str, tls_cert: Path, tls_key: Path, @@ -53,6 +54,7 @@ class PgSniRouter(PgProtocol): self.host = host self.neon_binpath = neon_binpath self.port = port + self.tls_port = tls_port self.destination = destination self.tls_cert = tls_cert self.tls_key = tls_key @@ -64,6 +66,7 @@ class PgSniRouter(PgProtocol): args = [ str(self.neon_binpath / "pg_sni_router"), *["--listen", f"127.0.0.1:{self.port}"], + *["--listen-tls", f"127.0.0.1:{self.tls_port}"], *["--tls-cert", str(self.tls_cert)], *["--tls-key", str(self.tls_key)], *["--destination", self.destination], @@ -127,10 +130,12 @@ def test_pg_sni_router( pg_port = vanilla_pg.default_options["port"] router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() with PgSniRouter( neon_binpath=neon_binpath, port=router_port, + tls_port=router_tls_port, destination="local.neon.build", tls_cert=test_output_dir / "router.crt", tls_key=test_output_dir / "router.key", @@ -146,3 +151,22 @@ def test_pg_sni_router( hostaddr="127.0.0.1", ) assert out[0][0] == 1 + + +def test_pg_sni_router_in_proxy( + static_proxy: NeonProxy, + vanilla_pg: VanillaPostgres, +): + # static_proxy starts this. + assert vanilla_pg.is_running() + pg_port = vanilla_pg.default_options["port"] + + out = static_proxy.safe_psql( + "select 1", + dbname="postgres", + sslmode="require", + host=f"endpoint--namespace--{pg_port}.local.neon.build", + hostaddr="127.0.0.1", + port=static_proxy.router_port, + ) + assert out[0][0] == 1 diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 4360b42d68..8d46ef8306 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -193,6 +193,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent" ) + offloaded_count = ps_http.get_metric_value( + "pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"} + ) + assert offloaded_count == 0 + ps_http.timeline_archival_config( tenant_id, leaf_timeline_id, @@ -244,6 +249,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b wait_until(leaf_offloaded) wait_until(parent_offloaded) + offloaded_count = ps_http.get_metric_value( + "pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"} + ) + assert offloaded_count == 2 + # Offloaded child timelines should still prevent deletion with pytest.raises( PageserverApiException, diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index a71652af8a..d42c5d403e 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1822,7 +1822,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1( endpoint2.safe_psql( "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')" ) - lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + lsn3 = wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id) assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set( ["pg_replslot/test_slot_restore/state"] @@ -1839,7 +1839,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1( assert all_reparented == set([]) # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN. - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id) assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set( ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"] ), "main branch unaffected" diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index d3c9d61fb7..55c0d45abe 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit d3c9d61fb7a362a165dac7060819dd9d6ad68c28 +Subproject commit 55c0d45abe6467c02084c2192bca117eda6ce1e7 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 8ecb12f21d..de7640f55d 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3 +Subproject commit de7640f55da07512834d5cc40c4b3fb376b5f04f diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 37496f87b5..0bf96bd6d7 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 37496f87b5324af53c56127e278ee5b1e8435253 +Subproject commit 0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index eab3a37834..8be779fd3a 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit eab3a37834cac6ec0719bf817ac918a201712d66 +Subproject commit 8be779fd3ab9e87206da96a7e4842ef1abf04f44 diff --git a/vendor/revisions.json b/vendor/revisions.json index 90d878d0f7..3e999760f4 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ - "17.4", - "eab3a37834cac6ec0719bf817ac918a201712d66" + "17.5", + "8be779fd3ab9e87206da96a7e4842ef1abf04f44" ], "v16": [ - "16.8", - "37496f87b5324af53c56127e278ee5b1e8435253" + "16.9", + "0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198" ], "v15": [ - "15.12", - "8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3" + "15.13", + "de7640f55da07512834d5cc40c4b3fb376b5f04f" ], "v14": [ - "14.17", - "d3c9d61fb7a362a165dac7060819dd9d6ad68c28" + "14.18", + "55c0d45abe6467c02084c2192bca117eda6ce1e7" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f3d8b951a8..9e1123ac0e 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -39,8 +39,10 @@ env_logger = { version = "0.11" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } form_urlencoded = { version = "1" } futures-channel = { version = "0.3", features = ["sink"] } +futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } +futures-task = { version = "0.3", default-features = false, features = ["std"] } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } @@ -51,14 +53,14 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["client", "http1", "http2", "runtime", "server", "stream"] } hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] } hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] } -indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } -indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } +indexmap = { version = "2", features = ["serde"] } itertools = { version = "0.12" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } -nix = { version = "0.26" } +nix-2f80eeee3b1b6c7e = { package = "nix", version = "0.26" } +nix-fa1f6196edfd7249 = { package = "nix", version = "0.30", features = ["dir", "ioctl", "mman", "poll", "signal", "socket"] } nom = { version = "7" } num = { version = "0.4" } num-bigint = { version = "0.4" } @@ -70,6 +72,7 @@ num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } p256 = { version = "0.13", features = ["jwk"] } parquet = { version = "53", default-features = false, features = ["zstd"] } +percent-encoding = { version = "2" } prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } @@ -78,7 +81,7 @@ regex-syntax = { version = "0.8" } reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "rustls-tls-native-roots", "stream"] } rustls = { version = "0.23", default-features = false, features = ["logging", "ring", "std", "tls12"] } rustls-pki-types = { version = "1", features = ["std"] } -rustls-webpki = { version = "0.102", default-features = false, features = ["ring", "std"] } +rustls-webpki = { version = "0.103", default-features = false, features = ["ring", "std"] } scopeguard = { version = "1" } sec1 = { version = "0.7", features = ["pem", "serde", "std", "subtle"] } serde = { version = "1", features = ["alloc", "derive"] } @@ -98,11 +101,11 @@ tokio-rustls = { version = "0.26", default-features = false, features = ["loggin tokio-stream = { version = "0.1" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } -tonic = { version = "0.12", default-features = false, features = ["codegen", "prost", "tls-roots"] } -tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] } +tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "log"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } tracing-log = { version = "0.2" } +tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive", "serde"] } @@ -120,8 +123,7 @@ either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } -indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } -indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } +indexmap = { version = "2", features = ["serde"] } itertools = { version = "0.12" } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] }